Add status/error checks during ONOS startup

We check for:
- process exits
- log ERRORs
- log WARNings
- low memory

Also added a "status" CLI command to check for exits
and errors.

Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
diff --git a/tools/dev/mininet/onos.py b/tools/dev/mininet/onos.py
index 96a17f0..6c5b817 100755
--- a/tools/dev/mininet/onos.py
+++ b/tools/dev/mininet/onos.py
@@ -44,10 +44,9 @@
 from mininet.nodelib import LinuxBridge
 from mininet.net import Mininet
 from mininet.topo import SingleSwitchTopo, Topo
-from mininet.log import setLogLevel, info
+from mininet.log import setLogLevel, info, warn, error, debug
 from mininet.cli import CLI
-from mininet.util import quietRun, waitListening
-from mininet.clean import killprocs
+from mininet.util import quietRun, specialClass
 from mininet.examples.controlnet import MininetFacade
 
 from os import environ
@@ -55,7 +54,7 @@
 from sys import argv
 from glob import glob
 import time
-
+from functools import partial
 
 ### ONOS Environment
 
@@ -150,6 +149,36 @@
     return onosDir
 
 
+def waitListening( client=None, server='127.0.0.1', port=80, timeout=None,
+                   callback=None, sleepSecs=.5 ):
+    "Modified mininet.util.waitListening with callback, sleepSecs"
+    runCmd = ( client.cmd if client else
+               partial( quietRun, shell=True ) )
+    if not runCmd( 'which telnet' ):
+        raise Exception('Could not find telnet' )
+    # pylint: disable=maybe-no-member
+    serverIP = server if isinstance( server, basestring ) else server.IP()
+    cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) )
+    elapsed = 0
+    result = runCmd( cmd )
+    while 'Connected' not in result:
+        if 'No route' in result:
+            rtable = runCmd( 'route' )
+            error( 'no route to %s:\n%s' % ( server, rtable ) )
+            return False
+        if timeout and elapsed >= timeout:
+            error( 'could not connect to %s on port %d\n' % ( server, port ) )
+            return False
+        debug( 'waiting for', server, 'to listen on port', port, '\n' )
+        info( '.' )
+        if callback:
+            callback()
+        time.sleep( sleepSecs )
+        elapsed += sleepSecs
+        result = runCmd( cmd )
+    return True
+
+
 ### Mininet classes
 
 def RenamedTopo( topo, *args, **kwargs ):
@@ -191,7 +220,9 @@
     "ONOS cluster node"
 
     def __init__( self, name, **kwargs ):
+        "alertAction: exception|ignore|warn|exit (exception)"
         kwargs.update( inNamespace=True )
+        self.alertAction = kwargs.pop( 'alertAction', 'exception' )
         Controller.__init__( self, name, **kwargs )
         self.dir = '/tmp/%s' % self.name
         self.client = self.dir + '/karaf/bin/client'
@@ -220,6 +251,7 @@
         self.ucmd( service, 'server 1>../onos.log 2>../onos.log'
                    ' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' )
         self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() )
+        self.warningCount = 0
 
     # pylint: enable=arguments-differ
 
@@ -228,15 +260,66 @@
         self.cmd( 'pkill -HUP -f karaf.jar && wait' )
         self.cmd( 'rm -rf', self.dir )
 
+    def sanityAlert( self, *args ):
+        "Alert to raise on sanityCheck failure"
+        info( '\n' )
+        if self.alertAction == 'exception':
+            raise Exception( *args )
+        if self.alertAction == 'warn':
+            warn( *args + ( '\n', ) )
+        elif self.alertAction == 'exit':
+            error( '***',  *args +
+                   ( '\nExiting. Run "sudo mn -c" to clean up.\n', ) )
+            exit( 1 )
+
     def isRunning( self ):
         "Is our ONOS process still running?"
-        cmd = 'ps -p %d  >/dev/null 2>&1 && echo "running" || echo "not running"'
-        return self.cmd( cmd % self.onosPid ) == 'running'
+        cmd = ( 'ps -p %d  >/dev/null 2>&1 && echo "running" ||'
+                'echo "not running"' )
+        return self.cmd( cmd % self.onosPid ).strip() == 'running'
 
-    def sanityCheck( self ):
-        "Check whether we've quit or are running out of memory"
+    def checkLog( self ):
+        "Return log file errors and warnings"
+        log = join( self.dir, 'log' )
+        errors, warnings = None, None
+        if isfile( log ):
+            lines = open( log ).read().split( '\n' )
+            errors = [ line for line in lines if 'ERROR' in line ]
+            warnings = [ line for line in lines if 'WARN'in line ]
+        return errors, warnings
+
+    def memAvailable( self ):
+        "Return available memory in KB (or -1 if we can't tell)"
+        lines = open( '/proc/meminfo' ).read().strip().split( '\n' )
+        entries = map( str.split, lines )
+        index = { entry[ 0 ]: entry for entry in entries }
+        # Check MemAvailable if present
+        default = ( None, '-1', 'kB' )
+        _name, count, unit = index.get( 'MemAvailable:', default )
+        if unit.lower() == 'kb':
+            return int( count )
+        return -1
+
+    def sanityCheck( self, lowMem=100000 ):
+        """Check whether we've quit or are running out of memory
+           lowMem: low memory threshold in KB (100000)"""
+        # Are we still running?
         if not self.isRunning():
-            raise Exception( 'ONOS node %s has died' % self.name )
+            self.sanityAlert( 'ONOS node %s has died' % self.name )
+        # Are there errors in the log file?
+        errors, warnings  = self.checkLog()
+        if errors:
+            self.sanityAlert( 'ONOS startup errors:\n<<%s>>' %
+                              '\n'.join( errors ) )
+        warningCount = len( warnings )
+        if warnings and warningCount > self.warningCount:
+            warn( '(%d warnings)' % len( warnings ) )
+            self.warningCount = warningCount
+        # Are we running out of memory?
+        mem = self.memAvailable()
+        if mem > 0 and mem < lowMem:
+            self.sanityAlert( 'Running out of memory (only %d KB available)'
+                              % mem )
 
     def waitStarted( self ):
         "Wait until we've really started"
@@ -246,11 +329,13 @@
             if 'running' in status and 'not running' not in status:
                 break
             info( '.' )
+            self.sanityCheck()
             time.sleep( 1 )
         info( ' ssh-port' )
-        waitListening( server=self, port=KarafPort )
+        waitListening( server=self, port=KarafPort, callback=self.sanityCheck )
         info( ' openflow-port' )
-        waitListening( server=self, port=OpenFlowPort )
+        waitListening( server=self, port=OpenFlowPort,
+                       callback=self.sanityCheck )
         info( ' client' )
         while True:
             result = quietRun( 'echo apps -a | %s -h %s' %
@@ -258,6 +343,7 @@
             if 'openflow' in result:
                 break
             info( '.' )
+            self.sanityCheck()
             time.sleep( 1 )
         info( ')\n' )
 
@@ -284,11 +370,13 @@
            ipBase: IP range for ONOS nodes
            forward: default port forwarding list,
            topo: topology class or instance
+           nodeOpts: ONOSNode options
            **kwargs: additional topology parameters"""
         args = list( args )
         name = args.pop( 0 )
         topo = kwargs.pop( 'topo', None )
         nat = kwargs.pop( 'nat', 'nat0' )
+        nodeOpts = kwargs.pop( 'nodeOpts', {} )
         # Default: single switch with 1 ONOS node
         if not topo:
             topo = SingleSwitchTopo
@@ -303,7 +391,8 @@
         fixIPTables()
         self.env = initONOSEnv()
         self.net = Mininet( topo=topo, ipBase=self.ipBase,
-                            host=ONOSNode, switch=LinuxBridge,
+                            host=partial( ONOSNode, **nodeOpts ),
+                            switch=LinuxBridge,
                             controller=None )
         if nat:
             self.net.addNAT( nat ).configDefault()
@@ -441,15 +530,37 @@
         "Run tail -f /tmp/onos1/log; press control-C to stop"
         self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() )
 
+    def do_status( self, line ):
+        "Return status of ONOS cluster(s)"
+        for c in self.mn.controllers:
+            if isinstance( c, ONOSCluster ):
+                for node in c.net.hosts:
+                    if isinstance( node, ONOSNode ):
+                        errors, warnings = node.checkLog()
+                        running = ( 'Running' if node.isRunning()
+                                   else 'Exited' )
+                        status = ''
+                        if errors:
+                            status += '%d ERRORS ' % len( errors )
+                        if warnings:
+                            status += '%d warnings' % len( warnings )
+                        status = status if status else 'OK'
+                        info( node, '\t', running, '\t', status, '\n' )
+
+
+# For interactive use, exit on error
+exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } )
+ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError )
+
 
 ### Exports for bin/mn
 
 CLI = ONOSCLI
-
-controllers = { 'onos': ONOSCluster, 'default': ONOSCluster }
+controllers = { 'onos': ONOSClusterInteractive,
+                'default': ONOSClusterInteractive }
 
 # XXX Hack to change default controller as above doesn't work
-findController = lambda: ONOSCluster
+findController = lambda: controllers[ 'default' ]
 
 switches = { 'onos': ONOSOVSSwitch,
              'onosovs': ONOSOVSSwitch,