Add status/error checks during ONOS startup
We check for:
- process exits
- log ERRORs
- log WARNings
- low memory
Also added a "status" CLI command to check for exits
and errors.
Change-Id: I6e65fe49619e338f1827fc616ca52a82de7ba470
diff --git a/tools/dev/mininet/onos.py b/tools/dev/mininet/onos.py
index 96a17f0..6c5b817 100755
--- a/tools/dev/mininet/onos.py
+++ b/tools/dev/mininet/onos.py
@@ -44,10 +44,9 @@
from mininet.nodelib import LinuxBridge
from mininet.net import Mininet
from mininet.topo import SingleSwitchTopo, Topo
-from mininet.log import setLogLevel, info
+from mininet.log import setLogLevel, info, warn, error, debug
from mininet.cli import CLI
-from mininet.util import quietRun, waitListening
-from mininet.clean import killprocs
+from mininet.util import quietRun, specialClass
from mininet.examples.controlnet import MininetFacade
from os import environ
@@ -55,7 +54,7 @@
from sys import argv
from glob import glob
import time
-
+from functools import partial
### ONOS Environment
@@ -150,6 +149,36 @@
return onosDir
+def waitListening( client=None, server='127.0.0.1', port=80, timeout=None,
+ callback=None, sleepSecs=.5 ):
+ "Modified mininet.util.waitListening with callback, sleepSecs"
+ runCmd = ( client.cmd if client else
+ partial( quietRun, shell=True ) )
+ if not runCmd( 'which telnet' ):
+ raise Exception('Could not find telnet' )
+ # pylint: disable=maybe-no-member
+ serverIP = server if isinstance( server, basestring ) else server.IP()
+ cmd = ( 'echo A | telnet -e A %s %s' % ( serverIP, port ) )
+ elapsed = 0
+ result = runCmd( cmd )
+ while 'Connected' not in result:
+ if 'No route' in result:
+ rtable = runCmd( 'route' )
+ error( 'no route to %s:\n%s' % ( server, rtable ) )
+ return False
+ if timeout and elapsed >= timeout:
+ error( 'could not connect to %s on port %d\n' % ( server, port ) )
+ return False
+ debug( 'waiting for', server, 'to listen on port', port, '\n' )
+ info( '.' )
+ if callback:
+ callback()
+ time.sleep( sleepSecs )
+ elapsed += sleepSecs
+ result = runCmd( cmd )
+ return True
+
+
### Mininet classes
def RenamedTopo( topo, *args, **kwargs ):
@@ -191,7 +220,9 @@
"ONOS cluster node"
def __init__( self, name, **kwargs ):
+ "alertAction: exception|ignore|warn|exit (exception)"
kwargs.update( inNamespace=True )
+ self.alertAction = kwargs.pop( 'alertAction', 'exception' )
Controller.__init__( self, name, **kwargs )
self.dir = '/tmp/%s' % self.name
self.client = self.dir + '/karaf/bin/client'
@@ -220,6 +251,7 @@
self.ucmd( service, 'server 1>../onos.log 2>../onos.log'
' & echo $! > onos.pid; ln -s `pwd`/onos.pid ..' )
self.onosPid = int( self.cmd( 'cat onos.pid' ).strip() )
+ self.warningCount = 0
# pylint: enable=arguments-differ
@@ -228,15 +260,66 @@
self.cmd( 'pkill -HUP -f karaf.jar && wait' )
self.cmd( 'rm -rf', self.dir )
+ def sanityAlert( self, *args ):
+ "Alert to raise on sanityCheck failure"
+ info( '\n' )
+ if self.alertAction == 'exception':
+ raise Exception( *args )
+ if self.alertAction == 'warn':
+ warn( *args + ( '\n', ) )
+ elif self.alertAction == 'exit':
+ error( '***', *args +
+ ( '\nExiting. Run "sudo mn -c" to clean up.\n', ) )
+ exit( 1 )
+
def isRunning( self ):
"Is our ONOS process still running?"
- cmd = 'ps -p %d >/dev/null 2>&1 && echo "running" || echo "not running"'
- return self.cmd( cmd % self.onosPid ) == 'running'
+ cmd = ( 'ps -p %d >/dev/null 2>&1 && echo "running" ||'
+ 'echo "not running"' )
+ return self.cmd( cmd % self.onosPid ).strip() == 'running'
- def sanityCheck( self ):
- "Check whether we've quit or are running out of memory"
+ def checkLog( self ):
+ "Return log file errors and warnings"
+ log = join( self.dir, 'log' )
+ errors, warnings = None, None
+ if isfile( log ):
+ lines = open( log ).read().split( '\n' )
+ errors = [ line for line in lines if 'ERROR' in line ]
+ warnings = [ line for line in lines if 'WARN'in line ]
+ return errors, warnings
+
+ def memAvailable( self ):
+ "Return available memory in KB (or -1 if we can't tell)"
+ lines = open( '/proc/meminfo' ).read().strip().split( '\n' )
+ entries = map( str.split, lines )
+ index = { entry[ 0 ]: entry for entry in entries }
+ # Check MemAvailable if present
+ default = ( None, '-1', 'kB' )
+ _name, count, unit = index.get( 'MemAvailable:', default )
+ if unit.lower() == 'kb':
+ return int( count )
+ return -1
+
+ def sanityCheck( self, lowMem=100000 ):
+ """Check whether we've quit or are running out of memory
+ lowMem: low memory threshold in KB (100000)"""
+ # Are we still running?
if not self.isRunning():
- raise Exception( 'ONOS node %s has died' % self.name )
+ self.sanityAlert( 'ONOS node %s has died' % self.name )
+ # Are there errors in the log file?
+ errors, warnings = self.checkLog()
+ if errors:
+ self.sanityAlert( 'ONOS startup errors:\n<<%s>>' %
+ '\n'.join( errors ) )
+ warningCount = len( warnings )
+ if warnings and warningCount > self.warningCount:
+ warn( '(%d warnings)' % len( warnings ) )
+ self.warningCount = warningCount
+ # Are we running out of memory?
+ mem = self.memAvailable()
+ if mem > 0 and mem < lowMem:
+ self.sanityAlert( 'Running out of memory (only %d KB available)'
+ % mem )
def waitStarted( self ):
"Wait until we've really started"
@@ -246,11 +329,13 @@
if 'running' in status and 'not running' not in status:
break
info( '.' )
+ self.sanityCheck()
time.sleep( 1 )
info( ' ssh-port' )
- waitListening( server=self, port=KarafPort )
+ waitListening( server=self, port=KarafPort, callback=self.sanityCheck )
info( ' openflow-port' )
- waitListening( server=self, port=OpenFlowPort )
+ waitListening( server=self, port=OpenFlowPort,
+ callback=self.sanityCheck )
info( ' client' )
while True:
result = quietRun( 'echo apps -a | %s -h %s' %
@@ -258,6 +343,7 @@
if 'openflow' in result:
break
info( '.' )
+ self.sanityCheck()
time.sleep( 1 )
info( ')\n' )
@@ -284,11 +370,13 @@
ipBase: IP range for ONOS nodes
forward: default port forwarding list,
topo: topology class or instance
+ nodeOpts: ONOSNode options
**kwargs: additional topology parameters"""
args = list( args )
name = args.pop( 0 )
topo = kwargs.pop( 'topo', None )
nat = kwargs.pop( 'nat', 'nat0' )
+ nodeOpts = kwargs.pop( 'nodeOpts', {} )
# Default: single switch with 1 ONOS node
if not topo:
topo = SingleSwitchTopo
@@ -303,7 +391,8 @@
fixIPTables()
self.env = initONOSEnv()
self.net = Mininet( topo=topo, ipBase=self.ipBase,
- host=ONOSNode, switch=LinuxBridge,
+ host=partial( ONOSNode, **nodeOpts ),
+ switch=LinuxBridge,
controller=None )
if nat:
self.net.addNAT( nat ).configDefault()
@@ -441,15 +530,37 @@
"Run tail -f /tmp/onos1/log; press control-C to stop"
self.default( self.onos1().name, 'tail -f /tmp/%s/log' % self.onos1() )
+ def do_status( self, line ):
+ "Return status of ONOS cluster(s)"
+ for c in self.mn.controllers:
+ if isinstance( c, ONOSCluster ):
+ for node in c.net.hosts:
+ if isinstance( node, ONOSNode ):
+ errors, warnings = node.checkLog()
+ running = ( 'Running' if node.isRunning()
+ else 'Exited' )
+ status = ''
+ if errors:
+ status += '%d ERRORS ' % len( errors )
+ if warnings:
+ status += '%d warnings' % len( warnings )
+ status = status if status else 'OK'
+ info( node, '\t', running, '\t', status, '\n' )
+
+
+# For interactive use, exit on error
+exitOnError = dict( nodeOpts={ 'alertAction': 'exit' } )
+ONOSClusterInteractive = specialClass( ONOSCluster, defaults=exitOnError )
+
### Exports for bin/mn
CLI = ONOSCLI
-
-controllers = { 'onos': ONOSCluster, 'default': ONOSCluster }
+controllers = { 'onos': ONOSClusterInteractive,
+ 'default': ONOSClusterInteractive }
# XXX Hack to change default controller as above doesn't work
-findController = lambda: ONOSCluster
+findController = lambda: controllers[ 'default' ]
switches = { 'onos': ONOSOVSSwitch,
'onosovs': ONOSOVSSwitch,