Choose ONOS nodes so that only one partition looses a majority
- Fix counter check to only look at the counters used by the test
- Add the gen-partition file to the repo and make the test copy it to
the right place in ONOS
- clear buffer in clidriver after secureCopy
- Change individual ping timeout for ping functions in mininet driver
- increase links timeout in mininet driver due to occasional failures
- Kill iperf if it timesout to allow continued use of mininet
- Clear buffer at the end of many onos driver functions
Change-Id: I3c9235d1c6082f0ef86266405ed99c07e1b27fb5
diff --git a/TestON/tests/HAminorityRestart/HAminorityRestart.py b/TestON/tests/HAminorityRestart/HAminorityRestart.py
index d175fd7..ca8a194 100644
--- a/TestON/tests/HAminorityRestart/HAminorityRestart.py
+++ b/TestON/tests/HAminorityRestart/HAminorityRestart.py
@@ -48,6 +48,7 @@
start tcpdump
"""
import imp
+ import pexpect
main.log.info( "ONOS HA test: Restart minority of ONOS nodes - " +
"initialization" )
main.case( "Setting up test environment" )
@@ -189,6 +190,16 @@
main.log.wiki(graphs)
main.step( "Creating ONOS package" )
+ # copy gen-partions file to ONOS
+ # NOTE: this assumes TestON and ONOS are on the same machine
+ srcFile = main.testDir + "/" + main.TEST + "/dependencies/onos-gen-partitions"
+ dstDir = main.ONOSbench.home + "/tools/test/bin/onos-gen-partitions"
+ cpResult = main.ONOSbench.secureCopy( main.ONOSbench.user_name,
+ main.ONOSbench.ip_address,
+ srcFile,
+ dstDir,
+ pwd=main.ONOSbench.pwd,
+ direction="from" )
packageResult = main.ONOSbench.onosPackage()
utilities.assert_equals( expect=main.TRUE, actual=packageResult,
onpass="ONOS package successful",
@@ -203,6 +214,19 @@
utilities.assert_equals( expect=main.TRUE, actual=onosInstallResult,
onpass="ONOS install successful",
onfail="ONOS install failed" )
+ # clean up gen-partitions file
+ try:
+ main.ONOSbench.handle.sendline( "cd " + main.ONOSbench.home )
+ main.ONOSbench.handle.expect( main.ONOSbench.home + "\$" )
+ main.ONOSbench.handle.sendline( "git checkout -- tools/test/bin/onos-gen-partitions" )
+ main.ONOSbench.handle.expect( main.ONOSbench.home + "\$" )
+ main.log.info( " Cleaning custom gen partitions file, response was: \n" +
+ str( main.ONOSbench.handle.before ) )
+ except ( pexpect.TIMEOUT, pexpect.EOF ):
+ main.log.exception( "ONOSbench: pexpect exception found:" +
+ main.ONOSbench.handle.before )
+ main.cleanup()
+ main.exit()
main.step( "Checking if ONOS is up yet" )
for i in range( 2 ):
@@ -1680,17 +1704,19 @@
main.log.debug( "Checking logs for errors on " + node.name + ":" )
main.log.warn( main.ONOSbench.checkLogs( node.ip_address ) )
- main.step( "Killing 3 ONOS nodes" )
+ n = len( main.nodes ) # Number of nodes
+ p = ( ( n + 1 ) / 2 ) + 1 # Number of partitions
+ main.kill = [ 0 ] # ONOS node to kill, listed by index in main.nodes
+ if n > 3:
+ main.kill.append( p - 1 )
+ # NOTE: This only works for cluster sizes of 3,5, or 7.
+
+ main.step( "Killing " + str( len( main.kill ) ) + " ONOS nodes" )
killTime = time.time()
- # TODO: Randomize these nodes or base this on partitions
- # TODO: use threads in this case
- killResults = main.ONOSbench.onosKill( main.nodes[0].ip_address )
- time.sleep( 10 )
- killResults = killResults and\
- main.ONOSbench.onosKill( main.nodes[1].ip_address )
- time.sleep( 10 )
- killResults = killResults and\
- main.ONOSbench.onosKill( main.nodes[2].ip_address )
+ killResults = main.TRUE
+ for i in main.kill:
+ killResults = killResults and\
+ main.ONOSbench.onosKill( main.nodes[i].ip_address )
utilities.assert_equals( expect=main.TRUE, actual=killResults,
onpass="ONOS Killed successfully",
onfail="ONOS kill NOT successful" )
@@ -1699,21 +1725,20 @@
count = 0
onosIsupResult = main.FALSE
while onosIsupResult == main.FALSE and count < 10:
- onos1Isup = main.ONOSbench.isup( main.nodes[0].ip_address )
- onos2Isup = main.ONOSbench.isup( main.nodes[1].ip_address )
- onos3Isup = main.ONOSbench.isup( main.nodes[2].ip_address )
- onosIsupResult = onos1Isup and onos2Isup and onos3Isup
+ onosIsupResult = main.TRUE
+ for i in main.kill:
+ onosIsupResult = onosIsupResult and\
+ main.ONOSbench.isup( main.nodes[i].ip_address )
count = count + 1
- # TODO: if it becomes an issue, we can retry this step a few times
utilities.assert_equals( expect=main.TRUE, actual=onosIsupResult,
onpass="ONOS restarted successfully",
onfail="ONOS restart NOT successful" )
main.step( "Restarting ONOS main.CLIs" )
- cliResult1 = main.ONOScli1.startOnosCli( main.nodes[0].ip_address )
- cliResult2 = main.ONOScli2.startOnosCli( main.nodes[1].ip_address )
- cliResult3 = main.ONOScli3.startOnosCli( main.nodes[2].ip_address )
- cliResults = cliResult1 and cliResult2 and cliResult3
+ cliResults = main.TRUE
+ for i in main.kill:
+ cliResults = cliResults and\
+ main.CLIs[i].startOnosCli( main.nodes[i].ip_address )
utilities.assert_equals( expect=main.TRUE, actual=cliResults,
onpass="ONOS cli restarted",
onfail="ONOS cli did not restart" )
@@ -1722,17 +1747,6 @@
# protocol has had time to work
main.restartTime = time.time() - killTime
main.log.debug( "Restart time: " + str( main.restartTime ) )
- '''
- # FIXME: revisit test plan for election with madan
- # Rerun for election on restarted nodes
- run1 = main.CLIs[0].electionTestRun()
- run2 = main.CLIs[1].electionTestRun()
- run3 = main.CLIs[2].electionTestRun()
- runResults = run1 and run2 and run3
- utilities.assert_equals( expect=main.TRUE, actual=runResults,
- onpass="Reran for election",
- onfail="Failed to rerun for election" )
- '''
# TODO: MAke this configurable. Also, we are breaking the above timer
time.sleep( 60 )
main.log.debug( main.CLIs[0].nodes( jsonFormat=False ) )
@@ -2052,11 +2066,12 @@
main.step( "Leadership Election is still functional" )
# Test of LeadershipElection
leaderList = []
- # FIXME: make sure this matches nodes that were restarted
- restarted = [ main.nodes[0].ip_address, main.nodes[1].ip_address,
- main.nodes[2].ip_address ]
+ restarted = []
+ for i in main.kill:
+ restarted.append( main.nodes[i].ip_address )
leaderResult = main.TRUE
+
for cli in main.CLIs:
leaderN = cli.electionTestLeader()
leaderList.append( leaderN )
@@ -3409,23 +3424,7 @@
onfail="Added counters are incorrect" )
main.step( "Check counters are consistant across nodes" )
- onosCounters = []
- threads = []
- for i in range( main.numCtrls ):
- t = main.Thread( target=main.CLIs[i].counters,
- name="counters-" + str( i ) )
- threads.append( t )
- t.start()
- for t in threads:
- t.join()
- onosCounters.append( t.result )
- tmp = [ i == onosCounters[ 0 ] for i in onosCounters ]
- if all( tmp ):
- main.log.info( "Counters are consistent across all nodes" )
- consistentCounterResults = main.TRUE
- else:
- main.log.error( "Counters are not consistent across all nodes" )
- consistentCounterResults = main.FALSE
+ onosCounters, consistentCounterResults = main.Counters.consistentCheck()
utilities.assert_equals( expect=main.TRUE,
actual=consistentCounterResults,
onpass="ONOS counters are consistent " +
@@ -3441,7 +3440,6 @@
actual=incrementCheck,
onpass="Added counters are correct",
onfail="Added counters are incorrect" )
-
# DISTRIBUTED SETS
main.step( "Distributed Set get" )
size = len( onosSet )
diff --git a/TestON/tests/HAminorityRestart/README b/TestON/tests/HAminorityRestart/README
new file mode 100644
index 0000000..a913f85
--- /dev/null
+++ b/TestON/tests/HAminorityRestart/README
@@ -0,0 +1,24 @@
+This test is designed to verify that an ONOS cluster behaves correctly when
+ONOS nodes die. Currently, we will kill nodes so that each raft partition will
+lose a member, but we make sure that there is always a majority of nodes
+available in each partition.
+
+As written, the test only supports an ONOS cluster of 3,5, or 7 nodes.
+This is because the test doesn't apply to a single node cluster, ONOS clusters
+should be deployed in odd numbers, and the partition generation and node
+killing scheme used doesn't give the same properties for clusters of more
+than 7 nodes. Namely, each partition won't have exactly one node killed.
+
+The gerneral structure for the test:
+- Startup
+- Assign switches
+- Verify ONOS state and functionality
+ - Device mastership
+ - Intents
+ - Leadership election
+ - Distributed Primitives
+- Kill some ONOS nodes
+- Verify ONOS state and functionality
+- Dataplane failures
+ - link down and up
+ - switch down and up
diff --git a/TestON/tests/HAminorityRestart/dependencies/Counters.py b/TestON/tests/HAminorityRestart/dependencies/Counters.py
index 21308c2..6614887 100644
--- a/TestON/tests/HAminorityRestart/dependencies/Counters.py
+++ b/TestON/tests/HAminorityRestart/dependencies/Counters.py
@@ -1,14 +1,19 @@
def __init__( self ):
self.default = ''
-def counterCheck( counterName, counterValue ):
+def consistentCheck():
"""
- Add Text here
+ Checks that TestON counters are consistent across all nodes.
+
+ Returns the tuple (onosCounters, consistent)
+ - onosCounters is the parsed json output of the counters command on all nodes
+ - consistent is main.TRUE if all "TestON" counters are consitent across all
+ nodes or main.FALSE
"""
import json
correctResults = main.TRUE
# Get onos counters results
- onosCounters = []
+ onosCountersRaw = []
threads = []
for i in range( main.numCtrls ):
t = main.Thread( target=main.CLIs[i].counters,
@@ -17,25 +22,58 @@
t.start()
for t in threads:
t.join()
- onosCounters.append( t.result )
- tmp = [ i == onosCounters[ 0 ] for i in onosCounters ]
+ onosCountersRaw.append( t.result )
+ onosCounters = []
+ for i in range( main.numCtrls ):
+ try:
+ onosCounters.append( json.loads( onosCountersRaw[i] ) )
+ except ( ValueError, TypeError ):
+ main.log.error( "Could not parse counters response from ONOS" +
+ str( i + 1 ) )
+ main.log.warn( repr( onosCountersRaw[ i ] ) )
+ return main.FALSE
+
+ testCounters = {}
+ # make a list of all the "TestON-*" counters in ONOS
+ # lookes like a dict whose keys are the name of the ONOS node and values
+ # are a list of the counters. I.E.
+ # { "ONOS1": [ {"name":"TestON-inMemory","value":56},
+ # {"name":"TestON-Partitions","value":56} ]
+ # }
+ # NOTE: There is an assumtion that all nodes are active
+ # based on the above for loops
+ for controller in enumerate( onosCounters ):
+ for dbType in controller[1]:
+ for dbName, items in dbType.iteritems():
+ for item in items:
+ if 'TestON' in item['name']:
+ node = 'ONOS' + str( controller[0] + 1 )
+ try:
+ testCounters[node].append( item )
+ except KeyError:
+ testCounters[node] = [ item ]
+ # compare the counters on each node
+ tmp = [ v == testCounters['ONOS1'] for k, v in testCounters.iteritems() ]
if all( tmp ):
consistent = main.TRUE
else:
consistent = main.FALSE
- main.log.error( "ONOS nodes have different values for counters" )
- for node in onosCounters:
- main.log.debug( node )
+ main.log.error( "ONOS nodes have different values for counters:\n" +
+ testCounters )
+ return ( onosCounters, consistent )
+def counterCheck( counterName, counterValue ):
+ """
+ Checks that TestON counters are consistent across all nodes and that
+ specified counter is in ONOS with the given value
+ """
+ import json
+ correctResults = main.TRUE
+ # Get onos counters results and consistentCheck
+ onosCounters, consistent = main.Counters.consistentCheck()
# Check for correct values
for i in range( main.numCtrls ):
- try:
- current = json.loads( onosCounters[i] )
- except ( ValueError, TypeError ):
- main.log.error( "Could not parse counters response from ONOS" +
- str( i + 1 ) )
- main.log.warn( repr( onosCounters[ i ] ) )
- return main.FALSE
+ current = onosCounters[i]
onosValue = None
try:
for database in current:
diff --git a/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions b/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions
new file mode 100755
index 0000000..bf9a77b
--- /dev/null
+++ b/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+'''
+ Generate the partitions json file from the $OC* environment variables
+
+ Usage: onos-gen-partitions [output file]
+ If output file is not provided, the json is written to stdout.
+'''
+
+from os import environ
+from collections import deque, OrderedDict
+import re
+import json
+import sys
+
+convert = lambda text: int(text) if text.isdigit() else text.lower()
+alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
+
+def get_OC_vars():
+ vars = []
+ for var in environ:
+ if re.match(r"OC[0-9]+", var):
+ vars.append(var)
+ return sorted(vars, key=alphanum_key)
+
+def get_nodes(vars, port=9876):
+ node = lambda k: { 'id': k, 'ip': k, 'tcpPort': port }
+ return [ node(environ[v]) for v in vars ]
+
+def generate_permutations(nodes, k):
+ l = deque(nodes)
+ perms = {}
+ for i in range(1, len(nodes)+1):
+ perms['p%d' % i] = list(l)[:k]
+ l.rotate(-1)
+ return OrderedDict(sorted(perms.iteritems(), key=lambda (k, v): alphanum_key(k)))
+
+def generate_permutations2(nodes, k):
+ l = deque(nodes)
+ perms = {}
+ for i in range(1, (len(nodes) + 1) / 2 + 1):
+ perms['p%d' % i] = list(l)[:k]
+ l.rotate(-2)
+ return OrderedDict(sorted(perms.iteritems(), key=lambda (k, v): alphanum_key(k)))
+
+
+if __name__ == '__main__':
+ vars = get_OC_vars()
+ nodes = get_nodes(vars)
+ partitions = generate_permutations2(nodes, 3)
+ data = {
+ 'nodes': nodes,
+ 'partitions': partitions
+ }
+ output = json.dumps(data, indent=4)
+
+ if len(sys.argv) == 2:
+ filename = sys.argv[1]
+ with open(filename, 'w') as f:
+ f.write(output)
+ else:
+ print output