Choose ONOS nodes so that only one partition looses a majority - Fix counter check to only look at the counters used by the test - Add the gen-partition file to the repo and make the test copy it to the right place in ONOS - clear buffer in clidriver after secureCopy - Change individual ping timeout for ping functions in mininet driver - increase links timeout in mininet driver due to occasional failures - Kill iperf if it timesout to allow continued use of mininet - Clear buffer at the end of many onos driver functions Change-Id: I3c9235d1c6082f0ef86266405ed99c07e1b27fb5

commit: 3b489db6c0b206494590c1e0fd497e4db9239793 [log] [tgz]
author: Jon Hall <jhall@onlab.us> Mon Oct 05 14:38:37 2015 -0700
committer: Jon Hall <jhall@onlab.us> Fri Oct 09 00:43:06 2015 +0000
tree: 06db1fd7ccd39ed92e5fb2a30e06cf3a53da1ff2
parent: 558cd86593d27e9187fd42bdcdc4667c71b6090e [diff]
diff --git a/TestON/tests/HAminorityRestart/HAminorityRestart.py b/TestON/tests/HAminorityRestart/HAminorityRestart.py
index d175fd7..ca8a194 100644
--- a/TestON/tests/HAminorityRestart/HAminorityRestart.py
+++ b/TestON/tests/HAminorityRestart/HAminorityRestart.py

@@ -48,6 +48,7 @@
         start tcpdump
         """
         import imp
+        import pexpect
         main.log.info( "ONOS HA test: Restart minority of ONOS nodes - " +
                          "initialization" )
         main.case( "Setting up test environment" )
@@ -189,6 +190,16 @@
         main.log.wiki(graphs)
 
         main.step( "Creating ONOS package" )
+        # copy gen-partions file to ONOS
+        # NOTE: this assumes TestON and ONOS are on the same machine
+        srcFile = main.testDir + "/" + main.TEST + "/dependencies/onos-gen-partitions"
+        dstDir = main.ONOSbench.home + "/tools/test/bin/onos-gen-partitions"
+        cpResult = main.ONOSbench.secureCopy( main.ONOSbench.user_name,
+                                              main.ONOSbench.ip_address,
+                                              srcFile,
+                                              dstDir,
+                                              pwd=main.ONOSbench.pwd,
+                                              direction="from" )
         packageResult = main.ONOSbench.onosPackage()
         utilities.assert_equals( expect=main.TRUE, actual=packageResult,
                                  onpass="ONOS package successful",
@@ -203,6 +214,19 @@
         utilities.assert_equals( expect=main.TRUE, actual=onosInstallResult,
                                  onpass="ONOS install successful",
                                  onfail="ONOS install failed" )
+        # clean up gen-partitions file
+        try:
+            main.ONOSbench.handle.sendline( "cd " + main.ONOSbench.home )
+            main.ONOSbench.handle.expect( main.ONOSbench.home + "\$" )
+            main.ONOSbench.handle.sendline( "git checkout -- tools/test/bin/onos-gen-partitions" )
+            main.ONOSbench.handle.expect( main.ONOSbench.home + "\$" )
+            main.log.info( " Cleaning custom gen partitions file, response was: \n" +
+                           str( main.ONOSbench.handle.before ) )
+        except ( pexpect.TIMEOUT, pexpect.EOF ):
+            main.log.exception( "ONOSbench: pexpect exception found:" +
+                                main.ONOSbench.handle.before )
+            main.cleanup()
+            main.exit()
 
         main.step( "Checking if ONOS is up yet" )
         for i in range( 2 ):
@@ -1680,17 +1704,19 @@
             main.log.debug( "Checking logs for errors on " + node.name + ":" )
             main.log.warn( main.ONOSbench.checkLogs( node.ip_address ) )
 
-        main.step( "Killing 3 ONOS nodes" )
+        n = len( main.nodes )  # Number of nodes
+        p = ( ( n + 1 ) / 2 ) + 1  # Number of partitions
+        main.kill = [ 0 ]  # ONOS node to kill, listed by index in main.nodes
+        if n > 3:
+            main.kill.append( p - 1 )
+            # NOTE: This only works for cluster sizes of 3,5, or 7.
+
+        main.step( "Killing " + str( len( main.kill ) ) + " ONOS nodes" )
         killTime = time.time()
-        # TODO: Randomize these nodes or base this on partitions
-        # TODO: use threads in this case
-        killResults = main.ONOSbench.onosKill( main.nodes[0].ip_address )
-        time.sleep( 10 )
-        killResults = killResults and\
-                      main.ONOSbench.onosKill( main.nodes[1].ip_address )
-        time.sleep( 10 )
-        killResults = killResults and\
-                      main.ONOSbench.onosKill( main.nodes[2].ip_address )
+        killResults = main.TRUE
+        for i in main.kill:
+            killResults = killResults and\
+                          main.ONOSbench.onosKill( main.nodes[i].ip_address )
         utilities.assert_equals( expect=main.TRUE, actual=killResults,
                                  onpass="ONOS Killed successfully",
                                  onfail="ONOS kill NOT successful" )
@@ -1699,21 +1725,20 @@
         count = 0
         onosIsupResult = main.FALSE
         while onosIsupResult == main.FALSE and count < 10:
-            onos1Isup = main.ONOSbench.isup( main.nodes[0].ip_address )
-            onos2Isup = main.ONOSbench.isup( main.nodes[1].ip_address )
-            onos3Isup = main.ONOSbench.isup( main.nodes[2].ip_address )
-            onosIsupResult = onos1Isup and onos2Isup and onos3Isup
+            onosIsupResult = main.TRUE
+            for i in main.kill:
+                onosIsupResult = onosIsupResult and\
+                                 main.ONOSbench.isup( main.nodes[i].ip_address )
             count = count + 1
-        # TODO: if it becomes an issue, we can retry this step  a few times
         utilities.assert_equals( expect=main.TRUE, actual=onosIsupResult,
                                  onpass="ONOS restarted successfully",
                                  onfail="ONOS restart NOT successful" )
 
         main.step( "Restarting ONOS main.CLIs" )
-        cliResult1 = main.ONOScli1.startOnosCli( main.nodes[0].ip_address )
-        cliResult2 = main.ONOScli2.startOnosCli( main.nodes[1].ip_address )
-        cliResult3 = main.ONOScli3.startOnosCli( main.nodes[2].ip_address )
-        cliResults = cliResult1 and cliResult2 and cliResult3
+        cliResults = main.TRUE
+        for i in main.kill:
+            cliResults = cliResults and\
+                         main.CLIs[i].startOnosCli( main.nodes[i].ip_address )
         utilities.assert_equals( expect=main.TRUE, actual=cliResults,
                                  onpass="ONOS cli restarted",
                                  onfail="ONOS cli did not restart" )
@@ -1722,17 +1747,6 @@
         # protocol has had time to work
         main.restartTime = time.time() - killTime
         main.log.debug( "Restart time: " + str( main.restartTime ) )
-        '''
-        # FIXME: revisit test plan for election with madan
-        # Rerun for election on restarted nodes
-        run1 = main.CLIs[0].electionTestRun()
-        run2 = main.CLIs[1].electionTestRun()
-        run3 = main.CLIs[2].electionTestRun()
-        runResults = run1 and run2 and run3
-        utilities.assert_equals( expect=main.TRUE, actual=runResults,
-                                 onpass="Reran for election",
-                                 onfail="Failed to rerun for election" )
-        '''
         # TODO: MAke this configurable. Also, we are breaking the above timer
         time.sleep( 60 )
         main.log.debug( main.CLIs[0].nodes( jsonFormat=False ) )
@@ -2052,11 +2066,12 @@
         main.step( "Leadership Election is still functional" )
         # Test of LeadershipElection
         leaderList = []
-        # FIXME: make sure this matches nodes that were restarted
-        restarted = [ main.nodes[0].ip_address, main.nodes[1].ip_address,
-                      main.nodes[2].ip_address ]
 
+        restarted = []
+        for i in main.kill:
+            restarted.append( main.nodes[i].ip_address )
         leaderResult = main.TRUE
+
         for cli in main.CLIs:
             leaderN = cli.electionTestLeader()
             leaderList.append( leaderN )
@@ -3409,23 +3424,7 @@
                                  onfail="Added counters are incorrect" )
 
         main.step( "Check counters are consistant across nodes" )
-        onosCounters = []
-        threads = []
-        for i in range( main.numCtrls ):
-            t = main.Thread( target=main.CLIs[i].counters,
-                             name="counters-" + str( i ) )
-            threads.append( t )
-            t.start()
-        for t in threads:
-            t.join()
-            onosCounters.append( t.result )
-        tmp = [ i == onosCounters[ 0 ] for i in onosCounters ]
-        if all( tmp ):
-            main.log.info( "Counters are consistent across all nodes" )
-            consistentCounterResults = main.TRUE
-        else:
-            main.log.error( "Counters are not consistent across all nodes" )
-            consistentCounterResults = main.FALSE
+        onosCounters, consistentCounterResults = main.Counters.consistentCheck()
         utilities.assert_equals( expect=main.TRUE,
                                  actual=consistentCounterResults,
                                  onpass="ONOS counters are consistent " +
@@ -3441,7 +3440,6 @@
                                  actual=incrementCheck,
                                  onpass="Added counters are correct",
                                  onfail="Added counters are incorrect" )
-
         # DISTRIBUTED SETS
         main.step( "Distributed Set get" )
         size = len( onosSet )

diff --git a/TestON/tests/HAminorityRestart/README b/TestON/tests/HAminorityRestart/README
new file mode 100644
index 0000000..a913f85
--- /dev/null
+++ b/TestON/tests/HAminorityRestart/README

@@ -0,0 +1,24 @@
+This test is designed to verify that an ONOS cluster behaves correctly when
+ONOS nodes die. Currently, we will kill nodes so that each raft partition will
+lose a member, but we make sure that there is always a majority of nodes
+available in each partition.
+
+As written, the test only supports an ONOS cluster of 3,5, or 7 nodes.
+This is because the test doesn't apply to a single node cluster, ONOS clusters
+should be deployed in odd numbers, and the partition generation and node
+killing scheme used doesn't give the same properties for clusters of more
+than 7 nodes. Namely, each partition won't have exactly one node killed.
+
+The gerneral structure for the test:
+- Startup
+- Assign switches
+- Verify ONOS state and functionality
+    - Device mastership
+    - Intents
+    - Leadership election
+    - Distributed Primitives
+- Kill some ONOS nodes
+- Verify ONOS state and functionality
+- Dataplane failures
+    - link down and up
+    - switch down and up

diff --git a/TestON/tests/HAminorityRestart/dependencies/Counters.py b/TestON/tests/HAminorityRestart/dependencies/Counters.py
index 21308c2..6614887 100644
--- a/TestON/tests/HAminorityRestart/dependencies/Counters.py
+++ b/TestON/tests/HAminorityRestart/dependencies/Counters.py

@@ -1,14 +1,19 @@
 def __init__( self ):
     self.default = ''
 
-def counterCheck( counterName, counterValue ):
+def consistentCheck():
     """
-    Add Text here
+    Checks that TestON counters are consistent across all nodes.
+
+    Returns the tuple (onosCounters, consistent)
+    - onosCounters is the parsed json output of the counters command on all nodes
+    - consistent is main.TRUE if all "TestON" counters are consitent across all
+        nodes or main.FALSE
     """
     import json
     correctResults = main.TRUE
     # Get onos counters results
-    onosCounters = []
+    onosCountersRaw = []
     threads = []
     for i in range( main.numCtrls ):
         t = main.Thread( target=main.CLIs[i].counters,
@@ -17,25 +22,58 @@
         t.start()
     for t in threads:
         t.join()
-        onosCounters.append( t.result )
-    tmp = [ i == onosCounters[ 0 ] for i in onosCounters ]
+        onosCountersRaw.append( t.result )
+    onosCounters = []
+    for i in range( main.numCtrls ):
+        try:
+            onosCounters.append( json.loads( onosCountersRaw[i] ) )
+        except ( ValueError, TypeError ):
+            main.log.error( "Could not parse counters response from ONOS" +
+                            str( i + 1 ) )
+            main.log.warn( repr( onosCountersRaw[ i ] ) )
+            return main.FALSE
+
+    testCounters = {}
+    # make a list of all the "TestON-*" counters in ONOS
+    # lookes like a dict whose keys are the name of the ONOS node and values
+    # are a list of the counters. I.E.
+    # { "ONOS1": [ {"name":"TestON-inMemory","value":56},
+    #              {"name":"TestON-Partitions","value":56} ]
+    # }
+    # NOTE: There is an assumtion that all nodes are active
+    #        based on the above for loops
+    for controller in enumerate( onosCounters ):
+        for dbType in controller[1]:
+            for dbName, items in dbType.iteritems():
+                for item in items:
+                    if 'TestON' in item['name']:
+                        node = 'ONOS' + str( controller[0] + 1 )
+                        try:
+                            testCounters[node].append( item )
+                        except KeyError:
+                            testCounters[node] = [ item ]
+    # compare the counters on each node
+    tmp = [ v == testCounters['ONOS1'] for k, v in testCounters.iteritems() ]
     if all( tmp ):
         consistent = main.TRUE
     else:
         consistent = main.FALSE
-        main.log.error( "ONOS nodes have different values for counters" )
-        for node in onosCounters:
-            main.log.debug( node )
+        main.log.error( "ONOS nodes have different values for counters:\n" +
+                        testCounters )
+    return ( onosCounters, consistent )
 
+def counterCheck( counterName, counterValue ):
+    """
+    Checks that TestON counters are consistent across all nodes and that
+    specified counter is in ONOS with the given value
+    """
+    import json
+    correctResults = main.TRUE
+    # Get onos counters results and consistentCheck
+    onosCounters, consistent = main.Counters.consistentCheck()
     # Check for correct values
     for i in range( main.numCtrls ):
-        try:
-            current = json.loads( onosCounters[i] )
-        except ( ValueError, TypeError ):
-            main.log.error( "Could not parse counters response from ONOS" +
-                            str( i + 1 ) )
-            main.log.warn( repr( onosCounters[ i ] ) )
-            return main.FALSE
+        current = onosCounters[i]
         onosValue = None
         try:
             for database in current:

diff --git a/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions b/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions
new file mode 100755
index 0000000..bf9a77b
--- /dev/null
+++ b/TestON/tests/HAminorityRestart/dependencies/onos-gen-partitions

@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+'''
+  Generate the partitions json file from the $OC* environment variables
+
+  Usage: onos-gen-partitions [output file]
+  If output file is not provided, the json is written to stdout.
+'''
+
+from os import environ
+from collections import deque, OrderedDict
+import re
+import json
+import sys
+
+convert = lambda text: int(text) if text.isdigit() else text.lower()
+alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)]
+
+def get_OC_vars():
+  vars = []
+  for var in environ:
+    if re.match(r"OC[0-9]+", var):
+      vars.append(var)
+  return sorted(vars, key=alphanum_key)
+
+def get_nodes(vars, port=9876):
+  node = lambda k: { 'id': k, 'ip': k, 'tcpPort': port }
+  return [ node(environ[v]) for v in vars ]
+
+def generate_permutations(nodes, k):
+  l = deque(nodes)
+  perms = {}
+  for i in range(1, len(nodes)+1):
+    perms['p%d' % i] = list(l)[:k]
+    l.rotate(-1)
+  return OrderedDict(sorted(perms.iteritems(), key=lambda (k, v): alphanum_key(k)))
+
+def generate_permutations2(nodes, k):
+  l = deque(nodes)
+  perms = {}
+  for i in range(1, (len(nodes) + 1) / 2 + 1):
+    perms['p%d' % i] = list(l)[:k]
+    l.rotate(-2)
+  return OrderedDict(sorted(perms.iteritems(), key=lambda (k, v): alphanum_key(k)))
+
+
+if __name__ == '__main__':
+  vars = get_OC_vars()
+  nodes = get_nodes(vars)
+  partitions = generate_permutations2(nodes, 3)
+  data = {
+           'nodes': nodes,
+           'partitions': partitions
+         }
+  output = json.dumps(data, indent=4)
+
+  if len(sys.argv) == 2:
+    filename = sys.argv[1]
+    with open(filename, 'w') as f:
+      f.write(output)
+  else:
+    print output
commit	3b489db6c0b206494590c1e0fd497e4db9239793	[log] [tgz]
author	Jon Hall <jhall@onlab.us>	Mon Oct 05 14:38:37 2015 -0700
committer	Jon Hall <jhall@onlab.us>	Fri Oct 09 00:43:06 2015 +0000
tree	06db1fd7ccd39ed92e5fb2a30e06cf3a53da1ff2
parent	558cd86593d27e9187fd42bdcdc4667c71b6090e [diff]