Leadership Election HA Tests - minor bug fixes

commit: 669173ba18b68205cc7761fdc4238b29619ff4ac [log] [tgz]
author: Jon Hall <jhall@onlab.us> Wed Dec 17 11:36:30 2014 -0800
committer: Jon Hall <jhall@onlab.us> Wed Dec 17 11:47:16 2014 -0800
tree: 4c8fa7bfab31421991dab9dcbc5f5bd231d8acca
parent: f4bf7c64b153ff0d54433b495f8396b34e9af090 [diff] [blame]
diff --git a/TestON/tests/HATestMinorityRestart/HATestMinorityRestart.py b/TestON/tests/HATestMinorityRestart/HATestMinorityRestart.py
index 3fbc29d..b9d2925 100644
--- a/TestON/tests/HATestMinorityRestart/HATestMinorityRestart.py
+++ b/TestON/tests/HATestMinorityRestart/HATestMinorityRestart.py

@@ -16,6 +16,8 @@
 CASE11: Switch down
 CASE12: Switch up
 CASE13: Clean up
+CASE14: start election app on all onos nodes
+CASE15: Check that Leadership Election is still functional
 '''
 class HATestMinorityRestart:
 
@@ -61,6 +63,7 @@
         global ONOS6_port
         global ONOS7_ip
         global ONOS7_port
+        global num_controllers
 
         ONOS1_ip = main.params['CTRL']['ip1']
         ONOS1_port = main.params['CTRL']['port1']
@@ -76,6 +79,7 @@
         ONOS6_port = main.params['CTRL']['port6']
         ONOS7_ip = main.params['CTRL']['ip7']
         ONOS7_port = main.params['CTRL']['port7']
+        num_controllers = int(main.params['num_controllers'])
 
 
         main.step("Applying cell variable to environment")
@@ -143,8 +147,7 @@
 
 
         main.step("Checking if ONOS is up yet")
-        #TODO: Refactor
-        # check bundle:list?
+        #TODO check bundle:list?
         for i in range(2):
             onos1_isup = main.ONOSbench.isup(ONOS1_ip)
             if not onos1_isup:
@@ -184,6 +187,7 @@
         cli_results = cli_result1 and cli_result2 and cli_result3 and\
                 cli_result4 and cli_result5 and cli_result6 and cli_result7
 
+
         main.step("Start Packet Capture MN")
         main.Mininet2.start_tcpdump(
                 str(main.params['MNtcpdump']['folder'])+str(main.TEST)+"-MN.pcap",
@@ -217,7 +221,7 @@
         main.step("Assign switches to controllers")
 
         for i in range (1,29):
-           main.Mininet1.assign_sw_controller(sw=str(i),count=7,
+           main.Mininet1.assign_sw_controller(sw=str(i),count=num_controllers,
                     ip1=ONOS1_ip,port1=ONOS1_port,
                     ip2=ONOS2_ip,port2=ONOS2_port,
                     ip3=ONOS3_ip,port3=ONOS3_port,
@@ -502,6 +506,8 @@
         main.ONOScli5.feature_uninstall("onos-app-fwd")
         main.ONOScli6.feature_uninstall("onos-app-fwd")
         main.ONOScli7.feature_uninstall("onos-app-fwd")
+        #timeout for fwd flows
+        time.sleep(10)
 
         main.step("Add  host intents")
         #TODO:  move the host numbers to params
@@ -512,11 +518,14 @@
             main.log.info("Adding host intent between h"+str(i)+" and h"+str(i+10))
             host1 =  "00:00:00:00:00:" + str(hex(i)[2:]).zfill(2).upper()
             host2 =  "00:00:00:00:00:" + str(hex(i+10)[2:]).zfill(2).upper()
-            #NOTE: get host can return None
-            #TODO: handle this
             host1_id = main.ONOScli1.get_host(host1)['id']
             host2_id = main.ONOScli1.get_host(host2)['id']
-            tmp_result = main.ONOScli1.add_host_intent(host1_id, host2_id )
+            #NOTE: get host can return None
+            if host1_id and host2_id:
+                tmp_result = main.ONOScli1.add_host_intent(host1_id, host2_id )
+            else:
+                main.log.error("Error, get_host() failed")
+                tmp_result = main.FALSE
             intent_add_result = bool(intent_add_result and tmp_result)
         utilities.assert_equals(expect=True, actual=intent_add_result,
                 onpass="Switch mastership correctly assigned",
@@ -806,13 +815,17 @@
         devices.append( main.ONOScli6.devices() )
         devices.append( main.ONOScli7.devices() )
         hosts = []
-        hosts.append( main.ONOScli1.hosts() )
-        hosts.append( main.ONOScli2.hosts() )
-        hosts.append( main.ONOScli3.hosts() )
-        hosts.append( main.ONOScli4.hosts() )
-        hosts.append( main.ONOScli5.hosts() )
-        hosts.append( main.ONOScli6.hosts() )
-        hosts.append( main.ONOScli7.hosts() )
+        hosts.append( json.loads( main.ONOScli1.hosts() ) )
+        hosts.append( json.loads( main.ONOScli2.hosts() ) )
+        hosts.append( json.loads( main.ONOScli3.hosts() ) )
+        hosts.append( json.loads( main.ONOScli4.hosts() ) )
+        hosts.append( json.loads( main.ONOScli5.hosts() ) )
+        hosts.append( json.loads( main.ONOScli6.hosts() ) )
+        hosts.append( json.loads( main.ONOScli7.hosts() ) )
+        for controller in range(0, len(hosts) ):
+            for host in hosts[controller]:
+                if host['ips'] == []:
+                    main.log.error("DEBUG:Error with host ips on controller"+str(controller+1)+": " + str(host))
         ports = []
         ports.append( main.ONOScli1.ports() )
         ports.append( main.ONOScli2.ports() )
@@ -920,7 +933,7 @@
         devices_results = main.TRUE
         ports_results = main.TRUE
         links_results = main.TRUE
-        for controller in range(7): #TODO parameterize the number of controllers
+        for controller in range(num_controllers):
             if devices[controller] or not "Error" in devices[controller]:
                 current_devices_result =  main.Mininet1.compare_switches(MNTopo, json.loads(devices[controller]))
             else:
@@ -995,6 +1008,8 @@
         cli_result3 = main.ONOScli3.start_onos_cli(ONOS3_ip)
         cli_results = cli_result1 and cli_result2 and cli_result3
 
+        main.log.info("Install leadership election app on restarted node")
+
         case_results = main.TRUE and onos_isup_result and cli_results
         utilities.assert_equals(expect=main.TRUE, actual=case_results,
                 onpass="ONOS restart successful",
@@ -1103,7 +1118,7 @@
                 onpass="Mastership of Switches was not changed",
                 onfail="Mastership of some switches changed")
         #NOTE: we expect mastership to change on controller failure
-        mastership_check = mastership_check #and consistent_mastership
+        mastership_check = consistent_mastership
 
 
 
@@ -1140,7 +1155,7 @@
                     intent_check = main.TRUE
                     main.log.report("Intents are consistent across all ONOS nodes")
         else:
-            main.log.warn("ONOS1 intents: ") 
+            main.log.warn("ONOS1 intents: ")
             print json.dumps(json.loads(ONOS1_intents),
                 sort_keys=True, indent=4, separators=(',', ': '))
             main.log.warn("ONOS2 intents: ")
@@ -1165,7 +1180,7 @@
                 onpass="Intents are consistent across all ONOS nodes",
                 onfail="ONOS nodes have different views of intents")
 
-        #NOTE: Hazelcast has no durability, so intents are lost
+        #NOTE: Hazelcast has no durability, so intents are lost across system restarts
         main.step("Compare current intents with intents before the failure")
         #NOTE: this requires case 5 to pass for intent_state to be set.
         #      maybe we should stop the test if that fails?
@@ -1174,6 +1189,12 @@
             main.log.report("Intents are consistent with before failure")
         #TODO: possibly the states have changed? we may need to figure out what the aceptable states are
         else:
+            try:
+                main.log.warn("ONOS1 intents: ")
+                print json.dumps(json.loads(ONOS1_intents),
+                    sort_keys=True, indent=4, separators=(',', ': '))
+            except:
+                pass
             same_intents = main.FALSE
         utilities.assert_equals(expect = main.TRUE,actual=same_intents,
                 onpass="Intents are consistent with before failure",
@@ -1221,8 +1242,36 @@
                 onfail="Loss of dataplane connectivity detected")
 
 
-        #TODO:add topology to this or leave as a seperate case?
-        result = mastership_check and intent_check and Flow_Tables and (not Loss_In_Pings) and roles_not_null 
+        #Test of LeadershipElection
+        leader_list = []
+        leader_result = main.TRUE
+        for controller in range(1,num_controllers+1):
+            node = getattr( main, ( 'ONOScli' + str( controller ) ) )#loop through ONOScli handlers
+            leaderN = node.election_test_leader()
+            leader_list.append(leaderN)
+            if leaderN == main.FALSE:
+                #error in  response
+                main.log.report("Something is wrong with election_test_leader function, check the error logs")
+                leader_result = main.FALSE
+            elif leaderN == None:
+                main.log.report("ONOS"+str(controller) + " shows no leader for the election-app was elected after the old one died")
+                leader_result = main.FALSE
+            elif leaderN == ONOS1_ip or leaderN == ONOS2_ip or leaderN == ONOS3_ip:
+                main.log.report("ONOS"+str(controller) + " shows "+str(leaderN)+" as leader for the election-app, but it was restarted")
+                leader_result = main.FALSE
+        if len( set( leader_list ) ) != 1:
+            leader_result = main.FALSE
+            main.log.error("Inconsistent view of leader for the election test app")
+            #TODO: print the list
+        if leader_result:
+            main.log.report("Leadership election tests passed(consistent view of leader across listeners and a new leader was re-elected if applicable)")
+        utilities.assert_equals(expect=main.TRUE, actual=leader_result,
+                onpass="Leadership election passed",
+                onfail="Something went wrong with Leadership election")
+
+
+        result = mastership_check and intent_check and Flow_Tables and (not Loss_In_Pings) and roles_not_null\
+                and leader_result
         result = int(result)
         if result == main.TRUE:
             main.log.report("Constant State Tests Passed")
@@ -1282,13 +1331,18 @@
             devices.append( main.ONOScli6.devices() )
             devices.append( main.ONOScli7.devices() )
             hosts = []
-            hosts.append( main.ONOScli1.hosts() )
-            hosts.append( main.ONOScli2.hosts() )
-            hosts.append( main.ONOScli3.hosts() )
-            hosts.append( main.ONOScli4.hosts() )
-            hosts.append( main.ONOScli5.hosts() )
-            hosts.append( main.ONOScli6.hosts() )
-            hosts.append( main.ONOScli7.hosts() )
+            hosts.append( json.loads( main.ONOScli1.hosts() ) )
+            hosts.append( json.loads( main.ONOScli2.hosts() ) )
+            hosts.append( json.loads( main.ONOScli3.hosts() ) )
+            hosts.append( json.loads( main.ONOScli4.hosts() ) )
+            hosts.append( json.loads( main.ONOScli5.hosts() ) )
+            hosts.append( json.loads( main.ONOScli6.hosts() ) )
+            hosts.append( json.loads( main.ONOScli7.hosts() ) )
+            for controller in range(0, len(hosts) ):
+                for host in hosts[controller]:
+                    host
+                    if host['ips'] == []:
+                        main.log.error("DEBUG:Error with host ips on controller"+str(controller+1)+": " + str(host))
             ports = []
             ports.append( main.ONOScli1.ports() )
             ports.append( main.ONOScli2.ports() )
@@ -1334,7 +1388,7 @@
             cli_time = time.time() - cli_start
             print "CLI time: " + str(cli_time)
 
-            for controller in range(7): #TODO parameterize the number of controllers
+            for controller in range(num_controllers):
                 if devices[controller] or not "Error" in devices[controller]:
                     current_devices_result =  main.Mininet1.compare_switches(MNTopo, json.loads(devices[controller]))
                 else:
@@ -1447,7 +1501,7 @@
         '''
         #NOTE: You should probably run a topology check after this
 
-        link_sleep = int(main.params['timers']['LinkDiscovery'])
+        link_sleep = float(main.params['timers']['LinkDiscovery'])
 
         description = "Turn off a link to ensure that Link Discovery is working properly"
         main.log.report(description)
@@ -1469,7 +1523,7 @@
         '''
         #NOTE: You should probably run a topology check after this
 
-        link_sleep = int(main.params['timers']['LinkDiscovery'])
+        link_sleep = float(main.params['timers']['LinkDiscovery'])
 
         description = "Restore a link to ensure that Link Discovery is working properly"
         main.log.report(description)
@@ -1492,7 +1546,7 @@
         #NOTE: You should probably run a topology check after this
         import time
 
-        switch_sleep = int(main.params['timers']['SwitchDiscovery'])
+        switch_sleep = float(main.params['timers']['SwitchDiscovery'])
 
         description = "Killing a switch to ensure it is discovered correctly"
         main.log.report(description)
@@ -1520,6 +1574,8 @@
         '''
         #NOTE: You should probably run a topology check after this
         import time
+
+        switch_sleep = float(main.params['timers']['SwitchDiscovery'])
         description = "Adding a switch to ensure it is discovered correctly"
         main.log.report(description)
         main.case(description)
@@ -1531,7 +1587,7 @@
         main.Mininet1.add_link('s28', 's3')
         main.Mininet1.add_link('s28', 's6')
         main.Mininet1.add_link('s28', 'h28')
-        main.Mininet1.assign_sw_controller(sw="28",count=7,
+        main.Mininet1.assign_sw_controller(sw="28",count=num_controllers,
                 ip1=ONOS1_ip,port1=ONOS1_port,
                 ip2=ONOS2_ip,port2=ONOS2_port,
                 ip3=ONOS3_ip,port3=ONOS3_port,
@@ -1557,6 +1613,15 @@
         '''
         import os
         import time
+        #printing colors to terminal
+        colors = {}
+        colors['cyan']   = '\033[96m'
+        colors['purple'] = '\033[95m'
+        colors['blue']   = '\033[94m'
+        colors['green']  = '\033[92m'
+        colors['yellow'] = '\033[93m'
+        colors['red']    = '\033[91m'
+        colors['end']    = '\033[0m'
         description = "Test Cleanup"
         main.log.report(description)
         main.case(description)
@@ -1564,19 +1629,19 @@
         main.Mininet2.stop_tcpdump()
 
         main.step("Checking ONOS Logs for errors")
-        print "Checking logs for errors on ONOS1:"
+        print colors['purple'] + "Checking logs for errors on ONOS1:" + colors['end']
         print main.ONOSbench.check_logs(ONOS1_ip)
-        print "Checking logs for errors on ONOS2:"
+        print colors['purple'] + "Checking logs for errors on ONOS2:" + colors['end']
         print main.ONOSbench.check_logs(ONOS2_ip)
-        print "Checking logs for errors on ONOS3:"
+        print colors['purple'] + "Checking logs for errors on ONOS3:" + colors['end']
         print main.ONOSbench.check_logs(ONOS3_ip)
-        print "Checking logs for errors on ONOS4:"
+        print colors['purple'] + "Checking logs for errors on ONOS4:" + colors['end']
         print main.ONOSbench.check_logs(ONOS4_ip)
-        print "Checking logs for errors on ONOS5:"
+        print colors['purple'] + "Checking logs for errors on ONOS5:" + colors['end']
         print main.ONOSbench.check_logs(ONOS5_ip)
-        print "Checking logs for errors on ONOS6:"
+        print colors['purple'] + "Checking logs for errors on ONOS6:" + colors['end']
         print main.ONOSbench.check_logs(ONOS6_ip)
-        print "Checking logs for errors on ONOS7:"
+        print colors['purple'] + "Checking logs for errors on ONOS7:" + colors['end']
         print main.ONOSbench.check_logs(ONOS7_ip)
 
         main.step("Copying MN pcap and ONOS log files to test station")
@@ -1656,3 +1721,139 @@
         utilities.assert_equals(expect=main.TRUE, actual=main.TRUE,
                 onpass="Test cleanup successful",
                 onfail="Test cleanup NOT successful")
+
+    def CASE14 ( self, main ) :
+        '''
+        start election app on all onos nodes
+        '''
+        leader_result = main.TRUE
+        #install app on onos 1
+        main.log.info("Install leadership election app")
+        main.ONOScli1.feature_install("onos-app-election")
+        #wait for election
+        #check for leader
+        leader = main.ONOScli1.election_test_leader()
+        #verify leader is ONOS1
+        if leader == ONOS1_ip:
+            #all is well
+            pass
+        elif leader == None:
+            #No leader elected
+            main.log.report("No leader was elected")
+            leader_result = main.FALSE
+        elif leader == main.FALSE:
+            #error in  response
+            #TODO: add check for "Command not found:" in the driver, this means the app isn't loaded
+            main.log.report("Something is wrong with election_test_leader function, check the error logs")
+            leader_result = main.FALSE
+        else:
+            #error in  response
+            main.log.report("Unexpected response from election_test_leader function:'"+str(leader)+"'")
+            leader_result = main.FALSE
+
+
+
+
+        #install on other nodes and check for leader.
+        #Should be onos1 and each app should show the same leader
+        for controller in range(2,num_controllers+1):
+            node = getattr( main, ( 'ONOScli' + str( controller ) ) )#loop through ONOScli handlers
+            node.feature_install("onos-app-election")
+            leaderN = node.election_test_leader()
+            #verify leader is ONOS1
+            if leaderN == ONOS1_ip:
+                #all is well
+                pass
+            elif leaderN == main.FALSE:
+                #error in  response
+                #TODO: add check for "Command not found:" in the driver, this means the app isn't loaded
+                main.log.report("Something is wrong with election_test_leader function, check the error logs")
+                leader_result = main.FALSE
+            elif leader != leaderN:
+                leader_result = main.FALSE
+                main.log.report("ONOS" + str(controller) + " sees "+str(leaderN) +
+                        " as the leader of the election app. Leader should be "+str(leader) )
+        if leader_result:
+            main.log.report("Leadership election tests passed(consistent view of leader across listeners and a leader was elected)")
+        utilities.assert_equals(expect=main.TRUE, actual=leader_result,
+                onpass="Leadership election passed",
+                onfail="Something went wrong with Leadership election")
+
+    def CASE15 ( self, main ) :
+        '''
+        Check that Leadership Election is still functional
+        '''
+        leader_result = main.TRUE
+        description = "Check that Leadership Election is still functional"
+        main.log.report(description)
+        main.case(description)
+        main.step("Find current leader and withdraw")
+        leader = main.ONOScli1.election_test_leader()
+        #TODO: do some sanity checking on leader before using it
+        withdraw_result = main.FALSE
+        if leader == ONOS1_ip:
+            old_leader = getattr( main, "ONOScli1" )
+        elif leader == ONOS2_ip:
+            old_leader = getattr( main, "ONOScli2" )
+        elif leader == ONOS3_ip:
+            old_leader = getattr( main, "ONOScli3" )
+        elif leader == ONOS4_ip:
+            old_leader = getattr( main, "ONOScli4" )
+        elif leader == ONOS5_ip:
+            old_leader = getattr( main, "ONOScli5" )
+        elif leader == ONOS6_ip:
+            old_leader = getattr( main, "ONOScli6" )
+        elif leader == ONOS7_ip:
+            old_leader = getattr( main, "ONOScli7" )
+        elif leader == None or leader == main.FALSE:
+            main.log.report("Leader for the election app should be an ONOS node,"\
+                    +"instead got '"+str(leader)+"'")
+            leader_result = main.FALSE
+        withdraw_result = old_leader.election_test_withdraw()
+
+
+        main.step("Make sure new leader is elected")
+        leader_list = []
+        for controller in range(1,num_controllers+1):
+            node = getattr( main, ( 'ONOScli' + str( controller ) ) )#loop through ONOScli handlers
+            leader_list.append( node.election_test_leader() )
+        for leaderN in leader_list:
+            if leaderN == leader:
+                main.log.report("ONOS"+str(controller)+" still sees " + str(leader) +\
+                        " as leader after they withdrew")
+                leader_result = main.FALSE
+            elif leaderN == main.FALSE:
+                #error in  response
+                #TODO: add check for "Command not found:" in the driver, this means the app isn't loaded
+                main.log.report("Something is wrong with election_test_leader function, check the error logs")
+                leader_result = main.FALSE
+        consistent_leader = main.FALSE
+        if len( set( leader_list ) ) == 1:
+            main.log.info("Each Election-app sees '"+str(leader_list[0])+"' as the leader")
+            consistent_leader = main.TRUE
+        else:
+            main.log.report("Inconsistent responses for leader of Election-app:")
+            for n in range(len(leader_list)):
+                main.log.report("ONOS" + str(n+1) + " response: " + str(leader_list[n]) )
+        if leader_result:
+            main.log.report("Leadership election tests passed(consistent view of leader across listeners and a new leader was elected when the old leader resigned)")
+        utilities.assert_equals(expect=main.TRUE, actual=leader_result,
+                onpass="Leadership election passed",
+                onfail="Something went wrong with Leadership election")
+
+
+        main.step("Run for election on old leader(just so everyone is in the hat)")
+        run_result = old_leader.election_test_run()
+        if consistent_leader == main.TRUE:
+            after_run = main.ONOScli1.election_test_leader()
+            #verify leader didn't just change
+            if after_run == leader_list[0]:
+                leader_result = main.TRUE
+            else:
+                leader_result = main.FALSE
+        #TODO: assert on  run and withdraw results?
+
+        utilities.assert_equals(expect=main.TRUE, actual=leader_result,
+                onpass="Leadership election passed",
+                onfail="Something went wrong with Leadership election after the old leader re-ran for election")
+
commit	669173ba18b68205cc7761fdc4238b29619ff4ac	[log] [tgz]
author	Jon Hall <jhall@onlab.us>	Wed Dec 17 11:36:30 2014 -0800
committer	Jon Hall <jhall@onlab.us>	Wed Dec 17 11:47:16 2014 -0800
tree	4c8fa7bfab31421991dab9dcbc5f5bd231d8acca
parent	f4bf7c64b153ff0d54433b495f8396b34e9af090 [diff] [blame]