ONOS Node rolling restart test - Kill ONOS k8s pods one at a time, while cordoning off the k8s node to prevent restarts while verifications are happening - While a node is down, check topology and ping between all hosts - Uncordon the k8s node to restart the onos k8s pod Change-Id: I871704068b633721cf79eb747a7c294575415e54

commit: a16b4dbee6d004768a7a551d91753dd3db47ba1c [log] [tgz]
author: Jon Hall <jhall@ciena.com> Wed Oct 20 14:11:59 2021 -0700
committer: Jon Hall <jhall@ciena.com> Fri Nov 05 09:02:10 2021 -0700
tree: 66cf83d3e94164673191c3b7c1ed7214f3b5e02c
parent: 4dbbbd62d2115438492350a12b949603c9acfd86 [diff]
diff --git a/TestON/drivers/common/cli/onosclidriver.py b/TestON/drivers/common/cli/onosclidriver.py
index bafa284..8e6a90a 100755
--- a/TestON/drivers/common/cli/onosclidriver.py
+++ b/TestON/drivers/common/cli/onosclidriver.py

@@ -495,7 +495,21 @@
                                       self.Prompt(),
                                       pexpect.TIMEOUT ] )
             response = self.handle.before
-            if i == 1:
+            if i == 1:  # Not in ONOS CLI
+                # FIXME: This isn't really the correct place for this, but it works for now
+                # Check if port-forward session is still up first
+                if hasattr( main, "Cluster"):
+                    ctrl = None
+                    for c in main.Cluster.controllers:
+                        if c.CLI is self:
+                            ctrl = c
+                            break
+                    if not ctrl:
+                        main.log.warn( self.name + ": Could not find this node in Cluster. Can't check port-forward status" )
+                    elif ctrl.k8s:
+                        ctrl.k8s.checkPortForward( ctrl.k8s.podName,
+                                                   kubeconfig=ctrl.k8s.kubeConfig,
+                                                   namespace=main.params[ 'kubernetes' ][ 'namespace' ] )
                 main.log.error( self.name + ": onos cli session closed. " )
                 if self.onosIp:
                     main.log.warn( "Trying to reconnect " + self.onosIp )

diff --git a/TestON/drivers/common/cli/onosclusterdriver.py b/TestON/drivers/common/cli/onosclusterdriver.py
index b4b6c12..d096d1b 100755
--- a/TestON/drivers/common/cli/onosclusterdriver.py
+++ b/TestON/drivers/common/cli/onosclusterdriver.py

@@ -168,7 +168,7 @@
             self.dockerPrompt = self.checkOptions( self.dockerPrompt, "~/onos#" )
             self.maxNodes = int( self.checkOptions( self.maxNodes, 100 ) )
             self.kubeConfig = self.checkOptions( self.kubeConfig, None )
-            self.up4Port = self.checkOptions(self.up4Port, None)
+            self.up4Port = self.checkOptions( self.up4Port, None )
 
             self.name = self.options[ 'name' ]
 

diff --git a/TestON/drivers/common/clidriver.py b/TestON/drivers/common/clidriver.py
index 48277e5..01242a1 100644
--- a/TestON/drivers/common/clidriver.py
+++ b/TestON/drivers/common/clidriver.py

@@ -35,6 +35,7 @@
     def __init__( self ):
         super( CLI, self ).__init__()
         self.inDocker = False
+        self.portForwardList = None
 
     def checkPrompt( self ):
         for key in self.options:
@@ -57,7 +58,7 @@
         ssh_newkey = 'Are you sure you want to continue connecting'
         refused = "ssh: connect to host " + \
             self.ip_address + " port 22: Connection refused"
-        ssh_options = "-t -X -A -o ServerAliveInterval=50 -o TCPKeepAlive=yes"
+        ssh_options = "-t -X -A -o ServerAliveInterval=50 -o ServerAliveCountMax=1000 -o TCPKeepAlive=yes"
         ssh_destination = self.user_name + "@" + self.ip_address
         envVars = { "TERM": "vt100" }
         # TODO: Add option to specify which shell/command to use
@@ -1132,7 +1133,7 @@
 
     def kubectlPodNodes( self, dstPath=None, kubeconfig=None, namespace=None ):
         """
-        Use kubectl to get the logs from a pod
+        Use kubectl to get the pod to node mappings
         Optional Arguments:
         - dstPath: The location to save the logs to
         - kubeconfig: The path to a kubeconfig file
@@ -1140,7 +1141,6 @@
         Returns main.TRUE if dstPath is given, else the output of the command or
             main.FALSE on Error
         """
-
         try:
             self.handle.sendline( "" )
             self.handle.expect( self.prompt )
@@ -1172,6 +1172,49 @@
             main.log.exception( self.name + ": Uncaught exception!" )
             return main.FALSE
 
+    def kubectlGetPodNode( self, podName, kubeconfig=None, namespace=None ):
+        """
+        Use kubectl to get the node a given pod is running on
+        Arguments:
+        - podName: The name of the pod
+        Optional Arguments:
+        - kubeconfig: The path to a kubeconfig file
+        - namespace: The namespace to search in
+        Returns a string of the node name or None
+        """
+        try:
+            self.handle.sendline( "" )
+            self.handle.expect( self.prompt )
+            main.log.debug( self.handle.before + self.handle.after )
+            cmdStr = "kubectl %s %s get pods %s --output=jsonpath='{.spec.nodeName}{\"\\n\"}'" % (
+                        "--kubeconfig %s" % kubeconfig if kubeconfig else "",
+                        "-n %s" % namespace if namespace else "",
+                        podName )
+            main.log.info( self.name + ": sending: " + repr( cmdStr ) )
+            self.handle.sendline( cmdStr )
+            i = self.handle.expect( [ "not found", "error", "The connection to the server", self.prompt ] )
+            if i == 3:
+                output = self.handle.before
+                main.log.debug( self.name + ": " + output )
+                output = output.splitlines()
+                main.log.warn( output )
+                return output[1] if len( output ) == 3 else None
+            else:
+                main.log.error( self.name + ": Error executing command" )
+                main.log.debug( self.name + ": " + self.handle.before + str( self.handle.after ) )
+                return None
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":     " + self.handle.before )
+            return None
+        except pexpect.TIMEOUT:
+            main.log.exception( self.name + ": TIMEOUT exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            return None
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            return None
+
     def sternLogs( self, podString, dstPath, kubeconfig=None, namespace=None, since='1h', wait=60 ):
         """
         Use stern to get the logs from a pod
@@ -1315,11 +1358,11 @@
             main.log.exception( self.name + ": Uncaught exception!" )
             return main.FALSE
 
-    def kubectlPortForward( self, podName, portsList,  kubeconfig=None, namespace=None, ):
+    def kubectlPortForward( self, podName, portsList, kubeconfig=None, namespace=None ):
         """
         Use kubectl to setup port forwarding from the local machine to the kubernetes pod
 
-        Note: This command does not return until the port forwarding session is ended.
+        Note: This cli command does not return until the port forwarding session is ended.
 
         Required Arguments:
         - podName: The name of the pod as a string
@@ -1327,9 +1370,7 @@
         Optional Arguments:
         - kubeconfig: The path to a kubeconfig file
         - namespace: The namespace to search in
-        - app: Get pods belonging to a specific app
-        Returns a list containing the names of the pods or
-            main.FALSE on Error
+        Returns main.TRUE if a port-forward session was created or main.FALSE on Error
 
 
         """
@@ -1341,8 +1382,11 @@
                         portsList )
             main.log.info( self.name + ": sending: " + repr( cmdStr ) )
             self.handle.sendline( cmdStr )
+            self.handle.expect( "pod/%s" % podName )
+            output = self.handle.before + self.handle.after
             i = self.handle.expect( [ "not found", "error", "closed/timedout",
                                       self.prompt, "The connection to the server", "Forwarding from" ] )
+            output += self.handle.before + str( self.handle.after )
             # NOTE: This won't clear the buffer entirely, and each time the port forward
             #       is used, another line will be added to the buffer. We need to make
             #       sure we clear the buffer before using this component again.
@@ -1350,10 +1394,11 @@
             if i == 5:
                 # Setup preDisconnect function
                 self.preDisconnect = self.exitFromProcess
+                self.portForwardList = portsList
                 return main.TRUE
             else:
                 main.log.error( self.name + ": Error executing command" )
-                main.log.debug( self.name + ": " + self.handle.before + str( self.handle.after ) )
+                main.log.debug( self.name + ": " + output )
                 return main.FALSE
         except pexpect.EOF:
             main.log.error( self.name + ": EOF exception found" )
@@ -1367,6 +1412,132 @@
             main.log.exception( self.name + ": Uncaught exception!" )
             return main.FALSE
 
+    def checkPortForward( self, podName, portsList=None, kubeconfig=None, namespace=None ):
+        """
+        Check that kubectl port-forward session is still active and restarts it if it was closed.
+
+
+        Required Arguments:
+        - podName: The name of the pod as a string
+        - portsList: The list of ports to forward, as a string. see kubectl help for details. Deafults to
+                     the last used string on this node.
+        Optional Arguments:
+        - kubeconfig: The path to a kubeconfig file
+        - namespace: The namespace to search in
+        Returns main.TRUE if a port-forward session was created or is still active, main.FALSE on Error
+
+
+        """
+        try:
+            if not portsList:
+                portsList = self.portForwardList
+            self.handle.sendline( "" )
+            i = self.handle.expect( [ self.prompt, pexpect.TIMEOUT ], timeout=5 )
+            output = self.handle.before + str( self.handle.after )
+            main.log.debug( "%s: %s" % ( self.name, output ) )
+            if i == 0:
+                # We are not currently in a port-forwarding session, try to re-establish.
+                return self.kubectlPortForward( podName, portsList, kubeconfig, namespace )
+            elif i == 1:
+                # Still in a command, port-forward is probably still active
+                return main.TRUE
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":     " + self.handle.before )
+            return main.FALSE
+        except pexpect.TIMEOUT:
+            main.log.exception( self.name + ": TIMEOUT exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            return main.FALSE
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            return main.FALSE
+
+    def kubectlCordonNode( self, nodeName, kubeconfig=None, namespace=None, timeout=240, uncordonOnDisconnect=True ):
+        try:
+            cmdStr = "kubectl %s %s cordon %s" % (
+                "--kubeconfig %s" % kubeconfig if kubeconfig else "",
+                "-n %s" % namespace if namespace else "",
+                nodeName )
+            main.log.info( self.name + ": sending: " + repr( cmdStr ) )
+            if uncordonOnDisconnect:
+                self.nodeName = nodeName
+                if kubeconfig:
+                    self.kubeconfig = kubeconfig
+                if namespace:
+                    self.namespace = namespace
+                self.preDisconnect = self.kubectlUncordonNode
+            self.handle.sendline( cmdStr )
+            i = self.handle.expect( [ "not found", "error",
+                                      "The connection to the server",
+                                      "node/%s cordoned" % nodeName,
+                                      "node/%s already cordoned" % nodeName, ],
+                                    timeout=timeout )
+            if i == 3 or i == 4:
+                output = self.handle.before + self.handle.after
+                main.log.debug( self.name + ": " + output )
+                self.clearBuffer()
+                return main.TRUE
+            else:
+                main.log.error( self.name + ": Error executing command" )
+                main.log.debug( self.name + ": " + self.handle.before + str( self.handle.after ) )
+                self.clearBuffer()
+                return main.FALSE
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":     " + self.handle.before )
+            return main.FALSE
+        except pexpect.TIMEOUT:
+            main.log.exception( self.name + ": TIMEOUT exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            self.clearBuffer()
+            return main.FALSE
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            return main.FALSE
+
+    def kubectlUncordonNode( self, nodeName=None, kubeconfig=None, namespace=None, timeout=240 ):
+        try:
+            if not nodeName:
+                nodeName = getattr( self, "nodeName" )
+            if not kubeconfig:
+                kubeconfig = getattr( self, "kubeconfig", None )
+            if not kubeconfig:
+                namespace = getattr( self, "namespace", None )
+            cmdStr = "kubectl %s %s uncordon %s" % (
+                "--kubeconfig %s" % kubeconfig if kubeconfig else "",
+                "-n %s" % namespace if namespace else "",
+                nodeName )
+            main.log.info( self.name + ": sending: " + repr( cmdStr ) )
+            self.handle.sendline( cmdStr )
+            i = self.handle.expect( [ "not found", "error",
+                                      "The connection to the server",
+                                      "node/%s uncordoned" % nodeName,
+                                      "node/%s already uncordoned" % nodeName, ],
+                                    timeout=timeout )
+            if i == 3 or i == 4:
+                output = self.handle.before + self.handle.after
+                main.log.debug( self.name + ": " + output )
+                self.clearBuffer()
+                return main.TRUE
+            else:
+                main.log.error( self.name + ": Error executing command" )
+                main.log.debug( self.name + ": " + self.handle.before + str( self.handle.after ) )
+                self.clearBuffer()
+                return main.FALSE
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":     " + self.handle.before )
+            return main.FALSE
+        except pexpect.TIMEOUT:
+            main.log.exception( self.name + ": TIMEOUT exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            self.clearBuffer()
+            return main.FALSE
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            return main.FALSE
+
     def kubectlDeletePod( self, podName, kubeconfig=None, namespace=None, timeout=240 ):
         try:
             cmdStr = "kubectl %s %s delete pod %s" % (
@@ -1411,7 +1582,7 @@
             self.handle.sendline( cmdStr )
             # Since the command contains the prompt ($), we first expect for the
             # last part of the command and then we expect the actual values
-            self.handle.expect("grep --color=never %s" % podName, timeout=1)
+            self.handle.expect( "grep --color=never %s" % podName, timeout=1 )
             i = self.handle.expect( [ podName + " ready",
                                       self.prompt ],
                                     timeout=timeout )
@@ -1445,4 +1616,4 @@
                 self.handle.expect( self.prompt, timeout=5 )
                 response += self.cleanOutput( self.handle.before )
             except pexpect.TIMEOUT:
-                return response
\ No newline at end of file
+                return response

diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params
index b582718..4217583 100644
--- a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params

@@ -91,6 +91,7 @@
         <TrafficDiscovery>10</TrafficDiscovery>
     </timers>
 
+
     <SLEEP>
         <startup>10</startup>
     </SLEEP>

diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params.tucson b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params.tucson
new file mode 100644
index 0000000..5e69e79
--- /dev/null
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.params.tucson

@@ -0,0 +1,92 @@
+<PARAMS>
+    <testcases>2</testcases>
+
+    <GRAPH>
+        <nodeCluster>pairedleaves</nodeCluster>
+        <builds>20</builds>
+        <jobName>SRpairedLeaves</jobName>
+        <branch>master</branch>
+    </GRAPH>
+
+    <SCALE>
+        <size>3</size>
+        <max>3</max>
+    </SCALE>
+
+    <DEPENDENCY>
+        <useCommonConf>False</useCommonConf>
+        <useCommonTopo>True</useCommonTopo>
+        <useBmv2>True</useBmv2>
+        <bmv2SwitchType>stratum</bmv2SwitchType>
+        <switchPrefix></switchPrefix>
+        <stratumRoot>~/stratum</stratumRoot>
+        <topology>trellis_fabric.py</topology>
+        <lib>routinglib.py,trellislib.py,stratum.py</lib>
+    </DEPENDENCY>
+
+    <persistent_setup>True</persistent_setup>
+
+    <use_stern>True</use_stern>
+
+    <kubernetes>
+        <appName>onos-classic</appName>
+        <namespace>tost</namespace>
+    </kubernetes>
+
+    <PERF>
+        <traffic_host>Compute1 Compute2</traffic_host>
+        <pcap_host>Compute3</pcap_host>
+        <pcap_cmd_arguments>-t e -F pcap -s 100 </pcap_cmd_arguments>
+        <iterations>1</iterations>
+        <topo>
+            <leaf1>
+                <ports>176 180 184 188</ports>
+                <note>eNB</note>
+            </leaf1>
+            <leaf2>
+                <ports>260 268 276 284</ports>
+                <note>upstream</note>
+            </leaf2>
+        </topo>
+    </PERF>
+    <ONOS_Logging>
+        <org.onosproject.segmentrouting>DEBUG</org.onosproject.segmentrouting>
+    </ONOS_Logging>
+    <ONOS_Logging_Reset>
+        <org.onosproject.segmentrouting>DEBUG</org.onosproject.segmentrouting>
+    </ONOS_Logging_Reset>
+
+
+    <ENV>
+        <cellName>productionCell</cellName>
+        <cellApps>drivers,fpm,lldpprovider,hostprovider,netcfghostprovider,drivers.bmv2,org.opencord.fabric-tofino,pipelines.fabric,org.stratumproject.fabric-tna,drivers.barefoot,segmentrouting,t3</cellApps>
+    </ENV>
+
+    <EXTERNAL_APPS>
+    </EXTERNAL_APPS>
+
+    <CTRL>
+        <port>6653</port>
+    </CTRL>
+
+    <timers>
+        <LinkDiscovery>12</LinkDiscovery>
+        <SwitchDiscovery>12</SwitchDiscovery>
+        <TrafficDiscovery>13</TrafficDiscovery>
+    </timers>
+
+    <restartRounds>2</restartRounds>
+
+    <SLEEP>
+        <startup>10</startup>
+    </SLEEP>
+
+    <TOPO>
+        <switchNum>2</switchNum>
+        <linkNum>2</linkNum>
+    </TOPO>
+
+    <ALARM>
+        <minPassPercent>100</minPassPercent>
+    </ALARM>
+</PARAMS>

diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.py b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.py
index fc3c44c..33aad64 100644
--- a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.py
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.py

@@ -12,4 +12,56 @@
         Perform rolling ONOS failure/recovery test
         Collect logs and analyze results
         """
-        pass
+        try:
+            from tests.USECASE.SegmentRouting.SRStaging.dependencies.SRStagingTest import SRStagingTest
+            import json
+        except ImportError:
+            main.log.error( "SRStagingTest not found. Exiting the test" )
+            main.cleanAndExit()
+        try:
+            main.funcs
+        except ( NameError, AttributeError ):
+            main.funcs = SRStagingTest()
+
+        descPrefix = "Rolling ONOS Restart"
+        pod = main.params['GRAPH'].get( 'nodeCluster', "hardware" )
+        main.funcs.setupTest( main,
+                              topology='0x2',
+                              onosNodes=3,
+                              description="%s tests on the %s pod" % ( descPrefix, pod ) )
+        switches = int( main.params[ 'TOPO' ][ 'switchNum' ] )
+        links = int( main.params[ 'TOPO' ][ 'linkNum' ] )
+        hosts = [ 'h1', 'h2', 'h3', 'mgmt' ]
+
+        clusterSize = main.Cluster.numCtrls
+        restartRounds = int( main.params.get( 'restartRounds', 1 ) )
+
+        def verifications( main, switches, links, hosts ):
+            """
+            Checks to perform before and after each ONOS node event
+            All asserts should happen within this function
+            """
+            from tests.USECASE.SegmentRouting.dependencies.Testcaselib import Testcaselib as run
+            run.verifyTopology( main, switches, links, main.Cluster.numCtrls )
+            run.pingAllFabricIntfs( main, hosts, dumpFlows=False )
+            run.verifyPing( main, hosts, hosts )
+        verifications( main, switches, links, hosts )
+        # TODO ADD control plane checks: nodes, flows, ...
+        # TODO: Mastership check? look at HA Test
+        # TODO: Any specific fabric checks? APP commands?
+
+        for i in range( 0, clusterSize * restartRounds ):
+            n = i % clusterSize
+            ctrl = main.Cluster.getControllers( n )
+
+            longDesc = "%s - kill %s" % ( descPrefix, ctrl.name )
+            # TODO: verify flow isn't interrupted
+            node = main.funcs.onosDown( main, ctrl, preventRestart=True )
+            verifications( main, switches, links, hosts )
+            main.funcs.onosUp( main, node, ctrl )
+            verifications( main, switches, links, hosts )
+        # Cleanup
+        main.log.warn( json.dumps( main.downtimeResults, indent=4, sort_keys=True ) )
+        main.funcs.cleanup( main )
+
+

diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.topo.tucson b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.topo.tucson
new file mode 100644
index 0000000..07c69b4
--- /dev/null
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/SRrollingRestart/SRrollingRestart.topo.tucson

@@ -0,0 +1,192 @@
+<TOPOLOGY>
+    <COMPONENT>
+        <ONOScell>
+            <host>localhost</host>  # ONOS "bench" machine
+            <user>jenkins</user>
+            <password></password>
+            <type>OnosClusterDriver</type>
+            <connect_order>50</connect_order>
+            <jump_host></jump_host>
+            <home>~/onos</home>   # defines where onos home is on the build machine. Defaults to "~/onos/" if empty.
+            <COMPONENTS>
+                <kubeConfig>~/.kube/dev-pairedleaves-tucson</kubeConfig>  # If set, will attempt to use this file for setting up port-forwarding
+                <useDocker>True</useDocker>  # Whether to use docker for ONOS nodes
+                <docker_prompt>\$</docker_prompt>
+                <cluster_name></cluster_name>  # Used as a prefix for cluster components. Defaults to 'ONOS'
+                <diff_clihost>True</diff_clihost> # if it has different host other than localhost for CLI. True or empty. OC# will be used if True.
+                <karaf_username>karaf</karaf_username>
+                <karaf_password>karaf</karaf_password>
+                <web_user>karaf</web_user>
+                <web_pass>karaf</web_pass>
+                <karafPrompt_username>karaf</karafPrompt_username>
+                <rest_port></rest_port>
+                <prompt></prompt>  # TODO: we technically need a few of these, one per component
+                <onos_home>~/onos/</onos_home>  # defines where onos home is on the target cell machine. Defaults to entry in "home" if empty.
+                <nodes> 3 </nodes>  # number of nodes in the cluster
+            </COMPONENTS>
+        </ONOScell>
+
+        <Leaf1>
+            <host>10.76.28.70</host>
+            <user>root</user>
+            <password>onl</password>
+            <type>StratumOSSwitchDriver</type>
+            <connect_order>12</connect_order>
+            <jump_host></jump_host>
+            <COMPONENTS>
+                <shortName>leaf1</shortName>
+                <port1></port1>
+                <link1></link1>
+                <port2></port2>
+                <link2></link2>
+                <onosConfigPath></onosConfigPath>
+                <onosConfigFile></onosConfigFile>
+            </COMPONENTS>
+        </Leaf1>
+
+        <Leaf2>
+            <host>10.76.28.71</host>
+            <user>root</user>
+            <password>onl</password>
+            <type>StratumOSSwitchDriver</type>
+            <connect_order>13</connect_order>
+            <jump_host></jump_host>
+            <COMPONENTS>
+                <shortName>leaf2</shortName>
+                <port1></port1>
+                <link1></link1>
+                <port2></port2>
+                <link2></link2>
+                <onosConfigPath></onosConfigPath>
+                <onosConfigFile></onosConfigFile>
+            </COMPONENTS>
+        </Leaf2>
+
+        <Compute1>
+            <host>10.76.28.74</host>
+            <user>jenkins</user>
+            <password></password>
+            <type>HostDriver</type>
+            <connect_order>6</connect_order>
+            <jump_host></jump_host>
+            <COMPONENTS>
+                <mac></mac>
+                <inband>false</inband>
+                <dhcp>True</dhcp>
+                <ip>10.32.11.2</ip>
+                <shortName>h1</shortName>
+                <port1></port1>
+                <link1></link1>
+                <ifaceName>pairbond</ifaceName>
+                <routes>
+                    <route1>
+                        <network></network>
+                        <netmask></netmask>
+                        <gw></gw>
+                        <interface></interface>
+                    </route1>
+                </routes>
+                <sudo_required>true</sudo_required>
+                <scapy_path>/usr/bin/scapy</scapy_path>
+            </COMPONENTS>
+        </Compute1>
+
+        <Compute2>
+            <host>10.76.28.72</host>
+            <user>jenkins</user>
+            <password></password>
+            <type>HostDriver</type>
+            <connect_order>7</connect_order>
+            <jump_host></jump_host>
+            <COMPONENTS>
+                <mac></mac>
+                <inband>false</inband>
+                <dhcp>True</dhcp>
+                <ip>10.32.11.3</ip>
+                <shortName>h2</shortName>
+                <port1></port1>
+                <link1></link1>
+                <ifaceName>pairbond</ifaceName>
+                <routes>
+                    <route1>
+                        <network></network>
+                        <netmask></netmask>
+                        <gw></gw>
+                        <interface></interface>
+                    </route1>
+                </routes>
+                <sudo_required>true</sudo_required>
+                <scapy_path>/usr/bin/scapy</scapy_path>
+            </COMPONENTS>
+        </Compute2>
+
+        <Compute3>
+            <host>10.76.28.68</host>
+            <user>jenkins</user>
+            <password></password>
+            <type>HostDriver</type>
+            <connect_order>8</connect_order>
+            <jump_host></jump_host>
+            <COMPONENTS>
+                <mac></mac>
+                <inband>false</inband>
+                <dhcp>True</dhcp>
+                <ip>10.32.11.194</ip>
+                <shortName>h3</shortName>
+                <port1></port1>
+                <link1></link1>
+                <ifaceName>eno2</ifaceName>
+                <routes>
+                    <route1>
+                        <network></network>
+                        <netmask></netmask>
+                        <gw></gw>
+                        <interface></interface>
+                    </route1>
+                </routes>
+                <sudo_required>true</sudo_required>
+                <scapy_path>/usr/bin/scapy</scapy_path>
+            </COMPONENTS>
+        </Compute3>
+
+        <ManagmentServer>
+            <host>10.76.28.66</host>
+            <user>jenkins</user>
+            <password></password>
+            <type>HostDriver</type>
+            <connect_order>1</connect_order>
+            <COMPONENTS>
+                <mac></mac>
+                <inband>false</inband>
+                <dhcp>True</dhcp>
+                <ip>10.32.11.1</ip>
+                <shortName>mgmt</shortName>
+                <port1></port1>
+                <link1></link1>
+                <ifaceName>pairbond</ifaceName>
+                <routes>
+                    <route1>
+                        <network></network>
+                        <netmask></netmask>
+                        <gw></gw>
+                        <interface></interface>
+                    </route1>
+                </routes>
+                <sudo_required>true</sudo_required>
+                <scapy_path>/usr/bin/scapy</scapy_path>
+
+            </COMPONENTS>
+        </ManagmentServer>
+
+        <NetworkBench>
+            <host>10.76.28.66</host>
+            <user>jenkins</user>
+            <password></password>
+            <type>NetworkDriver</type>
+            <connect_order>1</connect_order>
+            <COMPONENTS>
+            </COMPONENTS>
+        </NetworkBench>
+
+    </COMPONENT>
+</TOPOLOGY>

diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py
index 70fec33..5ce8b0d 100644
--- a/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py

@@ -1027,9 +1027,10 @@
                 main.log.warn( "Did not find a specific switch pod to kill" )
             startTime = time.time()
             # Delete pod
-            main.ONOSbench.handle.sendline( "kubectl --kubeconfig %s delete pod -n %s %s" % ( kubeConfig, namespace, output[0] ) )
-            main.ONOSbench.handle.expect( main.ONOSbench.prompt )
-            main.log.debug( repr( main.ONOSbench.handle.before ) + repr( main.ONOSbench.handle.after ) )
+            deleted = main.ONOSbench.kubectlDeletePod( output[0], kubeConfig, namespace )
+            utilities.assert_equals( expect=main.TRUE, actual=deleted,
+                                     onpass="Successfully deleted switch pod",
+                                     onfail="Failed to delete switch pod" )
             # TODO ASSERTS
             main.log.info( "Sleeping %s seconds" % sleepTime )
             time.sleep( sleepTime )
@@ -1092,15 +1093,95 @@
             main.log.exception( "Error in killSwitchAgent" )
 
     @staticmethod
-    def onosDown():
+    def onosDown( main, controller, preventRestart=False ):
+        """
+        Brings down an ONOS kubernetes pod. If preventRestart, will attempt to prevent
+        it from coming back on that node by adding a taint.
+        Returns the nodeName of the pod that was killed
+        """
         try:
-            pass
+            # Get pod name to delete
+            podName = controller.k8s.podName
+            kubeConfig = main.Cluster.active(0).k8s.kubeConfig
+            namespace = main.params[ 'kubernetes' ][ 'namespace' ]
+            if preventRestart:
+                # Cordon off the node so no more pods will be scheduled
+                k8sNode = controller.Bench.kubectlGetPodNode( podName,
+                                                              kubeconfig=kubeConfig,
+                                                              namespace=namespace )
+                main.step( "Cordon off k8s node %s, which is hosting onos k8s pod %s" % ( k8sNode,
+                                                                                          controller.name ) )
+                cordoned = controller.Bench.kubectlCordonNode( k8sNode,
+                                                               kubeconfig=kubeConfig,
+                                                               namespace=namespace )
+                utilities.assert_equals( expect=main.TRUE, actual=cordoned,
+                                         onpass="Successfully cordoned k8s node",
+                                         onfail="Failed to cordon off k8s node" )
+                controller.active = False
+                main.Cluster.setRunningNode( main.Cluster.getRunningPos() )
+            else:
+                k8sNode = None
+            main.step( "Delete onos k8s pod %s" % controller.name )
+            #startTime = time.time()
+            # Delete pod
+            deleted = controller.Bench.kubectlDeletePod( podName, kubeConfig, namespace )
+            utilities.assert_equals( expect=main.TRUE, actual=deleted,
+                                     onpass="Successfully deleted switch pod",
+                                     onfail="Failed to delete switch pod" )
+            return k8sNode
         except SkipCase:
             raise
         except Exception:
             main.log.exception( "Error in onosDown" )
 
     @staticmethod
+    def onosUp( main, k8sNode, controller ):
+        """
+        Brings up an ONOS kubernetes pod by uncordoning the node
+        """
+        try:
+            kubeConfig = main.Cluster.active(0).k8s.kubeConfig
+            namespace = main.params[ 'kubernetes' ][ 'namespace' ]
+            podName = controller.k8s.podName
+            # Uncordon the node so pod will be scheduled
+            main.step( "Uncordon k8s node %s, which is hosting onos k8s pod %s" % ( k8sNode,
+                                                                                    controller.name ) )
+            #startTime = time.time()
+            uncordoned = controller.Bench.kubectlUncordonNode( k8sNode,
+                                                               kubeconfig=kubeConfig,
+                                                               namespace=namespace )
+            utilities.assert_equals( expect=main.TRUE, actual=uncordoned,
+                                     onpass="Successfully uncordoned k8s node",
+                                     onfail="Failed to uncordon k8s node" )
+
+            # Check pod is ready
+            main.step( "Wait for ONOS pod to restart" )
+            ready = utilities.retry( controller.Bench.kubectlCheckPodReady,
+                                     main.FALSE,
+                                     kwargs={ "podName": podName,
+                                              "kubeconfig": kubeConfig,
+                                              "namespace": namespace },
+                                     attempts=50,
+                                     getRetryingTime=True )
+            utilities.assert_equals( expect=main.TRUE, actual=ready,
+                                     onpass="Successfully restarted onos pod",
+                                     onfail="Failed to restart onos pod" )
+            controller.active = True
+            # Set all nodes as "running", then reduce to only "active" nodes
+            main.Cluster.runningNodes = main.Cluster.controllers
+            main.Cluster.setRunningNode( main.Cluster.getRunningPos() )
+            controller.k8s.clearBuffer()
+            controller.k8s.kubectlPortForward( podName,
+                                               controller.k8s.portForwardList,
+                                               kubeConfig,
+                                               namespace )
+            #stopTime = time.time()
+        except SkipCase:
+            raise
+        except Exception:
+            main.log.exception( "Error in onosUp" )
+
+    @staticmethod
     def analyzeIperfPcap( main, pcapFile, filterStr, timeout=240, pingOnly=False ):
         """
         Given a pcap file, will use tshark to create a csv file with iperf fields.
@@ -1196,7 +1277,7 @@
             except SkipCase:
                 raise
             except Exception:
-                main.log.exception( "Error in onosDown" )
+                main.log.exception( "Error in analyzePcap" )
                 return -1
             # Remove first and last packets, sometimes there can be a long gap between
             # these and the other packets
commit	a16b4dbee6d004768a7a551d91753dd3db47ba1c	[log] [tgz]
author	Jon Hall <jhall@ciena.com>	Wed Oct 20 14:11:59 2021 -0700
committer	Jon Hall <jhall@ciena.com>	Fri Nov 05 09:02:10 2021 -0700
tree	66cf83d3e94164673191c3b7c1ed7214f3b5e02c
parent	4dbbbd62d2115438492350a12b949603c9acfd86 [diff]