Merge "Add HA test for restoring from and offline backup"
diff --git a/TestON/drivers/common/cli/onosdriver.py b/TestON/drivers/common/cli/onosdriver.py
index f8c77e9..5b12d0c 100755
--- a/TestON/drivers/common/cli/onosdriver.py
+++ b/TestON/drivers/common/cli/onosdriver.py
@@ -2476,3 +2476,67 @@
         except Exception:
             main.log.exception( self.name + ": Uncaught exception!" )
             main.cleanAndExit()
+
+    def backupData( self, location ):
+        """
+            Backs up ONOS data and logs to a given location. Returns main.FALSE
+            if there is an error executing the command, and main.TRUE otherwise.
+            required arguments:
+            loaction - The file path to save the backup to
+        """
+        try:
+            cmd = "/opt/onos/bin/onos-backup " + str( location )
+            self.handle.sendline( cmd )
+            self.handle.expect( self.prompt )
+            handle = self.handle.before
+            main.log.debug( handle )
+            assert handle is not None, "Error in sendline"
+            assert "Command not found:" not in handle, handle
+            assert "Error" not in handle, handle
+            assert "Exception:" not in handle, handle
+            return main.TRUE
+        except AssertionError:
+            main.log.exception( "{} Error in onos-backup output:".format( self.name ) )
+            return main.FALSE
+        except TypeError:
+            main.log.exception( self.name + ": Object not as expected" )
+            return main.FALSE
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            main.cleanAndExit()
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            main.cleanAndExit()
+
+    def restoreData( self, location ):
+        """
+            Restores ONOS data and logs from a given location. Returns main.FALSE
+            if there is an error executing the command, and main.TRUE otherwise.
+            required arguments:
+            loaction - The file path of a backup file
+        """
+        try:
+            cmd = "/opt/onos/bin/onos-restore " + str( location )
+            self.handle.sendline( cmd )
+            self.handle.expect( self.prompt )
+            handle = self.handle.before
+            main.log.debug( handle )
+            assert handle is not None, "Error in sendline"
+            assert "Command not found:" not in handle, handle
+            assert "Error" not in handle, handle
+            assert "Exception:" not in handle, handle
+            return main.TRUE
+        except AssertionError:
+            main.log.exception( "{} Error in onos-restore output:".format( self.name ) )
+            return main.FALSE
+        except TypeError:
+            main.log.exception( self.name + ": Object not as expected" )
+            return main.FALSE
+        except pexpect.EOF:
+            main.log.error( self.name + ": EOF exception found" )
+            main.log.error( self.name + ":    " + self.handle.before )
+            main.cleanAndExit()
+        except Exception:
+            main.log.exception( self.name + ": Uncaught exception!" )
+            main.cleanAndExit()
diff --git a/TestON/tests/HA/HAbackupRecover/HAbackupRecover.params b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.params
new file mode 100644
index 0000000..7091a1d
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.params
@@ -0,0 +1,96 @@
+<PARAMS>
+    #CASE1: Compile ONOS and push it to the test machines
+    #CASE2: Assign devices to controllers
+    #CASE21: Assign mastership to controllers
+    #CASE3: Assign intents
+    #CASE4: Ping across added host intents
+    #CASE5: Reading state of ONOS
+    #CASE6: The Failure case.
+    #CASE7: Check state after control plane failure
+    #CASE8: Compare topo
+    #CASE9: Link s3-s28 down
+    #CASE10: Link s3-s28 up
+    #CASE11: Switch down
+    #CASE12: Switch up
+    #CASE13: Clean up
+    #CASE14: start election app on all onos nodes
+    #CASE15: Check that Leadership Election is still functional
+    #CASE16: Install Distributed Primitives app
+    #CASE17: Check for basic functionality with distributed primitives
+    <testcases>1,2,8,[21,3,8,4,5,14,16,17]*1,[6],8,[3,7,4,15,17,9,8,4,10,8,4,11,8,4,12,8,4]*1,13</testcases>
+
+    <GRAPH>
+        <nodeCluster>VM</nodeCluster>
+        <builds>20</builds>
+    </GRAPH>
+
+    <apps></apps>
+    <ONOS_Configuration>
+        <org.onosproject.net.intent.impl.compiler.IntentConfigurableRegistrator>
+            <useFlowObjectives>false</useFlowObjectives>
+            <defaultFlowObjectiveCompiler>org.onosproject.net.intent.impl.compiler.LinkCollectionIntentObjectiveCompiler</defaultFlowObjectiveCompiler>
+        </org.onosproject.net.intent.impl.compiler.IntentConfigurableRegistrator>
+    </ONOS_Configuration>
+    <ENV>
+        <cellName>HA</cellName>
+        <appString>drivers,openflow,proxyarp,mobility,events</appString>
+    </ENV>
+    <GIT>
+        <pull>False</pull>
+        <branch>master</branch>
+    </GIT>
+    <num_controllers> 7 </num_controllers>
+    <tcpdump> False </tcpdump>
+
+    <CTRL>
+        <port1>6653</port1>
+        <port2>6653</port2>
+        <port3>6653</port3>
+        <port4>6653</port4>
+        <port5>6653</port5>
+        <port6>6653</port6>
+        <port7>6653</port7>
+    </CTRL>
+    <BACKUP>
+        <ENABLED> False </ENABLED>
+        <TESTONUSER>sdn</TESTONUSER>
+        <TESTONIP>10.128.30.9</TESTONIP>
+    </BACKUP>
+    <PING>
+        <source1>h8</source1>
+        <source2>h9</source2>
+        <source3>h10</source3>
+        <source4>h11</source4>
+        <source5>h12</source5>
+        <source6>h13</source6>
+        <source7>h14</source7>
+        <source8>h15</source8>
+        <source9>h16</source9>
+        <source10>h17</source10>
+        <target1>10.0.0.18</target1>
+        <target2>10.0.0.19</target2>
+        <target3>10.0.0.20</target3>
+        <target4>10.0.0.21</target4>
+        <target5>10.0.0.22</target5>
+        <target6>10.0.0.23</target6>
+        <target7>10.0.0.24</target7>
+        <target8>10.0.0.25</target8>
+        <target9>10.0.0.26</target9>
+        <target10>10.0.0.27</target10>
+    </PING>
+    <timers>
+        <LinkDiscovery>12</LinkDiscovery>
+        <SwitchDiscovery>12</SwitchDiscovery>
+        <gossip>5</gossip>
+    </timers>
+    <kill>
+        <switch> s5 </switch>
+        <dpid> 0000000000005000 </dpid>
+        <links> h5 s2 s1 s6 </links>
+    </kill>
+    <MNtcpdump>
+        <intf>eth0</intf>
+        <port> </port>
+        <folder>~/packet_captures/</folder>
+    </MNtcpdump>
+</PARAMS>
diff --git a/TestON/tests/HA/HAbackupRecover/HAbackupRecover.py b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.py
new file mode 100644
index 0000000..5e9d2bc
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.py
@@ -0,0 +1,323 @@
+"""
+Copyright 2018 Open Networking Foundation ( ONF )
+
+Please refer questions to either the onos test mailing list at <onos-test@onosproject.org>,
+the System Testing Plans and Results wiki page at <https://wiki.onosproject.org/x/voMg>,
+or the System Testing Guide page at <https://wiki.onosproject.org/x/WYQg>
+
+    TestON is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 2 of the License, or
+    ( at your option ) any later version.
+
+    TestON is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with TestON.  If not, see <http://www.gnu.org/licenses/>.
+"""
+"""
+Description: This test is to determine if ONOS can handle
+    all of it's nodes restarting
+
+List of test cases:
+CASE1: Compile ONOS and push it to the test machines
+CASE2: Assign devices to controllers
+CASE21: Assign mastership to controllers
+CASE3: Assign intents
+CASE4: Ping across added host intents
+CASE5: Reading state of ONOS
+CASE6: The Failure case.
+CASE7: Check state after control plane failure
+CASE8: Compare topo
+CASE9: Link s3-s28 down
+CASE10: Link s3-s28 up
+CASE11: Switch down
+CASE12: Switch up
+CASE13: Clean up
+CASE14: start election app on all onos nodes
+CASE15: Check that Leadership Election is still functional
+CASE16: Install Distributed Primitives app
+CASE17: Check for basic functionality with distributed primitives
+"""
+class HAbackupRecover:
+
+    def __init__( self ):
+        self.default = ''
+
+    def CASE1( self, main ):
+        """
+        CASE1 is to compile ONOS and push it to the test machines
+
+        Startup sequence:
+        cell <name>
+        onos-verify-cell
+        NOTE: temporary - onos-remove-raft-logs
+        onos-uninstall
+        start mininet
+        git pull
+        mvn clean install
+        onos-package
+        onos-install -f
+        onos-wait-for-start
+        start cli sessions
+        start tcpdump
+        """
+        main.log.info( "ONOS HA test: Restart all ONOS nodes - " +
+                         "initialization" )
+        # These are for csv plotting in jenkins
+        main.HAlabels = []
+        main.HAdata = []
+        try:
+            from tests.dependencies.ONOSSetup import ONOSSetup
+            main.testSetUp = ONOSSetup()
+        except ImportError:
+            main.log.error( "ONOSSetup not found exiting the test" )
+            main.cleanAndExit()
+        main.testSetUp.envSetupDescription()
+        try:
+            from tests.HA.dependencies.HA import HA
+            main.HA = HA()
+            # load some variables from the params file
+            cellName = main.params[ 'ENV' ][ 'cellName' ]
+            main.apps = main.params[ 'ENV' ][ 'appString' ]
+            stepResult = main.testSetUp.envSetup()
+        except Exception as e:
+            main.testSetUp.envSetupException( e )
+        main.testSetUp.evnSetupConclusion( stepResult )
+
+        main.testSetUp.ONOSSetUp( main.Cluster, cellName=cellName, removeLog=True,
+                                  extraApply=main.HA.startingMininet )
+
+        main.HA.initialSetUp()
+
+    def CASE2( self, main ):
+        """
+        Assign devices to controllers
+        """
+        main.HA.assignDevices( main )
+
+    def CASE21( self, main ):
+        """
+        Assign mastership to controllers
+        """
+        main.HA.assignMastership( main )
+
+    def CASE3( self, main ):
+        """
+        Assign intents
+        """
+        main.HA.assignIntents( main )
+
+    def CASE4( self, main ):
+        """
+        Ping across added host intents
+        """
+        main.HA.pingAcrossHostIntent( main )
+
+    def CASE5( self, main ):
+        """
+        Reading state of ONOS
+        """
+        main.HA.readingState( main )
+
+    def CASE6( self, main ):
+        """
+        The Failure case.
+        """
+        import time
+        assert main, "main not defined"
+        assert utilities.assert_equals, "utilities.assert_equals not defined"
+        try:
+            main.HAlabels
+        except ( NameError, AttributeError ):
+            main.log.error( "main.HAlabels not defined, setting to []" )
+            main.HAlabels = []
+        try:
+            main.HAdata
+        except ( NameError, AttributeError ):
+            main.log.error( "main.HAdata not defined, setting to []" )
+            main.HAdata = []
+
+        main.case( "Restart entire ONOS cluster with backed up state" )
+
+        main.step( "Backup ONOS data" )
+        location = "/tmp/" + main.TEST + ".tar.gz"
+        backupResult = main.HA.backupData( main, location )
+        utilities.assert_equals( expect=True, actual=backupResult,
+                                 onpass="ONOS backup succeded",
+                                 onfail="ONOS backup failed" )
+
+        main.step( "Checking ONOS Logs for errors" )
+        for ctrl in main.Cluster.active():
+            main.log.debug( "Checking logs for errors on " + ctrl.name + ":" )
+            main.log.warn( main.ONOSbench.checkLogs( ctrl.ipAddress ) )
+
+        killTime = time.time()
+        main.testSetUp.uninstallOnos( main.Cluster, uninstallMax=True )
+
+        clusterSize = len( main.Cluster.active() )
+        main.Cluster.setRunningNode( 0 )  # So we can install without starting ONOS
+        main.testSetUp.installOnos( main.Cluster, installMax=True )
+        main.Cluster.setRunningNode( clusterSize )
+
+        main.step( "Restore ONOS data" )
+        restoreResult = main.HA.restoreData( main, location )
+        utilities.assert_equals( expect=True, actual=restoreResult,
+                                 onpass="ONOS restore succeded",
+                                 onfail="ONOS restore failed" )
+
+        main.step( "Restart ONOS nodes" )
+        started = main.Cluster.command( "onosStart",
+                                        args=[ "ipAddress" ],
+                                        getFrom=0,
+                                        funcFromCtrl=True )
+        for ctrl in main.Cluster.controllers:
+            ctrl.active = True
+            main.log.debug( repr( ctrl ) )
+
+        main.testSetUp.setupSsh( main.Cluster )
+        main.testSetUp.checkOnosService( main.Cluster )
+        main.testSetUp.startOnosClis( main.Cluster )
+
+        ready = utilities.retry( main.Cluster.command,
+                                 False,
+                                 kwargs={ "function": "summary", "contentCheck": True },
+                                 sleep=30,
+                                 attempts=10 )
+        utilities.assert_equals( expect=True, actual=ready,
+                                 onpass="ONOS summary command succeded",
+                                 onfail="ONOS summary command failed" )
+        if not ready:
+            main.cleanAndExit()
+
+        # Grab the time of restart so we chan check how long the gossip
+        # protocol has had time to work
+        main.restartTime = time.time() - killTime
+        main.log.debug( "Restart time: " + str( main.restartTime ) )
+        main.HAlabels.append( "Restart" )
+        main.HAdata.append( str( main.restartTime ) )
+
+        # Rerun for election on restarted nodes
+        runResults = main.Cluster.command( "electionTestRun", returnBool=True )
+        utilities.assert_equals( expect=True, actual=runResults,
+                                 onpass="Reran for election",
+                                 onfail="Failed to rerun for election" )
+
+        main.HA.commonChecks()
+
+    def CASE7( self, main ):
+        """
+        Check state after ONOS failure
+        """
+        # NOTE: Store has no durability, so intents are lost across system
+        #       restarts
+        main.HA.checkStateAfterEvent( main, afterWhich=0, isRestart=True )
+
+        main.step( "Leadership Election is still functional" )
+        # Test of LeadershipElection
+        leaderList = []
+        leaderResult = main.TRUE
+
+        for ctrl in main.Cluster.active():
+            ctrl.CLI.electionTestLeader()
+            leaderN = ctrl.CLI.electionTestLeader()
+            leaderList.append( leaderN )
+            if leaderN == main.FALSE:
+                # error in response
+                main.log.error( "Something is wrong with " +
+                                 "electionTestLeader function, check the" +
+                                 " error logs" )
+                leaderResult = main.FALSE
+            elif leaderN is None:
+                main.log.error( ctrl.name +
+                                 " shows no leader for the election-app." )
+                leaderResult = main.FALSE
+        if len( set( leaderList ) ) != 1:
+            leaderResult = main.FALSE
+            main.log.error(
+                "Inconsistent view of leader for the election test app" )
+            # TODO: print the list
+        utilities.assert_equals(
+            expect=main.TRUE,
+            actual=leaderResult,
+            onpass="Leadership election passed",
+            onfail="Something went wrong with Leadership election" )
+
+    def CASE8( self, main ):
+        """
+        Compare topo
+        """
+        main.HA.compareTopo( main )
+
+    def CASE9( self, main ):
+        """
+        Link s3-s28 down
+        """
+        main.HA.linkDown( main )
+
+    def CASE10( self, main ):
+        """
+        Link s3-s28 up
+        """
+        main.HA.linkUp( main )
+
+    def CASE11( self, main ):
+        """
+        Switch Down
+        """
+        # NOTE: You should probably run a topology check after this
+        main.HA.switchDown( main )
+
+    def CASE12( self, main ):
+        """
+        Switch Up
+        """
+        # NOTE: You should probably run a topology check after this
+        main.HA.switchUp( main )
+
+    def CASE13( self, main ):
+        """
+        Clean up
+        """
+        main.HA.cleanUp( main )
+
+    def CASE14( self, main ):
+        """
+        start election app on all onos nodes
+        """
+        try:
+            main.HA.startElectionApp( main )
+        except Exception as e:
+            main.log.error( e )
+
+    def CASE15( self, main ):
+        """
+        Check that Leadership Election is still functional
+            15.1 Run election on each node
+            15.2 Check that each node has the same leaders and candidates
+            15.3 Find current leader and withdraw
+            15.4 Check that a new node was elected leader
+            15.5 Check that that new leader was the candidate of old leader
+            15.6 Run for election on old leader
+            15.7 Check that oldLeader is a candidate, and leader if only 1 node
+            15.8 Make sure that the old leader was added to the candidate list
+
+            old and new variable prefixes refer to data from before vs after
+                withdrawl and later before withdrawl vs after re-election
+        """
+        main.HA.isElectionFunctional( main )
+
+    def CASE16( self, main ):
+        """
+        Install Distributed Primitives app
+        """
+        main.HA.installDistributedPrimitiveApp( main )
+
+    def CASE17( self, main ):
+        """
+        Check for basic functionality with distributed primitives
+        """
+        main.HA.checkDistPrimitivesFunc( main )
diff --git a/TestON/tests/HA/HAbackupRecover/HAbackupRecover.topo b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.topo
new file mode 100644
index 0000000..4bf4bd4
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/HAbackupRecover.topo
@@ -0,0 +1,53 @@
+<TOPOLOGY>
+    <COMPONENT>
+
+        <ONOScell>
+            <host>localhost</host>  # ONOS "bench" machine
+            <user>sdn</user>
+            <password>rocks</password>
+            <type>OnosClusterDriver</type>
+            <connect_order>1</connect_order>
+            <COMPONENTS>
+                <cluster_name></cluster_name>  # Used as a prefix for cluster components. Defaults to 'ONOS'
+                <diff_clihost></diff_clihost>  # if it has different host other than localhost for CLI. True or empty. OC# will be used if True.
+                <karaf_username></karaf_username>
+                <karaf_password></karaf_password>
+                <web_user></web_user>
+                <web_pass></web_pass>
+                <rest_port></rest_port>
+                <prompt></prompt>  # TODO: we technically need a few of these, one per component
+                <onos_home></onos_home>  # defines where onos home is
+                <nodes> 7 </nodes>  # number of nodes in the cluster
+            </COMPONENTS>
+        </ONOScell>
+
+        <Mininet1>
+            <host>OCN</host>
+            <user>sdn</user>
+            <password>rocks</password>
+            <type>MininetCliDriver</type>
+            <connect_order>2</connect_order>
+            <COMPONENTS>
+                #Specify the Option for mininet
+                <arg1> --custom ~/mininet/custom/obelisk.py </arg1>
+                <arg2> --topo obelisk </arg2>
+                <arg3> --switch ovs,protocols=OpenFlow13 </arg3>
+                <controller> none </controller>
+                <home>~/mininet/custom/</home>
+                <prompt></prompt>
+            </COMPONENTS>
+        </Mininet1>
+
+        <Mininet2>
+            <host>OCN</host>
+            <user>sdn</user>
+            <password>rocks</password>
+            <type>RemoteMininetDriver</type>
+            <connect_order>3</connect_order>
+            <COMPONENTS>
+                <prompt></prompt>
+            </COMPONENTS>
+        </Mininet2>
+
+    </COMPONENT>
+</TOPOLOGY>
diff --git a/TestON/tests/HA/HAbackupRecover/README b/TestON/tests/HA/HAbackupRecover/README
new file mode 100644
index 0000000..b5bce27
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/README
@@ -0,0 +1,23 @@
+This test is designed to verify that an ONOS cluster behaves correctly when
+restoring an ONOS cluster from backups. We will take a backup of ONOS data
+for each node in the cluster. Then stop and reinstall ONOS on each node.
+Then copy the backup data to the correct locations, restart ONOS, and verify
+correct behavior as the cluster restarts.
+
+The gerneral structure for the test:
+- Startup
+- Assign switches
+- Verify ONOS state and functionality
+    - Device mastership
+    - Intents
+    - Leadership election
+    - Distributed Primitives
+- Take backup of ONOS data
+- Stop ONOS nodes
+- Reinstall ONOS nodes
+- Restore data from backups
+- Restart ONOS nodes
+- Verify ONOS state and functionality
+- Dataplane failures
+    - link down and up
+    - switch down and up
diff --git a/TestON/tests/HA/HAbackupRecover/__init__.py b/TestON/tests/HA/HAbackupRecover/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/__init__.py
diff --git a/TestON/tests/HA/HAbackupRecover/dependencies/__init__.py b/TestON/tests/HA/HAbackupRecover/dependencies/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/TestON/tests/HA/HAbackupRecover/dependencies/__init__.py
diff --git a/TestON/tests/HA/dependencies/HA.py b/TestON/tests/HA/dependencies/HA.py
index c61cd4f..262dbaf 100644
--- a/TestON/tests/HA/dependencies/HA.py
+++ b/TestON/tests/HA/dependencies/HA.py
@@ -3795,3 +3795,30 @@
         utilities.assert_equals( expect=True, actual=nodeResults,
                                  onpass="Nodes check successful",
                                  onfail="Nodes check NOT successful" )
+
+    def backupData( self, main, location ):
+        """
+        Backs up ONOS data and logs to a given location on each active node in a cluster
+        """
+        result = True
+        for ctrl in main.Cluster.active():
+            try:
+                ctrl.server.handle.sendline( "rm " + location )
+                ctrl.server.handle.expect( ctrl.server.prompt )
+                main.log.debug( ctrl.server.handle.before + ctrl.server.handle.after )
+            except pexpect.ExceptionPexpect as e:
+                main.log.error( e )
+                main.cleanAndExit()
+            ctrl.CLI.log( "'Starting backup of onos data'", level="INFO" )
+            result = result and ( ctrl.server.backupData( location ) is main.TRUE )
+            ctrl.CLI.log( "'End of backup of onos data'", level="INFO" )
+        return result
+
+    def restoreData( self, main, location ):
+        """
+        Restores ONOS data and logs from a given location on each node in a cluster
+        """
+        result = True
+        for ctrl in main.Cluster.controllers:
+            result = result and ( ctrl.server.restoreData( location ) is main.TRUE )
+        return result