Add script to analyze logs from failure/recovery tests
- Read ONOS and Stratum logs + stratum write request logs
- Pull out selected lines
- Break down response by component
- Call this script from failure/recovery tests to get component level
results
Change-Id: Iae558affc66b025813f5316a5398835412b83624
diff --git a/TestON/core/teston.py b/TestON/core/teston.py
index f296c69..632c56c 100644
--- a/TestON/core/teston.py
+++ b/TestON/core/teston.py
@@ -379,6 +379,7 @@
" code: " + str( testCaseNumber ) + "." +
str( stepNo ) + ": " + stepName )
self.log.error( str( e.__class__ ) + str( e.message ) )
+ self.log.exception( "" )
self.stepCount = self.stepCount + 1
self.logger.updateCaseResults( self )
# WIKI results
diff --git a/TestON/tests/USECASE/SegmentRouting/SRBridging/SRBridging.params.tofino b/TestON/tests/USECASE/SegmentRouting/SRBridging/SRBridging.params.tofino
index 5b204a0..b116f3c 100644
--- a/TestON/tests/USECASE/SegmentRouting/SRBridging/SRBridging.params.tofino
+++ b/TestON/tests/USECASE/SegmentRouting/SRBridging/SRBridging.params.tofino
@@ -34,7 +34,7 @@
</kubernetes>
<PERF>
- <traffic_host>Host2</traffic_host>
+ <traffic_host>Host1</traffic_host>
<pcap_host>ng40vm</pcap_host>
<pcap_cmd_arguments>-t e -F pcap -s 100 </pcap_cmd_arguments>
<iterations>1</iterations>
diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py
index db27d12..ba8fba0 100644
--- a/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/SRStagingTest.py
@@ -20,6 +20,7 @@
"""
from tests.USECASE.SegmentRouting.dependencies.Testcaselib import Testcaselib as run
+import tests.USECASE.SegmentRouting.SRStaging.dependencies.log_breakdown as logParser
import time
import re
import json
@@ -90,6 +91,7 @@
src: the src component that sends the traffic
dst: the dst component that receives the traffic
"""
+ import datetime
try:
# ping right before to make sure arp is cached and sudo is authenticated
for src in srcList:
@@ -188,7 +190,7 @@
main.log.info( "Starting tshark on %s " % src.name )
src.handle.sendline( "sudo /usr/bin/tshark %s &> /dev/null " % tsharkArgsSender )
# Timestamp used for EVENT START
- main.eventStart = time.time()
+ main.eventStart = datetime.datetime.utcnow()
# LOG Event start in ONOS logs
for ctrl in main.Cluster.active():
ctrl.CLI.log( "'%s START'" % longDesc, level="INFO" )
@@ -197,13 +199,16 @@
main.skipCase( result="FAIL", msg=e )
def stopCapturing( self, main, srcList, dst, shortDesc=None, longDesc=None ):
+ import datetime
import time
+ from tests.dependencies.utils import Utils
+ main.utils = Utils()
try:
pcapFileReceiver = "%s/tshark/%s-%s-tsharkReceiver" % ( "~/TestON",
shortDesc if shortDesc else "tshark",
dst.name )
# Timestamp used for EVENT STOP
- main.eventStop = time.time()
+ main.eventStop = datetime.datetime.utcnow()
# LOG Event stop in ONOS logs
for ctrl in main.Cluster.active():
ctrl.CLI.log( "'%s STOP'" % longDesc, level="INFO" )
@@ -233,6 +238,7 @@
onpass="Saved pcap files from %s" % src.name,
onfail="Failed to scp pcap files from %s" % src.name )
# Grab logs
+ main.utils.copyKarafLog( "CASE%d" % main.CurrentTestCaseNumber, before=True, includeCaseDesc=False )
# Grab pcap
receiverSCP = main.ONOSbench.scp( dst, pcapFileReceiver, main.logdir, direction="from" )
utilities.assert_equals( expect=main.TRUE, actual=receiverSCP,
@@ -257,6 +263,42 @@
main.log.exception( "Error in stopCapturing" )
@staticmethod
+ def analyzeLogs( shortDesc, event, logDir=main.logdir ):
+ try:
+ import datetime
+ main.log.step( "Read saved component logs" )
+ main.log.debug( main.eventStart )
+ # Seems like there is clock skew, +/- 30 seconds
+ MINUTE = datetime.timedelta( minutes=1 )
+ lines, podMapping = logParser.readLogsFromTestFolder( main.eventStart-MINUTE,
+ main.eventStop+MINUTE,
+ testDir=logDir )
+ # Write a merged log file
+ ordered_lines = sorted( lines, key=logParser.sortByDateTime )
+ mergedLogFile = "%s/%s-mergedLogs.txt" % ( logDir, shortDesc )
+ with open( mergedLogFile, 'w' ) as output:
+ for line in ordered_lines:
+ output.write( "%s %s" % ( line['pod'], line['line'] ) )
+
+ # Pull out lines related to test. These are stored in logParser
+ logParser.analyzeLogs( ordered_lines, podMapping )
+ # save human readable results to this file
+ outFile = "log_breakdown_output"
+ outputTextFile = "%s/%s-%s.txt" %( logDir, shortDesc, outFile )
+ # Find component times based on these lines
+ resultsDict = logParser.breakdownEvent( event, podMapping, outputFile=outputTextFile )
+ main.log.debug( json.dumps( resultsDict, sort_keys=True, indent=4 ) )
+ # Add these times to be saved to test output
+ componentBreakdownDict = {}
+ for oldKey in resultsDict.keys():
+ newKey = "%s-%s" % ( shortDesc, oldKey )
+ componentBreakdownDict[ newKey ] = resultsDict[ oldKey ]
+ main.downtimeResults.update( componentBreakdownDict )
+ main.log.debug( json.dumps( main.downtimeResults, sort_keys=True, indent=4 ) )
+ except Exception:
+ main.log.exception( "Error while breaking down logs" )
+
+ @staticmethod
def clearBuffer( component, kill=False, debug=False ):
i = 0
if kill:
@@ -332,6 +374,8 @@
dstComponent,
shortDesc=shortDesc,
longDesc=longDesc )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDesc, 'portstate_down' )
return port
except Exception as e:
main.log.exception( "Error in linkDown" )
@@ -381,6 +425,8 @@
dstComponent,
shortDesc=shortDesc,
longDesc=longDesc )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDesc, 'portstate_up' )
except Exception as e:
main.log.exception( "Error in linkUp" )
@@ -431,6 +477,8 @@
dstComponent,
shortDesc=shortDescFailure,
longDesc=longDescFailure )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDescFailure, 'shutdown_onl' )
# Try to minimize the amount of time we are sending 20mb/s while switch is down
# Large pcaps can cause timeouts when analyzing the results
#time.sleep( 60 )
@@ -469,6 +517,8 @@
dstComponent,
shortDesc=shortDescRecovery,
longDesc=longDescRecovery )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDescRecovery, 'start_onl' )
# Check the switch is back in ONOS
except Exception as e:
main.log.exception( "Error in onlReboot" )
@@ -533,6 +583,8 @@
dstComponent,
shortDesc=shortDescFailure,
longDesc=longDescFailure )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDescFailure, 'powerdown_switch' )
# Try to minimize the amount of time we are sending 20mb/s while switch is down
#time.sleep( 60 )
@@ -571,6 +623,8 @@
dstComponent,
shortDesc=shortDescRecovery,
longDesc=longDescRecovery )
+ # Break down logs
+ main.funcs.analyzeLogs( shortDescRecovery, 'powerup_switch' )
except Exception as e:
main.log.exception( "Error in killSwitchAgent" )
diff --git a/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/log_breakdown.py b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/log_breakdown.py
new file mode 100755
index 0000000..c41cda8
--- /dev/null
+++ b/TestON/tests/USECASE/SegmentRouting/SRStaging/dependencies/log_breakdown.py
@@ -0,0 +1,1274 @@
+#! /usr/bin/python2
+"""
+Copyright 2021 Open Networking Foundation ( ONF )
+
+Please refer questions to either the onos test mailing list at <onos-test@onosproject.org>,
+the System Testing Plans and Results wiki page at <https://wiki.onosproject.org/x/voMg>,
+or the System Testing Guide page at <https://wiki.onosproject.org/x/WYQg>
+
+ TestON is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 2 of the License, or
+ ( at your option ) any later version.
+
+ TestON is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with TestON. If not, see <http://www.gnu.org/licenses/>.
+"""
+
+import datetime
+import re
+import sys
+import json
+from copy import deepcopy as deepcopy
+
+DEBUG = 0
+TIMEFMT = '%Y-%m-%d %H:%M:%S.%f'
+################## ONOS REGULAR EXPRESSIONS ##################
+# TODO Add log:log from the test for start and end timestamps
+# TODO: Investigate why these don't show up in the logs
+# TODO Using a tail of the logs will help reduce log size
+
+defaultOnosLineRE = r"((?P<pod>onos-tost-onos-classic-\d+) (?P<app>onos-classic) )?(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2},\d{3}) (?P<level>\w+)"
+
+# Error in RPC Stream
+p4rt_switch_down_re = r"Error on StreamChannel RPC for device:(?P<deviceName>\w+)"
+gnmi_switch_down_re = r"Error on Subscribe RPC for device:(?P<deviceName>\w+)"
+gnmi_switch_state_re = r"Detected channel connectivity change for device:(?P<deviceName>\w+), new state is (?P<state>\w+)"
+p4rt_channel_open_re = r"Notifying event CHANNEL_OPEN for device:(?P<deviceName>\w+)"
+gnmi_update_re = r"Received SubscribeResponse from device:(?P<deviceName>\w+)"
+device_manager_port_change_re = r"Device device:(?P<deviceName>\w+) port \[(\d+)/(\d+)\]\((?P<sdkPort>\d+)\) status changed \(enabled=(?P<state>\w+)\)"
+# Trellis processing
+sr_switch_down_re = r"\*\* DEVICE DOWN Processing device event DEVICE_AVAILABILITY_CHANGED for unavailable device device:(?P<deviceName>\w+)"
+sr_port_updated_re = r"\*\* PORT UPDATED"
+sr_link_added_re = r"\*\* LINK ADDED"
+sr_link_removed_re = r"\*\* LINK REMOVED"
+sr_event_re = r"\*\* "
+# Cause writes to switches
+sr_remove_re = r"removeFromHash in device device:(?P<deviceName>\w+): "
+sr_add_re = r"addToHash in device device:(?P<deviceName>\w+): "
+sr_rule_re = r"Sending MPLS fwd obj ([\d-]+) for SID (\d+)-> next (\d+) in sw: device:(?P<deviceName>\w+)"
+# There should probably be another line between add/remove FAR to write request
+sr_far_installation_re = r"Installing FAR"
+sr_far_removal_re = r"Removing FAR" # VERIFY RE
+# write requests/response
+p4rt_write_request_update_re = r"Adding (?P<updateType>\w+) update to write request for device:(?P<deviceName>\w+):"
+p4rt_write_request_re = r"Sending write request to device:(?P<deviceName>\w+)"
+p4rt_write_response_re = r"Received write response from device:(?P<deviceName>\w+)"
+# exclude next write request/response
+p4rt_cleanup_re = r"Cleaning up (\d+) action profile groups and (\d+) members on device:(?P<deviceName>\w+)..."
+# Remove filtering caused by port down, not related to new path plumbing, but ignore next write for switch
+sr_port_filter_removal_re = r"Switchport device:(?P<deviceName>\w+)/\[(\d+)/(\d+)\]\((?P<sdkPort>\d+)\) disabled..removing filters"
+sr_port_filter_program_re = r"Switchport device:(?P<deviceName>\w+)/\[(\d+)/(\d+)\]\((?P<sdkPort>\d+)\) enabled..programming filters"
+# Unable to delete/modify - write failed
+p4rt_unable_to_re = r"Unable to (?P<action>\w+) action profile group on device:(?P<deviceName>\w+)"
+onos_error_re = r"ERROR"
+
+################## STRATUM REGULAR EXPRESSIONS ##################
+defaultStratumLineRE = r"((?P<pod>stratum-\w+) (?P<app>stratum-bf) \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{9}Z (\x1b\[(0;\d+)?m)*)?(I|E|W)?(?P<date>(\d{4}\d{2}\d{2})|(\d{4}-\d{2}-\d{2})) (?P<time>\d{2}:\d{2}:\d{2}\.\d{6})"
+stratum_disable_port_re = r"Disabling port (?P<sdkPort>\d+) in node"
+stratum_enable_port_re = r"Enabling port (?P<sdkPort>\d+) in node"
+stratum_port_detect_re = r"State of port (?P<sdkPort>\d+) in node (\d+) \(SDK port (\d+)\): (?P<state>UP|DOWN)"
+stratum_p4_write_success_re = r"P4-based forwarding entities written successfully to node"
+stratum_controller_connect_re = r"is connected as (\w+) for node"
+stratum_p4_write_fail_re = r"Failed to write forwarding entries to node 1: One or more write operations failed."
+stratum_error_re = r"ERROR"
+
+################## STRATUM WRITE REQUESTS REGULAR EXPRESSIONS ##################
+defaultStratumWritesLineRE = r"(?P<date>\d{4}-\d{2}-\d{2}) (?P<time>\d{2}:\d{2}:\d{2}\.\d{6})"
+write_entity_RE = r"type: (?P<operation>\w+) entity"
+
+################ LINES USED FOR TIMESTAMPS ##########################
+selectedLines = { 'stratum_disabling_port': [],
+ 'stratum_enabling_port': [],
+ 'onos_receives_port_update': [],
+ 'stratum_port_down': [],
+ 'stratum_port_up': [],
+ 'onos_p4_switch_down': [],
+ 'onos_gnmi_switch_down': [],
+ 'onos_sr_switch_down': [],
+ 'stratum_controller_connect': [],
+ 'onos_gnmi_switch_state': [],
+ 'onos_p4rt_channel_open': [],
+ 'onos_port_down_device_manager': [],
+ 'onos_port_up_device_manager': [],
+ 'onos_sr_port_updated': [],
+ 'onos_sr_link_added': [],
+ 'onos_sr_link_removed': [],
+ 'onos_sr_remove_bucket': [],
+ 'onos_sr_add_bucket': [],
+ 'onos_sr_mpls_rule': [],
+ 'onos_sr_wildcard_event': [],
+ 'onos_p4_write_request_update': [],
+ 'onos_p4_write_request': [],
+ 'onos_p4_write_response': [],
+ 'onos_p4_write_response_fail': [],
+ 'onos_error': [],
+ 'stratum_error': [],
+ 'stratum_p4_write': [],
+ 'stratum_p4_write_fail': [],
+ 'stratum_modify': [],
+ 'stratum_insert': [],
+ 'stratum_delete': [] }
+
+def parseLine( line, lineRE, pod=None, startDT=None, endDT=None ):
+ """
+ Returns a dict of the line and its components
+ expects the lineRE to contain named groups, among them "time" and "pod"
+ """
+ match = re.search( lineRE, line )
+ if not match:
+ if DEBUG > 2:
+ print( repr( line ) )
+ print( "COULDN'T MATCH LINE RE" )
+ return
+
+ output = match.groupdict()
+ output[ 'line' ] = line
+ if pod:
+ output[ 'pod' ] = pod
+ if not output.get('pod') and DEBUG > 1:
+ print( output )
+ print( "COULDN'T FIND NODE" )
+ # Add datetime for easier math later
+ date = output[ 'date' ]
+ time = output[ 'time' ]
+ dateFmt = '%Y-%m-%d' if '-' in date else '%Y%m%d'
+ if '.' in time:
+ timeFmt = '%H:%M:%S.%f'
+ elif ',' in time:
+ timeFmt = '%H:%M:%S,%f'
+ # TODO: Default time format?
+
+ dt = datetime.datetime.strptime( "%s %s" % ( date, time ), '%s %s' % ( dateFmt, timeFmt ) )
+ if startDT and dt < startDT:
+ return None
+ if endDT and dt > endDT:
+ return None
+ outputFmt = '%Y-%m-%d %H:%M:%S.%f'
+ output[ 'datetime' ] = datetime.datetime.strftime( dt, outputFmt )
+ return output
+
+def readLog( log, lineRE, pod=None, startDT=None, endDT=None ):
+ """
+ Parse a log file
+ If using stern, pod will be parsed from the log, if the file is from a
+ single switch, pod can be supplied
+ RE must have 'date' and 'time' as named groups and
+ 'pod' and 'app' as optional named groups
+ """
+ foundLines = []
+ with open( log, "r" ) as logFile:
+ if DEBUG:
+ print( "Reading file: %s" % log )
+ for line in logFile:
+ parsed = parseLine( line, lineRE, pod, startDT, endDT )
+ if not parsed:
+ continue
+ foundLines.append( parsed )
+ return foundLines
+
+def ignoreLine( line, lineType, writeActivity, deviceName=None ):
+ """
+ We try to filter out writes that aren't related to the rerouting caused by topology change.
+ writeActivity is a dictionary with key's being a cause for writes and a map of log lines with counters for how many we include
+ """
+ assert lineType == "onos_write_request" or\
+ lineType == "onos_write_success" or\
+ lineType == "onos_write_fail" or\
+ lineType == "stratum_success" or\
+ lineType == "stratum_fail"
+ if not deviceName:
+ deviceName = line[ 'deviceName' ]
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ acceptCauses = [ 'addBucket', 'removeBucket', 'mplsRule' ]
+ ignored = True
+ for cause in acceptCauses:
+ count = writeActivity[ deviceName ][ cause ]
+ if count['writes']:
+ if count[ 'onos_write_req' ] < count[ 'writes' ]:
+ if lineType == "onos_write_request":
+ count[ 'onos_write_req' ] += 1
+ if DEBUG:
+ print( "Including line for %s" % cause )
+ ignored = False
+ if count[ 'onos_write_resp' ] < count[ 'onos_write_req' ]:
+ if lineType == "onos_write_fail":
+ # We will retry?
+ pass
+ #count[ 'writes' ] += 1
+ if lineType == "onos_write_success" or lineType == "onos_write_fail":
+ count[ 'onos_write_resp' ] += 1
+ if DEBUG:
+ print( "Including line for %s" % cause )
+ ignored = False
+ if count[ 'stratum_write' ] < count[ 'onos_write_req' ]:
+ if lineType == "stratum_fail":
+ pass
+ #count[ 'ignored_write' ] += 1
+ if lineType == "stratum_success" or lineType == "stratum_fail":
+ count[ 'stratum_write' ] += 1
+ if DEBUG:
+ print( "Including line for %s\n" % cause )
+ ignored = False
+ if ignored:
+ if DEBUG:
+ print( "DEBUG: writeActivity - ignored writes during a non ignored write")
+ print( writeActivity[ deviceName ] )
+ else:
+ if DEBUG:
+ print( writeActivity[ deviceName ] )
+ if count[ 'stratum_write' ] == count[ 'writes' ] and\
+ count[ 'onos_write_resp' ] == count[ 'writes' ] and\
+ count[ 'onos_write_req' ] == count[ 'writes' ]:
+ for key, value in count.items():
+ count[ key ] = 0
+ break
+ if ignored:
+ if DEBUG:
+ print( "Line ignored\n" )
+ return ignored
+
+def analyzeLogs( lines, podMapping ):
+ """
+ Analyze a list of parsed log lines and pull out selected log lines. These can be a combined stream of multiple log files
+ """
+ if DEBUG:
+ print( "\n"*3 )
+ print( "Analyzing merged logs" )
+ print( "\n"*3 )
+ writeDict = { "writes": 0,
+ "onos_write_req": 0,
+ "onos_write_resp": 0,
+ "stratum_write": 0 }
+ writeActivity = {
+ node: {
+ "removeBucket": deepcopy(writeDict),
+ "addBucket": deepcopy(writeDict),
+ "mplsRule": deepcopy(writeDict)
+ }
+ for pod, node in podMapping.items()
+ }
+
+ for line in lines:
+ m = re.search( p4rt_switch_down_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_p4_switch_down' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( gnmi_switch_down_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_gnmi_switch_down' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( gnmi_update_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_receives_port_update' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( gnmi_switch_state_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_gnmi_switch_state' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( device_manager_port_change_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ state = m.group( 'state' )
+ if state == "false":
+ selectedLines[ 'onos_port_down_device_manager' ].append( line )
+ if state == "true":
+ selectedLines[ 'onos_port_up_device_manager' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_switch_down_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_sr_switch_down' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( p4rt_channel_open_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_p4rt_channel_open' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_rule_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ writeActivity[ line[ 'deviceName' ] ][ 'mplsRule' ][ 'writes' ] += 1
+ selectedLines[ 'onos_sr_mpls_rule' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_far_installation_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ #writeActivity[ line[ 'deviceName' ] ][ 'farInstall' ][ 'ignored_write' ] = 1
+ # NOTE: Without knowing which device this gets written to, not sure how to correspond writes with this
+ if DEBUG:
+ print( "Line ignored\n" )
+ continue
+ m = re.search( sr_far_removal_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ #writeActivity[ line[ 'deviceName' ] ][ 'farRemove' ][ 'ignored_write' ] = 1
+ # NOTE: Without knowing which device this gets written to, not sure how to correspond writes with this
+ if DEBUG:
+ print( "Line ignored\n" )
+ continue
+ m = re.search( sr_remove_re, line[ 'line' ] )
+ if m:
+ # TODO: measure first to last of these events
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ writeActivity[ line[ 'deviceName' ] ][ 'removeBucket' ][ 'writes' ] += 2
+ selectedLines[ 'onos_sr_remove_bucket' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_add_re, line[ 'line' ] )
+ if m:
+ # TODO: measure first to last of these events
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ writeActivity[ line[ 'deviceName' ] ][ 'addBucket' ][ 'writes' ] += 2
+ selectedLines[ 'onos_sr_add_bucket' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_port_updated_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_sr_port_updated' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_link_added_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_sr_link_added' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_link_removed_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_sr_link_removed' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( sr_event_re, line[ 'line' ] )
+ if m:
+ # This is any other important event from SR
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_sr_wildcard_event' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( p4rt_cleanup_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ #writeActivity[ line[ 'deviceName' ] ][ 'cleanup' ][ 'ignored_write' ] = 1
+ if DEBUG:
+ print( "Line ignored\n" )
+ continue
+ m = re.search( sr_port_filter_removal_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ # Ignore 1 for leaf and 2 for spines?
+ # TODO: This seems really hacky to me
+ deviceName = line[ 'deviceName' ]
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ #if "spine" in deviceName:
+ # writeActivity[ deviceName ][ 'removeFilters' ][ 'ignored_write' ] = 2
+ #else:
+ # writeActivity[ deviceName ][ 'removeFilters' ][ 'ignored_write' ] = 1
+ if DEBUG:
+ print( "Line ignored\n" )
+ continue
+ m = re.search( sr_port_filter_program_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ # Ignore 1 for leaf and 2 for spines?
+ # TODO: This seems really hacky to me
+ deviceName = line[ 'deviceName' ]
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ #if "spine" in deviceName:
+ # writeActivity[ deviceName ][ 'addFilters' ][ 'ignored_write' ] = 2
+ #else:
+ # writeActivity[ deviceName ][ 'addFilters' ][ 'ignored_write' ] = 1
+ if DEBUG:
+ print( "Line ignored\n" )
+ continue
+ m = re.search( p4rt_write_request_update_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_p4_write_request_update' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( p4rt_write_request_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if ignoreLine( line, "onos_write_request", writeActivity ):
+ continue
+ selectedLines[ 'onos_p4_write_request' ].append( line )
+ continue
+ m = re.search( p4rt_write_response_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if ignoreLine( line, "onos_write_success", writeActivity ):
+ continue
+ selectedLines[ 'onos_p4_write_response' ].append( line )
+ continue
+ m = re.search( p4rt_unable_to_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if ignoreLine( line, "onos_write_fail", writeActivity ):
+ continue
+ selectedLines[ 'onos_p4_write_response_fail' ].append( line )
+ continue
+ m = re.search( onos_error_re, line[ 'line' ] )
+ # FIXME: This matches all "ERROR" in any line, since at this point we aren't looking where it came from
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'onos_error' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( stratum_disable_port_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'stratum_disabling_port' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( stratum_enable_port_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'stratum_enabling_port' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( stratum_port_detect_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ state = m.group( 'state' )
+ if state == "DOWN":
+ selectedLines[ 'stratum_port_down' ].append( line )
+ if state == "UP":
+ selectedLines[ 'stratum_port_up' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( stratum_p4_write_success_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ deviceName = podMapping[ line[ 'pod' ] ]
+ if ignoreLine( line, "stratum_success", writeActivity, deviceName ):
+ continue
+ selectedLines[ 'stratum_p4_write' ].append( line )
+ continue
+ m = re.search( stratum_p4_write_fail_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ deviceName = podMapping[ line[ 'pod' ] ]
+ if ignoreLine( line, "stratum_fail", writeActivity, deviceName ):
+ continue
+ selectedLines[ 'stratum_p4_write_fail' ].append( line )
+ continue
+ m = re.search( stratum_controller_connect_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'stratum_controller_connect' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( write_entity_RE, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ # TODO: Ignore Line?
+ """
+ operation = m.group( 'operation' )
+ if operation == "MODIFY":
+ selectedLines[ 'stratum_modify' ].append( line )
+ elif operation == "INSERT":
+ selectedLines[ 'stratum_insert' ].append( line )
+ elif operation == "DELETE":
+ selectedLines[ 'stratum_delete' ].append( line )
+ else:
+ raise NotImplementedError
+ """
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+ m = re.search( stratum_error_re, line[ 'line' ] )
+ if m:
+ line.update( m.groupdict() )
+ if DEBUG:
+ print( "parsing line: " )
+ print( line )
+ selectedLines[ 'stratum_error' ].append( line )
+ if DEBUG:
+ print( "Line added\n" )
+ continue
+
+def sortByDateTime( parsed ):
+ """
+ Returns the key used to sort the parsed line dictionary by date/time
+ """
+ date = parsed[ 'date' ]
+ dateRE = r"(?P<year>\d{4})[:\-/]?(?P<month>\d{2})[:\-/]?(?P<day>\d{2})"
+ m1 = re.search( dateRE, date )
+ time = parsed[ 'time' ]
+ timeRE = r"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})(,|\.)(?P<ms>\d{3,9})"
+ m2 = re.search( timeRE, time )
+ return "%s-%s-%s %s:%s:%s.%s" % ( m1.group( 'year' ),
+ m1.group( 'month' ),
+ m1.group( 'day' ),
+ m2.group( 'hours' ),
+ m2.group( 'minutes' ),
+ m2.group( 'seconds' ),
+ m2.group( 'ms' ) )
+
+def duration( stop, start ):
+ """
+ Returns the difference between two time stamps in ms
+ """
+ #fmt = '%Y-%m-%d %H:%M:%S.%f'
+ stopDT = datetime.datetime.strptime( stop, TIMEFMT )
+ startDT = datetime.datetime.strptime( start, TIMEFMT )
+ result = stopDT - startDT
+ return result.total_seconds() * 1000
+
+def readLogsFromTestFolder( startDT, endDT,
+ testDir=None,
+ onosLineRE=defaultOnosLineRE,
+ stratumLineRE=defaultStratumLineRE,
+ stratumWritesLineRE=defaultStratumWritesLineRE ):
+ import os
+
+ podMappingFile = "podMapping.txt"
+ pod_mapping_re = r"(?P<pod>stratum-\w{5})( +[^ ]+){5}( )+(?P<deviceName>\w+)"
+ onos_log_prefix = "onos-tost-onos-classic-"
+ stratum_log_prefix = "stratum-"
+ stratum_write_log_suffix = "write-reqs.txt"
+ tempMapping = {}
+ tempLines = []
+ onos_files = []
+ stratum_files = []
+ write_files = []
+ fileName = "%s/%s" % ( testDir, podMappingFile ) if testDir else podMappingFile
+ with open( fileName, "r" ) as logFile:
+ for line in logFile:
+ m = re.search( pod_mapping_re, line )
+ if not m:
+ continue
+ tempMapping[ m.group( 'pod' ) ] = m.group( 'deviceName' )
+ if DEBUG:
+ print( tempMapping )
+ # Search for all files
+
+ rootDir = testDir if testDir else "."
+ for root, dirs, files in os.walk( rootDir ):
+ for f in files:
+ if f.startswith( onos_log_prefix ):
+ onos_files.append( os.path.join( rootDir, f ) )
+ elif f.startswith( stratum_log_prefix ):
+ stratum_files.append( os.path.join( rootDir, f ) )
+ elif f.endswith( stratum_write_log_suffix ):
+ write_files.append( os.path.join( rootDir, f ) )
+ # parse each file
+ for f in onos_files:
+ try:
+ startIndex = f.rindex( '/' ) + 1
+ except ValueError:
+ startIndex = 0
+ pod = f[ startIndex : -4 ]
+ tempLines.extend( readLog( f, onosLineRE, pod=pod, startDT=startDT, endDT=endDT ) )
+ for f in stratum_files:
+ try:
+ startIndex = f.rindex( '/' ) + 1
+ except ValueError:
+ startIndex = 0
+ pod = f[ startIndex : -4 ]
+ tempLines.extend( readLog( f, stratumLineRE, pod=pod, startDT=startDT, endDT=endDT ) )
+ for f in write_files:
+ pod = None
+ for p in tempMapping:
+ if p in f:
+ pod = p
+ break
+ # FIXME: Pod name isn't correct
+ tempLines.extend( readLog( f, stratumWritesLineRE, pod=pod, startDT=startDT, endDT=endDT ) )
+
+
+ #print( tempLines )
+ return tempLines, tempMapping
+
+
+
+def main():
+ # First arg is ONOS logs
+ # second arg is stratum logs
+ # third arg is stratum write requests
+ lines = []
+ # TODO: These should be read as arguments
+
+ manualTriggerTime = ""
+ ##### These are for the logs found here:
+ ##### https://drive.google.com/drive/folders/1_Mil7OINIS54McKHN4MngzGR1e6KEKtX?usp=sharing
+ # disable-spine2-leaf1-links.*
+ enbPortDown1StartStr = "2021-01-30 00:06:12.0000"
+ enbPortDown1EndStr = "2021-01-30 00:06:40.00000"
+ enbPortDown2EndStr = "2021-01-30 00:06:47.0000"
+
+ # enable-spine2-leaf1-links.*
+ enbPortUp1StartStr = "2021-01-30 00:08:30.0000"
+ enbPortUp1EndStr = "2021-01-30 00:09:00.00000"
+ enbPortUp2EndStr = "2021-01-30 00:09:10.00000"
+
+ # portstate-spine2-leaf1-links.*
+ enbPortDown3StartStr = "2021-01-30 00:13:00.000"
+ enbPortDown3EndStr = "2021-01-30 00:13:30.00000"
+ enbPortDown4EndStr = "2021-01-30 00:13:50.0000"
+ enbPortUp3StartStr = "2021-01-30 00:16:50.0000"
+ enbPortUp3EndStr = "2021-01-30 00:17:00.00000"
+ enbPortUp4EndStr = "2021-01-30 00:17:50.00000"
+
+
+ # disable-spine2-leaf2-links.*
+ upstreamPortDown1StartStr = "2021-01-29 23:57:30.0000"
+ upstreamPortDown1EndStr = "2021-01-29 23:57:40.00000"
+ upstreamPortDown2EndStr = "2021-01-29 23:58:50.0000"
+
+ # enable-spine2-leaf2-links.*
+ upstreamPortUp1StartStr = "2021-01-30 00:01:50.0000"
+ upstreamPortUp1EndStr = "2021-01-30 00:02:00.00000"
+ upstreamPortUp2EndStr = "2021-01-30 00:02:30.00000"
+
+ # unplug-leaf2-spine1.*
+ unpluglink1Start = "2021-01-29 23:49:30.0000"
+ unpluglink1End = "2021-01-29 23:50:00.0000"
+ pluglink1End = "2021-01-29 23:50:33.0000"
+
+ # powercycle-spine1.*
+ powerDown1Start = "2021-01-29 23:19:17.0000"
+ powerDown1End = "2021-01-29 23:19:19.0000"
+ powerUp1Start = "2021-01-29 23:23:33.0000"
+ powerUp1End = "2021-01-29 23:23:45.0000"
+
+ # onl-reboot-spine1.*
+ onlDown1Start = "2021-01-29 23:32:00.0000"
+ onlDown1End = "2021-01-29 23:36:20.0000"
+ onlUp1Start = "2021-01-29 23:36:20.0000"
+ onlUp1End = "2021-01-29 23:36:35.0000"
+
+
+ event_selection = 3
+ if event_selection == 1:
+ # enb link down1
+ startStr = enbPortDown1StartStr
+ endStr = enbPortDown1EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 2:
+ # enb link down2
+ startStr = enbPortDown1EndStr
+ endStr = enbPortDown2EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 3:
+ # enb link up1
+ startStr = enbPortUp1StartStr
+ endStr = enbPortUp1EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 4:
+ # enb link up2
+ startStr = enbPortUp1EndStr
+ endStr = enbPortUp2EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 5:
+ # enb link down3
+ startStr = enbPortDown3StartStr
+ endStr = enbPortDown3EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 6:
+ # enb link down4
+ startStr = enbPortDown3EndStr
+ endStr = enbPortDown4EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 7:
+ # enb link up3
+ startStr = enbPortUp3StartStr
+ endStr = enbPortUp3EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 8:
+ # enb link up4
+ startStr = enbPortUp3EndStr
+ endStr = enbPortUp4EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 9:
+ # upstream link down1
+ startStr = upstreamPortDown1StartStr
+ endStr = upstreamPortDown1EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 10:
+ # upstream link down2
+ startStr = upstreamPortDown1EndStr
+ endStr = upstreamPortDown2EndStr
+ event = "portstate_down"
+ eventType = "failure"
+ elif event_selection == 11:
+ # upstream link up1
+ startStr = upstreamPortUp1StartStr
+ endStr = upstreamPortUp1EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 12:
+ # upstream link up2
+ startStr = upstreamPortUp1EndStr
+ endStr = upstreamPortUp2EndStr
+ event = "portstate_up"
+ eventType = "recovery"
+ elif event_selection == 13:
+ # unplug link
+ startStr = unpluglink1Start
+ endStr = unpluglink1End
+ event = "unplug_link"
+ eventType = "failure"
+ elif event_selection == 14:
+ # plug link
+ startStr = unpluglink1End
+ endStr = pluglink1End
+ event = "plug_link"
+ eventType = "recovery"
+ elif event_selection == 15:
+ # power down switch
+ startStr = powerDown1Start
+ endStr = powerDown1End
+ event = "powerdown_switch"
+ eventType = "failure"
+ elif event_selection == 16:
+ # power up switch
+ startStr = powerUp1Start
+ endStr = powerUp1End
+ event = "powerup_switch"
+ eventType = "recovery"
+ elif event_selection == 17:
+ # onl shutdown on switch
+ startStr = onlDown1Start
+ endStr = onlDown1End
+ event = "powerdown_switch"
+ eventType = "failure"
+ elif event_selection == 18:
+ # onl start on switch
+ startStr = onlUp1Start
+ endStr = onlUp1End
+ event = "powerup_switch"
+ eventType = "recovery"
+ elif event_selection == 19:
+ # onl shutdown on switch
+ # SRONLReboot_30_Mar_2021_08_27_02
+ #manualTriggerTime = "2021-03-30 08:29:17.534335"
+ startStr = "2021-03-30 08:28:52.206714"
+ endStr = "2021-03-30 08:30:22.642694"
+ event = "shutdown_onl"
+ eventType = "failure"
+ elif event_selection == 20:
+ # onl start on switch
+ # SRONLReboot_30_Mar_2021_08_27_02
+ startStr = "2021-03-30 08:29:22.642694"
+ endStr = "2021-03-30 08:33:12.293721"
+ event = "start_onl"
+ eventType = "recovery"
+ # TODO: Read from a `get pods -o wide output`
+ podMapping = {"stratum-7dtb8": "leaf1",
+ "stratum-c4p42": "spine2",
+ "stratum-jbb5w": "leaf2",
+ "stratum-49fwj": "spine1" }
+
+ ######### Manual testing, test will do this ########
+ p1down = "2021-06-08 10:02:28.842681"
+ p2down = "2021-06-08 10:05:47.638684"
+ p1up = "2021-06-08 10:08:54.974723"
+ p2up = "2021-06-08 10:11:44.923067"
+ p3down = "2021-06-08 10:14:48.605082"
+ p4down = "2021-06-08 10:18:00.892514"
+ p3up = "2021-06-08 10:20:59.108350"
+ p4up = "2021-06-08 10:23:53.826612"
+ times = [ p1down, p2down, p1up, p2up, p3down, p4down, p3up, p4up ]
+ startStr = times[2]
+ endStr = times[3]
+ event = 'portstate_up'
+ prefix = "recovery1"
+ #####################################################
+
+ #fmt = '%Y-%m-%d %H:%M:%S.%f'
+ endDT = datetime.datetime.strptime( endStr, TIMEFMT )
+ startDT = datetime.datetime.strptime( startStr, TIMEFMT )
+
+ # Read files
+ if len( sys.argv ) > 1:
+ try:
+ onos_logfile = sys.argv[1]
+ lines.extend( readLog( onos_logfile, defaultOnosLineRE, startDT=startDT, endDT=endDT ) )
+ except IndexError:
+ print( "ERROR: COULD NOT FIND ONOS LOG FILE" )
+ try:
+ stratum_logfile = sys.argv[2]
+ lines.extend( readLog( stratum_logfile, defaultStratumLineRE, startDT=startDT, endDT=endDT ) )
+ except IndexError:
+ print( "ERROR: COULD NOT FIND STRATUM LOG FILE" )
+ try:
+ stratum_writes_logfile = sys.argv[3]
+ lines.extend( readLog( stratum_writes_logfile, defaultStratumWritesLineRE, pod="stratum-g6k7", startDT=startDT, endDT=endDT ) )
+ except IndexError:
+ print( "ERROR: COULD NOT FIND STRATUM WRITE REQUESTS FILE" )
+ else:
+ # Read files from cwd
+ lines, podMapping = readLogsFromTestFolder( startDT, endDT )
+
+ # Sort merged files by time
+ ordered_lines = sorted( lines, key=sortByDateTime )
+
+ # TODO: Make a function
+ mergedLogFile = "%s-mergedLogs.txt" % prefix
+ with open( mergedLogFile, 'w' ) as output:
+ for line in ordered_lines:
+ output.write( "%s %s" % ( line['pod'], line['line'] ) )
+
+ if DEBUG:
+ for line in ordered_lines:
+ #print( repr( line['line'] ) )
+ print( line )
+
+ # pull out interesting log lines
+ analyzeLogs( ordered_lines, podMapping )
+ breakdownEvent( event, podMapping )
+
+def breakdownEvent( event, podMapping, manualTriggerTime=None, outputFile="log_breakdown_output.txt", DEBUG=0 ):
+
+ results = {}
+ with open( outputFile, 'w' ) as outFile:
+ if DEBUG:
+ outFile.write("\n\nSelected Lines in order:\n")
+ selectedLinesList = []
+ for cause in selectedLines:
+ selectedLinesList.extend( selectedLines[cause] )
+ sorted_selectedLinesList = sorted( selectedLinesList, key=sortByDateTime )
+ if DEBUG:
+ outFile.write( json.dumps( sorted_selectedLinesList, sort_keys=True, indent=4 ) )
+ # FOR DEBUG
+ if DEBUG:
+ outFile.write("\n\nSelected Lines by group:\n")
+ for eventLabel, eventLines in selectedLines.items():
+ outFile.write( eventLabel )
+ outFile.write( '\n' )
+ outFile.write( json.dumps( eventLines, sort_keys=True, indent=4 ) )
+ outFile.write( '\n' )
+
+ writeCauses = [ 'onos_sr_add_bucket', 'onos_sr_remove_bucket', 'onos_sr_mpls_rule' ]
+ trellisEvents = [ 'onos_sr_port_updated', 'onos_sr_switch_down', 'onos_sr_link_added', 'onos_sr_link_removed' ]
+ if event == 'portstate_down':
+ eventType = "failure"
+ triggerLine = 'stratum_disabling_port'
+ onosDetectEvent = 'onos_receives_port_update'
+ rerouteCauses = [ 'onos_sr_remove_bucket' ]
+ switchEvents = [ 'stratum_port_down' ]
+ elif event == 'portstate_up':
+ eventType = "recovery"
+ triggerLine = 'stratum_enabling_port'
+ onosDetectEvent = 'onos_receives_port_update'
+ rerouteCauses = [ 'onos_sr_add_bucket' ]
+ switchEvents = [ 'stratum_port_up' ]
+ elif event == 'unplug_link':
+ eventType = "failure"
+ onosDetectEvent = 'onos_receives_port_update'
+ rerouteCauses = [ 'onos_sr_remove_bucket' ]
+ switchEvents = [ 'stratum_port_down' ]
+ # TODO: pass in manual trigger time
+ triggerLine = switchEvents[0]
+ elif event == 'plug_link':
+ eventType = "recovery"
+ onosDetectEvent = 'onos_receives_port_update'
+ rerouteCauses = [ 'onos_sr_add_bucket' ]
+ switchEvents = [ 'stratum_port_up' ]
+ # TODO: pass in manual trigger time
+ triggerLine = switchEvents[0]
+ elif event == 'powerdown_switch':
+ eventType = "failure"
+ onosDetectEvent = 'onos_port_down_device_manager'
+ rerouteCauses = [ 'onos_sr_remove_bucket' ]
+ switchEvents = [ 'stratum_port_down' ]
+ # TODO: pass in manual trigger time
+ triggerLine = switchEvents[0]
+ elif event in ['powerup_switch', 'start_onl']:
+ eventType = "recovery"
+ # Even though this is a manual event, we really shouldn't count the time for the switch to boot up
+ triggerLine = 'stratum_controller_connect'
+ """
+ Trigger could be the first of these? Its possible switch's time is out of sync when first starts, often see events out of order on these recovery cases
+ 'stratum_controller_connect': [],
+ 'onos_p4rt_channel_open': [],
+ """
+ onosDetectEvent = 'onos_p4rt_channel_open'
+ rerouteCauses = [ 'onos_sr_mpls_rule' ]
+ switchEvents = [ 'stratum_port_up' ]
+ elif event == 'shutdown_onl':
+ eventType = "failure"
+ onosDetectEvent = 'onos_gnmi_switch_state'
+ rerouteCauses = [ 'onos_sr_remove_bucket' ]
+ switchEvents = [ 'stratum_port_down' ]
+ # TODO: pass in manual trigger time
+ triggerLine = onosDetectEvent
+
+ if eventType == "failure":
+ trellis_link = 'onos_sr_link_removed'
+ elif eventType == "recovery":
+ trellis_link = 'onos_sr_link_added'
+
+ temp_list = []
+ for cause in rerouteCauses:
+ temp_list.extend( selectedLines[cause] )
+ if len( temp_list ) == 0:
+ outFile.write( "\nWARNING Could Not find a cause for rerouting, looking for all causes" )
+ for cause in writeCauses:
+ temp_list.extend( selectedLines[cause] )
+ sorted_reroute_causes = sorted( temp_list, key=sortByDateTime )
+ temp_list = []
+ for e in switchEvents:
+ temp_list.extend( selectedLines[e] )
+ sorted_switch_events = sorted( temp_list, key=sortByDateTime )
+ if sorted_switch_events:
+ last_switch_event = sorted_switch_events[-1]['datetime']
+ if sorted_reroute_causes:
+ n2s_start = sorted_reroute_causes[0]['datetime']
+ trellis_programming_finish = sorted_reroute_causes[-1]['datetime']
+ else:
+ n2s_start = None
+ trellis_programming_finish = None
+
+ temp_list = []
+ for e in trellisEvents:
+ temp_list.extend( selectedLines[e] )
+ sorted_trellis_events = sorted( temp_list, key=sortByDateTime )
+ if DEBUG:
+ outFile.write( "\nSorted Trellis events\n" )
+ for line in sorted_trellis_events:
+ outFile.write( str( line ) )
+
+ if manualTriggerTime:
+ trigger = manualTriggerTime
+ triggerEvent = {}
+ else:
+ if not triggerLine:
+ triggerEvent = sorted_switch_events[0]
+ else:
+ triggerEvent = selectedLines[ triggerLine ][0]
+ trigger = triggerEvent['datetime']
+ triggerDT = datetime.datetime.strptime( trigger, TIMEFMT )
+ # Group events by switch
+ switchDict = { 'onos_sr_add_bucket': [],
+ 'onos_sr_remove_bucket': [],
+ 'onos_sr_mpls_rule': [],
+ 'onos_p4_write_request': [],
+ 'onos_p4_write_response': [],
+ 'onos_p4_write_response_fail': [],
+ 'stratum_p4_write': [],
+ 'stratum_p4_write_fail': [],
+ 'stratum_modify': [],
+ 'stratum_insert': [],
+ 'stratum_delete': [] }
+ switches = {}
+ for eventName in switchDict:
+ for line in selectedLines[eventName]:
+ eventDT = datetime.datetime.strptime(line['datetime'], TIMEFMT)
+ if eventDT < triggerDT:
+ continue
+ try:
+ switch = line.get('deviceName')
+ if not switch:
+ switch = podMapping[line['pod']]
+ except KeyError:
+ outFile.write("\nCould not find device for line: %s" % line)
+ continue
+ if switch not in switches:
+ switches[switch] = deepcopy(switchDict)
+ switches[switch][eventName].append(line)
+
+ if DEBUG:
+ outFile.write( "\nevents by switch\n" )
+ outFile.write( json.dumps( switches, sort_keys=True, indent=4 ) )
+ outFile.write( '\n' )
+
+ if DEBUG:
+ outFile.write( "\nIgnoring late writes..." )
+ for switch in switches:
+ lastWriteCause = {}
+ for cause in writeCauses:
+ if switches[switch][cause]:
+ line = switches[switch][cause][-1]
+ if not lastWriteCause:
+ lastWriteCause = line
+ else:
+ tempDT = datetime.datetime.strptime(line['datetime'], TIMEFMT)
+ lastDT = datetime.datetime.strptime(lastWriteCause['datetime'], TIMEFMT)
+ if tempDT > lastDT:
+ lastWriteCause = line
+ lastDT = datetime.datetime.strptime(lastWriteCause['datetime'], TIMEFMT)
+ for e, lines in switches[switch].items():
+ if lines:
+ tempList = lines
+ #outFile.write( len( lines ) )
+ lateLines = [line for line in tempList if datetime.datetime.strptime(line['datetime'], TIMEFMT) > ( lastDT + datetime.timedelta( 0, 1 ) ) ]
+ if lateLines:
+ outFile.write( "\nWARNING: Ignoring %s %s on %s that came >1 second after write cause(script likely expected more writes from cause than happened)\n" % ( len(lateLines), e, switch ) )
+ outFile.write( json.dumps( lateLines, sort_keys=True, indent=4 ) )
+ # remove from switch dict
+ switches[switch][e] = [ line for line in tempList if line not in lateLines ]
+ # remove from selected lines
+ selectedLines[e] = [ line for line in selectedLines[e] if line not in lateLines ]
+
+ temp_list = []
+ for cause in [ 'onos_p4_write_response', 'onos_p4_write_response_fail' ]:
+ temp_list.extend( selectedLines[cause] )
+ sorted_acks = sorted( temp_list, key=sortByDateTime )
+
+ onos_first_detect = selectedLines[ onosDetectEvent ][0]['datetime'] # Same for port down/up
+ trellis_start = sorted_trellis_events[0]['datetime']
+ trellis_first_link = selectedLines[ trellis_link ][0]['datetime']
+ trellis_last_link = selectedLines[ trellis_link ][-1]['datetime']
+ n2s_finish = sorted_acks[-1]['datetime']
+ entire_duration = duration( n2s_finish, trigger )
+ trigger_to_onos_event = duration( onos_first_detect, trigger )
+ onos_to_trellis = duration( trellis_start, onos_first_detect )
+ onos_to_trellis_link = duration( trellis_first_link, onos_first_detect )
+ # Programming time for each switch
+ trellis_processing = duration( n2s_start, trellis_start )
+ trellis_processing_and_programming = duration( trellis_programming_finish, trellis_start )
+ trellis_processing_and_programming_with_acks = duration( n2s_finish, trellis_start )
+ n2s = duration( n2s_finish, n2s_start )
+
+ if eventType == "failure":
+ worst_case_desc = "Trigger to last write response in onos"
+ worst_case = duration( n2s_finish, trigger )
+ elif eventType == "recovery":
+ worst_case_desc = "Start of Trellis reroute programming to last switch write response in ONOS"
+ worst_case = n2s
+
+ # Lines
+ outFile.write( "\nTrigger:\n%s" % triggerEvent.get( 'line', '' ) )
+ outFile.write( "\nONOS First detect:\n%s" % selectedLines[ onosDetectEvent ][0]['line'] )
+ outFile.write( "\nTrellis first event:\n%s" % sorted_trellis_events[0]['line'] )
+ outFile.write( "\nTrellis first link event:\n%s" % selectedLines[ trellis_link ][0]['line'] )
+ if len( selectedLines[ trellis_link ] ) > 1:
+ outFile.write( "\nTrellis last link event:\n%s" % selectedLines[ trellis_link ][-1]['line'] )
+ outFile.write( "\nTrellis starts programming:\n%s" % sorted_reroute_causes[0]['line'] )
+ outFile.write( "\nTrellis ends programming:\n%s" % sorted_reroute_causes[-1]['line'] )
+ outFile.write( "\nFirst write response in onos:\n%s" % selectedLines[ 'onos_p4_write_response' ][0][ 'line' ] )
+ outFile.write( "\nLast write response in onos:\n%s" % selectedLines[ 'onos_p4_write_response' ][-1][ 'line' ] )
+ outFile.write( "\nFirst write success on switches:\n%s" % selectedLines[ 'stratum_p4_write' ][0][ 'line' ] )
+ outFile.write( "\nLast write success on switches:\n%s" % selectedLines[ 'stratum_p4_write' ][-1][ 'line' ] )
+
+ # Times
+ outFile.write( "\nEntire duration (trigger to last write response in onos): %s ms" % entire_duration )
+ outFile.write( "\nWorst case traffic loss (%s): %s ms" % ( worst_case_desc, worst_case ) )
+ if sorted_switch_events:
+ outFile.write( "\nTrigger to last switch event: %s ms" % ( duration( last_switch_event, trigger ) ) )
+ outFile.write( "\nTrigger to first onos event: %s ms \t\t\t\t\t(top box under the switches)" % trigger_to_onos_event )
+ results[ 'trigger_to_onos_event' ] = trigger_to_onos_event
+ outFile.write( "\nFirst ONOS event to first Trellis event: %s ms\t\t\t\t\t(South to North)" % onos_to_trellis )
+ results[ 's2n' ] = onos_to_trellis
+ outFile.write( "\nFirst ONOS event to first Trellis Link event: %s ms" % onos_to_trellis_link )
+ outFile.write( "\nFirst Trellis link event to last Trellis Link: %s ms" % duration( trellis_last_link, trellis_first_link ) )
+ outFile.write( "\nFirst Trellis event to start of programming devices: %s ms" % trellis_processing )
+ outFile.write( "\nFirst Trellis link event to start of programming devices: %s ms" % duration( n2s_start, trellis_first_link ) )
+ outFile.write( "\nLast Trellis link event to start of programming devices: %s ms" % duration( n2s_start, trellis_last_link ) )
+ outFile.write( "\nFirst Trellis event to end of Trellis programming devices: %s ms\t\t(Trellis)" % trellis_processing_and_programming )
+ results[ 'trellis' ] = trellis_processing_and_programming
+ outFile.write( "\nFirst Trellis link event to end of Trellis programming devices: %s ms" % duration( trellis_programming_finish, trellis_first_link ) )
+ outFile.write( "\nLast Trellis link event to end of Trellis programming devices: %s ms" % duration( trellis_programming_finish, trellis_last_link ) )
+ outFile.write( "\nFirst Trellis event to end of programming devices with acks: %s ms" % trellis_processing_and_programming_with_acks )
+ outFile.write( "\nFirst Trellis link event to end of programming devices with acks: %s ms" % duration( n2s_finish, trellis_first_link ) )
+ outFile.write( "\nStart of programming devices to last ack: %s ms\t\t\t\t(North to South)" % n2s )
+ results[ 'n2s' ] = n2s
+ for switchName, events in switches.items():
+ try:
+ writeCauseList = []
+ for cause in writeCauses:
+ if events[ cause ]:
+ writeCauseList.extend( events[cause] )
+ sorted_write_causes = sorted( writeCauseList, key=sortByDateTime )
+
+ onos_p4_acks = []
+ for cause in [ 'onos_p4_write_response', 'onos_p4_write_response_fail' ]:
+ onos_p4_acks.extend( events[cause] )
+ sorted_onos_p4_acks = sorted( onos_p4_acks, key=sortByDateTime )
+
+ stratum_results = []
+ for cause in [ 'stratum_p4_write', 'stratum_p4_write_fail' ]:
+ stratum_results.extend( events[cause] )
+ sorted_stratum_results = sorted( stratum_results, key=sortByDateTime )
+
+ # Get pod name
+ podName = ""
+ for pod, sw in podMapping.items():
+ if sw == switchName:
+ podName = pod
+
+ outFile.write( "\n----------Results for %s%s----------" % ( switchName, "(%s)" % podName if podName else "" ) )
+ programming_cause = sorted_write_causes[ 0 ]
+ programmingDT = datetime.datetime.strptime( programming_cause['datetime'], TIMEFMT )
+ writes_after_programming = [ line for line in events['onos_p4_write_request'] if datetime.datetime.strptime( line['datetime'], TIMEFMT ) >= programmingDT ]
+ write_responses_after_programming = [ line for line in sorted_onos_p4_acks if datetime.datetime.strptime( line['datetime'], TIMEFMT ) >= programmingDT ]
+ stratum_success_after_programming = [ line for line in events['stratum_p4_write'] if datetime.datetime.strptime( line['datetime'], TIMEFMT ) >= programmingDT ]
+ stratum_result_after_programming = [ line for line in sorted_stratum_results if datetime.datetime.strptime( line['datetime'], TIMEFMT ) >= programmingDT ]
+ onos_responses = len( sorted_onos_p4_acks )
+ stratum_responses = len( events['stratum_p4_write'] ) + len( events['stratum_p4_write_fail'] )
+ if len( events['onos_p4_write_request'] ) != onos_responses:
+ outFile.write( "\nWARNING: # of write requests != # of write responses on %s" % switchName )
+ if onos_responses != stratum_responses:
+ outFile.write( "\nWARNING: # of write requests != # of write results on %s" % switchName )
+ writeDuration = duration( stratum_success_after_programming[-1]['datetime'], programming_cause['datetime'] )
+ outFile.write( "\nFirst programming to last write success for %s: %s ms" % (switchName, writeDuration ) )
+ for cause in writeCauses:
+ if events[ cause ]:
+ outFile.write( "\nNumber of '%s's for %s: %s" % ( cause, switchName, len( events[cause] ) ) )
+ outFile.write( "\nNumber of write requests for %s: %s" % (switchName, len( events['onos_p4_write_request'] ) ) )
+ outFile.write( "\nNumber of write responses for %s: %s success, %s failed" % (switchName, len( events['onos_p4_write_response'] ), len( events['onos_p4_write_response_fail'] ) ) )
+ outFile.write( "\nNumber of write results on stratum for %s: %s success, %s failed" % (switchName, len( events['stratum_p4_write'] ), len( events['stratum_p4_write_fail'] ) ) )
+
+ # Lines per switch
+ outFile.write( "\nFirst programming:\n%s" % programming_cause[ 'line' ] )
+ outFile.write( "\nFirst write request from ONOS:\n%s" % writes_after_programming[0][ 'line' ] )
+ outFile.write( "\nFirst write response on ONOS:\n%s" % write_responses_after_programming[0][ 'line' ] )
+ outFile.write( "\nFirst write result on switch:\n%s" % stratum_success_after_programming[0][ 'line' ] )
+ if len( sorted_write_causes ) > 1:
+ outFile.write( "\nLast programming:\n%s" % sorted_write_causes[-1][ 'line' ] )
+ outFile.write( "\nLast write request from ONOS:\n%s" % writes_after_programming[-1][ 'line' ] )
+ outFile.write( "\nLast write response on ONOS:\n%s" % write_responses_after_programming[-1][ 'line' ] )
+ outFile.write( "\nLast write result on switch:\n%s" % stratum_success_after_programming[-1][ 'line' ] )
+
+ # Times per switch
+ progToReq = duration( writes_after_programming[0]['datetime'], programming_cause['datetime'] )
+ outFile.write( "\nFirst programming from ONOS to first write request for %s: %s ms" % (switchName, progToReq ) )
+ progToLastReq = duration( writes_after_programming[-1]['datetime'], programming_cause['datetime'] )
+ outFile.write( "\nFirst programming from ONOS to last write request for %s: %s ms" % (switchName, progToLastReq ) )
+ reqToRes = duration( write_responses_after_programming[0]['datetime'], writes_after_programming[0]['datetime'] )
+ outFile.write( "\nFirst write request to first write response for %s: %s ms" % (switchName, reqToRes ) )
+ reqToLastRes = duration( write_responses_after_programming[-1]['datetime'], writes_after_programming[0]['datetime'] )
+ outFile.write( "\nFirst write request to last write response for %s: %s ms" % (switchName, reqToLastRes ) )
+ resToSuc = duration( stratum_success_after_programming[0]['datetime'], write_responses_after_programming[0]['datetime'] )
+ outFile.write( "\nFirst write response to first write result for %s: %s ms" % (switchName, resToSuc ) )
+ resToLastSuc = duration( sorted_stratum_results[-1]['datetime'], write_responses_after_programming[0]['datetime'] )
+ outFile.write( "\nFirst write response to last write result for %s: %s ms" % (switchName, resToLastSuc ) )
+ successDuration = duration( stratum_result_after_programming[-1]['datetime'], stratum_result_after_programming[0]['datetime'] )
+ outFile.write( "\nFirst switch write result to last write result for %s: %s ms\t\t(Bottom box under switches)" % ( switchName, successDuration ) )
+ results[ '%s-writes' % switchName ] = successDuration
+ except IndexError as e:
+ outFile.write( "\nError processing logs for switch %s: %s" % ( switchName, e ) )
+ return results
+
+
+if __name__ == '__main__':
+ main()
diff --git a/TestON/tests/dependencies/Cluster.py b/TestON/tests/dependencies/Cluster.py
index b99e069..2efab0c 100644
--- a/TestON/tests/dependencies/Cluster.py
+++ b/TestON/tests/dependencies/Cluster.py
@@ -109,7 +109,7 @@
nodeList = self.runningNodes
for ctrl in nodeList:
- ips.append( ctrl.ipAddress if ctrl.ipAddress is not 'localhost' else ctrl.address )
+ ips.append( ctrl.ipAddress if 'localhost' not in ctrl.ipAddress else ctrl.address )
return ips