[SDFAB-633][SDFAB-634][SDFAB-445] Collection of improvements for SR

Firstly, this patch deeply rewrites the load sharing of the SR instances,
before we were using an hybrid approach based on MastershipService.
With this patch we get rid of completely of the MastershipService
for any task. We just use the MastershipEvent as way to perform rerouting
if it happens near a cluster event. The aim is to make more stable the forwarding,
and the phased recovery.

Then, the patch contains a fix for an issue related to the phased recovery.
pr.init() can be called when there are still no masters (on device config
for example) and when this happens the portstate commands are dropped.

Last but not least, there is a fix for missing device routes in DefaultRoutingHandler.
Device routes (seenBeforeRoutes) are cleaned on DEVICE UP/ADDED events, this can lead
to purge some routes when the device events are handled in different moments by the
ONOS instances and there already some programmed routes.

Change-Id: Ia03b7c7c5b8a1b80c4b6d17053c2e2e7abf13d17
diff --git a/impl/src/main/java/org/onosproject/segmentrouting/SegmentRoutingManager.java b/impl/src/main/java/org/onosproject/segmentrouting/SegmentRoutingManager.java
index 13a9b99..c81ee56 100644
--- a/impl/src/main/java/org/onosproject/segmentrouting/SegmentRoutingManager.java
+++ b/impl/src/main/java/org/onosproject/segmentrouting/SegmentRoutingManager.java
@@ -35,7 +35,6 @@
 import org.onosproject.cluster.ClusterEvent;
 import org.onosproject.cluster.ClusterEventListener;
 import org.onosproject.cluster.ClusterService;
-import org.onosproject.cluster.LeadershipService;
 import org.onosproject.cluster.NodeId;
 import org.onosproject.core.ApplicationId;
 import org.onosproject.core.CoreService;
@@ -198,7 +197,6 @@
 public class SegmentRoutingManager implements SegmentRoutingService {
 
     private static Logger log = LoggerFactory.getLogger(SegmentRoutingManager.class);
-    private static final String NOT_MASTER = "Current instance is not the master of {}. Ignore.";
 
     @Reference(cardinality = ReferenceCardinality.MANDATORY)
     private ComponentConfigService compCfgService;
@@ -257,9 +255,6 @@
     @Reference(cardinality = ReferenceCardinality.MANDATORY)
     public WorkPartitionService workPartitionService;
 
-    @Reference(cardinality = ReferenceCardinality.MANDATORY)
-    public LeadershipService leadershipService;
-
     @Reference(cardinality = ReferenceCardinality.OPTIONAL,
             policy = ReferencePolicy.DYNAMIC)
     public volatile XconnectService xconnectService;
@@ -1063,19 +1058,23 @@
 
     @Override
     public Map<Set<DeviceId>, NodeId> getShouldProgram() {
+        return ImmutableMap.of();
+    }
+
+    @Override
+    public Map<DeviceId, Boolean> getShouldProgramCache() {
+        return ImmutableMap.of();
+    }
+
+    @Override
+    public Map<DeviceId, NodeId> getShouldProgramLeaders() {
         return defaultRoutingHandler == null ? ImmutableMap.of() :
                 ImmutableMap.copyOf(defaultRoutingHandler.shouldProgram);
     }
 
     @Override
-    public Map<DeviceId, Boolean> getShouldProgramCache() {
-        return defaultRoutingHandler == null ? ImmutableMap.of() :
-                ImmutableMap.copyOf(defaultRoutingHandler.shouldProgramCache);
-    }
-
-    @Override
     public boolean shouldProgram(DeviceId deviceId) {
-        return defaultRoutingHandler.shouldProgram(deviceId);
+        return defaultRoutingHandler != null && defaultRoutingHandler.shouldProgram(deviceId);
     }
 
     @Override
@@ -1343,7 +1342,7 @@
      */
     public void updateMacVlanTreatment(DeviceId deviceId, MacAddress hostMac,
                                        VlanId hostVlanId, PortNumber port, int nextId) {
-        // Check if we are the king of this device
+        // Check if we are the leader of this device
         // just one instance should perform this update
         if (!defaultRoutingHandler.shouldProgram(deviceId)) {
             log.debug("This instance is not handling the routing towards the "
@@ -1462,6 +1461,16 @@
                                 + "for available device {}",
                                  event.type(), ((Device) event.subject()).id());
                         processDeviceAdded((Device) event.subject());
+                        /*
+                         * This is a mere heuristic as there is not yet stable mastership in ONOS, and it is based on
+                         * the fact that DEVICE is marked online only if there is a master around. processDeviceAdded
+                         * can be called on config change and link events and there is no check of the availability.
+                         * In this scenarios, we could not have a master and pr is broken as nobody can admin enable
+                         * the ports. We keep in processDeviceAdded the code that is already idempotent
+                         */
+                        if (event.type() == DeviceEvent.Type.DEVICE_AVAILABILITY_CHANGED) {
+                            phasedRecoveryService.init(deviceId);
+                        }
                     } else {
                         if (event.type() == DeviceEvent.Type.DEVICE_ADDED) {
                             // Note: For p4 devices, the device will be added but unavailable at the beginning.
@@ -1584,13 +1593,13 @@
                 } else if (event.type() == MastershipEvent.Type.MASTER_CHANGED) {
                     MastershipEvent me = (MastershipEvent) event;
                     DeviceId deviceId = me.subject();
-                    Optional<DeviceId> pairDeviceId = getPairDeviceId(deviceId);
-                    log.info(" ** MASTERSHIP CHANGED Invalidating shouldProgram cache"
-                            + " for {}/pair={} due to change", deviceId, pairDeviceId);
-                    defaultRoutingHandler.invalidateShouldProgramCache(deviceId);
-                    pairDeviceId.ifPresent(defaultRoutingHandler::invalidateShouldProgramCache);
+                    log.info(" ** Mastership changed check full reroute for {} due to change", deviceId);
                     defaultRoutingHandler.checkFullRerouteForMasterChange(deviceId, me);
 
+                } else if (event.type() == ClusterEvent.Type.INSTANCE_DEACTIVATED ||
+                        event.type() == ClusterEvent.Type.INSTANCE_REMOVED) {
+                    log.info(" ** Cluster event invalidating shouldProgram");
+                    defaultRoutingHandler.invalidateShouldProgram();
                 } else {
                     log.warn("Unhandled event type: {}", event.type());
                 }
@@ -1618,7 +1627,7 @@
     }
 
     private void processDeviceAddedInternal(DeviceId deviceId) {
-        // Irrespective of whether the local is a MASTER or not for this device,
+        // Irrespective of whether the local is leading the programming or not for this device,
         // we need to create a SR-group-handler instance. This is because in a
         // multi-instance setup, any instance can initiate forwarding/next-objectives
         // for any switch (even if this instance is a SLAVE or not even connected
@@ -1644,13 +1653,11 @@
             groupHandlerMap.put(deviceId, groupHandler);
         }
 
-        if (mastershipService.isLocalMaster(deviceId)) {
+        if (shouldProgram(deviceId)) {
             defaultRoutingHandler.populatePortAddressingRules(deviceId);
-            defaultRoutingHandler.purgeSeenBeforeRoutes(deviceId);
             DefaultGroupHandler groupHandler = groupHandlerMap.get(deviceId);
             groupHandler.createGroupsFromVlanConfig();
             routingRulePopulator.populateSubnetBroadcastRule(deviceId);
-            phasedRecoveryService.init(deviceId);
         }
 
         appCfgHandler.init(deviceId);
@@ -1684,6 +1691,8 @@
         defaultRoutingHandler
             .populateRoutingRulesForLinkStatusChange(null, null, device.id(), true);
         defaultRoutingHandler.purgeEcmpGraph(device.id());
+        // Removes routes having as target the device down
+        defaultRoutingHandler.purgeSeenBeforeRoutes(device.id());
 
         // Cleanup all internal groupHandler stores for this device. Should be
         // done after all rerouting or rehashing has been completed
@@ -1723,8 +1732,8 @@
             lastEdgePortEvent = Instant.now();
         }
 
-        if (!mastershipService.isLocalMaster(device.id()))  {
-            log.debug("Not master for dev:{} .. not handling port updated event "
+        if (shouldProgram(device.id()))  {
+            log.debug("Should not program dev:{} .. not handling port updated event "
                     + "for port {}", device.id(), port.number());
             return;
         }
@@ -1734,7 +1743,7 @@
     /**
      * Adds or remove filtering rules for the given switchport. If switchport is
      * an edge facing port, additionally handles host probing and broadcast
-     * rules. Must be called by local master of device.
+     * rules. Must be called by the instance leading the programming of the device.
      *
      * @param deviceId the device identifier
      * @param port the port to update
@@ -2116,6 +2125,7 @@
             case INSTANCE_REMOVED:
                 log.info("** Cluster event {}", event.type());
                 lastClusterEvent = Instant.now();
+                mainEventExecutor.execute(new InternalEventHandler(event));
                 break;
             default:
                 break;