[SDFAB-1015] Introduce gracePeriod and defer mastership check For some devices like p4rt devices which are first registered and then updated does not make sense to check the reachability asap: connection is established concurently to the mastership checks and the channel might not be ready. Change-Id: I5a31e09cf382df388a473af338eb72fb9a187fa8

commit: bc91cea9dedcbb221697e4300ad3dcb6dea1e4a9 [log] [tgz]
author: pierventre <pier@opennetworking.org> Fri Feb 18 05:16:48 2022 -0800
committer: Pier Luigi Ventre <pier@opennetworking.org> Fri Feb 25 00:30:01 2022 +0000
tree: 8b1f2eaee0f9f5f003d2bbfb29533c1287110fe7
parent: 116992a795f353242d6b3e801114c177dfeef0de [diff]
diff --git a/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java b/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java
index 8dfa730..d625da7 100644
--- a/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java
+++ b/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java

@@ -125,4 +125,15 @@
         return CompletableFuture.completedFuture(isReachable(deviceId));
     }
 
+    /**
+     * Return the grace period of this provider in milliseconds. With some devices, it might be
+     * required more time to establish a working channel. This method returns a grace period
+     * that should be respected before checking the reachability with the device.
+     *
+     * @return the grace period of the device. 0 if the device is expected to be immediately functional
+     */
+    default int gracePeriod() {
+        return 0;
+    }
+
 }

diff --git a/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java
index a85ee8f..e78e6d2 100644
--- a/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java

@@ -398,9 +398,22 @@
         return ls.dateTime.toEpochMilli();
     }
 
+    // Returns true if the grace period is still on
+    private boolean inGracePeriod(DeviceId deviceId) {
+        LocalStatus ls = deviceLocalStatus.get(deviceId);
+        if (ls == null) {
+            // This should not be possible, unless the device is removed
+            log.warn("Not found a recent local status for {}", deviceId);
+            return true;
+        }
+        DeviceProvider provider = getProvider(deviceId);
+        return ls.connected &&
+                (Instant.now().toEpochMilli() - ls.dateTime.toEpochMilli()) < provider.gracePeriod();
+    }
+
     // Check a device for control channel connectivity
     // and changes local-status appropriately.
-    private boolean isReachable(DeviceId deviceId) {
+    private boolean isReachable(DeviceId deviceId, boolean inGracePeriod) {
         if (deviceId == null) {
             return false;
         }
@@ -409,7 +422,8 @@
             boolean reachable = probeReachability(deviceId);
             if (reachable && !isLocallyConnected(deviceId)) {
                 deviceLocalStatus.put(deviceId, new LocalStatus(true, Instant.now()));
-            } else if (!reachable && isLocallyConnected(deviceId)) {
+            } else if (!reachable && isLocallyConnected(deviceId) && !inGracePeriod) {
+                // Do not update if the grace period is still on
                 deviceLocalStatus.put(deviceId, new LocalStatus(false, Instant.now()));
             }
             return reachable;
@@ -513,8 +527,12 @@
             final DeviceId deviceId = device.id();
             MastershipRole myRole = mastershipService.getLocalRole(deviceId);
             log.trace("Checking device {}. Current role is {}", deviceId, myRole);
-            log.info("Device local status is {}", localStatus(deviceId));
-            if (!isReachable(deviceId)) {
+
+            log.debug("Device {} local status is {}", deviceId, localStatus(deviceId));
+            final boolean isGracePeriodOn = inGracePeriod(deviceId);
+            final boolean isReachable = isReachable(deviceId, isGracePeriodOn);
+            // Passed the grace period and it is still not reachable
+            if (!isGracePeriodOn && !isReachable) {
                 if (myRole != NONE) {
                     // Verify if the device is fully disconnected from the cluster
                     if (updateMastershipFor(deviceId) == null
@@ -523,10 +541,10 @@
                         post(store.markOffline(deviceId));
                     }
                 } else {
-                    /* Firstly get a role and then check if the device is available in the store.
-                       if it is, if this node is the master and the device is fully disconnected
-                       from the cluster mark the device offline. In principle, this condition should
-                       never be hit unless in a device removed phase for NONE mastership roles. */
+                    // Firstly get a role and then check if the device is available in the store.
+                    // if it is, if this node is the master and the device is fully disconnected
+                    // from the cluster mark the device offline. In principle, this condition should
+                    // never be hit unless in a device removed phase for NONE mastership roles.
                     try {
                         mastershipService.requestRoleFor(deviceId).get();
                     } catch (InterruptedException e) {
@@ -545,19 +563,22 @@
                     }
                 }
                 roleToAcknowledge.remove(deviceId);
-                continue;
+            } else if (isReachable) {
+                // If this node is the master, ensure the device is marked online.
+                if (myRole == MASTER && canMarkOnline(device)) {
+                    log.info("Can mark online {}", deviceId);
+                    post(store.markOnline(deviceId));
+                }
+
+                log.info("{} is reachable - reasserting the role", deviceId);
+
+                // Device is still reachable. It is useful for some protocols
+                // to reassert the role. Note: NONE triggers request to MastershipService
+                reassertRole(deviceId, myRole);
+            } else {
+                // Do not proceed furthermore if the grace period is still on
+                log.debug("Skipping mastership check for {}", deviceId);
             }
-
-            // If this node is the master, ensure the device is marked online.
-            if (myRole == MASTER && canMarkOnline(device)) {
-                post(store.markOnline(deviceId));
-            }
-
-            log.info("{} is reachable - reasserting the role", deviceId);
-
-            /* Device is still reachable. It is useful for some protocols
-               to reassert the role. Note: NONE triggers request to MastershipService */
-            reassertRole(deviceId, myRole);
         }
     }
 
@@ -595,8 +616,8 @@
                 log.debug("Timeout is expired but current role is not MASTER ({}), nothing to do", myRole);
                 continue;
             }
-            /* Switch failed to acknowledge master role we asked for.
-               Yield mastership to other instance*/
+            // Switch failed to acknowledge master role we asked for.
+            // Yield mastership to other instance
             log.warn("Failed to assert role onto device {}. requested={}, no response",
                     deviceId, myRole);
             updateMastershipFor(deviceId);
@@ -705,10 +726,10 @@
             log.info("Device {} disconnected from this node: {}", deviceId,
                      clusterService.getLocalNode().id());
 
-            /* If none can reach the device, we will continue with the disconnection logic.
-               If there is one instance that reported device is still reachable, we hand over
-               the mastership to it if we are the current master, otherwise if we are a backup
-               we demote ourselves to the bottom of the backups list */
+            // If none can reach the device, we will continue with the disconnection logic.
+            // If there is one instance that reported device is still reachable, we hand over
+            // the mastership to it if we are the current master, otherwise if we are a backup
+            // we demote ourselves to the bottom of the backups list
             if (updateMastershipFor(deviceId) == null) {
                 log.info("Device {} is fully disconnected from the cluster", deviceId);
                 List<PortDescription> descs = store.getPortDescriptions(provider().id(), deviceId)
@@ -1058,6 +1079,7 @@
     }
 
     private void handleMastershipEvent(MastershipEvent event) {
+        log.debug("Handling mastership event");
         final DeviceId did = event.subject();
 
         // myNextRole suggested by MastershipService event
@@ -1079,9 +1101,11 @@
             myNextRole = NONE;
         }
 
-        final boolean isReachable = isReachable(did);
-        log.info("Device local status is {}", localStatus(did));
-        if (!isReachable) {
+        log.debug("Device {} local status is {}", did, localStatus(did));
+        final boolean isGracePeriodOn = inGracePeriod(did);
+        final boolean isReachable = isReachable(did, isGracePeriodOn);
+        // Passed the grace period and it is still not reachable
+        if (!isGracePeriodOn && !isReachable) {
             // device is not connected to this node, nevertheless we should get a role
             if (mastershipService.getLocalRole(did) == NONE) {
                 log.debug("Node was instructed to be {} role for {}, "
@@ -1106,20 +1130,23 @@
             // Let's put some order in the candidates list
             roleToAcknowledge.remove(did);
             updateMastershipFor(did);
-            return;
-        }
-
-        /* Device is connected to this node - always reassert the role.
-           Ideally, protocols like OpenFlow would not need to reassert the
-           role because the instances are only identified by the role. However,
-           other protocols like P4RT require to provide also an election id
-           which maybe different over time, by reasserting the role will guarantee
-           that updated election ids are communicated to the devices. It should not
-           cost us a lot as it is equivalent to a probe.*/
-        if (store.getDevice(did) != null) {
-            reassertRole(did, myNextRole);
+        } else if (isReachable) {
+            // Device is connected to this node - always reassert the role.
+            // Ideally, protocols like OpenFlow would not need to reassert the
+            // role because the instances are only identified by the role. However,
+            // other protocols like P4RT require to provide also an election id
+            // which maybe different over time, by reasserting the role will guarantee
+            // that updated election ids are communicated to the devices. It should not
+            // cost us a lot as it is equivalent to a probe.
+            if (store.getDevice(did) != null) {
+                log.info("{} is reachable - reasserting the role", did);
+                reassertRole(did, myNextRole);
+            } else {
+                log.debug("Device is not yet/no longer in the store: {}", did);
+            }
         } else {
-            log.debug("Device is not yet/no longer in the store: {}", did);
+            // Do not proceed furthermore if the grace period is still on
+            log.debug("Skipping mastership event {}", event);
         }
     }
 
@@ -1452,10 +1479,10 @@
             if (myRole == MASTER) {
                 log.info("Handing over the mastership of {} to next master {}", deviceId, nextMaster);
                 mastershipAdminService.setRole(nextMaster, deviceId, MASTER);
-                /* Do not demote here because setRole can return before the mastership has been passed.
-                   Current implementation promotes first the nextMaster as top of candidate list and then
-                   transfer the leadership. We can use the BACKUP events to do demote or leverage periodic
-                   checks.*/
+                // Do not demote here because setRole can return before the mastership has been passed.
+                // Current implementation promotes first the nextMaster as top of candidate list and then
+                // transfer the leadership. We can use the BACKUP events to do demote or leverage periodic
+                // checks.
             } else if (myRole == STANDBY) {
                 log.info("Demote current instance to the bottom of the candidates list for {}", deviceId);
                 mastershipAdminService.demote(localNodeId, deviceId);
@@ -1467,8 +1494,6 @@
         return nextMaster;
     }
 
-
-
     /**
      * Port Enable/Disable message sent to the device's MASTER node.
      */

diff --git a/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java b/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java
index ac0fda7..78ab98a 100644
--- a/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java
+++ b/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java

@@ -124,6 +124,11 @@
     private static final int CORE_POOL_SIZE = 10;
     private static final String UNKNOWN = "unknown";
 
+    // We have measured a grace period of 5s with the
+    // current devices - giving some time more to absorb
+    // any fluctuation.
+    private static final int GRACE_PERIOD = 8000;
+
     @Reference(cardinality = ReferenceCardinality.MANDATORY)
     private DeviceProviderRegistry providerRegistry;
 
@@ -411,8 +416,14 @@
         return handshaker.probeReachability();
     }
 
+    @Override
+    public int gracePeriod() {
+        return GRACE_PERIOD;
+    }
+
     private boolean probeReachabilitySync(DeviceId deviceId) {
-        // Wait 3/4 of the checkup interval
+        // Wait 3/4 of the checkup interval and make sure the thread
+        // is not blocked more than the checkUpInterval
         return Tools.futureGetOrElse(probeReachability(deviceId), (checkupInterval * 3000 / 4),
                 TimeUnit.MILLISECONDS, Boolean.FALSE);
     }
@@ -664,22 +675,20 @@
         if (deviceService.getDevice(deviceId) != null &&
                 deviceService.isAvailable(deviceId) == available &&
                 storeDescription != null) {
-            /*
-             * FIXME SDFAB-650 rethink the APIs and abstractions around the DeviceStore.
-             * Device registration is a two-step process for the GDP. Initially, the device is
-             * registered with default avail. to false. Later, the checkup task will update the
-             * description with the default avail to true in order to mark it available. Today,
-             * there is only one API to mark online a device from the device provider which is
-             * deviceConnected which assumes an update on the device description. The device provider
-             * is the only one able to update the device description and we have to make sure that
-             * the default avail. is flipped to true as it is used to mark as online the device when
-             * it is created or updated. Otherwise, if an ONOS instance fails and restarts, when re-joining
-             * the cluster, it will get the device marked as offline and will not be able to update
-             * its status until it become the master. This process concurs with the markOnline done
-             * by the background thread in the DeviceManager and its the reason why we cannot just check
-             * the device availability but we need to compare also the desc. Checking here the equality,
-             * as in general we may want to upgrade the device description at run time.
-             */
+            // FIXME SDFAB-650 rethink the APIs and abstractions around the DeviceStore.
+            //  Device registration is a two-step process for the GDP. Initially, the device is
+            //  registered with default avail. to false. Later, the checkup task will update the
+            //  description with the default avail to true in order to mark it available. Today,
+            //  there is only one API to mark online a device from the device provider which is
+            //  deviceConnected which assumes an update on the device description. The device provider
+            //  is the only one able to update the device description and we have to make sure that
+            //  the default avail. is flipped to true as it is used to mark as online the device when
+            //  it is created or updated. Otherwise, if an ONOS instance fails and restarts, when re-joining
+            //  the cluster, it will get the device marked as offline and will not be able to update
+            //  its status until it become the master. This process concurs with the markOnline done
+            //  by the background thread in the DeviceManager and its the reason why we cannot just check
+            //  the device availability but we need to compare also the desc. Checking here the equality,
+            //  as in general we may want to upgrade the device description at run time.
             DeviceDescription testDeviceDescription = DefaultDeviceDescription.copyReplacingAnnotation(
                     deviceDescription, storeDescription.annotations());
             if (testDeviceDescription.equals(storeDescription)) {
@@ -784,8 +793,8 @@
                 log.warn("Detected role mismatch for {}, core expects {}, " +
                                 "but device reports {}, reassert the role... ",
                         deviceId, expectedRole, deviceRole);
-                /* If we are experience a severe issue, eventually
-                   the DeviceManager will move the mastership */
+                // If we are experiencing a severe issue, eventually
+                // the DeviceManager will move the mastership
                 roleChanged(deviceId, expectedRole);
             } else {
                 log.debug("Detected role mismatch for {}, core expects {}, " +
commit	bc91cea9dedcbb221697e4300ad3dcb6dea1e4a9	[log] [tgz]
author	pierventre <pier@opennetworking.org>	Fri Feb 18 05:16:48 2022 -0800
committer	Pier Luigi Ventre <pier@opennetworking.org>	Fri Feb 25 00:30:01 2022 +0000
tree	8b1f2eaee0f9f5f003d2bbfb29533c1287110fe7
parent	116992a795f353242d6b3e801114c177dfeef0de [diff]