[SDFAB-1015] Introduce gracePeriod and defer mastership check
For some devices like p4rt devices which are first registered
and then updated does not make sense to check the reachability
asap: connection is established concurently to the mastership
checks and the channel might not be ready.
Change-Id: I5a31e09cf382df388a473af338eb72fb9a187fa8
diff --git a/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java b/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java
index 8dfa730..d625da7 100644
--- a/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java
+++ b/core/api/src/main/java/org/onosproject/net/device/DeviceProvider.java
@@ -125,4 +125,15 @@
return CompletableFuture.completedFuture(isReachable(deviceId));
}
+ /**
+ * Return the grace period of this provider in milliseconds. With some devices, it might be
+ * required more time to establish a working channel. This method returns a grace period
+ * that should be respected before checking the reachability with the device.
+ *
+ * @return the grace period of the device. 0 if the device is expected to be immediately functional
+ */
+ default int gracePeriod() {
+ return 0;
+ }
+
}
diff --git a/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java
index a85ee8f..e78e6d2 100644
--- a/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onosproject/net/device/impl/DeviceManager.java
@@ -398,9 +398,22 @@
return ls.dateTime.toEpochMilli();
}
+ // Returns true if the grace period is still on
+ private boolean inGracePeriod(DeviceId deviceId) {
+ LocalStatus ls = deviceLocalStatus.get(deviceId);
+ if (ls == null) {
+ // This should not be possible, unless the device is removed
+ log.warn("Not found a recent local status for {}", deviceId);
+ return true;
+ }
+ DeviceProvider provider = getProvider(deviceId);
+ return ls.connected &&
+ (Instant.now().toEpochMilli() - ls.dateTime.toEpochMilli()) < provider.gracePeriod();
+ }
+
// Check a device for control channel connectivity
// and changes local-status appropriately.
- private boolean isReachable(DeviceId deviceId) {
+ private boolean isReachable(DeviceId deviceId, boolean inGracePeriod) {
if (deviceId == null) {
return false;
}
@@ -409,7 +422,8 @@
boolean reachable = probeReachability(deviceId);
if (reachable && !isLocallyConnected(deviceId)) {
deviceLocalStatus.put(deviceId, new LocalStatus(true, Instant.now()));
- } else if (!reachable && isLocallyConnected(deviceId)) {
+ } else if (!reachable && isLocallyConnected(deviceId) && !inGracePeriod) {
+ // Do not update if the grace period is still on
deviceLocalStatus.put(deviceId, new LocalStatus(false, Instant.now()));
}
return reachable;
@@ -513,8 +527,12 @@
final DeviceId deviceId = device.id();
MastershipRole myRole = mastershipService.getLocalRole(deviceId);
log.trace("Checking device {}. Current role is {}", deviceId, myRole);
- log.info("Device local status is {}", localStatus(deviceId));
- if (!isReachable(deviceId)) {
+
+ log.debug("Device {} local status is {}", deviceId, localStatus(deviceId));
+ final boolean isGracePeriodOn = inGracePeriod(deviceId);
+ final boolean isReachable = isReachable(deviceId, isGracePeriodOn);
+ // Passed the grace period and it is still not reachable
+ if (!isGracePeriodOn && !isReachable) {
if (myRole != NONE) {
// Verify if the device is fully disconnected from the cluster
if (updateMastershipFor(deviceId) == null
@@ -523,10 +541,10 @@
post(store.markOffline(deviceId));
}
} else {
- /* Firstly get a role and then check if the device is available in the store.
- if it is, if this node is the master and the device is fully disconnected
- from the cluster mark the device offline. In principle, this condition should
- never be hit unless in a device removed phase for NONE mastership roles. */
+ // Firstly get a role and then check if the device is available in the store.
+ // if it is, if this node is the master and the device is fully disconnected
+ // from the cluster mark the device offline. In principle, this condition should
+ // never be hit unless in a device removed phase for NONE mastership roles.
try {
mastershipService.requestRoleFor(deviceId).get();
} catch (InterruptedException e) {
@@ -545,19 +563,22 @@
}
}
roleToAcknowledge.remove(deviceId);
- continue;
+ } else if (isReachable) {
+ // If this node is the master, ensure the device is marked online.
+ if (myRole == MASTER && canMarkOnline(device)) {
+ log.info("Can mark online {}", deviceId);
+ post(store.markOnline(deviceId));
+ }
+
+ log.info("{} is reachable - reasserting the role", deviceId);
+
+ // Device is still reachable. It is useful for some protocols
+ // to reassert the role. Note: NONE triggers request to MastershipService
+ reassertRole(deviceId, myRole);
+ } else {
+ // Do not proceed furthermore if the grace period is still on
+ log.debug("Skipping mastership check for {}", deviceId);
}
-
- // If this node is the master, ensure the device is marked online.
- if (myRole == MASTER && canMarkOnline(device)) {
- post(store.markOnline(deviceId));
- }
-
- log.info("{} is reachable - reasserting the role", deviceId);
-
- /* Device is still reachable. It is useful for some protocols
- to reassert the role. Note: NONE triggers request to MastershipService */
- reassertRole(deviceId, myRole);
}
}
@@ -595,8 +616,8 @@
log.debug("Timeout is expired but current role is not MASTER ({}), nothing to do", myRole);
continue;
}
- /* Switch failed to acknowledge master role we asked for.
- Yield mastership to other instance*/
+ // Switch failed to acknowledge master role we asked for.
+ // Yield mastership to other instance
log.warn("Failed to assert role onto device {}. requested={}, no response",
deviceId, myRole);
updateMastershipFor(deviceId);
@@ -705,10 +726,10 @@
log.info("Device {} disconnected from this node: {}", deviceId,
clusterService.getLocalNode().id());
- /* If none can reach the device, we will continue with the disconnection logic.
- If there is one instance that reported device is still reachable, we hand over
- the mastership to it if we are the current master, otherwise if we are a backup
- we demote ourselves to the bottom of the backups list */
+ // If none can reach the device, we will continue with the disconnection logic.
+ // If there is one instance that reported device is still reachable, we hand over
+ // the mastership to it if we are the current master, otherwise if we are a backup
+ // we demote ourselves to the bottom of the backups list
if (updateMastershipFor(deviceId) == null) {
log.info("Device {} is fully disconnected from the cluster", deviceId);
List<PortDescription> descs = store.getPortDescriptions(provider().id(), deviceId)
@@ -1058,6 +1079,7 @@
}
private void handleMastershipEvent(MastershipEvent event) {
+ log.debug("Handling mastership event");
final DeviceId did = event.subject();
// myNextRole suggested by MastershipService event
@@ -1079,9 +1101,11 @@
myNextRole = NONE;
}
- final boolean isReachable = isReachable(did);
- log.info("Device local status is {}", localStatus(did));
- if (!isReachable) {
+ log.debug("Device {} local status is {}", did, localStatus(did));
+ final boolean isGracePeriodOn = inGracePeriod(did);
+ final boolean isReachable = isReachable(did, isGracePeriodOn);
+ // Passed the grace period and it is still not reachable
+ if (!isGracePeriodOn && !isReachable) {
// device is not connected to this node, nevertheless we should get a role
if (mastershipService.getLocalRole(did) == NONE) {
log.debug("Node was instructed to be {} role for {}, "
@@ -1106,20 +1130,23 @@
// Let's put some order in the candidates list
roleToAcknowledge.remove(did);
updateMastershipFor(did);
- return;
- }
-
- /* Device is connected to this node - always reassert the role.
- Ideally, protocols like OpenFlow would not need to reassert the
- role because the instances are only identified by the role. However,
- other protocols like P4RT require to provide also an election id
- which maybe different over time, by reasserting the role will guarantee
- that updated election ids are communicated to the devices. It should not
- cost us a lot as it is equivalent to a probe.*/
- if (store.getDevice(did) != null) {
- reassertRole(did, myNextRole);
+ } else if (isReachable) {
+ // Device is connected to this node - always reassert the role.
+ // Ideally, protocols like OpenFlow would not need to reassert the
+ // role because the instances are only identified by the role. However,
+ // other protocols like P4RT require to provide also an election id
+ // which maybe different over time, by reasserting the role will guarantee
+ // that updated election ids are communicated to the devices. It should not
+ // cost us a lot as it is equivalent to a probe.
+ if (store.getDevice(did) != null) {
+ log.info("{} is reachable - reasserting the role", did);
+ reassertRole(did, myNextRole);
+ } else {
+ log.debug("Device is not yet/no longer in the store: {}", did);
+ }
} else {
- log.debug("Device is not yet/no longer in the store: {}", did);
+ // Do not proceed furthermore if the grace period is still on
+ log.debug("Skipping mastership event {}", event);
}
}
@@ -1452,10 +1479,10 @@
if (myRole == MASTER) {
log.info("Handing over the mastership of {} to next master {}", deviceId, nextMaster);
mastershipAdminService.setRole(nextMaster, deviceId, MASTER);
- /* Do not demote here because setRole can return before the mastership has been passed.
- Current implementation promotes first the nextMaster as top of candidate list and then
- transfer the leadership. We can use the BACKUP events to do demote or leverage periodic
- checks.*/
+ // Do not demote here because setRole can return before the mastership has been passed.
+ // Current implementation promotes first the nextMaster as top of candidate list and then
+ // transfer the leadership. We can use the BACKUP events to do demote or leverage periodic
+ // checks.
} else if (myRole == STANDBY) {
log.info("Demote current instance to the bottom of the candidates list for {}", deviceId);
mastershipAdminService.demote(localNodeId, deviceId);
@@ -1467,8 +1494,6 @@
return nextMaster;
}
-
-
/**
* Port Enable/Disable message sent to the device's MASTER node.
*/
diff --git a/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java b/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java
index ac0fda7..78ab98a 100644
--- a/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java
+++ b/providers/general/device/src/main/java/org/onosproject/provider/general/device/impl/GeneralDeviceProvider.java
@@ -124,6 +124,11 @@
private static final int CORE_POOL_SIZE = 10;
private static final String UNKNOWN = "unknown";
+ // We have measured a grace period of 5s with the
+ // current devices - giving some time more to absorb
+ // any fluctuation.
+ private static final int GRACE_PERIOD = 8000;
+
@Reference(cardinality = ReferenceCardinality.MANDATORY)
private DeviceProviderRegistry providerRegistry;
@@ -411,8 +416,14 @@
return handshaker.probeReachability();
}
+ @Override
+ public int gracePeriod() {
+ return GRACE_PERIOD;
+ }
+
private boolean probeReachabilitySync(DeviceId deviceId) {
- // Wait 3/4 of the checkup interval
+ // Wait 3/4 of the checkup interval and make sure the thread
+ // is not blocked more than the checkUpInterval
return Tools.futureGetOrElse(probeReachability(deviceId), (checkupInterval * 3000 / 4),
TimeUnit.MILLISECONDS, Boolean.FALSE);
}
@@ -664,22 +675,20 @@
if (deviceService.getDevice(deviceId) != null &&
deviceService.isAvailable(deviceId) == available &&
storeDescription != null) {
- /*
- * FIXME SDFAB-650 rethink the APIs and abstractions around the DeviceStore.
- * Device registration is a two-step process for the GDP. Initially, the device is
- * registered with default avail. to false. Later, the checkup task will update the
- * description with the default avail to true in order to mark it available. Today,
- * there is only one API to mark online a device from the device provider which is
- * deviceConnected which assumes an update on the device description. The device provider
- * is the only one able to update the device description and we have to make sure that
- * the default avail. is flipped to true as it is used to mark as online the device when
- * it is created or updated. Otherwise, if an ONOS instance fails and restarts, when re-joining
- * the cluster, it will get the device marked as offline and will not be able to update
- * its status until it become the master. This process concurs with the markOnline done
- * by the background thread in the DeviceManager and its the reason why we cannot just check
- * the device availability but we need to compare also the desc. Checking here the equality,
- * as in general we may want to upgrade the device description at run time.
- */
+ // FIXME SDFAB-650 rethink the APIs and abstractions around the DeviceStore.
+ // Device registration is a two-step process for the GDP. Initially, the device is
+ // registered with default avail. to false. Later, the checkup task will update the
+ // description with the default avail to true in order to mark it available. Today,
+ // there is only one API to mark online a device from the device provider which is
+ // deviceConnected which assumes an update on the device description. The device provider
+ // is the only one able to update the device description and we have to make sure that
+ // the default avail. is flipped to true as it is used to mark as online the device when
+ // it is created or updated. Otherwise, if an ONOS instance fails and restarts, when re-joining
+ // the cluster, it will get the device marked as offline and will not be able to update
+ // its status until it become the master. This process concurs with the markOnline done
+ // by the background thread in the DeviceManager and its the reason why we cannot just check
+ // the device availability but we need to compare also the desc. Checking here the equality,
+ // as in general we may want to upgrade the device description at run time.
DeviceDescription testDeviceDescription = DefaultDeviceDescription.copyReplacingAnnotation(
deviceDescription, storeDescription.annotations());
if (testDeviceDescription.equals(storeDescription)) {
@@ -784,8 +793,8 @@
log.warn("Detected role mismatch for {}, core expects {}, " +
"but device reports {}, reassert the role... ",
deviceId, expectedRole, deviceRole);
- /* If we are experience a severe issue, eventually
- the DeviceManager will move the mastership */
+ // If we are experiencing a severe issue, eventually
+ // the DeviceManager will move the mastership
roleChanged(deviceId, expectedRole);
} else {
log.debug("Detected role mismatch for {}, core expects {}, " +