Attempt to fix Link vanishing after mastership change
- LLDPLinkProvider: handle DeviceUpdate event
- DeviceManager: publish Device events caused by Mastership change
- DeviceManager: Always try to markOffLine on deviceDisconnected
- GossipDeviceStore: Silently ignore failure to get Timestamp on port events
Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 3863449..76d46b0 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -221,9 +221,15 @@
log.info("Device {} connected", deviceId);
// check my Role
MastershipRole role = mastershipService.requestRoleFor(deviceId);
+ log.info("requestedRole, became {} for {}", role, deviceId);
if (role != MastershipRole.MASTER) {
// TODO: Do we need to explicitly tell the Provider that
// this instance is no longer the MASTER? probably not
+// Device device = getDevice(deviceId);
+// if (device != null) {
+// // FIXME roleChanged should take DeviceId instead of Device
+// provider().roleChanged(device, role);
+// }
return;
}
MastershipTerm term = mastershipService.requestTermService()
@@ -231,6 +237,7 @@
if (!term.master().equals(clusterService.getLocalNode().id())) {
// lost mastership after requestRole told this instance was MASTER.
+ log.info("lost mastership before getting term info.");
return;
}
@@ -251,17 +258,13 @@
// instance is the new Master, but
// event returned from the store is null?
- // TODO: Confirm: Mastership could be lost after requestRole
- // and createOrUpdateDevice call.
- // In that case STANDBY node can
- // claim itself to be master against the Device.
- // Will the Node, chosen by the MastershipService, retry
- // to get the MASTER role when that happen?
-
// FIXME: 1st argument should be deviceId, to allow setting
// certain roles even if the store returned null.
+ log.info("event: {} {}", event.type(), event);
provider().roleChanged(event.subject(), role);
post(event);
+ } else {
+ log.info("No event to publish");
}
}
@@ -270,32 +273,34 @@
checkNotNull(deviceId, DEVICE_ID_NULL);
checkValidity();
- // FIXME: only the MASTER should be marking off-line in normal cases,
- // but if I was the last STANDBY connection, etc. and no one else
- // was there to mark the device offline, this instance may need to
- // temporarily request for Master Role and mark offline.
- if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
- log.debug("Device {} disconnected, but I am not the master", deviceId);
- //let go of ability to be backup
- mastershipService.relinquishMastership(deviceId);
- return;
- }
DeviceEvent event = null;
try {
event = store.markOffline(deviceId);
} catch (IllegalStateException e) {
+ log.warn("Failed to mark {} offline", deviceId);
+ // only the MASTER should be marking off-line in normal cases,
+ // but if I was the last STANDBY connection, etc. and no one else
+ // was there to mark the device offline, this instance may need to
+ // temporarily request for Master Role and mark offline.
+
//there are times when this node will correctly have mastership, BUT
//that isn't reflected in the ClockManager before the device disconnects.
//we want to let go of the device anyways, so make sure this happens.
- // FIXME: Come up with workaround for above scenario.
+ // FIXME: Store semantics leaking out as IllegalStateException.
+ // Consider revising store API to handle this scenario.
+
+ MastershipRole role = mastershipService.requestRoleFor(deviceId);
MastershipTerm term = termService.getMastershipTerm(deviceId);
final NodeId myNodeId = clusterService.getLocalNode().id();
// TODO: Move this type of check inside device clock manager, etc.
if (myNodeId.equals(term.master())) {
+ log.info("Marking {} offline", deviceId);
deviceClockProviderService.setMastershipTerm(deviceId, term);
event = store.markOffline(deviceId);
+ } else {
+ log.error("Failed again marking {} offline. {}", deviceId, role);
}
} finally {
//relinquish master role and ability to be backup.
@@ -315,14 +320,6 @@
checkNotNull(portDescriptions,
"Port descriptions list cannot be null");
checkValidity();
- //XXX what's this doing here?
- this.provider().id();
-
- if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
- // TODO If we become master, then we'll trigger something to update this
- // info to fix any inconsistencies that may result during the handoff.
- return;
- }
List<DeviceEvent> events = store.updatePorts(this.provider().id(),
deviceId, portDescriptions);
@@ -338,13 +335,8 @@
checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
checkValidity();
- if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
- // TODO If we become master, then we'll trigger something to update this
- // info to fix any inconsistencies that may result during the handoff.
- return;
- }
- DeviceEvent event = store.updatePortStatus(this.provider().id(),
- deviceId, portDescription);
+ final DeviceEvent event = store.updatePortStatus(this.provider().id(),
+ deviceId, portDescription);
if (event != null) {
log.info("Device {} port {} status changed", deviceId, event
.port().number());
@@ -407,15 +399,16 @@
Device device = getDevice(did);
if ((device != null) && !isAvailable(did)) {
//flag the device as online. Is there a better way to do this?
- store.createOrUpdateDevice(device.providerId(), did,
+ DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did,
new DefaultDeviceDescription(
did.uri(), device.type(), device.manufacturer(),
device.hwVersion(), device.swVersion(),
device.serialNumber(), device.chassisId()));
+ post(devEvent);
}
- //TODO re-collect device information to fix potential staleness
- queryPortInfo(did);
applyRole(did, MastershipRole.MASTER);
+ // re-collect device information to fix potential staleness
+ queryPortInfo(did);
} else if (event.roleInfo().backups().contains(myNodeId)) {
applyRole(did, MastershipRole.STANDBY);
}
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
index 21941b5..6fbc034 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
@@ -328,8 +328,8 @@
final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
if (event != null) {
- log.info("Notifying peers of a device offline topology event for deviceId: {}",
- deviceId);
+ log.info("Notifying peers of a device offline topology event for deviceId: {} {}",
+ deviceId, timestamp);
try {
notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
} catch (IOException e) {
@@ -399,7 +399,24 @@
DeviceId deviceId,
List<PortDescription> portDescriptions) {
- final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+ final Timestamp newTimestamp;
+ try {
+ newTimestamp = deviceClockService.getTimestamp(deviceId);
+ } catch (IllegalStateException e) {
+ log.info("Timestamp was not available for device {}", deviceId);
+ log.debug(" discarding {}", portDescriptions);
+ // Failed to generate timestamp.
+
+ // Possible situation:
+ // Device connected and became master for short period of time,
+ // but lost mastership before this instance had the chance to
+ // retrieve term information.
+
+ // Information dropped here is expected to be recoverable by
+ // device probing after mastership change
+
+ return Collections.emptyList();
+ }
log.info("timestamp for {} {}", deviceId, newTimestamp);
final Timestamped<List<PortDescription>> timestampedInput
@@ -580,7 +597,16 @@
DeviceId deviceId,
PortDescription portDescription) {
- final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+ final Timestamp newTimestamp;
+ try {
+ newTimestamp = deviceClockService.getTimestamp(deviceId);
+ } catch (IllegalStateException e) {
+ log.info("Timestamp was not available for device {}", deviceId);
+ log.debug(" discarding {}", portDescription);
+ // Failed to generate timestamp. Ignoring.
+ // See updatePorts comment
+ return null;
+ }
final Timestamped<PortDescription> deltaDesc
= new Timestamped<>(portDescription, newTimestamp);
final DeviceEvent event;
diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
index f024414..8e50378 100644
--- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
+++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
@@ -111,11 +111,22 @@
log.error("Device is null.");
return;
}
+ log.trace("{} {} {}", event.type(), event.subject(), event);
switch (event.type()) {
case DEVICE_ADDED:
- discoverers.put(device.id(),
- new LinkDiscovery(device, packetSevice, masterService,
- providerService, useBDDP));
+ case DEVICE_UPDATED:
+ ld = discoverers.get(device.id());
+ if (ld == null) {
+ log.debug("Device added ({}) {}", event.type(), device.id());
+ discoverers.put(device.id(),
+ new LinkDiscovery(device, packetSevice, masterService,
+ providerService, useBDDP));
+ } else {
+ if (ld.isStopped()) {
+ log.debug("Device restarted ({}) {}", event.type(), device.id());
+ ld.start();
+ }
+ }
break;
case PORT_ADDED:
case PORT_UPDATED:
@@ -125,6 +136,7 @@
return;
}
if (!port.number().isLogical()) {
+ log.debug("Port added {}", port);
ld.addPort(port);
}
} else {
@@ -134,12 +146,15 @@
}
break;
case PORT_REMOVED:
+ log.debug("Port removed {}", port);
ConnectPoint point = new ConnectPoint(device.id(),
port.number());
providerService.linksVanished(point);
+ // TODO: Don't we need to removePort from ld?
break;
case DEVICE_REMOVED:
case DEVICE_SUSPENDED:
+ log.debug("Device removed {}", device.id());
ld = discoverers.get(device.id());
if (ld == null) {
return;
@@ -153,15 +168,18 @@
return;
}
if (deviceService.isAvailable(device.id())) {
+ log.debug("Device up {}", device.id());
ld.start();
} else {
providerService.linksVanished(device.id());
+ log.debug("Device down {}", device.id());
ld.stop();
}
break;
- case DEVICE_UPDATED:
case DEVICE_MASTERSHIP_CHANGED:
if (!discoverers.containsKey(device.id())) {
+ // TODO: ideally, should never reach here
+ log.debug("Device mastership changed ({}) {}", event.type(), device.id());
discoverers.put(device.id(),
new LinkDiscovery(device, packetSevice, masterService,
providerService, useBDDP));
diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
index e60ed90..9bcd8c6 100644
--- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
+++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
@@ -139,8 +139,8 @@
* @param port the port
*/
public void addPort(final Port port) {
- this.log.debug("sending init probe to port {}",
- port.number().toLong());
+ this.log.debug("sending init probe to port {}@{}",
+ port.number().toLong(), device.id());
sendProbes(port.number().toLong());
@@ -245,7 +245,7 @@
*/
@Override
public void run(final Timeout t) {
- this.log.trace("sending probes");
+ this.log.trace("sending probes from {}", device.id());
synchronized (this) {
final Iterator<Long> fastIterator = this.fastPorts.iterator();
Long portNumber;
@@ -256,7 +256,7 @@
.getAndIncrement();
if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
- this.log.trace("sending fast probe to port");
+ this.log.trace("sending fast probe to port {}", portNumber);
sendProbes(portNumber);
} else {
// Update fast and slow ports
@@ -356,6 +356,7 @@
if (device.type() != Device.Type.ROADM &&
mastershipService.getLocalRole(this.device.id()) ==
MastershipRole.MASTER) {
+ log.debug("sending probes out to {}@{}", portNumber, device.id());
OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
pktService.emit(pkt);
if (useBDDP) {
diff --git a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
index 984e8ae..a7ee9e7 100644
--- a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
+++ b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
@@ -96,7 +96,9 @@
// FIXME if possible, we might want this to be part of
// OpenFlowSwitch interface so the driver interface isn't misused.
OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
- if (!((OpenFlowSwitchDriver) sw).isConnected()) {
+ if (sw == null ||
+ !((OpenFlowSwitchDriver) sw).isConnected()) {
+ LOG.error("Failed to probe device {} on sw={}", device, sw);
providerService.deviceDisconnected(device.id());
return;
}