fixes for mastership handoff race conditions
Change-Id: Ifed733df1bdc3b144b6a341a9322838ea2aacd72
diff --git a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
index 4bcaff4a..7cf14fc 100644
--- a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
+++ b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
@@ -104,7 +104,6 @@
MastershipEvent event = null;
event = store.relinquishRole(
clusterService.getLocalNode().id(), deviceId);
-
if (event != null) {
post(event);
}
@@ -229,7 +228,8 @@
return true;
}
//else {
- //FIXME: break tie for equal-sized clusters, can we use hz's functions?
+ //FIXME: break tie for equal-sized clusters,
+ // maybe by number of connected switches
// }
return false;
}
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 28bdac1..b009477 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -161,6 +161,9 @@
@Override
public void removeDevice(DeviceId deviceId) {
checkNotNull(deviceId, DEVICE_ID_NULL);
+ // XXX is this intended to apply to the full global topology?
+ // if so, we probably don't want the fact that we aren't
+ // MASTER to get in the way, as it would do now.
DeviceEvent event = store.removeDevice(deviceId);
if (event != null) {
log.info("Device {} administratively removed", deviceId);
@@ -203,19 +206,21 @@
log.info("Device {} connected", deviceId);
// check my Role
MastershipRole role = mastershipService.requestRoleFor(deviceId);
-
+ log.info("## - our role for {} is {} [master is {}]", deviceId, role,
+ mastershipService.getMasterFor(deviceId));
if (role != MastershipRole.MASTER) {
// TODO: Do we need to explicitly tell the Provider that
// this instance is no longer the MASTER? probably not
return;
}
-
MastershipTerm term = mastershipService.requestTermService()
.getMastershipTerm(deviceId);
+
if (!term.master().equals(clusterService.getLocalNode().id())) {
// lost mastership after requestRole told this instance was MASTER.
return;
}
+
// tell clock provider if this instance is the master
deviceClockProviderService.setMastershipTerm(deviceId, term);
@@ -256,19 +261,32 @@
// but if I was the last STANDBY connection, etc. and no one else
// was there to mark the device offline, this instance may need to
// temporarily request for Master Role and mark offline.
+ log.info("## for {} role is {}", deviceId, mastershipService.getLocalRole(deviceId));
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
log.debug("Device {} disconnected, but I am not the master", deviceId);
//let go of ability to be backup
mastershipService.relinquishMastership(deviceId);
return;
}
- DeviceEvent event = store.markOffline(deviceId);
- //relinquish master role and ability to be backup.
- mastershipService.relinquishMastership(deviceId);
- if (event != null) {
- log.info("Device {} disconnected", deviceId);
- post(event);
+ DeviceEvent event = null;
+ try {
+ event = store.markOffline(deviceId);
+ } catch (IllegalStateException e) {
+ //there are times when this node will correctly have mastership, BUT
+ //that isn't reflected in the ClockManager before the device disconnects.
+ //we want to let go of the device anyways, so make sure this happens.
+ MastershipTerm term = termService.getMastershipTerm(deviceId);
+ deviceClockProviderService.setMastershipTerm(deviceId, term);
+ event = store.markOffline(deviceId);
+ } finally {
+ //relinquish master role and ability to be backup.
+ mastershipService.relinquishMastership(deviceId);
+
+ if (event != null) {
+ log.info("Device {} disconnected", deviceId);
+ post(event);
+ }
}
}
@@ -279,7 +297,15 @@
checkNotNull(portDescriptions,
"Port descriptions list cannot be null");
checkValidity();
+ //XXX what's this doing here?
this.provider().id();
+
+ if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+ // TODO If we become master, then we'll trigger something to update this
+ // info to fix any inconsistencies that may result during the handoff.
+ return;
+ }
+
List<DeviceEvent> events = store.updatePorts(this.provider().id(),
deviceId, portDescriptions);
for (DeviceEvent event : events) {
@@ -293,6 +319,12 @@
checkNotNull(deviceId, DEVICE_ID_NULL);
checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
checkValidity();
+
+ if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+ // TODO If we become master, then we'll trigger something to update this
+ // info to fix any inconsistencies that may result during the handoff.
+ return;
+ }
DeviceEvent event = store.updatePortStatus(this.provider().id(),
deviceId, portDescription);
if (event != null) {
@@ -328,27 +360,37 @@
final DeviceId did = event.subject();
final NodeId myNodeId = clusterService.getLocalNode().id();
+ log.info("## got Mastershipevent for dev {}", did);
if (myNodeId.equals(event.master())) {
MastershipTerm term = termService.getMastershipTerm(did);
- if (term.master().equals(myNodeId)) {
- // only set the new term if I am the master
- deviceClockProviderService.setMastershipTerm(did, term);
+ if (!myNodeId.equals(term.master())) {
+ // something went wrong in consistency, let go
+ mastershipService.relinquishMastership(did);
+ applyRole(did, MastershipRole.STANDBY);
+ return;
}
+ log.info("## setting term for CPS as new master for {}", did);
+ // only set the new term if I am the master
+ deviceClockProviderService.setMastershipTerm(did, term);
+
// FIXME: we should check that the device is connected on our end.
// currently, this is not straight forward as the actual switch
- // implementation is hidden from the registry.
- if (!isAvailable(did)) {
+ // implementation is hidden from the registry. Maybe we can ask the
+ // provider.
+ // if the device is null here, we are the first master to claim the
+ // device. No worries, the DeviceManager will create one soon.
+ Device device = getDevice(did);
+ if ((device != null) && !isAvailable(did)) {
//flag the device as online. Is there a better way to do this?
- Device device = getDevice(did);
store.createOrUpdateDevice(device.providerId(), did,
new DefaultDeviceDescription(
did.uri(), device.type(), device.manufacturer(),
device.hwVersion(), device.swVersion(),
device.serialNumber()));
}
-
+ //TODO re-collect device information to fix potential staleness
applyRole(did, MastershipRole.MASTER);
} else {
applyRole(did, MastershipRole.STANDBY);
diff --git a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
index 779e1ee..bab9955 100644
--- a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
@@ -16,6 +16,7 @@
import org.onlab.onos.net.ConnectPoint;
import org.onlab.onos.net.DeviceId;
import org.onlab.onos.net.Link;
+import org.onlab.onos.net.MastershipRole;
import org.onlab.onos.net.device.DeviceEvent;
import org.onlab.onos.net.device.DeviceListener;
import org.onlab.onos.net.device.DeviceService;
@@ -139,11 +140,17 @@
@Override
public void removeLinks(ConnectPoint connectPoint) {
+ if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+ return;
+ }
removeLinks(getLinks(connectPoint));
}
@Override
public void removeLinks(DeviceId deviceId) {
+ if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+ return;
+ }
removeLinks(getDeviceLinks(deviceId));
}
@@ -189,6 +196,15 @@
public void linkDetected(LinkDescription linkDescription) {
checkNotNull(linkDescription, LINK_DESC_NULL);
checkValidity();
+
+ ConnectPoint src = linkDescription.src();
+ ConnectPoint dst = linkDescription.dst();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+ (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+ return;
+ }
LinkEvent event = store.createOrUpdateLink(provider().id(),
linkDescription);
if (event != null) {
@@ -201,6 +217,15 @@
public void linkVanished(LinkDescription linkDescription) {
checkNotNull(linkDescription, LINK_DESC_NULL);
checkValidity();
+
+ ConnectPoint src = linkDescription.src();
+ ConnectPoint dst = linkDescription.dst();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+ (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+ return;
+ }
LinkEvent event = store.removeLink(linkDescription.src(),
linkDescription.dst());
if (event != null) {
@@ -213,6 +238,11 @@
public void linksVanished(ConnectPoint connectPoint) {
checkNotNull(connectPoint, "Connect point cannot be null");
checkValidity();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+ return;
+ }
log.info("Links for connection point {} vanished", connectPoint);
removeLinks(getLinks(connectPoint));
}
@@ -221,6 +251,11 @@
public void linksVanished(DeviceId deviceId) {
checkNotNull(deviceId, DEVICE_ID_NULL);
checkValidity();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+ return;
+ }
log.info("Links for device {} vanished", deviceId);
removeLinks(getDeviceLinks(deviceId));
}
diff --git a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
index 1d52ba3..9a16562 100644
--- a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
+++ b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
@@ -59,6 +59,7 @@
protected LinkProviderService providerService;
protected TestProvider provider;
protected TestListener listener = new TestListener();
+ protected DeviceManager devmgr = new TestDeviceManager();
@Before
public void setUp() {
@@ -68,7 +69,7 @@
registry = mgr;
mgr.store = new SimpleLinkStore();
mgr.eventDispatcher = new TestEventDispatcher();
- mgr.deviceService = new DeviceManager();
+ mgr.deviceService = devmgr;
mgr.activate();
service.addListener(listener);
@@ -259,4 +260,11 @@
}
}
+ private static class TestDeviceManager extends DeviceManager {
+ @Override
+ public MastershipRole getRole(DeviceId deviceId) {
+ return MastershipRole.MASTER;
+ }
+ }
+
}