fixes for mastership handoff race conditions
Change-Id: Ifed733df1bdc3b144b6a341a9322838ea2aacd72
diff --git a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
index 4bcaff4a..7cf14fc 100644
--- a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
+++ b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
@@ -104,7 +104,6 @@
MastershipEvent event = null;
event = store.relinquishRole(
clusterService.getLocalNode().id(), deviceId);
-
if (event != null) {
post(event);
}
@@ -229,7 +228,8 @@
return true;
}
//else {
- //FIXME: break tie for equal-sized clusters, can we use hz's functions?
+ //FIXME: break tie for equal-sized clusters,
+ // maybe by number of connected switches
// }
return false;
}
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 28bdac1..b009477 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -161,6 +161,9 @@
@Override
public void removeDevice(DeviceId deviceId) {
checkNotNull(deviceId, DEVICE_ID_NULL);
+ // XXX is this intended to apply to the full global topology?
+ // if so, we probably don't want the fact that we aren't
+ // MASTER to get in the way, as it would do now.
DeviceEvent event = store.removeDevice(deviceId);
if (event != null) {
log.info("Device {} administratively removed", deviceId);
@@ -203,19 +206,21 @@
log.info("Device {} connected", deviceId);
// check my Role
MastershipRole role = mastershipService.requestRoleFor(deviceId);
-
+ log.info("## - our role for {} is {} [master is {}]", deviceId, role,
+ mastershipService.getMasterFor(deviceId));
if (role != MastershipRole.MASTER) {
// TODO: Do we need to explicitly tell the Provider that
// this instance is no longer the MASTER? probably not
return;
}
-
MastershipTerm term = mastershipService.requestTermService()
.getMastershipTerm(deviceId);
+
if (!term.master().equals(clusterService.getLocalNode().id())) {
// lost mastership after requestRole told this instance was MASTER.
return;
}
+
// tell clock provider if this instance is the master
deviceClockProviderService.setMastershipTerm(deviceId, term);
@@ -256,19 +261,32 @@
// but if I was the last STANDBY connection, etc. and no one else
// was there to mark the device offline, this instance may need to
// temporarily request for Master Role and mark offline.
+ log.info("## for {} role is {}", deviceId, mastershipService.getLocalRole(deviceId));
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
log.debug("Device {} disconnected, but I am not the master", deviceId);
//let go of ability to be backup
mastershipService.relinquishMastership(deviceId);
return;
}
- DeviceEvent event = store.markOffline(deviceId);
- //relinquish master role and ability to be backup.
- mastershipService.relinquishMastership(deviceId);
- if (event != null) {
- log.info("Device {} disconnected", deviceId);
- post(event);
+ DeviceEvent event = null;
+ try {
+ event = store.markOffline(deviceId);
+ } catch (IllegalStateException e) {
+ //there are times when this node will correctly have mastership, BUT
+ //that isn't reflected in the ClockManager before the device disconnects.
+ //we want to let go of the device anyways, so make sure this happens.
+ MastershipTerm term = termService.getMastershipTerm(deviceId);
+ deviceClockProviderService.setMastershipTerm(deviceId, term);
+ event = store.markOffline(deviceId);
+ } finally {
+ //relinquish master role and ability to be backup.
+ mastershipService.relinquishMastership(deviceId);
+
+ if (event != null) {
+ log.info("Device {} disconnected", deviceId);
+ post(event);
+ }
}
}
@@ -279,7 +297,15 @@
checkNotNull(portDescriptions,
"Port descriptions list cannot be null");
checkValidity();
+ //XXX what's this doing here?
this.provider().id();
+
+ if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+ // TODO If we become master, then we'll trigger something to update this
+ // info to fix any inconsistencies that may result during the handoff.
+ return;
+ }
+
List<DeviceEvent> events = store.updatePorts(this.provider().id(),
deviceId, portDescriptions);
for (DeviceEvent event : events) {
@@ -293,6 +319,12 @@
checkNotNull(deviceId, DEVICE_ID_NULL);
checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
checkValidity();
+
+ if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+ // TODO If we become master, then we'll trigger something to update this
+ // info to fix any inconsistencies that may result during the handoff.
+ return;
+ }
DeviceEvent event = store.updatePortStatus(this.provider().id(),
deviceId, portDescription);
if (event != null) {
@@ -328,27 +360,37 @@
final DeviceId did = event.subject();
final NodeId myNodeId = clusterService.getLocalNode().id();
+ log.info("## got Mastershipevent for dev {}", did);
if (myNodeId.equals(event.master())) {
MastershipTerm term = termService.getMastershipTerm(did);
- if (term.master().equals(myNodeId)) {
- // only set the new term if I am the master
- deviceClockProviderService.setMastershipTerm(did, term);
+ if (!myNodeId.equals(term.master())) {
+ // something went wrong in consistency, let go
+ mastershipService.relinquishMastership(did);
+ applyRole(did, MastershipRole.STANDBY);
+ return;
}
+ log.info("## setting term for CPS as new master for {}", did);
+ // only set the new term if I am the master
+ deviceClockProviderService.setMastershipTerm(did, term);
+
// FIXME: we should check that the device is connected on our end.
// currently, this is not straight forward as the actual switch
- // implementation is hidden from the registry.
- if (!isAvailable(did)) {
+ // implementation is hidden from the registry. Maybe we can ask the
+ // provider.
+ // if the device is null here, we are the first master to claim the
+ // device. No worries, the DeviceManager will create one soon.
+ Device device = getDevice(did);
+ if ((device != null) && !isAvailable(did)) {
//flag the device as online. Is there a better way to do this?
- Device device = getDevice(did);
store.createOrUpdateDevice(device.providerId(), did,
new DefaultDeviceDescription(
did.uri(), device.type(), device.manufacturer(),
device.hwVersion(), device.swVersion(),
device.serialNumber()));
}
-
+ //TODO re-collect device information to fix potential staleness
applyRole(did, MastershipRole.MASTER);
} else {
applyRole(did, MastershipRole.STANDBY);
diff --git a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
index 779e1ee..bab9955 100644
--- a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
@@ -16,6 +16,7 @@
import org.onlab.onos.net.ConnectPoint;
import org.onlab.onos.net.DeviceId;
import org.onlab.onos.net.Link;
+import org.onlab.onos.net.MastershipRole;
import org.onlab.onos.net.device.DeviceEvent;
import org.onlab.onos.net.device.DeviceListener;
import org.onlab.onos.net.device.DeviceService;
@@ -139,11 +140,17 @@
@Override
public void removeLinks(ConnectPoint connectPoint) {
+ if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+ return;
+ }
removeLinks(getLinks(connectPoint));
}
@Override
public void removeLinks(DeviceId deviceId) {
+ if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+ return;
+ }
removeLinks(getDeviceLinks(deviceId));
}
@@ -189,6 +196,15 @@
public void linkDetected(LinkDescription linkDescription) {
checkNotNull(linkDescription, LINK_DESC_NULL);
checkValidity();
+
+ ConnectPoint src = linkDescription.src();
+ ConnectPoint dst = linkDescription.dst();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+ (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+ return;
+ }
LinkEvent event = store.createOrUpdateLink(provider().id(),
linkDescription);
if (event != null) {
@@ -201,6 +217,15 @@
public void linkVanished(LinkDescription linkDescription) {
checkNotNull(linkDescription, LINK_DESC_NULL);
checkValidity();
+
+ ConnectPoint src = linkDescription.src();
+ ConnectPoint dst = linkDescription.dst();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+ (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+ return;
+ }
LinkEvent event = store.removeLink(linkDescription.src(),
linkDescription.dst());
if (event != null) {
@@ -213,6 +238,11 @@
public void linksVanished(ConnectPoint connectPoint) {
checkNotNull(connectPoint, "Connect point cannot be null");
checkValidity();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+ return;
+ }
log.info("Links for connection point {} vanished", connectPoint);
removeLinks(getLinks(connectPoint));
}
@@ -221,6 +251,11 @@
public void linksVanished(DeviceId deviceId) {
checkNotNull(deviceId, DEVICE_ID_NULL);
checkValidity();
+ // if we aren't master for the device associated with the ConnectPoint
+ // we probably shouldn't be doing this.
+ if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+ return;
+ }
log.info("Links for device {} vanished", deviceId);
removeLinks(getDeviceLinks(deviceId));
}
diff --git a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
index 1d52ba3..9a16562 100644
--- a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
+++ b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
@@ -59,6 +59,7 @@
protected LinkProviderService providerService;
protected TestProvider provider;
protected TestListener listener = new TestListener();
+ protected DeviceManager devmgr = new TestDeviceManager();
@Before
public void setUp() {
@@ -68,7 +69,7 @@
registry = mgr;
mgr.store = new SimpleLinkStore();
mgr.eventDispatcher = new TestEventDispatcher();
- mgr.deviceService = new DeviceManager();
+ mgr.deviceService = devmgr;
mgr.activate();
service.addListener(listener);
@@ -259,4 +260,11 @@
}
}
+ private static class TestDeviceManager extends DeviceManager {
+ @Override
+ public MastershipRole getRole(DeviceId deviceId) {
+ return MastershipRole.MASTER;
+ }
+ }
+
}
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
index 48355cf..73ba735 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
@@ -44,6 +44,8 @@
@Override
public Timestamp getTimestamp(DeviceId deviceId) {
MastershipTerm term = deviceMastershipTerms.get(deviceId);
+ log.info("term info for {} is: {}", deviceId, term);
+
if (term == null) {
throw new IllegalStateException("Requesting timestamp for a deviceId without mastership");
}
@@ -52,6 +54,7 @@
@Override
public void setMastershipTerm(DeviceId deviceId, MastershipTerm term) {
+ log.info("adding term info {} {}", deviceId, term.master());
deviceMastershipTerms.put(deviceId, term);
}
}
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
index 2603da1..31a86a5 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
@@ -390,6 +390,7 @@
List<PortDescription> portDescriptions) {
final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+ log.info("timestamp for {} {}", deviceId, newTimestamp);
final Timestamped<List<PortDescription>> timestampedInput
= new Timestamped<>(portDescriptions, newTimestamp);
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
index 6e8a367..8cf78e5 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
@@ -360,7 +360,14 @@
final LinkKey key = linkKey(src, dst);
DeviceId dstDeviceId = dst.deviceId();
- Timestamp timestamp = deviceClockService.getTimestamp(dstDeviceId);
+ Timestamp timestamp = null;
+ try {
+ timestamp = deviceClockService.getTimestamp(dstDeviceId);
+ } catch (IllegalStateException e) {
+ //there are times when this is called before mastership
+ // handoff correctly completes.
+ return null;
+ }
LinkEvent event = removeLinkInternal(key, timestamp);
diff --git a/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java b/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
index d0eae2d..b310b48 100644
--- a/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
+++ b/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
@@ -29,7 +29,10 @@
import org.onlab.util.KryoPool;
import com.google.common.collect.ImmutableSet;
+import com.hazelcast.core.EntryEvent;
+import com.hazelcast.core.EntryListener;
import com.hazelcast.core.IAtomicLong;
+import com.hazelcast.core.MapEvent;
import static org.onlab.onos.net.MastershipRole.*;
@@ -78,7 +81,7 @@
roleMap = new SMap(theInstance.getMap("nodeRoles"), this.serializer);
terms = new SMap(theInstance.getMap("terms"), this.serializer);
clusterSize = theInstance.getAtomicLong("clustersize");
- // roleMap.addEntryListener(new RemoteMasterShipEventHandler(), true);
+ roleMap.addEntryListener((new RemoteMasterShipEventHandler()), true);
log.info("Started");
}
@@ -207,6 +210,7 @@
rv.reassign(local, NONE, STANDBY);
roleMap.put(deviceId, rv);
terms.putIfAbsent(deviceId, INIT);
+
break;
case NONE:
//claim mastership
@@ -289,7 +293,8 @@
}
//helper to fetch a new master candidate for a given device.
- private MastershipEvent reelect(NodeId current, DeviceId deviceId, RoleValue rv) {
+ private MastershipEvent reelect(
+ NodeId current, DeviceId deviceId, RoleValue rv) {
//if this is an queue it'd be neater.
NodeId backup = null;
@@ -301,17 +306,18 @@
}
if (backup == null) {
+ log.info("{} giving up and going to NONE for {}", current, deviceId);
rv.remove(MASTER, current);
roleMap.put(deviceId, rv);
return null;
} else {
+ log.info("{} trying to pass mastership for {} to {}", current, deviceId, backup);
rv.replace(current, backup, MASTER);
rv.reassign(backup, STANDBY, NONE);
roleMap.put(deviceId, rv);
Integer term = terms.get(deviceId);
terms.put(deviceId, ++term);
- return new MastershipEvent(
- MASTER_CHANGED, deviceId, backup);
+ return new MastershipEvent(MASTER_CHANGED, deviceId, backup);
}
}
@@ -346,30 +352,51 @@
//adds or updates term information.
private void updateTerm(DeviceId deviceId) {
- Integer term = terms.get(deviceId);
- if (term == null) {
- terms.put(deviceId, INIT);
- } else {
- terms.put(deviceId, ++term);
+ terms.lock(deviceId);
+ try {
+ Integer term = terms.get(deviceId);
+ if (term == null) {
+ terms.put(deviceId, INIT);
+ } else {
+ terms.put(deviceId, ++term);
+ }
+ } finally {
+ terms.unlock(deviceId);
}
}
- private class RemoteMasterShipEventHandler extends RemoteEventHandler<DeviceId, NodeId> {
+ private class RemoteMasterShipEventHandler implements EntryListener<DeviceId, RoleValue> {
@Override
- protected void onAdd(DeviceId deviceId, NodeId nodeId) {
- notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+ public void entryAdded(EntryEvent<DeviceId, RoleValue> event) {
}
@Override
- protected void onRemove(DeviceId deviceId, NodeId nodeId) {
- //notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+ public void entryRemoved(EntryEvent<DeviceId, RoleValue> event) {
}
@Override
- protected void onUpdate(DeviceId deviceId, NodeId oldNodeId, NodeId nodeId) {
- //only addition indicates a change in mastership
- //notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+ public void entryUpdated(EntryEvent<DeviceId, RoleValue> event) {
+ NodeId myId = clusterService.getLocalNode().id();
+ NodeId node = event.getValue().get(MASTER);
+ if (myId.equals(node)) {
+ // XXX or do we just let it get sent and caught by ourself?
+ return;
+ }
+ notifyDelegate(new MastershipEvent(
+ MASTER_CHANGED, event.getKey(), event.getValue().get(MASTER)));
+ }
+
+ @Override
+ public void entryEvicted(EntryEvent<DeviceId, RoleValue> event) {
+ }
+
+ @Override
+ public void mapEvicted(MapEvent event) {
+ }
+
+ @Override
+ public void mapCleared(MapEvent event) {
}
}
diff --git a/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java b/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
index 38c4dfd..fe0d82a 100644
--- a/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
+++ b/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
@@ -94,7 +94,6 @@
.register(ConnectPoint.class, new ConnectPointSerializer())
.register(DefaultLink.class, new DefaultLinkSerializer())
.register(MastershipTerm.class, new MastershipTermSerializer())
- .register(MastershipRole.class, new MastershipRoleSerializer())
.register(HostLocation.class, new HostLocationSerializer())
.build();