fixes in mastership reelection for single-node failure
Change-Id: Iedcab52bb156643464a97435fcc39c5db7393976
diff --git a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
index 125745b..ba3e616 100644
--- a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
+++ b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
@@ -4,6 +4,7 @@
import static org.slf4j.LoggerFactory.getLogger;
import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
import org.apache.felix.scr.annotations.Activate;
import org.apache.felix.scr.annotations.Component;
@@ -14,6 +15,7 @@
import org.onlab.onos.cluster.ClusterEvent;
import org.onlab.onos.cluster.ClusterEventListener;
import org.onlab.onos.cluster.ClusterService;
+import org.onlab.onos.cluster.ControllerNode;
import org.onlab.onos.cluster.MastershipAdminService;
import org.onlab.onos.cluster.MastershipEvent;
import org.onlab.onos.cluster.MastershipListener;
@@ -164,21 +166,68 @@
//callback for reacting to cluster events
private class InternalClusterEventListener implements ClusterEventListener {
+ // A notion of a local maximum cluster size, used to tie-break.
+ // Think of a better way to do this.
+ private AtomicInteger clusterSize;
+
+ InternalClusterEventListener() {
+ clusterSize = new AtomicInteger(0);
+ }
+
@Override
public void event(ClusterEvent event) {
switch (event.type()) {
//FIXME: worry about addition when the time comes
case INSTANCE_ADDED:
case INSTANCE_ACTIVATED:
- break;
+ clusterSize.incrementAndGet();
+ log.info("instance {} added/activated", event.subject());
+ break;
case INSTANCE_REMOVED:
case INSTANCE_DEACTIVATED:
+ ControllerNode node = event.subject();
+
+ if (node.equals(clusterService.getLocalNode())) {
+ //If we are in smaller cluster, relinquish and return
+ for (DeviceId device : getDevicesOf(node.id())) {
+ if (!isInMajority()) {
+ //own DeviceManager should catch event and tell switch
+ store.relinquishRole(node.id(), device);
+ }
+ }
+ log.info("broke off from cluster, relinquished devices");
+ break;
+ }
+
+ // if we are the larger one and the removed node(s) are brain dead,
+ // force relinquish on behalf of disabled node.
+ // check network channel to do this?
+ for (DeviceId device : getDevicesOf(node.id())) {
+ //some things to check:
+ // 1. we didn't break off as well while we're at it
+ // 2. others don't pile in and try too - maybe a lock
+ if (isInMajority()) {
+ store.relinquishRole(node.id(), device);
+ }
+ }
+ clusterSize.decrementAndGet();
+ log.info("instance {} removed/deactivated", event.subject());
break;
default:
log.warn("unknown cluster event {}", event);
}
}
+ private boolean isInMajority() {
+ if (clusterService.getNodes().size() > (clusterSize.intValue() / 2)) {
+ return true;
+ }
+ //else {
+ //FIXME: break tie for equal-sized clusters, can we use hz's functions?
+ // }
+ return false;
+ }
+
}
public class InternalDelegate implements MastershipStoreDelegate {
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 8cde5a3..36caafb 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -26,6 +26,7 @@
import org.onlab.onos.net.MastershipRole;
import org.onlab.onos.net.Port;
import org.onlab.onos.net.PortNumber;
+import org.onlab.onos.net.device.DefaultDeviceDescription;
import org.onlab.onos.net.device.DeviceAdminService;
import org.onlab.onos.net.device.DeviceDescription;
import org.onlab.onos.net.device.DeviceEvent;
@@ -257,12 +258,12 @@
// temporarily request for Master Role and mark offline.
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
log.debug("Device {} disconnected, but I am not the master", deviceId);
- //let go of any role anyways
+ //let go of ability to be backup
mastershipService.relinquishMastership(deviceId);
return;
}
DeviceEvent event = store.markOffline(deviceId);
- //we're no longer capable of being master or a candidate.
+ //relinquish master role and ability to be backup.
mastershipService.relinquishMastership(deviceId);
if (event != null) {
@@ -325,23 +326,31 @@
@Override
public void event(MastershipEvent event) {
final DeviceId did = event.subject();
- if (isAvailable(did)) {
- final NodeId myNodeId = clusterService.getLocalNode().id();
+ final NodeId myNodeId = clusterService.getLocalNode().id();
- if (myNodeId.equals(event.master())) {
- MastershipTerm term = termService.getMastershipTerm(did);
+ if (myNodeId.equals(event.master())) {
+ MastershipTerm term = termService.getMastershipTerm(did);
- if (term.master().equals(myNodeId)) {
- // only set the new term if I am the master
- clockProviderService.setMastershipTerm(did, term);
- }
- applyRole(did, MastershipRole.MASTER);
- } else {
- applyRole(did, MastershipRole.STANDBY);
+ if (term.master().equals(myNodeId)) {
+ // only set the new term if I am the master
+ clockProviderService.setMastershipTerm(did, term);
}
+
+ // FIXME: we should check that the device is connected on our end.
+ // currently, this is not straight forward as the actual switch
+ // implementation is hidden from the registry.
+ if (!isAvailable(did)) {
+ //flag the device as online. Is there a better way to do this?
+ Device device = getDevice(did);
+ store.createOrUpdateDevice(device.providerId(), did,
+ new DefaultDeviceDescription(
+ did.uri(), device.type(), device.manufacturer(),
+ device.hwVersion(), device.swVersion(),
+ device.serialNumber()));
+ }
+
+ applyRole(did, MastershipRole.MASTER);
} else {
- //device dead to node, give up
- mastershipService.relinquishMastership(did);
applyRole(did, MastershipRole.STANDBY);
}
}
diff --git a/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java b/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
index 29b4ddf..e6cf542 100644
--- a/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
+++ b/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
@@ -18,6 +18,8 @@
import org.onlab.onos.store.trivial.impl.SimpleMastershipStore;
import org.onlab.packet.IpPrefix;
+import com.google.common.collect.Sets;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.onlab.onos.net.MastershipRole.*;
@@ -143,7 +145,7 @@
@Override
public Set<ControllerNode> getNodes() {
- return null;
+ return Sets.newHashSet();
}
@Override