fixes in mastership reelection for single-node failure

Change-Id: Iedcab52bb156643464a97435fcc39c5db7393976
diff --git a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
index 125745b..ba3e616 100644
--- a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
+++ b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
@@ -4,6 +4,7 @@
 import static org.slf4j.LoggerFactory.getLogger;
 
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -14,6 +15,7 @@
 import org.onlab.onos.cluster.ClusterEvent;
 import org.onlab.onos.cluster.ClusterEventListener;
 import org.onlab.onos.cluster.ClusterService;
+import org.onlab.onos.cluster.ControllerNode;
 import org.onlab.onos.cluster.MastershipAdminService;
 import org.onlab.onos.cluster.MastershipEvent;
 import org.onlab.onos.cluster.MastershipListener;
@@ -164,21 +166,68 @@
     //callback for reacting to cluster events
     private class InternalClusterEventListener implements ClusterEventListener {
 
+        // A notion of a local maximum cluster size, used to tie-break.
+        // Think of a better way to do this.
+        private AtomicInteger clusterSize;
+
+        InternalClusterEventListener() {
+            clusterSize = new AtomicInteger(0);
+        }
+
         @Override
         public void event(ClusterEvent event) {
             switch (event.type()) {
                 //FIXME: worry about addition when the time comes
                 case INSTANCE_ADDED:
                 case INSTANCE_ACTIVATED:
-                     break;
+                    clusterSize.incrementAndGet();
+                    log.info("instance {} added/activated", event.subject());
+                    break;
                 case INSTANCE_REMOVED:
                 case INSTANCE_DEACTIVATED:
+                    ControllerNode node = event.subject();
+
+                    if (node.equals(clusterService.getLocalNode())) {
+                        //If we are in smaller cluster, relinquish and return
+                        for (DeviceId device : getDevicesOf(node.id())) {
+                            if (!isInMajority()) {
+                                //own DeviceManager should catch event and tell switch
+                                store.relinquishRole(node.id(), device);
+                            }
+                        }
+                        log.info("broke off from cluster, relinquished devices");
+                        break;
+                    }
+
+                    // if we are the larger one and the removed node(s) are brain dead,
+                    // force relinquish on behalf of disabled node.
+                    // check network channel to do this?
+                    for (DeviceId device : getDevicesOf(node.id())) {
+                        //some things to check:
+                        // 1. we didn't break off as well while we're at it
+                        // 2. others don't pile in and try too - maybe a lock
+                        if (isInMajority()) {
+                            store.relinquishRole(node.id(), device);
+                        }
+                    }
+                    clusterSize.decrementAndGet();
+                    log.info("instance {} removed/deactivated", event.subject());
                     break;
                 default:
                     log.warn("unknown cluster event {}", event);
             }
         }
 
+        private boolean isInMajority() {
+            if (clusterService.getNodes().size() > (clusterSize.intValue() / 2)) {
+                return true;
+            }
+            //else {
+                //FIXME: break tie for equal-sized clusters, can we use hz's functions?
+            // }
+            return false;
+        }
+
     }
 
     public class InternalDelegate implements MastershipStoreDelegate {
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 8cde5a3..36caafb 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -26,6 +26,7 @@
 import org.onlab.onos.net.MastershipRole;
 import org.onlab.onos.net.Port;
 import org.onlab.onos.net.PortNumber;
+import org.onlab.onos.net.device.DefaultDeviceDescription;
 import org.onlab.onos.net.device.DeviceAdminService;
 import org.onlab.onos.net.device.DeviceDescription;
 import org.onlab.onos.net.device.DeviceEvent;
@@ -257,12 +258,12 @@
             // temporarily request for Master Role and mark offline.
             if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
                 log.debug("Device {} disconnected, but I am not the master", deviceId);
-                //let go of any role anyways
+                //let go of ability to be backup
                 mastershipService.relinquishMastership(deviceId);
                 return;
             }
             DeviceEvent event = store.markOffline(deviceId);
-            //we're no longer capable of being master or a candidate.
+            //relinquish master role and ability to be backup.
             mastershipService.relinquishMastership(deviceId);
 
             if (event != null) {
@@ -325,23 +326,31 @@
         @Override
         public void event(MastershipEvent event) {
             final DeviceId did = event.subject();
-            if (isAvailable(did)) {
-                final NodeId myNodeId = clusterService.getLocalNode().id();
+            final NodeId myNodeId = clusterService.getLocalNode().id();
 
-                if (myNodeId.equals(event.master())) {
-                    MastershipTerm term = termService.getMastershipTerm(did);
+            if (myNodeId.equals(event.master())) {
+                MastershipTerm term = termService.getMastershipTerm(did);
 
-                    if (term.master().equals(myNodeId)) {
-                        // only set the new term if I am the master
-                        clockProviderService.setMastershipTerm(did, term);
-                    }
-                    applyRole(did, MastershipRole.MASTER);
-                } else {
-                    applyRole(did, MastershipRole.STANDBY);
+                if (term.master().equals(myNodeId)) {
+                    // only set the new term if I am the master
+                    clockProviderService.setMastershipTerm(did, term);
                 }
+
+                // FIXME: we should check that the device is connected on our end.
+                // currently, this is not straight forward as the actual switch
+                // implementation is hidden from the registry.
+                if (!isAvailable(did)) {
+                    //flag the device as online. Is there a better way to do this?
+                    Device device = getDevice(did);
+                    store.createOrUpdateDevice(device.providerId(), did,
+                            new DefaultDeviceDescription(
+                                    did.uri(), device.type(), device.manufacturer(),
+                                    device.hwVersion(), device.swVersion(),
+                                    device.serialNumber()));
+                }
+
+                applyRole(did, MastershipRole.MASTER);
             } else {
-                //device dead to node, give up
-                mastershipService.relinquishMastership(did);
                 applyRole(did, MastershipRole.STANDBY);
             }
         }
diff --git a/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java b/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
index 29b4ddf..e6cf542 100644
--- a/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
+++ b/core/net/src/test/java/org/onlab/onos/cluster/impl/MastershipManagerTest.java
@@ -18,6 +18,8 @@
 import org.onlab.onos.store.trivial.impl.SimpleMastershipStore;
 import org.onlab.packet.IpPrefix;
 
+import com.google.common.collect.Sets;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import static org.onlab.onos.net.MastershipRole.*;
@@ -143,7 +145,7 @@
 
         @Override
         public Set<ControllerNode> getNodes() {
-            return null;
+            return Sets.newHashSet();
         }
 
         @Override