fixes for mastership handoff race conditions

Change-Id: Ifed733df1bdc3b144b6a341a9322838ea2aacd72
diff --git a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
index 4bcaff4a..7cf14fc 100644
--- a/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
+++ b/core/net/src/main/java/org/onlab/onos/cluster/impl/MastershipManager.java
@@ -104,7 +104,6 @@
         MastershipEvent event = null;
         event = store.relinquishRole(
                 clusterService.getLocalNode().id(), deviceId);
-
         if (event != null) {
             post(event);
         }
@@ -229,7 +228,8 @@
                 return true;
             }
             //else {
-                //FIXME: break tie for equal-sized clusters, can we use hz's functions?
+                //FIXME: break tie for equal-sized clusters,
+                //       maybe by number of connected switches
             // }
             return false;
         }
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 28bdac1..b009477 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -161,6 +161,9 @@
     @Override
     public void removeDevice(DeviceId deviceId) {
         checkNotNull(deviceId, DEVICE_ID_NULL);
+        // XXX is this intended to apply to the full global topology?
+        // if so, we probably don't want the fact that we aren't
+        // MASTER to get in the way, as it would do now.
         DeviceEvent event = store.removeDevice(deviceId);
         if (event != null) {
             log.info("Device {} administratively removed", deviceId);
@@ -203,19 +206,21 @@
             log.info("Device {} connected", deviceId);
             // check my Role
             MastershipRole role = mastershipService.requestRoleFor(deviceId);
-
+            log.info("## - our role for {} is {} [master is {}]", deviceId, role,
+                    mastershipService.getMasterFor(deviceId));
             if (role != MastershipRole.MASTER) {
                 // TODO: Do we need to explicitly tell the Provider that
                 // this instance is no longer the MASTER? probably not
                 return;
             }
-
             MastershipTerm term = mastershipService.requestTermService()
                     .getMastershipTerm(deviceId);
+
             if (!term.master().equals(clusterService.getLocalNode().id())) {
                 // lost mastership after requestRole told this instance was MASTER.
                 return;
             }
+
             // tell clock provider if this instance is the master
             deviceClockProviderService.setMastershipTerm(deviceId, term);
 
@@ -256,19 +261,32 @@
             // but if I was the last STANDBY connection, etc. and no one else
             // was there to mark the device offline, this instance may need to
             // temporarily request for Master Role and mark offline.
+            log.info("## for {} role is {}", deviceId, mastershipService.getLocalRole(deviceId));
             if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
                 log.debug("Device {} disconnected, but I am not the master", deviceId);
                 //let go of ability to be backup
                 mastershipService.relinquishMastership(deviceId);
                 return;
             }
-            DeviceEvent event = store.markOffline(deviceId);
-            //relinquish master role and ability to be backup.
-            mastershipService.relinquishMastership(deviceId);
 
-            if (event != null) {
-                log.info("Device {} disconnected", deviceId);
-                post(event);
+            DeviceEvent event = null;
+            try {
+                event = store.markOffline(deviceId);
+            } catch (IllegalStateException e) {
+                //there are times when this node will correctly have mastership, BUT
+                //that isn't reflected in the ClockManager before the device disconnects.
+                //we want to let go of the device anyways, so make sure this happens.
+                MastershipTerm term = termService.getMastershipTerm(deviceId);
+                deviceClockProviderService.setMastershipTerm(deviceId, term);
+                event = store.markOffline(deviceId);
+            } finally {
+                //relinquish master role and ability to be backup.
+                mastershipService.relinquishMastership(deviceId);
+
+                if (event != null) {
+                    log.info("Device {} disconnected", deviceId);
+                    post(event);
+                }
             }
         }
 
@@ -279,7 +297,15 @@
             checkNotNull(portDescriptions,
                     "Port descriptions list cannot be null");
             checkValidity();
+            //XXX what's this doing here?
             this.provider().id();
+
+            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+                // TODO If we become master, then we'll trigger something to update this
+                //      info to fix any inconsistencies that may result during the handoff.
+                return;
+            }
+
             List<DeviceEvent> events = store.updatePorts(this.provider().id(),
                     deviceId, portDescriptions);
             for (DeviceEvent event : events) {
@@ -293,6 +319,12 @@
             checkNotNull(deviceId, DEVICE_ID_NULL);
             checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
             checkValidity();
+
+            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+                // TODO If we become master, then we'll trigger something to update this
+                //      info to fix any inconsistencies that may result during the handoff.
+                return;
+            }
             DeviceEvent event = store.updatePortStatus(this.provider().id(),
                     deviceId, portDescription);
             if (event != null) {
@@ -328,27 +360,37 @@
             final DeviceId did = event.subject();
             final NodeId myNodeId = clusterService.getLocalNode().id();
 
+            log.info("## got Mastershipevent for dev {}", did);
             if (myNodeId.equals(event.master())) {
                 MastershipTerm term = termService.getMastershipTerm(did);
 
-                if (term.master().equals(myNodeId)) {
-                    // only set the new term if I am the master
-                    deviceClockProviderService.setMastershipTerm(did, term);
+                if (!myNodeId.equals(term.master())) {
+                    // something went wrong in consistency, let go
+                    mastershipService.relinquishMastership(did);
+                    applyRole(did, MastershipRole.STANDBY);
+                    return;
                 }
 
+                log.info("## setting term for CPS as new master for {}", did);
+                // only set the new term if I am the master
+                deviceClockProviderService.setMastershipTerm(did, term);
+
                 // FIXME: we should check that the device is connected on our end.
                 // currently, this is not straight forward as the actual switch
-                // implementation is hidden from the registry.
-                if (!isAvailable(did)) {
+                // implementation is hidden from the registry. Maybe we can ask the
+                // provider.
+                // if the device is null here, we are the first master to claim the
+                // device. No worries, the DeviceManager will create one soon.
+                Device device = getDevice(did);
+                if ((device != null) && !isAvailable(did)) {
                     //flag the device as online. Is there a better way to do this?
-                    Device device = getDevice(did);
                     store.createOrUpdateDevice(device.providerId(), did,
                             new DefaultDeviceDescription(
                                     did.uri(), device.type(), device.manufacturer(),
                                     device.hwVersion(), device.swVersion(),
                                     device.serialNumber()));
                 }
-
+                //TODO re-collect device information to fix potential staleness
                 applyRole(did, MastershipRole.MASTER);
             } else {
                 applyRole(did, MastershipRole.STANDBY);
diff --git a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
index 779e1ee..bab9955 100644
--- a/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/link/impl/LinkManager.java
@@ -16,6 +16,7 @@
 import org.onlab.onos.net.ConnectPoint;
 import org.onlab.onos.net.DeviceId;
 import org.onlab.onos.net.Link;
+import org.onlab.onos.net.MastershipRole;
 import org.onlab.onos.net.device.DeviceEvent;
 import org.onlab.onos.net.device.DeviceListener;
 import org.onlab.onos.net.device.DeviceService;
@@ -139,11 +140,17 @@
 
     @Override
     public void removeLinks(ConnectPoint connectPoint) {
+        if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+            return;
+        }
         removeLinks(getLinks(connectPoint));
     }
 
     @Override
     public void removeLinks(DeviceId deviceId) {
+        if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+            return;
+        }
         removeLinks(getDeviceLinks(deviceId));
     }
 
@@ -189,6 +196,15 @@
         public void linkDetected(LinkDescription linkDescription) {
             checkNotNull(linkDescription, LINK_DESC_NULL);
             checkValidity();
+
+            ConnectPoint src = linkDescription.src();
+            ConnectPoint dst = linkDescription.dst();
+            // if we aren't master for the device associated with the ConnectPoint
+            // we probably shouldn't be doing this.
+            if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+                    (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+                return;
+            }
             LinkEvent event = store.createOrUpdateLink(provider().id(),
                                                        linkDescription);
             if (event != null) {
@@ -201,6 +217,15 @@
         public void linkVanished(LinkDescription linkDescription) {
             checkNotNull(linkDescription, LINK_DESC_NULL);
             checkValidity();
+
+            ConnectPoint src = linkDescription.src();
+            ConnectPoint dst = linkDescription.dst();
+            // if we aren't master for the device associated with the ConnectPoint
+            // we probably shouldn't be doing this.
+            if ((deviceService.getRole(src.deviceId()) != MastershipRole.MASTER) ||
+                    (deviceService.getRole(dst.deviceId()) != MastershipRole.MASTER)) {
+                return;
+            }
             LinkEvent event = store.removeLink(linkDescription.src(),
                                                linkDescription.dst());
             if (event != null) {
@@ -213,6 +238,11 @@
         public void linksVanished(ConnectPoint connectPoint) {
             checkNotNull(connectPoint, "Connect point cannot be null");
             checkValidity();
+            // if we aren't master for the device associated with the ConnectPoint
+            // we probably shouldn't be doing this.
+            if (deviceService.getRole(connectPoint.deviceId()) != MastershipRole.MASTER) {
+                return;
+            }
             log.info("Links for connection point {} vanished", connectPoint);
             removeLinks(getLinks(connectPoint));
         }
@@ -221,6 +251,11 @@
         public void linksVanished(DeviceId deviceId) {
             checkNotNull(deviceId, DEVICE_ID_NULL);
             checkValidity();
+            // if we aren't master for the device associated with the ConnectPoint
+            // we probably shouldn't be doing this.
+            if (deviceService.getRole(deviceId) != MastershipRole.MASTER) {
+                return;
+            }
             log.info("Links for device {} vanished", deviceId);
             removeLinks(getDeviceLinks(deviceId));
         }
diff --git a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
index 1d52ba3..9a16562 100644
--- a/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
+++ b/core/net/src/test/java/org/onlab/onos/net/link/impl/LinkManagerTest.java
@@ -59,6 +59,7 @@
     protected LinkProviderService providerService;
     protected TestProvider provider;
     protected TestListener listener = new TestListener();
+    protected DeviceManager devmgr = new TestDeviceManager();
 
     @Before
     public void setUp() {
@@ -68,7 +69,7 @@
         registry = mgr;
         mgr.store = new SimpleLinkStore();
         mgr.eventDispatcher = new TestEventDispatcher();
-        mgr.deviceService = new DeviceManager();
+        mgr.deviceService = devmgr;
         mgr.activate();
 
         service.addListener(listener);
@@ -259,4 +260,11 @@
         }
     }
 
+    private static class TestDeviceManager extends DeviceManager {
+        @Override
+        public MastershipRole getRole(DeviceId deviceId) {
+            return MastershipRole.MASTER;
+        }
+    }
+
 }
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
index 48355cf..73ba735 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/DeviceClockManager.java
@@ -44,6 +44,8 @@
     @Override
     public Timestamp getTimestamp(DeviceId deviceId) {
         MastershipTerm term = deviceMastershipTerms.get(deviceId);
+        log.info("term info for {} is: {}", deviceId, term);
+
         if (term == null) {
             throw new IllegalStateException("Requesting timestamp for a deviceId without mastership");
         }
@@ -52,6 +54,7 @@
 
     @Override
     public void setMastershipTerm(DeviceId deviceId, MastershipTerm term) {
+        log.info("adding term info {} {}", deviceId, term.master());
         deviceMastershipTerms.put(deviceId, term);
     }
 }
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
index 2603da1..31a86a5 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
@@ -390,6 +390,7 @@
                                        List<PortDescription> portDescriptions) {
 
         final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+        log.info("timestamp for {} {}", deviceId, newTimestamp);
 
         final Timestamped<List<PortDescription>> timestampedInput
                 = new Timestamped<>(portDescriptions, newTimestamp);
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
index 6e8a367..8cf78e5 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/link/impl/GossipLinkStore.java
@@ -360,7 +360,14 @@
         final LinkKey key = linkKey(src, dst);
 
         DeviceId dstDeviceId = dst.deviceId();
-        Timestamp timestamp = deviceClockService.getTimestamp(dstDeviceId);
+        Timestamp timestamp = null;
+        try {
+            timestamp = deviceClockService.getTimestamp(dstDeviceId);
+        } catch (IllegalStateException e) {
+            //there are times when this is called before mastership
+            // handoff correctly completes.
+            return null;
+        }
 
         LinkEvent event = removeLinkInternal(key, timestamp);
 
diff --git a/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java b/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
index d0eae2d..b310b48 100644
--- a/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
+++ b/core/store/hz/cluster/src/main/java/org/onlab/onos/store/mastership/impl/DistributedMastershipStore.java
@@ -29,7 +29,10 @@
 import org.onlab.util.KryoPool;
 
 import com.google.common.collect.ImmutableSet;
+import com.hazelcast.core.EntryEvent;
+import com.hazelcast.core.EntryListener;
 import com.hazelcast.core.IAtomicLong;
+import com.hazelcast.core.MapEvent;
 
 import static org.onlab.onos.net.MastershipRole.*;
 
@@ -78,7 +81,7 @@
         roleMap = new SMap(theInstance.getMap("nodeRoles"), this.serializer);
         terms = new SMap(theInstance.getMap("terms"), this.serializer);
         clusterSize = theInstance.getAtomicLong("clustersize");
-       // roleMap.addEntryListener(new RemoteMasterShipEventHandler(), true);
+        roleMap.addEntryListener((new RemoteMasterShipEventHandler()), true);
 
         log.info("Started");
     }
@@ -207,6 +210,7 @@
                     rv.reassign(local, NONE, STANDBY);
                     roleMap.put(deviceId, rv);
                     terms.putIfAbsent(deviceId, INIT);
+
                     break;
                 case NONE:
                     //claim mastership
@@ -289,7 +293,8 @@
     }
 
     //helper to fetch a new master candidate for a given device.
-    private MastershipEvent reelect(NodeId current, DeviceId deviceId, RoleValue rv) {
+    private MastershipEvent reelect(
+            NodeId current, DeviceId deviceId, RoleValue rv) {
 
         //if this is an queue it'd be neater.
         NodeId backup = null;
@@ -301,17 +306,18 @@
         }
 
         if (backup == null) {
+            log.info("{} giving up and going to NONE for {}", current, deviceId);
             rv.remove(MASTER, current);
             roleMap.put(deviceId, rv);
             return null;
         } else {
+            log.info("{} trying to pass mastership for {} to {}", current, deviceId, backup);
             rv.replace(current, backup, MASTER);
             rv.reassign(backup, STANDBY, NONE);
             roleMap.put(deviceId, rv);
             Integer term = terms.get(deviceId);
             terms.put(deviceId, ++term);
-            return new MastershipEvent(
-                    MASTER_CHANGED, deviceId, backup);
+            return new MastershipEvent(MASTER_CHANGED, deviceId, backup);
         }
     }
 
@@ -346,30 +352,51 @@
 
     //adds or updates term information.
     private void updateTerm(DeviceId deviceId) {
-        Integer term = terms.get(deviceId);
-        if (term == null) {
-            terms.put(deviceId, INIT);
-        } else {
-            terms.put(deviceId, ++term);
+        terms.lock(deviceId);
+        try {
+            Integer term = terms.get(deviceId);
+            if (term == null) {
+                terms.put(deviceId, INIT);
+            } else {
+                terms.put(deviceId, ++term);
+            }
+        } finally {
+            terms.unlock(deviceId);
         }
     }
 
-    private class RemoteMasterShipEventHandler extends RemoteEventHandler<DeviceId, NodeId> {
+    private class RemoteMasterShipEventHandler implements EntryListener<DeviceId, RoleValue> {
 
         @Override
-        protected void onAdd(DeviceId deviceId, NodeId nodeId) {
-            notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+        public void entryAdded(EntryEvent<DeviceId, RoleValue> event) {
         }
 
         @Override
-        protected void onRemove(DeviceId deviceId, NodeId nodeId) {
-            //notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+        public void entryRemoved(EntryEvent<DeviceId, RoleValue> event) {
         }
 
         @Override
-        protected void onUpdate(DeviceId deviceId, NodeId oldNodeId, NodeId nodeId) {
-            //only addition indicates a change in mastership
-            //notifyDelegate(new MastershipEvent(MASTER_CHANGED, deviceId, nodeId));
+        public void entryUpdated(EntryEvent<DeviceId, RoleValue> event) {
+            NodeId myId = clusterService.getLocalNode().id();
+            NodeId node = event.getValue().get(MASTER);
+            if (myId.equals(node)) {
+                // XXX or do we just let it get sent and caught by ourself?
+                return;
+            }
+            notifyDelegate(new MastershipEvent(
+                    MASTER_CHANGED, event.getKey(), event.getValue().get(MASTER)));
+        }
+
+        @Override
+        public void entryEvicted(EntryEvent<DeviceId, RoleValue> event) {
+        }
+
+        @Override
+        public void mapEvicted(MapEvent event) {
+        }
+
+        @Override
+        public void mapCleared(MapEvent event) {
         }
     }
 
diff --git a/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java b/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
index 38c4dfd..fe0d82a 100644
--- a/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
+++ b/core/store/serializers/src/main/java/org/onlab/onos/store/serializers/KryoPoolUtil.java
@@ -94,7 +94,6 @@
             .register(ConnectPoint.class, new ConnectPointSerializer())
             .register(DefaultLink.class, new DefaultLinkSerializer())
             .register(MastershipTerm.class, new MastershipTermSerializer())
-            .register(MastershipRole.class, new MastershipRoleSerializer())
             .register(HostLocation.class, new HostLocationSerializer())
 
             .build();