fixes for mastership handoff race conditions

Change-Id: Ifed733df1bdc3b144b6a341a9322838ea2aacd72
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 28bdac1..b009477 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -161,6 +161,9 @@
     @Override
     public void removeDevice(DeviceId deviceId) {
         checkNotNull(deviceId, DEVICE_ID_NULL);
+        // XXX is this intended to apply to the full global topology?
+        // if so, we probably don't want the fact that we aren't
+        // MASTER to get in the way, as it would do now.
         DeviceEvent event = store.removeDevice(deviceId);
         if (event != null) {
             log.info("Device {} administratively removed", deviceId);
@@ -203,19 +206,21 @@
             log.info("Device {} connected", deviceId);
             // check my Role
             MastershipRole role = mastershipService.requestRoleFor(deviceId);
-
+            log.info("## - our role for {} is {} [master is {}]", deviceId, role,
+                    mastershipService.getMasterFor(deviceId));
             if (role != MastershipRole.MASTER) {
                 // TODO: Do we need to explicitly tell the Provider that
                 // this instance is no longer the MASTER? probably not
                 return;
             }
-
             MastershipTerm term = mastershipService.requestTermService()
                     .getMastershipTerm(deviceId);
+
             if (!term.master().equals(clusterService.getLocalNode().id())) {
                 // lost mastership after requestRole told this instance was MASTER.
                 return;
             }
+
             // tell clock provider if this instance is the master
             deviceClockProviderService.setMastershipTerm(deviceId, term);
 
@@ -256,19 +261,32 @@
             // but if I was the last STANDBY connection, etc. and no one else
             // was there to mark the device offline, this instance may need to
             // temporarily request for Master Role and mark offline.
+            log.info("## for {} role is {}", deviceId, mastershipService.getLocalRole(deviceId));
             if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
                 log.debug("Device {} disconnected, but I am not the master", deviceId);
                 //let go of ability to be backup
                 mastershipService.relinquishMastership(deviceId);
                 return;
             }
-            DeviceEvent event = store.markOffline(deviceId);
-            //relinquish master role and ability to be backup.
-            mastershipService.relinquishMastership(deviceId);
 
-            if (event != null) {
-                log.info("Device {} disconnected", deviceId);
-                post(event);
+            DeviceEvent event = null;
+            try {
+                event = store.markOffline(deviceId);
+            } catch (IllegalStateException e) {
+                //there are times when this node will correctly have mastership, BUT
+                //that isn't reflected in the ClockManager before the device disconnects.
+                //we want to let go of the device anyways, so make sure this happens.
+                MastershipTerm term = termService.getMastershipTerm(deviceId);
+                deviceClockProviderService.setMastershipTerm(deviceId, term);
+                event = store.markOffline(deviceId);
+            } finally {
+                //relinquish master role and ability to be backup.
+                mastershipService.relinquishMastership(deviceId);
+
+                if (event != null) {
+                    log.info("Device {} disconnected", deviceId);
+                    post(event);
+                }
             }
         }
 
@@ -279,7 +297,15 @@
             checkNotNull(portDescriptions,
                     "Port descriptions list cannot be null");
             checkValidity();
+            //XXX what's this doing here?
             this.provider().id();
+
+            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+                // TODO If we become master, then we'll trigger something to update this
+                //      info to fix any inconsistencies that may result during the handoff.
+                return;
+            }
+
             List<DeviceEvent> events = store.updatePorts(this.provider().id(),
                     deviceId, portDescriptions);
             for (DeviceEvent event : events) {
@@ -293,6 +319,12 @@
             checkNotNull(deviceId, DEVICE_ID_NULL);
             checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
             checkValidity();
+
+            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
+                // TODO If we become master, then we'll trigger something to update this
+                //      info to fix any inconsistencies that may result during the handoff.
+                return;
+            }
             DeviceEvent event = store.updatePortStatus(this.provider().id(),
                     deviceId, portDescription);
             if (event != null) {
@@ -328,27 +360,37 @@
             final DeviceId did = event.subject();
             final NodeId myNodeId = clusterService.getLocalNode().id();
 
+            log.info("## got Mastershipevent for dev {}", did);
             if (myNodeId.equals(event.master())) {
                 MastershipTerm term = termService.getMastershipTerm(did);
 
-                if (term.master().equals(myNodeId)) {
-                    // only set the new term if I am the master
-                    deviceClockProviderService.setMastershipTerm(did, term);
+                if (!myNodeId.equals(term.master())) {
+                    // something went wrong in consistency, let go
+                    mastershipService.relinquishMastership(did);
+                    applyRole(did, MastershipRole.STANDBY);
+                    return;
                 }
 
+                log.info("## setting term for CPS as new master for {}", did);
+                // only set the new term if I am the master
+                deviceClockProviderService.setMastershipTerm(did, term);
+
                 // FIXME: we should check that the device is connected on our end.
                 // currently, this is not straight forward as the actual switch
-                // implementation is hidden from the registry.
-                if (!isAvailable(did)) {
+                // implementation is hidden from the registry. Maybe we can ask the
+                // provider.
+                // if the device is null here, we are the first master to claim the
+                // device. No worries, the DeviceManager will create one soon.
+                Device device = getDevice(did);
+                if ((device != null) && !isAvailable(did)) {
                     //flag the device as online. Is there a better way to do this?
-                    Device device = getDevice(did);
                     store.createOrUpdateDevice(device.providerId(), did,
                             new DefaultDeviceDescription(
                                     did.uri(), device.type(), device.manufacturer(),
                                     device.hwVersion(), device.swVersion(),
                                     device.serialNumber()));
                 }
-
+                //TODO re-collect device information to fix potential staleness
                 applyRole(did, MastershipRole.MASTER);
             } else {
                 applyRole(did, MastershipRole.STANDBY);