Attempt to fix Link vanishing after mastership change

- LLDPLinkProvider: handle DeviceUpdate event
- DeviceManager: publish Device events caused by Mastership change
- DeviceManager: Always try to markOffLine on deviceDisconnected
- GossipDeviceStore: Silently ignore failure to get Timestamp on port events

Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
index 3863449..76d46b0 100644
--- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
+++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java
@@ -221,9 +221,15 @@
             log.info("Device {} connected", deviceId);
             // check my Role
             MastershipRole role = mastershipService.requestRoleFor(deviceId);
+            log.info("requestedRole, became {} for {}", role, deviceId);
             if (role != MastershipRole.MASTER) {
                 // TODO: Do we need to explicitly tell the Provider that
                 // this instance is no longer the MASTER? probably not
+//                Device device = getDevice(deviceId);
+//                if (device != null) {
+//                    // FIXME roleChanged should take DeviceId instead of Device
+//                    provider().roleChanged(device, role);
+//                }
                 return;
             }
             MastershipTerm term = mastershipService.requestTermService()
@@ -231,6 +237,7 @@
 
             if (!term.master().equals(clusterService.getLocalNode().id())) {
                 // lost mastership after requestRole told this instance was MASTER.
+                log.info("lost mastership before getting term info.");
                 return;
             }
 
@@ -251,17 +258,13 @@
                 //       instance is the new Master, but
                 //       event returned from the store is null?
 
-                // TODO: Confirm: Mastership could be lost after requestRole
-                //       and createOrUpdateDevice call.
-                //       In that case STANDBY node can
-                //       claim itself to be master against the Device.
-                //       Will the Node, chosen by the MastershipService, retry
-                //       to get the MASTER role when that happen?
-
                 // FIXME: 1st argument should be deviceId, to allow setting
                 //        certain roles even if the store returned null.
+                log.info("event: {} {}", event.type(), event);
                 provider().roleChanged(event.subject(), role);
                 post(event);
+            } else {
+                log.info("No event to publish");
             }
         }
 
@@ -270,32 +273,34 @@
             checkNotNull(deviceId, DEVICE_ID_NULL);
             checkValidity();
 
-            // FIXME: only the MASTER should be marking off-line in normal cases,
-            // but if I was the last STANDBY connection, etc. and no one else
-            // was there to mark the device offline, this instance may need to
-            // temporarily request for Master Role and mark offline.
-            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
-                log.debug("Device {} disconnected, but I am not the master", deviceId);
-                //let go of ability to be backup
-                mastershipService.relinquishMastership(deviceId);
-                return;
-            }
 
             DeviceEvent event = null;
             try {
                 event = store.markOffline(deviceId);
             } catch (IllegalStateException e) {
+                log.warn("Failed to mark {} offline", deviceId);
+                // only the MASTER should be marking off-line in normal cases,
+                // but if I was the last STANDBY connection, etc. and no one else
+                // was there to mark the device offline, this instance may need to
+                // temporarily request for Master Role and mark offline.
+
                 //there are times when this node will correctly have mastership, BUT
                 //that isn't reflected in the ClockManager before the device disconnects.
                 //we want to let go of the device anyways, so make sure this happens.
 
-                // FIXME: Come up with workaround for above scenario.
+                // FIXME: Store semantics leaking out as IllegalStateException.
+                //  Consider revising store API to handle this scenario.
+
+                MastershipRole role = mastershipService.requestRoleFor(deviceId);
                 MastershipTerm term = termService.getMastershipTerm(deviceId);
                 final NodeId myNodeId = clusterService.getLocalNode().id();
                 // TODO: Move this type of check inside device clock manager, etc.
                 if (myNodeId.equals(term.master())) {
+                    log.info("Marking {} offline", deviceId);
                     deviceClockProviderService.setMastershipTerm(deviceId, term);
                     event = store.markOffline(deviceId);
+                } else {
+                    log.error("Failed again marking {} offline. {}", deviceId, role);
                 }
             } finally {
                 //relinquish master role and ability to be backup.
@@ -315,14 +320,6 @@
             checkNotNull(portDescriptions,
                     "Port descriptions list cannot be null");
             checkValidity();
-            //XXX what's this doing here?
-            this.provider().id();
-
-            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
-                // TODO If we become master, then we'll trigger something to update this
-                //      info to fix any inconsistencies that may result during the handoff.
-                return;
-            }
 
             List<DeviceEvent> events = store.updatePorts(this.provider().id(),
                     deviceId, portDescriptions);
@@ -338,13 +335,8 @@
             checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
             checkValidity();
 
-            if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
-                // TODO If we become master, then we'll trigger something to update this
-                //      info to fix any inconsistencies that may result during the handoff.
-                return;
-            }
-            DeviceEvent event = store.updatePortStatus(this.provider().id(),
-                    deviceId, portDescription);
+            final DeviceEvent event = store.updatePortStatus(this.provider().id(),
+                        deviceId, portDescription);
             if (event != null) {
                 log.info("Device {} port {} status changed", deviceId, event
                         .port().number());
@@ -407,15 +399,16 @@
                 Device device = getDevice(did);
                 if ((device != null) && !isAvailable(did)) {
                     //flag the device as online. Is there a better way to do this?
-                    store.createOrUpdateDevice(device.providerId(), did,
+                    DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did,
                             new DefaultDeviceDescription(
                                     did.uri(), device.type(), device.manufacturer(),
                                     device.hwVersion(), device.swVersion(),
                                     device.serialNumber(), device.chassisId()));
+                    post(devEvent);
                 }
-                //TODO re-collect device information to fix potential staleness
-                queryPortInfo(did);
                 applyRole(did, MastershipRole.MASTER);
+                // re-collect device information to fix potential staleness
+                queryPortInfo(did);
             } else if (event.roleInfo().backups().contains(myNodeId)) {
                 applyRole(did, MastershipRole.STANDBY);
             }
diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
index 21941b5..6fbc034 100644
--- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
+++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java
@@ -328,8 +328,8 @@
         final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
         final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
         if (event != null) {
-            log.info("Notifying peers of a device offline topology event for deviceId: {}",
-                    deviceId);
+            log.info("Notifying peers of a device offline topology event for deviceId: {} {}",
+                    deviceId, timestamp);
             try {
                 notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
             } catch (IOException e) {
@@ -399,7 +399,24 @@
                                        DeviceId deviceId,
                                        List<PortDescription> portDescriptions) {
 
-        final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+        final Timestamp newTimestamp;
+        try {
+            newTimestamp = deviceClockService.getTimestamp(deviceId);
+        } catch (IllegalStateException e) {
+            log.info("Timestamp was not available for device {}", deviceId);
+            log.debug("  discarding {}", portDescriptions);
+            // Failed to generate timestamp.
+
+            // Possible situation:
+            //  Device connected and became master for short period of time,
+            // but lost mastership before this instance had the chance to
+            // retrieve term information.
+
+            // Information dropped here is expected to be recoverable by
+            // device probing after mastership change
+
+            return Collections.emptyList();
+        }
         log.info("timestamp for {} {}", deviceId, newTimestamp);
 
         final Timestamped<List<PortDescription>> timestampedInput
@@ -580,7 +597,16 @@
                                                      DeviceId deviceId,
                                                      PortDescription portDescription) {
 
-        final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
+        final Timestamp newTimestamp;
+        try {
+            newTimestamp = deviceClockService.getTimestamp(deviceId);
+        } catch (IllegalStateException e) {
+            log.info("Timestamp was not available for device {}", deviceId);
+            log.debug("  discarding {}", portDescription);
+            // Failed to generate timestamp. Ignoring.
+            // See updatePorts comment
+            return null;
+        }
         final Timestamped<PortDescription> deltaDesc
             = new Timestamped<>(portDescription, newTimestamp);
         final DeviceEvent event;
diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
index f024414..8e50378 100644
--- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
+++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java
@@ -111,11 +111,22 @@
                 log.error("Device is null.");
                 return;
             }
+            log.trace("{} {} {}", event.type(), event.subject(), event);
             switch (event.type()) {
                 case DEVICE_ADDED:
-                    discoverers.put(device.id(),
-                                    new LinkDiscovery(device, packetSevice, masterService,
-                                                      providerService, useBDDP));
+                case DEVICE_UPDATED:
+                    ld = discoverers.get(device.id());
+                    if (ld == null) {
+                        log.debug("Device added ({}) {}", event.type(), device.id());
+                        discoverers.put(device.id(),
+                               new LinkDiscovery(device, packetSevice, masterService,
+                                      providerService, useBDDP));
+                    } else {
+                        if (ld.isStopped()) {
+                            log.debug("Device restarted ({}) {}", event.type(), device.id());
+                            ld.start();
+                        }
+                    }
                     break;
                 case PORT_ADDED:
                 case PORT_UPDATED:
@@ -125,6 +136,7 @@
                             return;
                         }
                         if (!port.number().isLogical()) {
+                            log.debug("Port added {}", port);
                             ld.addPort(port);
                         }
                     } else {
@@ -134,12 +146,15 @@
                     }
                     break;
                 case PORT_REMOVED:
+                    log.debug("Port removed {}", port);
                     ConnectPoint point = new ConnectPoint(device.id(),
                                                           port.number());
                     providerService.linksVanished(point);
+                    // TODO: Don't we need to removePort from ld?
                     break;
                 case DEVICE_REMOVED:
                 case DEVICE_SUSPENDED:
+                    log.debug("Device removed {}", device.id());
                     ld = discoverers.get(device.id());
                     if (ld == null) {
                         return;
@@ -153,15 +168,18 @@
                         return;
                     }
                     if (deviceService.isAvailable(device.id())) {
+                        log.debug("Device up {}", device.id());
                         ld.start();
                     } else {
                         providerService.linksVanished(device.id());
+                        log.debug("Device down {}", device.id());
                         ld.stop();
                     }
                     break;
-                case DEVICE_UPDATED:
                 case DEVICE_MASTERSHIP_CHANGED:
                     if (!discoverers.containsKey(device.id())) {
+                        // TODO: ideally, should never reach here
+                        log.debug("Device mastership changed ({}) {}", event.type(), device.id());
                         discoverers.put(device.id(),
                                new LinkDiscovery(device, packetSevice, masterService,
                                       providerService, useBDDP));
diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
index e60ed90..9bcd8c6 100644
--- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
+++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java
@@ -139,8 +139,8 @@
      * @param port the port
      */
     public void addPort(final Port port) {
-        this.log.debug("sending init probe to port {}",
-                           port.number().toLong());
+        this.log.debug("sending init probe to port {}@{}",
+                           port.number().toLong(), device.id());
 
         sendProbes(port.number().toLong());
 
@@ -245,7 +245,7 @@
      */
     @Override
     public void run(final Timeout t) {
-        this.log.trace("sending probes");
+        this.log.trace("sending probes from {}", device.id());
         synchronized (this) {
             final Iterator<Long> fastIterator = this.fastPorts.iterator();
             Long portNumber;
@@ -256,7 +256,7 @@
                         .getAndIncrement();
 
                 if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
-                    this.log.trace("sending fast probe to port");
+                    this.log.trace("sending fast probe to port {}", portNumber);
                     sendProbes(portNumber);
                 } else {
                     // Update fast and slow ports
@@ -356,6 +356,7 @@
        if (device.type() != Device.Type.ROADM &&
                mastershipService.getLocalRole(this.device.id()) ==
                MastershipRole.MASTER) {
+           log.debug("sending probes out to {}@{}", portNumber, device.id());
            OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
            pktService.emit(pkt);
            if (useBDDP) {
diff --git a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
index 984e8ae..a7ee9e7 100644
--- a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
+++ b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java
@@ -96,7 +96,9 @@
         // FIXME if possible, we might want this to be part of
         // OpenFlowSwitch interface so the driver interface isn't misused.
         OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
-        if (!((OpenFlowSwitchDriver) sw).isConnected()) {
+        if (sw == null ||
+            !((OpenFlowSwitchDriver) sw).isConnected()) {
+            LOG.error("Failed to probe device {} on sw={}", device, sw);
             providerService.deviceDisconnected(device.id());
             return;
         }