Bugfix in routing logic to avoid null groups; Improvement in group handling.

Two main changes for the bug-fix:
    - avoid doing a full-reroute in the event of link failure as this will leave behind stale state in stores
      if event has been preceded by mastership change leading to the nuking of the ecmpSpg for one of the
      link's devices; instead do a rehash
    - when full-reroute is attempted, do it only once, with a complete nuke of the next-obj store

Improvement in group handling allows for a max number of retries for a group that failed to be added.
Earlier behavior was to try only once, and if it fails, it gets removed from the group-store. Now it
is removed after a couple of retries - so total 3 attempts to program the group.

Change-Id: I54ca8203cef779463522b01353540d12f8be3c91
diff --git a/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/DefaultRoutingHandler.java b/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/DefaultRoutingHandler.java
index 0d25c10..a5a90a7 100644
--- a/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/DefaultRoutingHandler.java
+++ b/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/DefaultRoutingHandler.java
@@ -75,6 +75,7 @@
     private static final int RETRY_INTERVAL_SCALE = 1;
     private static final long STABLITY_THRESHOLD = 10; //secs
     private static final long MASTER_CHANGE_DELAY = 1000; // ms
+    private static final long PURGE_DELAY = 1000; // ms
     private static Logger log = LoggerFactory.getLogger(DefaultRoutingHandler.class);
 
     private SegmentRoutingManager srManager;
@@ -88,8 +89,11 @@
         = newScheduledThreadPool(1, groupedThreads("retryftr", "retry-%d", log));
     private ScheduledExecutorService executorServiceMstChg
         = newScheduledThreadPool(1, groupedThreads("masterChg", "mstch-%d", log));
+    private ScheduledExecutorService executorServiceFRR
+        = newScheduledThreadPool(1, groupedThreads("fullRR", "fullRR-%d", log));
 
     private Instant lastRoutingChange = Instant.EPOCH;
+    private Instant lastFullReroute = Instant.EPOCH;
 
     // Distributed store to keep track of ONOS instance that should program the
     // device pair. There should be only one instance (the king) that programs the same pair.
@@ -199,6 +203,7 @@
     public void shutdown() {
         executorService.shutdown();
         executorServiceMstChg.shutdown();
+        executorServiceFRR.shutdown();
     }
 
     //////////////////////////////////////
@@ -501,20 +506,10 @@
                 // link has gone down
                 // Compare existing ECMP SPG only with the link that went down
                 routeChanges = computeDamagedRoutes(linkDown);
-                if (routeChanges != null) {
-                    processHashGroupChange(routeChanges, true, null);
-                    // clear out routesChanges so a re-route is not attempted
-                    routeChanges = ImmutableSet.of();
-                    hashGroupsChanged = true;
-                }
-            }
-
-            // do full re-routing if optimized routing returns null routeChanges
-            if (routeChanges == null) {
-                log.warn("Optimized routing failed... opting for full reroute");
-                populationStatus = Status.ABORTED;
-                populateAllRoutingRules();
-                return;
+                processHashGroupChange(routeChanges, true, null);
+                // clear out routesChanges so a re-route is not attempted
+                routeChanges = ImmutableSet.of();
+                hashGroupsChanged = true;
             }
 
             if (routeChanges.isEmpty()) {
@@ -786,7 +781,7 @@
                 DeviceId dstSw = route.get(1); // same as impactedDstDevice
                 Set<DeviceId> nextHops = getNextHops(targetSw, dstSw);
                 if (nextHops.isEmpty()) {
-                    log.warn("Could not find next hop from target:{} --> dst {} "
+                    log.debug("Could not find next hop from target:{} --> dst {} "
                             + "skipping this route", targetSw, dstSw);
                     continue;
                 }
@@ -1066,7 +1061,7 @@
                     (revoke) ? "revoke" : "repopulate", route);
             return false;
         }
-        log.debug("{} hash-groups buckets For Route {} -> {} to next-hops {}",
+        log.debug("{} hash-groups buckets For Route {} -> {} to new next-hops {}",
                   (revoke) ? "revoke" : "repopulating",
                   targetSw, destSw, nextHops);
         return (revoke) ? grpHandler.fixHashGroups(targetSw, nextHops,
@@ -1296,6 +1291,7 @@
         private static final long CLUSTER_EVENT_THRESHOLD = 4500; // ms
         private static final long DEVICE_EVENT_THRESHOLD = 2000; // ms
         private static final long EDGE_PORT_EVENT_THRESHOLD = 10000; //ms
+        private static final long FULL_REROUTE_THRESHOLD = 10000; // ms
 
         MasterChange(DeviceId devId, MastershipEvent me) {
             this.devId = devId;
@@ -1348,12 +1344,29 @@
                 if (srManager.mastershipService.isLocalMaster(devId)) {
                     // old master could have died when populating filters
                     populatePortAddressingRules(devId);
-                    // old master could have died when creating groups
-                    srManager.purgeHashedNextObjectiveStore(devId);
                 }
+                // old master could have died when creating groups
                 // XXX right now we have no fine-grained way to only make changes
-                // for the route paths affected by this device.
-                populateAllRoutingRules();
+                // for the route paths affected by this device. Thus we do a
+                // full reroute after purging all hash groups. We also try to do
+                // it only once, irrespective of the number of devices
+                // that changed mastership when their master instance died.
+                long lfrr = Instant.now().toEpochMilli() - lastFullReroute.toEpochMilli();
+                boolean doFullReroute = lfrr > FULL_REROUTE_THRESHOLD;
+                if (doFullReroute) {
+                    lastFullReroute = Instant.now();
+                    for (Device dev : srManager.deviceService.getDevices()) {
+                        if (shouldProgram(dev.id())) {
+                            srManager.purgeHashedNextObjectiveStore(dev.id());
+                        }
+                    }
+                    // give small delay to ensure entire store is purged
+                    executorServiceFRR.schedule(new FullRerouteAfterPurge(),
+                                                PURGE_DELAY,
+                                                TimeUnit.MILLISECONDS);
+                } else {
+                    log.warn("Full reroute attempted {} ms ago .. skipping", lfrr);
+                }
 
             } else if (edgePortEvent && clusterEvent) {
                 log.warn("Mastership changed for dev: {}/{} due to clusterEvent {} ms ago "
@@ -1376,18 +1389,31 @@
         }
     }
 
+    /**
+     * Performs a full reroute of routing rules in all the switches. Assumes
+     * caller has purged hash groups from the nextObjective store, otherwise
+     * re-uses ones available in the store.
+     */
+    protected final class FullRerouteAfterPurge implements Runnable {
+        @Override
+        public void run() {
+            populateAllRoutingRules();
+        }
+    }
+
+
     //////////////////////////////////////
     //  Routing helper methods and classes
     //////////////////////////////////////
 
     /**
-     * Computes set of affected routes due to failed link. Assumes
-     * previous ecmp shortest-path graph exists for a switch in order to compute
-     * affected routes. If such a graph does not exist, the method returns null.
+     * Computes set of affected routes due to failed link. Assumes previous ecmp
+     * shortest-path graph exists for a switch in order to compute affected
+     * routes. If such a graph does not exist, the method returns null.
      *
      * @param linkFail the failed link
      * @return the set of affected routes which may be empty if no routes were
-     *         affected, or null if no previous ecmp spg was found for comparison
+     *         affected
      */
     private Set<ArrayList<DeviceId>> computeDamagedRoutes(Link linkFail) {
         Set<ArrayList<DeviceId>> routes = new HashSet<>();
@@ -1402,17 +1428,26 @@
             for (DeviceId rootSw : deviceAndItsPair(sw.id())) {
                 // check for mastership change since last run
                 if (!lastProgrammed.contains(sw.id())) {
-                    lastProgrammed.add(sw.id());
-                    log.warn("New reponsibility for this node to program dev:{}"
+                    log.warn("New responsibility for this node to program dev:{}"
                             + " ... nuking current ECMPspg", sw.id());
                     currentEcmpSpgMap.remove(sw.id());
                 }
+                lastProgrammed.add(sw.id());
+
                 EcmpShortestPathGraph ecmpSpg = currentEcmpSpgMap.get(rootSw);
                 if (ecmpSpg == null) {
-                    log.warn("No existing ECMP graph for switch {}. Aborting optimized"
-                            + " rerouting and opting for full-reroute", rootSw);
-                    return null;
+                    log.warn("No existing ECMP graph for switch {}. Assuming "
+                            + "all route-paths have changed towards it.", rootSw);
+                    for (DeviceId targetSw : srManager.deviceConfiguration.getRouters()) {
+                        if (targetSw.equals(rootSw)) {
+                            continue;
+                        }
+                        routes.add(Lists.newArrayList(targetSw, rootSw));
+                        log.debug("Impacted route:{}->{}", targetSw, rootSw);
+                    }
+                    continue;
                 }
+
                 if (log.isDebugEnabled()) {
                     log.debug("Root switch: {}", rootSw);
                     log.debug("  Current/Existing SPG: {}", ecmpSpg);
@@ -1484,11 +1519,11 @@
                 }
                 // check for mastership change since last run
                 if (!lastProgrammed.contains(sw.id())) {
-                    lastProgrammed.add(sw.id());
-                    log.warn("New reponsibility for this node to program dev:{}"
+                    log.warn("New responsibility for this node to program dev:{}"
                             + " ... nuking current ECMPspg", sw.id());
                     currentEcmpSpgMap.remove(sw.id());
                 }
+                lastProgrammed.add(sw.id());
                 EcmpShortestPathGraph currEcmpSpg = currentEcmpSpgMap.get(rootSw);
                 if (currEcmpSpg == null) {
                     log.debug("No existing ECMP graph for device {}.. adding self as "
@@ -1657,19 +1692,19 @@
         NodeId currentNodeId = srManager.clusterService.getLocalNode().id();
         NodeId masterNodeId = srManager.mastershipService.getMasterFor(deviceId);
         Optional<NodeId> pairMasterNodeId = pairDeviceId.map(srManager.mastershipService::getMasterFor);
-        log.debug("Evaluate shouldProgram {}/pair={}. current={}, master={}, pairMaster={}",
+        log.debug("Evaluate shouldProgram {}/pair={}. currentNodeId={}, master={}, pairMaster={}",
                 deviceId, pairDeviceId, currentNodeId, masterNodeId, pairMasterNodeId);
 
         // No pair device configured. Only handle when current instance is the master of the device
         if (!pairDeviceId.isPresent()) {
-            log.debug("No pair device. current={}, master={}", currentNodeId, masterNodeId);
+            log.debug("No pair device. currentNodeId={}, master={}", currentNodeId, masterNodeId);
             return currentNodeId.equals(masterNodeId);
         }
 
         // Should not handle if current instance is not the master of either switch
         if (!currentNodeId.equals(masterNodeId) &&
                 !(pairMasterNodeId.isPresent() && currentNodeId.equals(pairMasterNodeId.get()))) {
-            log.debug("Current node {} is neither the master of target device {} nor pair device {}",
+            log.debug("Current nodeId {} is neither the master of target device {} nor pair device {}",
                     currentNodeId, deviceId, pairDeviceId);
             return false;
         }
@@ -1692,7 +1727,7 @@
         }));
 
         if (king != null) {
-            log.debug("{} should handle routing for {}/pair={}", king, deviceId, pairDeviceId);
+            log.debug("{} is king, should handle routing for {}/pair={}", king, deviceId, pairDeviceId);
             shouldProgramCache.put(deviceId, king.equals(currentNodeId));
             return king.equals(currentNodeId);
         } else {
@@ -1778,7 +1813,7 @@
                                 log.warn(e.getMessage());
                             }
                             if (pathdevIsEdge) {
-                                log.debug("Avoiding {} hop path for non-edge targetSw:{}"
+                                log.debug("Avoiding {} hop path for targetSw:{}"
                                         + " --> dstSw:{} which goes through an edge"
                                         + " device {} in path {}", itrIdx,
                                           targetSw, dstSw, pathdev, via);
diff --git a/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/grouphandler/DefaultGroupHandler.java b/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/grouphandler/DefaultGroupHandler.java
index 9e28149..a569873 100644
--- a/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/grouphandler/DefaultGroupHandler.java
+++ b/apps/segmentrouting/app/src/main/java/org/onosproject/segmentrouting/grouphandler/DefaultGroupHandler.java
@@ -486,7 +486,7 @@
                           currNeighbors, nextHops, nextId);
                 if ((revoke && !nextHops.isEmpty())
                         || (!revoke && !nextHops.equals(currNeighbors))) {
-                    log.warn("Simple next objective cannot be edited to "
+                    log.debug("Simple next objective cannot be edited to "
                             + "move from {} to {}", currNeighbors, nextHops);
                 }
                 continue;
diff --git a/core/api/src/main/java/org/onosproject/net/group/DefaultGroup.java b/core/api/src/main/java/org/onosproject/net/group/DefaultGroup.java
index 90da929..d0777ec 100644
--- a/core/api/src/main/java/org/onosproject/net/group/DefaultGroup.java
+++ b/core/api/src/main/java/org/onosproject/net/group/DefaultGroup.java
@@ -36,6 +36,7 @@
     private long referenceCount;
     private GroupId id;
     private int age;
+    private int failedRetryCount;
 
     /**
      * Initializes default values.
@@ -50,6 +51,7 @@
         bytes = 0;
         referenceCount = 0;
         age = 0;
+        failedRetryCount = 0;
     }
 
     /**
@@ -135,6 +137,11 @@
         return age;
     }
 
+    @Override
+    public int failedRetryCount() {
+        return failedRetryCount;
+    }
+
     /**
      * Sets the new state for this entry.
      *
@@ -190,6 +197,16 @@
         return referenceCount;
     }
 
+    @Override
+    public void incrFailedRetryCount() {
+        failedRetryCount++;
+    }
+
+    @Override
+    public void setFailedRetryCount(int failedRetryCount) {
+        this.failedRetryCount = failedRetryCount;
+    }
+
     /*
      * The deviceId, type and buckets are used for hash.
      *
diff --git a/core/api/src/main/java/org/onosproject/net/group/Group.java b/core/api/src/main/java/org/onosproject/net/group/Group.java
index 39c6a3d..dd2aa63 100644
--- a/core/api/src/main/java/org/onosproject/net/group/Group.java
+++ b/core/api/src/main/java/org/onosproject/net/group/Group.java
@@ -105,4 +105,12 @@
      * @return the age of the group as an integer
      */
     int age();
+
+    /**
+     * Returns the count for the number of attempts at programming a failed
+     * group.
+     *
+     * @return the count for the number of failed attempts at programming this group
+     */
+    int failedRetryCount();
 }
diff --git a/core/api/src/main/java/org/onosproject/net/group/StoredGroupEntry.java b/core/api/src/main/java/org/onosproject/net/group/StoredGroupEntry.java
index 6d22fa7..760fc5a 100644
--- a/core/api/src/main/java/org/onosproject/net/group/StoredGroupEntry.java
+++ b/core/api/src/main/java/org/onosproject/net/group/StoredGroupEntry.java
@@ -72,4 +72,20 @@
      * @param referenceCount reference count
      */
     void setReferenceCount(long referenceCount);
+
+    /**
+     * Increments the count for the number of failed attempts in programming
+     * this group.
+     *
+     */
+    void incrFailedRetryCount();
+
+    /**
+     * Sets the count for the number of failed attempts in programming this
+     * group.
+     *
+     * @param failedRetryCount count for number of failed attempts in
+     *            programming this group
+     */
+    void setFailedRetryCount(int failedRetryCount);
 }
diff --git a/core/api/src/test/java/org/onosproject/net/group/DefaultGroupTest.java b/core/api/src/test/java/org/onosproject/net/group/DefaultGroupTest.java
index 0c7df81..e8fb7b3 100644
--- a/core/api/src/test/java/org/onosproject/net/group/DefaultGroupTest.java
+++ b/core/api/src/test/java/org/onosproject/net/group/DefaultGroupTest.java
@@ -85,6 +85,7 @@
         assertThat(group1.referenceCount(), is(0L));
         assertThat(group1.buckets(), is(groupBuckets));
         assertThat(group1.state(), is(Group.GroupState.PENDING_ADD));
+        assertThat(group1.failedRetryCount(), is(0));
     }
 
     /**
@@ -101,5 +102,22 @@
         assertThat(group.referenceCount(), is(0L));
         assertThat(group.deviceId(), is(NetTestTools.did("1")));
         assertThat(group.buckets(), is(groupBuckets));
+        assertThat(group.failedRetryCount(), is(0));
+    }
+
+    /**
+     * Test failedRetryCount field.
+     */
+    @Test
+    public void checkFailedRetyCount() {
+        assertThat(group1.failedRetryCount(), is(0));
+        group1.incrFailedRetryCount();
+        assertThat(group1.failedRetryCount(), is(1));
+        group1.setFailedRetryCount(3);
+        assertThat(group1.failedRetryCount(), is(3));
+        group1.incrFailedRetryCount();
+        assertThat(group1.failedRetryCount(), is(4));
+        group1.setFailedRetryCount(0);
+        assertThat(group1.failedRetryCount(), is(0));
     }
 }
diff --git a/core/store/dist/src/main/java/org/onosproject/store/group/impl/DistributedGroupStore.java b/core/store/dist/src/main/java/org/onosproject/store/group/impl/DistributedGroupStore.java
index 58417a8..6e8b126 100644
--- a/core/store/dist/src/main/java/org/onosproject/store/group/impl/DistributedGroupStore.java
+++ b/core/store/dist/src/main/java/org/onosproject/store/group/impl/DistributedGroupStore.java
@@ -112,6 +112,7 @@
     private static final boolean GARBAGE_COLLECT = false;
     private static final int GC_THRESH = 6;
     private static final boolean ALLOW_EXTRANEOUS_GROUPS = true;
+    private static final int MAX_FAILED_ATTEMPTS = 3;
 
     private final int dummyId = 0xffffffff;
     private final GroupId dummyGroupId = new GroupId(dummyId);
@@ -152,6 +153,7 @@
             extraneousGroupEntriesById = new ConcurrentHashMap<>();
     private ExecutorService messageHandlingExecutor;
     private static final int MESSAGE_HANDLER_THREAD_POOL_SIZE = 1;
+
     private final HashMap<DeviceId, Boolean> deviceAuditStatus = new HashMap<>();
 
     private final AtomicInteger groupIdGen = new AtomicInteger();
@@ -941,6 +943,7 @@
                 existing.setPackets(group.packets());
                 existing.setBytes(group.bytes());
                 existing.setReferenceCount(group.referenceCount());
+                existing.setFailedRetryCount(0);
                 if ((existing.state() == GroupState.PENDING_ADD) ||
                         (existing.state() == GroupState.PENDING_ADD_RETRY)) {
                     log.trace("addOrUpdateGroupEntry: group entry {} in device {} moving from {} to ADDED",
@@ -969,6 +972,7 @@
                              + "happening for a non-existing entry in the map");
         }
 
+        // XXX if map is going to trigger event, is this one needed?
         if (event != null) {
             notifyDelegate(event);
         }
@@ -1082,9 +1086,10 @@
             return;
         }
 
-        log.warn("groupOperationFailed: group operation {} failed"
+        log.warn("groupOperationFailed: group operation {} failed in state {} "
                          + "for group {} in device {} with code {}",
                  operation.opType(),
+                 existing.state(),
                  existing.id(),
                  existing.deviceId(),
                  operation.failureCode());
@@ -1107,9 +1112,26 @@
                         existing.buckets());
             }
         }
+        if (operation.failureCode() == GroupOperation.GroupMsgErrorCode.INVALID_GROUP) {
+            existing.incrFailedRetryCount();
+            if (existing.failedRetryCount() < MAX_FAILED_ATTEMPTS) {
+                log.warn("Group {} programming failed {} of {} times in dev {}, "
+                        + "retrying ..", existing.id(),
+                         existing.failedRetryCount(), MAX_FAILED_ATTEMPTS,
+                         deviceId);
+                return;
+            }
+            log.warn("Group {} programming failed {} of {} times in dev {}, "
+                    + "removing group from store", existing.id(),
+                     existing.failedRetryCount(), MAX_FAILED_ATTEMPTS,
+                     deviceId);
+            // fall through to case
+        }
+
         switch (operation.opType()) {
             case ADD:
-                if (existing.state() == GroupState.PENDING_ADD) {
+                if (existing.state() == GroupState.PENDING_ADD
+                    || existing.state() == GroupState.PENDING_ADD_RETRY) {
                     notifyDelegate(new GroupEvent(Type.GROUP_ADD_FAILED, existing));
                     log.warn("groupOperationFailed: cleaningup "
                                      + "group {} from store in device {}....",
diff --git a/core/store/dist/src/test/java/org/onosproject/store/group/impl/DistributedGroupStoreTest.java b/core/store/dist/src/test/java/org/onosproject/store/group/impl/DistributedGroupStoreTest.java
index fd9b484..0a44359 100644
--- a/core/store/dist/src/test/java/org/onosproject/store/group/impl/DistributedGroupStoreTest.java
+++ b/core/store/dist/src/test/java/org/onosproject/store/group/impl/DistributedGroupStoreTest.java
@@ -36,6 +36,7 @@
 import org.onosproject.net.group.DefaultGroupDescription;
 import org.onosproject.net.group.DefaultGroupKey;
 import org.onosproject.net.group.Group;
+import org.onosproject.net.group.Group.GroupState;
 import org.onosproject.net.group.GroupBucket;
 import org.onosproject.net.group.GroupBuckets;
 import org.onosproject.net.group.GroupDescription;
@@ -44,6 +45,7 @@
 import org.onosproject.net.group.GroupOperation;
 import org.onosproject.net.group.GroupStore;
 import org.onosproject.net.group.GroupStoreDelegate;
+import org.onosproject.net.group.GroupOperation.GroupMsgErrorCode;
 import org.onosproject.store.cluster.messaging.ClusterCommunicationServiceAdapter;
 import org.onosproject.store.service.ConsistentMap;
 import org.onosproject.store.service.TestStorageService;
@@ -379,6 +381,85 @@
     }
 
     /**
+     * Tests group operation failed interface, with error codes for failures.
+     */
+    @Test
+    public void testGroupOperationFailedWithErrorCode() {
+        TestDelegate delegate = new TestDelegate();
+        groupStore.setDelegate(delegate);
+        groupStore.deviceInitialAuditCompleted(deviceId1, true);
+        groupStore.storeGroupDescription(groupDescription1);
+        groupStore.deviceInitialAuditCompleted(deviceId2, true);
+        groupStore.storeGroupDescription(groupDescription2);
+
+        List<GroupEvent> eventsAfterAdds = delegate.eventsSeen();
+        assertThat(eventsAfterAdds, hasSize(2));
+        eventsAfterAdds.forEach(event -> assertThat(event
+                .type(), is(GroupEvent.Type.GROUP_ADD_REQUESTED)));
+        delegate.resetEvents();
+
+        // test group exists
+        GroupOperation opAdd = GroupOperation
+                .createAddGroupOperation(groupId1, INDIRECT, buckets);
+        GroupOperation addFailedExists = GroupOperation
+                .createFailedGroupOperation(opAdd, GroupMsgErrorCode.GROUP_EXISTS);
+        groupStore.groupOperationFailed(deviceId1, addFailedExists);
+
+        List<GroupEvent> eventsAfterAddFailed = delegate.eventsSeen();
+        assertThat(eventsAfterAddFailed, hasSize(2));
+        assertThat(eventsAfterAddFailed.get(0).type(),
+                   is(GroupEvent.Type.GROUP_ADDED));
+        assertThat(eventsAfterAddFailed.get(1).type(),
+                   is(GroupEvent.Type.GROUP_ADDED));
+        Group g1 = groupStore.getGroup(deviceId1, groupId1);
+        assertEquals(0, g1.failedRetryCount());
+        delegate.resetEvents();
+
+        // test invalid group
+        Group g2 = groupStore.getGroup(deviceId2, groupId2);
+        assertEquals(0, g2.failedRetryCount());
+        assertEquals(GroupState.PENDING_ADD, g2.state());
+        GroupOperation opAdd1 = GroupOperation
+                .createAddGroupOperation(groupId2, INDIRECT, buckets);
+        GroupOperation addFailedInvalid = GroupOperation
+                .createFailedGroupOperation(opAdd1, GroupMsgErrorCode.INVALID_GROUP);
+
+        groupStore.groupOperationFailed(deviceId2, addFailedInvalid);
+        groupStore.pushGroupMetrics(deviceId2, ImmutableList.of());
+        List<GroupEvent> eventsAfterAddFailed1 = delegate.eventsSeen();
+        assertThat(eventsAfterAddFailed1, hasSize(1));
+        assertThat(eventsAfterAddFailed.get(0).type(),
+                   is(GroupEvent.Type.GROUP_ADD_REQUESTED));
+        g2 = groupStore.getGroup(deviceId2, groupId2);
+        assertEquals(1, g2.failedRetryCount());
+        assertEquals(GroupState.PENDING_ADD_RETRY, g2.state());
+        delegate.resetEvents();
+
+        groupStore.groupOperationFailed(deviceId2, addFailedInvalid);
+        groupStore.pushGroupMetrics(deviceId2, ImmutableList.of());
+        List<GroupEvent> eventsAfterAddFailed2 = delegate.eventsSeen();
+        assertThat(eventsAfterAddFailed2, hasSize(1));
+        assertThat(eventsAfterAddFailed.get(0).type(),
+                   is(GroupEvent.Type.GROUP_ADD_REQUESTED));
+        g2 = groupStore.getGroup(deviceId2, groupId2);
+        assertEquals(2, g2.failedRetryCount());
+        assertEquals(GroupState.PENDING_ADD_RETRY, g2.state());
+        delegate.resetEvents();
+
+        groupStore.groupOperationFailed(deviceId2, addFailedInvalid);
+        groupStore.pushGroupMetrics(deviceId2, ImmutableList.of());
+        List<GroupEvent> eventsAfterAddFailed3 = delegate.eventsSeen();
+        assertThat(eventsAfterAddFailed3, hasSize(2));
+        assertThat(eventsAfterAddFailed.get(0).type(),
+                   is(GroupEvent.Type.GROUP_ADD_FAILED));
+        assertThat(eventsAfterAddFailed.get(1).type(),
+                   is(GroupEvent.Type.GROUP_REMOVED));
+        g2 = groupStore.getGroup(deviceId2, groupId2);
+        assertEquals(null, g2);
+        delegate.resetEvents();
+    }
+
+    /**
      * Tests extraneous group operations.
      */
     @Test
diff --git a/web/api/src/test/java/org/onosproject/rest/resources/GroupsResourceTest.java b/web/api/src/test/java/org/onosproject/rest/resources/GroupsResourceTest.java
index 2cdb6ff..e13784d 100644
--- a/web/api/src/test/java/org/onosproject/rest/resources/GroupsResourceTest.java
+++ b/web/api/src/test/java/org/onosproject/rest/resources/GroupsResourceTest.java
@@ -189,6 +189,11 @@
         public GroupBuckets buckets() {
             return this.buckets;
         }
+
+        @Override
+        public int failedRetryCount() {
+            return 0;
+        }
     }
 
     /**