[ONOS-8049] Node Diagnosis Application

Node Diagnosis application, diagnosis the Controller node failures
and tries to recover it.

Diagnosis consists of App Diagnosis and Cluster Diagnosis.

App diagnosis checks App condition and Bundles.
Cluster Diagnosis checks Cluster condition.

NextPatch Plan:
1. Introducing more clis for Diagnosis Action Cli,SetretryCountOfRebbotsCli
2. Memory & file descriptors usage Ananlysis - identify abnormal and Recover

Change-Id: I591650fbacdcd6498684ccb6fbf8e38bd206250a
diff --git a/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/RestartNodeCommand.java b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/RestartNodeCommand.java
new file mode 100644
index 0000000..b63e6d7
--- /dev/null
+++ b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/RestartNodeCommand.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2020-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.onosproject.diagnosis.cli;
+
+import org.apache.karaf.shell.api.action.Command;
+import org.apache.karaf.shell.api.action.lifecycle.Service;
+import org.onosproject.cli.AbstractShellCommand;
+import org.osgi.service.component.annotations.Reference;
+import org.osgi.service.component.annotations.ReferenceCardinality;
+import org.apache.karaf.system.SystemService;
+
+@Service
+@Command(scope = "onos", name = "restart-node", description = "restart onos")
+public class RestartNodeCommand extends AbstractShellCommand {
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    private SystemService systemService;
+
+    @Override
+    protected void doExecute() {
+        systemService = get(SystemService.class);
+        try {
+            systemService.reboot("now", SystemService.Swipe.CACHE);
+        } catch (Exception e) {
+            log.error("error occured because of {} ", e.getMessage());
+        }
+    }
+
+}
diff --git a/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/package-info.java b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/package-info.java
new file mode 100644
index 0000000..938bac6
--- /dev/null
+++ b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/cli/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2020-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *  ONOS Node diagnosis cli.
+ */
+package org.onosproject.diagnosis.cli;
\ No newline at end of file
diff --git a/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/NodeDiagnosisManager.java b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/NodeDiagnosisManager.java
new file mode 100644
index 0000000..1519e24
--- /dev/null
+++ b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/NodeDiagnosisManager.java
@@ -0,0 +1,376 @@
+/*
+ * Copyright 2020-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.onosproject.diagnosis.impl;
+
+import org.osgi.service.component.annotations.Activate;
+import org.osgi.service.component.annotations.Component;
+import org.osgi.service.component.annotations.Modified;
+import org.osgi.service.component.annotations.Deactivate;
+import org.osgi.service.component.runtime.ServiceComponentRuntime;
+import org.osgi.service.component.annotations.Reference;
+import org.osgi.service.component.annotations.ReferenceCardinality;
+import org.osgi.service.component.runtime.dto.ComponentConfigurationDTO;
+import org.osgi.service.component.runtime.dto.ComponentDescriptionDTO;
+import org.osgi.service.component.ComponentContext;
+import org.apache.karaf.bundle.core.BundleService;
+import org.apache.karaf.system.SystemService;
+import org.onlab.osgi.DefaultServiceDirectory;
+import org.onosproject.cfg.ComponentConfigService;
+import org.onosproject.cluster.ClusterAdminService;
+import org.onosproject.cluster.ControllerNode;
+import org.onosproject.cluster.NodeId;
+import org.onosproject.store.cluster.messaging.ClusterCommunicationService;
+import org.onosproject.store.cluster.messaging.ClusterMessage;
+import org.onosproject.store.cluster.messaging.ClusterMessageHandler;
+import org.onosproject.store.cluster.messaging.MessageSubject;
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.FrameworkUtil;
+import org.osgi.framework.Constants;
+import org.osgi.framework.wiring.FrameworkWiring;
+import org.slf4j.Logger;
+
+import java.util.Dictionary;
+import java.time.Instant;
+import java.time.temporal.ChronoUnit;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import static com.google.common.collect.Lists.newArrayList;
+import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor;
+import static org.onlab.util.SharedExecutors.getPoolThreadExecutor;
+import static org.onlab.util.Tools.groupedThreads;
+import static org.slf4j.LoggerFactory.getLogger;
+
+import static org.onlab.util.Tools.getIntegerProperty;
+import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_POLL_DELAY_MINUTE;
+import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.POLL_FREQUENCY_MINUTE;
+import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_INITIAL_POLL_DELAY_MINUTE;
+import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_POLL_FREQUENCY_MINUTE;
+
+
+@Component(immediate = true,
+        property = {
+                INITIAL_POLL_DELAY_MINUTE + ":Integer=" + DEFAULT_INITIAL_POLL_DELAY_MINUTE,
+                POLL_FREQUENCY_MINUTE + ":Integer=" + DEFAULT_POLL_FREQUENCY_MINUTE,
+        })
+public class NodeDiagnosisManager {
+
+    private static final MessageSubject REBOOT_MSG = new MessageSubject("Node-diagnosis");
+
+    private static int initialPollDelayMinute = DEFAULT_INITIAL_POLL_DELAY_MINUTE;
+    private static int pollFrequencyMinute = DEFAULT_POLL_FREQUENCY_MINUTE;
+
+    private final Logger log = getLogger(getClass());
+    private ScheduledExecutorService metricsExecutor;
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    private ServiceComponentRuntime scrService;
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    private BundleService bundleService;
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    private SystemService systemService;
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    private ClusterCommunicationService communicationService;
+
+    @Reference(cardinality = ReferenceCardinality.MANDATORY)
+    protected ComponentConfigService cfgService;
+
+    private ScheduledFuture<?> clusterNodeDiagnosisFuture;
+
+    private BundleContext bundleContext;
+    private ClusterAdminService caService;
+    private NodeId localNodeId;
+    private Set<NodeId> nodeIds;
+
+    private static final long TIMEOUT = 3000;
+
+    @Activate
+    public void activate() {
+        cfgService.registerProperties(getClass());
+        bundleContext = FrameworkUtil.getBundle(this.getClass()).getBundleContext();
+        getNodeId();
+        scheduleAppDiagnosisPolling();
+        scheduleClusterNodeDiagnosisPolling();
+        communicationService.addSubscriber(REBOOT_MSG, new InternalSampleCollector(),
+                                           getPoolThreadExecutor());
+        log.info("Started");
+    }
+
+    @Deactivate
+    public void deactivate() {
+        cfgService.unregisterProperties(getClass(), false);
+        communicationService.removeSubscriber(REBOOT_MSG);
+        metricsExecutor.shutdownNow();
+        clusterNodeDiagnosisFuture.cancel(true);
+        log.info("Stopped");
+    }
+
+    @Modified
+    public void modified(ComponentContext context) {
+        readComponentConfiguration(context);
+        log.info("modified");
+    }
+
+    /**
+     * Extracts properties from the component configuration context.
+     *
+     * @param context the component context
+     */
+    private void readComponentConfiguration(ComponentContext context) {
+
+        Dictionary<?, ?> properties = context.getProperties();
+        boolean changed = false;
+
+        int newPollFrequency = getNewPollFrequency(properties);
+        if (newPollFrequency != pollFrequencyMinute) {
+            pollFrequencyMinute = newPollFrequency;
+            changed = true;
+        }
+
+        int newPollDelay = getNewPollDelay(properties);
+        if (newPollDelay != pollFrequencyMinute) {
+            initialPollDelayMinute = newPollDelay;
+            changed = true;
+        }
+        log.info("Node Diagnosis properties are:" +
+                         " initialPollDelayMinute: {}, pollFrequencyMinute: {}",
+                 initialPollDelayMinute, pollFrequencyMinute);
+        if (changed) {
+            //stops the old scheduled task
+            this.clusterNodeDiagnosisFuture.cancel(true);
+            //schedules new task at the new polling rate
+            log.info("New Scheduler started with,Node Diagnosis properties:" +
+                             " initialPollDelayMinute: {}, pollFrequencyMinute: {}",
+                     initialPollDelayMinute, pollFrequencyMinute);
+            scheduleClusterNodeDiagnosisPolling();
+        }
+    }
+
+    private int getNewPollFrequency(Dictionary<?, ?> properties) {
+        int newPollFrequency;
+        try {
+            newPollFrequency = getIntegerProperty(properties, POLL_FREQUENCY_MINUTE);
+        } catch (NumberFormatException | ClassCastException e) {
+            newPollFrequency = DEFAULT_POLL_FREQUENCY_MINUTE;
+        }
+        return newPollFrequency;
+    }
+
+    private int getNewPollDelay(Dictionary<?, ?> properties) {
+        int newPollDelay;
+        try {
+            newPollDelay = getIntegerProperty(properties, INITIAL_POLL_DELAY_MINUTE);
+        } catch (NumberFormatException | ClassCastException e) {
+            newPollDelay = DEFAULT_INITIAL_POLL_DELAY_MINUTE;
+        }
+        return newPollDelay;
+    }
+
+    private List<Bundle> getAllBundles() {
+        return Arrays.asList(bundleContext.getBundles());
+    }
+
+    private void getNodeId() {
+        caService = DefaultServiceDirectory.getService(ClusterAdminService.class);
+        if (Objects.isNull(caService)) {
+            return;
+        }
+        List<ControllerNode> controllerNodes = newArrayList(caService.getNodes());
+        nodeIds = controllerNodes
+                .stream()
+                .map(ControllerNode::id)
+                .collect(Collectors.toSet());
+
+        localNodeId = caService.getLocalNode().id();
+    }
+
+    private void scheduleAppDiagnosisPolling() {
+        metricsExecutor = newSingleThreadScheduledExecutor(
+                groupedThreads("Nodediagnosis/diagnosisThread",
+                               "Nodediagnosis-executor-%d", log));
+        metricsExecutor.scheduleAtFixedRate(this::appDiagnosis,
+                                            60,
+                                            30, TimeUnit.SECONDS);
+    }
+
+    private void scheduleClusterNodeDiagnosisPolling() {
+        clusterNodeDiagnosisFuture = metricsExecutor.scheduleAtFixedRate(this::clusterNodeDiagnosis,
+                                                                         initialPollDelayMinute,
+                                                                         pollFrequencyMinute, TimeUnit.MINUTES);
+    }
+
+    private void appDiagnosis() {
+        verifyBundles(null);
+        verifyApps();
+    }
+
+    private void verifyBundles(String bundleName) {
+
+        if (Objects.isNull(bundleContext)) {
+            return;
+        }
+        try {
+            FrameworkWiring wiring = bundleContext.getBundle(Constants.SYSTEM_BUNDLE_LOCATION)
+                    .adapt(FrameworkWiring.class);
+            if (Objects.isNull(wiring)) {
+                return;
+            }
+
+            boolean result;
+            List<Bundle> bundleList;
+            if (Objects.nonNull(bundleName)) {
+                log.info("bundle to be resolved and refreshed: {}", bundleName);
+                bundleList = this.getAllBundles().stream()
+                        .filter(bundle -> bundleService.getInfo(bundle).getName().equals(bundleName))
+                        .collect(Collectors.toList());
+            } else {
+                bundleList = this.getAllBundles().stream()
+                        .filter(bundle -> bundleService.getDiag(bundle).split("[\n|\r]").length > 1)
+                        .collect(Collectors.toList());
+            }
+            /**
+             * Example diags :
+             *  BundleName:onos-providers-openflow-flow,
+             *  Diag:Declarative Services
+             * ,number of lines of diag:1
+             * BundleName:onos-apps-faultmanagement-fmgui,
+             *  Diag:Declarative Services
+             *  org.onosproject.faultmanagement.alarms.gui.AlarmTableComponent (136)
+             *   missing references: uiExtensionService
+             * org.onosproject.faultmanagement.alarms.gui.AlarmTopovComponent (137)
+             *   missing references: uiExtensionService
+             *    number of lines of diag:5
+             */
+            this.getAllBundles().forEach(
+                    bundle -> {
+                        log.debug("Bundle service - BundleName:{}, Diag:{}, number of lines of diag:{}",
+                                  bundleService.getInfo(bundle).getName(),
+                                  bundleService.getDiag(bundle),
+                                  bundleService.getDiag(bundle).split("[\n|\r]").length);
+                    });
+
+
+            CompletableFuture<Boolean> completableBundles = CompletableFuture.supplyAsync(() -> {
+                Boolean isResolved = wiring.resolveBundles(bundleList);
+
+                wiring.refreshBundles(bundleList);
+                return isResolved;
+            });
+            result = completableBundles.get();
+
+            if (Objects.nonNull(bundleName)) {
+                log.info("bundle {} is in resolved State ? {}", bundleName, result ? "Yes" : "No");
+            } else {
+                log.info("All the  bundles are in resolved State ? {}", result ? "Yes" : "No");
+            }
+        } catch (InterruptedException | ExecutionException e) {
+            log.error("exception occurred because of", e);
+        } catch (Exception e) {
+            log.error("Exception occured in Verifying Bundles", e);
+        }
+    }
+
+    private void verifyApps() {
+        log.debug("verifyApps() method invoked");
+        List<ComponentDescriptionDTO> nonActiveComponents = getNonActiveComponents();
+
+        nonActiveComponents.forEach(component -> {
+            try {
+                scrService.enableComponent(component).timeout(TIMEOUT);
+            } catch (Exception e) {
+                throw new IllegalStateException("Unable to start component " + component.name, e);
+            }
+        });
+    }
+
+    private List<ComponentDescriptionDTO> getNonActiveComponents() {
+        List<ComponentDescriptionDTO> nonActiveComponents = new ArrayList<>();
+        for (ComponentDescriptionDTO component : scrService.getComponentDescriptionDTOs()) {
+            if (scrService.isComponentEnabled(component)) {
+                for (ComponentConfigurationDTO config : scrService.getComponentConfigurationDTOs(component)) {
+                    if (config.state != ComponentConfigurationDTO.ACTIVE) {
+                        nonActiveComponents.add(component);
+                        break;
+                    }
+                }
+            }
+        }
+        return nonActiveComponents;
+    }
+
+    private void clusterNodeDiagnosis() {
+        if (Objects.isNull(caService)) {
+            return;
+        }
+
+        List<ControllerNode> nodes = newArrayList(caService.getNodes());
+        Set<NodeId> activeNodes = nodes
+                .stream()
+                .filter(node -> caService.getState(node.id()) == ControllerNode.State.ACTIVE)
+                .filter(node -> caService.getLastUpdatedInstant(node.id()).until(Instant.now(), ChronoUnit.MINUTES) > 4)
+                .map(ControllerNode::id)
+                .collect(Collectors.toSet());
+        boolean isNodesActive = nodes
+                .stream().filter(node -> !(caService.getState(node.id()) == ControllerNode.State.INACTIVE))
+                .allMatch(node -> caService.getState(node.id()) == ControllerNode.State.ACTIVE);
+        if (Objects.nonNull(activeNodes) && !activeNodes.isEmpty()) {
+            multicastReboot(isNodesActive, activeNodes);
+        }
+    }
+
+
+    public void restartNode() {
+        try {
+            systemService.reboot("now", SystemService.Swipe.CACHE);
+        } catch (Exception e) {
+            log.error("error occured because of {} ", e.getMessage());
+        }
+    }
+
+    private void multicastReboot(boolean removeDb, Set<NodeId> nodeIds) {
+        String data = "Reboot:" + removeDb;
+        communicationService.multicast(data, REBOOT_MSG, String::getBytes, nodeIds);
+    }
+
+    private class InternalSampleCollector implements ClusterMessageHandler {
+        @Override
+        public void handle(ClusterMessage message) {
+            String reqMsg = new String(message.payload());
+            log.info("Cluster communication message subject{} and message {}",
+                     message.subject(), reqMsg);
+            boolean flag = Boolean.parseBoolean(reqMsg.split(":")[1].trim());
+            if (flag) {
+                System.setProperty("apache.karaf.removedb", "true");
+            }
+            if (message.subject().equals(REBOOT_MSG)) {
+                restartNode();
+            }
+        }
+    }
+}
diff --git a/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/OsgiPropertyConstants.java b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/OsgiPropertyConstants.java
new file mode 100644
index 0000000..9fcdb5d
--- /dev/null
+++ b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/OsgiPropertyConstants.java
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2020-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.onosproject.diagnosis.impl;
+
+public final class OsgiPropertyConstants {
+    private OsgiPropertyConstants() {
+    }
+
+    static final String INITIAL_POLL_DELAY_MINUTE = "initialPollDelayMinute";
+    static final int DEFAULT_INITIAL_POLL_DELAY_MINUTE = 5;
+
+    static final String POLL_FREQUENCY_MINUTE = "pollFrequencyMinute";
+    static final int DEFAULT_POLL_FREQUENCY_MINUTE = 1;
+
+}
diff --git a/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/package-info.java b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/package-info.java
new file mode 100644
index 0000000..bd2d61b
--- /dev/null
+++ b/apps/node-diagnosis/src/main/java/org/onosproject/diagnosis/impl/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2020-present Open Networking Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * dcsdndiag implementation.
+ */
+package org.onosproject.diagnosis.impl;
\ No newline at end of file