Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 1 | /* |
| 2 | * Copyright 2020-present Open Networking Foundation |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | package org.onosproject.diagnosis.impl; |
| 17 | |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 18 | import com.fasterxml.jackson.databind.ObjectMapper; |
| 19 | import com.fasterxml.jackson.databind.node.ObjectNode; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 20 | import org.osgi.service.component.annotations.Activate; |
| 21 | import org.osgi.service.component.annotations.Component; |
| 22 | import org.osgi.service.component.annotations.Modified; |
| 23 | import org.osgi.service.component.annotations.Deactivate; |
| 24 | import org.osgi.service.component.runtime.ServiceComponentRuntime; |
| 25 | import org.osgi.service.component.annotations.Reference; |
| 26 | import org.osgi.service.component.annotations.ReferenceCardinality; |
| 27 | import org.osgi.service.component.runtime.dto.ComponentConfigurationDTO; |
| 28 | import org.osgi.service.component.runtime.dto.ComponentDescriptionDTO; |
| 29 | import org.osgi.service.component.ComponentContext; |
| 30 | import org.apache.karaf.bundle.core.BundleService; |
| 31 | import org.apache.karaf.system.SystemService; |
| 32 | import org.onlab.osgi.DefaultServiceDirectory; |
| 33 | import org.onosproject.cfg.ComponentConfigService; |
| 34 | import org.onosproject.cluster.ClusterAdminService; |
| 35 | import org.onosproject.cluster.ControllerNode; |
| 36 | import org.onosproject.cluster.NodeId; |
| 37 | import org.onosproject.store.cluster.messaging.ClusterCommunicationService; |
| 38 | import org.onosproject.store.cluster.messaging.ClusterMessage; |
| 39 | import org.onosproject.store.cluster.messaging.ClusterMessageHandler; |
| 40 | import org.onosproject.store.cluster.messaging.MessageSubject; |
| 41 | import org.osgi.framework.Bundle; |
| 42 | import org.osgi.framework.BundleContext; |
| 43 | import org.osgi.framework.FrameworkUtil; |
| 44 | import org.osgi.framework.Constants; |
| 45 | import org.osgi.framework.wiring.FrameworkWiring; |
| 46 | import org.slf4j.Logger; |
| 47 | |
| 48 | import java.util.Dictionary; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 49 | import java.io.File; |
| 50 | import java.io.BufferedReader; |
| 51 | import java.io.IOException; |
| 52 | import java.io.InputStreamReader; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 53 | import java.util.Arrays; |
| 54 | import java.util.ArrayList; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 55 | import java.util.HashMap; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 56 | import java.util.List; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 57 | import java.util.Map; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 58 | import java.util.Objects; |
| 59 | import java.util.Set; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 60 | import java.util.Collections; |
| 61 | import java.time.Instant; |
| 62 | import java.time.temporal.ChronoUnit; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 63 | import java.util.concurrent.CompletableFuture; |
| 64 | import java.util.concurrent.ExecutionException; |
| 65 | import java.util.concurrent.ScheduledExecutorService; |
| 66 | import java.util.concurrent.ScheduledFuture; |
| 67 | import java.util.concurrent.TimeUnit; |
| 68 | import java.util.stream.Collectors; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 69 | import java.util.concurrent.Callable; |
| 70 | import java.util.concurrent.TimeoutException; |
| 71 | import java.util.concurrent.Executors; |
| 72 | import java.util.concurrent.Future; |
| 73 | import org.hyperic.sigar.Sigar; |
| 74 | import org.hyperic.sigar.ProcFd; |
| 75 | import java.util.concurrent.ExecutorService; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 76 | |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 77 | import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor; |
| 78 | import static org.onlab.util.SharedExecutors.getPoolThreadExecutor; |
| 79 | import static org.onlab.util.Tools.groupedThreads; |
| 80 | import static org.slf4j.LoggerFactory.getLogger; |
| 81 | |
| 82 | import static org.onlab.util.Tools.getIntegerProperty; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 83 | import static org.onlab.util.Tools.isPropertyEnabled; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 84 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_POLL_DELAY_MINUTE; |
| 85 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.POLL_FREQUENCY_MINUTE; |
| 86 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_INITIAL_POLL_DELAY_MINUTE; |
| 87 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_POLL_FREQUENCY_MINUTE; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 88 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.REBOOT_RETRY_COUNT; |
| 89 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_REBOOT_RETRY_COUNT; |
| 90 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_CLUSTER_TIMEOUT_PERIOD; |
| 91 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_CLUSTER_TIMEOUT_PERIOD; |
| 92 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_DIAGNOSIS_ACTION; |
| 93 | import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_DIAGNOSIS_ACTION; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 94 | |
| 95 | |
| 96 | @Component(immediate = true, |
| 97 | property = { |
| 98 | INITIAL_POLL_DELAY_MINUTE + ":Integer=" + DEFAULT_INITIAL_POLL_DELAY_MINUTE, |
| 99 | POLL_FREQUENCY_MINUTE + ":Integer=" + DEFAULT_POLL_FREQUENCY_MINUTE, |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 100 | REBOOT_RETRY_COUNT + ":Integer=" + DEFAULT_REBOOT_RETRY_COUNT, |
| 101 | INITIAL_CLUSTER_TIMEOUT_PERIOD + ":Integer=" + DEFAULT_CLUSTER_TIMEOUT_PERIOD, |
| 102 | INITIAL_DIAGNOSIS_ACTION + ":Boolean=" + DEFAULT_DIAGNOSIS_ACTION, |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 103 | }) |
| 104 | public class NodeDiagnosisManager { |
| 105 | |
| 106 | private static final MessageSubject REBOOT_MSG = new MessageSubject("Node-diagnosis"); |
| 107 | |
| 108 | private static int initialPollDelayMinute = DEFAULT_INITIAL_POLL_DELAY_MINUTE; |
| 109 | private static int pollFrequencyMinute = DEFAULT_POLL_FREQUENCY_MINUTE; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 110 | private static final File CFG_FILE = new File("../config/diag-info.json"); |
| 111 | private static final String REBOOT_NU = "rebootNu"; |
| 112 | private static int initialClusterTimeoutPeriod = DEFAULT_CLUSTER_TIMEOUT_PERIOD; |
| 113 | private static boolean initialDiagnosisAction = true; |
| 114 | private static int rebootRetryCount = DEFAULT_REBOOT_RETRY_COUNT; |
| 115 | private int rebootNu; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 116 | |
| 117 | private final Logger log = getLogger(getClass()); |
| 118 | private ScheduledExecutorService metricsExecutor; |
| 119 | |
| 120 | @Reference(cardinality = ReferenceCardinality.MANDATORY) |
| 121 | private ServiceComponentRuntime scrService; |
| 122 | |
| 123 | @Reference(cardinality = ReferenceCardinality.MANDATORY) |
| 124 | private BundleService bundleService; |
| 125 | |
| 126 | @Reference(cardinality = ReferenceCardinality.MANDATORY) |
| 127 | private SystemService systemService; |
| 128 | |
| 129 | @Reference(cardinality = ReferenceCardinality.MANDATORY) |
| 130 | private ClusterCommunicationService communicationService; |
| 131 | |
| 132 | @Reference(cardinality = ReferenceCardinality.MANDATORY) |
| 133 | protected ComponentConfigService cfgService; |
| 134 | |
| 135 | private ScheduledFuture<?> clusterNodeDiagnosisFuture; |
| 136 | |
| 137 | private BundleContext bundleContext; |
| 138 | private ClusterAdminService caService; |
| 139 | private NodeId localNodeId; |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 140 | private Sigar sigar; |
| 141 | private static Process getTcpProc; |
| 142 | private static Process getUdpProc; |
| 143 | private static final String[] CMD_FOR_NETSTAT_PID = {"/bin/sh", "-c", |
| 144 | "ps -ef | grep netstat | grep -v grep | cut -c10-15 | tr -d \' \'"}; |
| 145 | private static final String[] CMD_FOR_PID = {"/bin/sh", "-c", |
| 146 | "ps -ef | grep org.apache.karaf.main.Main | grep -v grep | cut -c10-15 | tr -d \' \'"}; |
| 147 | private static final String[] CMD_FOR_TOTAL_MEMORY = {"/bin/sh", "-c", |
| 148 | "free -b | cut -d \' \' -f 5"}; |
| 149 | private static long memoryThreshold; |
| 150 | private static final int FD_THRESHOLD = 50000; |
| 151 | private static final int SOCKETS_THRESHOLD = 50000; |
| 152 | private static final int DATA_BLOCK_1024 = 1024; |
| 153 | private static final int MEMORY_START_IDX = 3; |
| 154 | private static final String EXTRA_SPACE = "\\s+"; |
| 155 | private static long pid; |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 156 | |
| 157 | private static final long TIMEOUT = 3000; |
| 158 | |
| 159 | @Activate |
| 160 | public void activate() { |
| 161 | cfgService.registerProperties(getClass()); |
| 162 | bundleContext = FrameworkUtil.getBundle(this.getClass()).getBundleContext(); |
| 163 | getNodeId(); |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 164 | sigar = new Sigar(); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 165 | scheduleAppDiagnosisPolling(); |
| 166 | scheduleClusterNodeDiagnosisPolling(); |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 167 | getPid(); |
| 168 | getMemoryThreshold(); |
| 169 | scheduleSdncMemoryDiagnosisPolling(); |
| 170 | scheduleSdncFileDescriptorDiagnosisPolling(); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 171 | communicationService.addSubscriber(REBOOT_MSG, new InternalSampleCollector(), |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 172 | getPoolThreadExecutor()); |
| 173 | rebootNu = fetchRebootNu(); //to restrict number of reboots , reboot numbers will be saved in file and used |
| 174 | rebootRetryCount = fetchRetryRebootCount(); // to set maximum limit for reboot retry. |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 175 | log.info("Started"); |
| 176 | } |
| 177 | |
| 178 | @Deactivate |
| 179 | public void deactivate() { |
| 180 | cfgService.unregisterProperties(getClass(), false); |
| 181 | communicationService.removeSubscriber(REBOOT_MSG); |
| 182 | metricsExecutor.shutdownNow(); |
| 183 | clusterNodeDiagnosisFuture.cancel(true); |
| 184 | log.info("Stopped"); |
| 185 | } |
| 186 | |
| 187 | @Modified |
| 188 | public void modified(ComponentContext context) { |
| 189 | readComponentConfiguration(context); |
| 190 | log.info("modified"); |
| 191 | } |
| 192 | |
| 193 | /** |
| 194 | * Extracts properties from the component configuration context. |
| 195 | * |
| 196 | * @param context the component context |
| 197 | */ |
| 198 | private void readComponentConfiguration(ComponentContext context) { |
| 199 | |
| 200 | Dictionary<?, ?> properties = context.getProperties(); |
| 201 | boolean changed = false; |
| 202 | |
| 203 | int newPollFrequency = getNewPollFrequency(properties); |
| 204 | if (newPollFrequency != pollFrequencyMinute) { |
| 205 | pollFrequencyMinute = newPollFrequency; |
| 206 | changed = true; |
| 207 | } |
| 208 | |
| 209 | int newPollDelay = getNewPollDelay(properties); |
| 210 | if (newPollDelay != pollFrequencyMinute) { |
| 211 | initialPollDelayMinute = newPollDelay; |
| 212 | changed = true; |
| 213 | } |
| 214 | log.info("Node Diagnosis properties are:" + |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 215 | " initialPollDelayMinute: {}, pollFrequencyMinute: {}", |
| 216 | initialPollDelayMinute, pollFrequencyMinute); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 217 | if (changed) { |
| 218 | //stops the old scheduled task |
| 219 | this.clusterNodeDiagnosisFuture.cancel(true); |
| 220 | //schedules new task at the new polling rate |
| 221 | log.info("New Scheduler started with,Node Diagnosis properties:" + |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 222 | " initialPollDelayMinute: {}, pollFrequencyMinute: {}", |
| 223 | initialPollDelayMinute, pollFrequencyMinute); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 224 | scheduleClusterNodeDiagnosisPolling(); |
| 225 | } |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 226 | |
| 227 | int newRebootRetryCount = getNewRebootRetryCount(properties); |
| 228 | updateDiagFile(rebootNu, newRebootRetryCount); |
| 229 | initialDiagnosisAction = getNewDiagnosisAction(properties); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 230 | } |
| 231 | |
| 232 | private int getNewPollFrequency(Dictionary<?, ?> properties) { |
| 233 | int newPollFrequency; |
| 234 | try { |
| 235 | newPollFrequency = getIntegerProperty(properties, POLL_FREQUENCY_MINUTE); |
| 236 | } catch (NumberFormatException | ClassCastException e) { |
| 237 | newPollFrequency = DEFAULT_POLL_FREQUENCY_MINUTE; |
| 238 | } |
| 239 | return newPollFrequency; |
| 240 | } |
| 241 | |
| 242 | private int getNewPollDelay(Dictionary<?, ?> properties) { |
| 243 | int newPollDelay; |
| 244 | try { |
| 245 | newPollDelay = getIntegerProperty(properties, INITIAL_POLL_DELAY_MINUTE); |
| 246 | } catch (NumberFormatException | ClassCastException e) { |
| 247 | newPollDelay = DEFAULT_INITIAL_POLL_DELAY_MINUTE; |
| 248 | } |
| 249 | return newPollDelay; |
| 250 | } |
| 251 | |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 252 | private int getNewRebootRetryCount(Dictionary<?, ?> properties) { |
| 253 | int newRebootRetryCount; |
| 254 | try { |
| 255 | newRebootRetryCount = getIntegerProperty(properties, REBOOT_RETRY_COUNT); |
| 256 | } catch (NumberFormatException | ClassCastException e) { |
| 257 | newRebootRetryCount = DEFAULT_REBOOT_RETRY_COUNT; |
| 258 | } |
| 259 | return newRebootRetryCount; |
| 260 | } |
| 261 | |
| 262 | private boolean getNewDiagnosisAction(Dictionary<?, ?> properties) { |
| 263 | boolean newDiagnosisAction; |
| 264 | try { |
| 265 | newDiagnosisAction = isPropertyEnabled(properties, INITIAL_DIAGNOSIS_ACTION); |
| 266 | } catch (NumberFormatException | ClassCastException e) { |
| 267 | newDiagnosisAction = DEFAULT_DIAGNOSIS_ACTION; |
| 268 | } |
| 269 | return newDiagnosisAction; |
| 270 | } |
| 271 | |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 272 | private List<Bundle> getAllBundles() { |
| 273 | return Arrays.asList(bundleContext.getBundles()); |
| 274 | } |
| 275 | |
| 276 | private void getNodeId() { |
| 277 | caService = DefaultServiceDirectory.getService(ClusterAdminService.class); |
| 278 | if (Objects.isNull(caService)) { |
| 279 | return; |
| 280 | } |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 281 | |
| 282 | localNodeId = caService.getLocalNode().id(); |
| 283 | } |
| 284 | |
| 285 | private void scheduleAppDiagnosisPolling() { |
| 286 | metricsExecutor = newSingleThreadScheduledExecutor( |
| 287 | groupedThreads("Nodediagnosis/diagnosisThread", |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 288 | "Nodediagnosis-executor-%d", log)); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 289 | metricsExecutor.scheduleAtFixedRate(this::appDiagnosis, |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 290 | 60, |
| 291 | 30, TimeUnit.SECONDS); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 292 | } |
| 293 | |
| 294 | private void scheduleClusterNodeDiagnosisPolling() { |
| 295 | clusterNodeDiagnosisFuture = metricsExecutor.scheduleAtFixedRate(this::clusterNodeDiagnosis, |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 296 | initialPollDelayMinute, |
| 297 | pollFrequencyMinute, TimeUnit.MINUTES); |
| 298 | } |
| 299 | |
| 300 | private void scheduleSdncMemoryDiagnosisPolling() { |
| 301 | metricsExecutor.scheduleAtFixedRate(this::sdncMemoryDiagnosis, |
| 302 | 60, |
| 303 | 30, TimeUnit.SECONDS); |
| 304 | |
| 305 | } |
| 306 | |
| 307 | private void scheduleSdncFileDescriptorDiagnosisPolling() { |
| 308 | metricsExecutor.scheduleAtFixedRate(this::sdncFileDescriptorDiagnosis, |
| 309 | 60, |
| 310 | 30, TimeUnit.SECONDS); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 311 | } |
| 312 | |
| 313 | private void appDiagnosis() { |
| 314 | verifyBundles(null); |
| 315 | verifyApps(); |
| 316 | } |
| 317 | |
| 318 | private void verifyBundles(String bundleName) { |
| 319 | |
| 320 | if (Objects.isNull(bundleContext)) { |
| 321 | return; |
| 322 | } |
| 323 | try { |
| 324 | FrameworkWiring wiring = bundleContext.getBundle(Constants.SYSTEM_BUNDLE_LOCATION) |
| 325 | .adapt(FrameworkWiring.class); |
| 326 | if (Objects.isNull(wiring)) { |
| 327 | return; |
| 328 | } |
| 329 | |
| 330 | boolean result; |
| 331 | List<Bundle> bundleList; |
| 332 | if (Objects.nonNull(bundleName)) { |
| 333 | log.info("bundle to be resolved and refreshed: {}", bundleName); |
| 334 | bundleList = this.getAllBundles().stream() |
| 335 | .filter(bundle -> bundleService.getInfo(bundle).getName().equals(bundleName)) |
| 336 | .collect(Collectors.toList()); |
| 337 | } else { |
| 338 | bundleList = this.getAllBundles().stream() |
| 339 | .filter(bundle -> bundleService.getDiag(bundle).split("[\n|\r]").length > 1) |
| 340 | .collect(Collectors.toList()); |
| 341 | } |
| 342 | /** |
| 343 | * Example diags : |
| 344 | * BundleName:onos-providers-openflow-flow, |
| 345 | * Diag:Declarative Services |
| 346 | * ,number of lines of diag:1 |
| 347 | * BundleName:onos-apps-faultmanagement-fmgui, |
| 348 | * Diag:Declarative Services |
| 349 | * org.onosproject.faultmanagement.alarms.gui.AlarmTableComponent (136) |
| 350 | * missing references: uiExtensionService |
| 351 | * org.onosproject.faultmanagement.alarms.gui.AlarmTopovComponent (137) |
| 352 | * missing references: uiExtensionService |
| 353 | * number of lines of diag:5 |
| 354 | */ |
| 355 | this.getAllBundles().forEach( |
| 356 | bundle -> { |
| 357 | log.debug("Bundle service - BundleName:{}, Diag:{}, number of lines of diag:{}", |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 358 | bundleService.getInfo(bundle).getName(), |
| 359 | bundleService.getDiag(bundle), |
| 360 | bundleService.getDiag(bundle).split("[\n|\r]").length); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 361 | }); |
| 362 | |
| 363 | |
| 364 | CompletableFuture<Boolean> completableBundles = CompletableFuture.supplyAsync(() -> { |
| 365 | Boolean isResolved = wiring.resolveBundles(bundleList); |
| 366 | |
| 367 | wiring.refreshBundles(bundleList); |
| 368 | return isResolved; |
| 369 | }); |
| 370 | result = completableBundles.get(); |
| 371 | |
| 372 | if (Objects.nonNull(bundleName)) { |
| 373 | log.info("bundle {} is in resolved State ? {}", bundleName, result ? "Yes" : "No"); |
| 374 | } else { |
| 375 | log.info("All the bundles are in resolved State ? {}", result ? "Yes" : "No"); |
| 376 | } |
| 377 | } catch (InterruptedException | ExecutionException e) { |
| 378 | log.error("exception occurred because of", e); |
| 379 | } catch (Exception e) { |
| 380 | log.error("Exception occured in Verifying Bundles", e); |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | private void verifyApps() { |
| 385 | log.debug("verifyApps() method invoked"); |
| 386 | List<ComponentDescriptionDTO> nonActiveComponents = getNonActiveComponents(); |
| 387 | |
| 388 | nonActiveComponents.forEach(component -> { |
| 389 | try { |
| 390 | scrService.enableComponent(component).timeout(TIMEOUT); |
| 391 | } catch (Exception e) { |
| 392 | throw new IllegalStateException("Unable to start component " + component.name, e); |
| 393 | } |
| 394 | }); |
| 395 | } |
| 396 | |
| 397 | private List<ComponentDescriptionDTO> getNonActiveComponents() { |
| 398 | List<ComponentDescriptionDTO> nonActiveComponents = new ArrayList<>(); |
| 399 | for (ComponentDescriptionDTO component : scrService.getComponentDescriptionDTOs()) { |
| 400 | if (scrService.isComponentEnabled(component)) { |
| 401 | for (ComponentConfigurationDTO config : scrService.getComponentConfigurationDTOs(component)) { |
| 402 | if (config.state != ComponentConfigurationDTO.ACTIVE) { |
| 403 | nonActiveComponents.add(component); |
| 404 | break; |
| 405 | } |
| 406 | } |
| 407 | } |
| 408 | } |
| 409 | return nonActiveComponents; |
| 410 | } |
| 411 | |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 412 | private int fetchRebootNu() { |
| 413 | int rebootNum = 0; |
| 414 | if (!CFG_FILE.exists()) { |
| 415 | log.debug("CFG file not found for reboot number"); |
| 416 | return rebootNum; |
| 417 | } |
| 418 | |
| 419 | ObjectNode root; |
| 420 | try { |
| 421 | root = (ObjectNode) new ObjectMapper().readTree(CFG_FILE); |
| 422 | if (Objects.nonNull(root.findValue(REBOOT_NU))) { |
| 423 | rebootNum = root.findValue(REBOOT_NU).asInt(); |
| 424 | } |
| 425 | } catch (IOException e) { |
| 426 | log.error("applyConfiguration: Exception occurred: {} for {}", e, CFG_FILE); |
| 427 | } |
| 428 | return rebootNum; |
| 429 | } |
| 430 | |
| 431 | private int fetchRetryRebootCount() { |
| 432 | int rebootCount = rebootRetryCount; |
| 433 | if (!CFG_FILE.exists()) { |
| 434 | log.debug("CFG file not found for reboot number"); |
| 435 | return rebootCount; |
| 436 | } |
| 437 | |
| 438 | ObjectNode root; |
| 439 | try { |
| 440 | root = (ObjectNode) new ObjectMapper().readTree(CFG_FILE); |
| 441 | if (Objects.nonNull(root.findValue(REBOOT_RETRY_COUNT))) { |
| 442 | rebootCount = root.findValue(REBOOT_RETRY_COUNT).asInt(); |
| 443 | } |
| 444 | } catch (IOException e) { |
| 445 | log.error("applyConfiguration: Exception occurred: {} for {}", e, CFG_FILE); |
| 446 | } |
| 447 | return rebootCount; |
| 448 | } |
| 449 | |
| 450 | private void resetRebootNu() { |
| 451 | updateRebootNu(0); |
| 452 | } |
| 453 | |
| 454 | private void updateRebootNu(int rebootnum) { |
| 455 | updateDiagFile(rebootnum, rebootRetryCount); |
| 456 | } |
| 457 | |
| 458 | private void updateDiagFile(int rebootnum, int defaultRebootcount) { |
| 459 | ObjectMapper mapper = new ObjectMapper(); |
| 460 | Map<String, Integer> output = new HashMap<>(); |
| 461 | output.put(REBOOT_RETRY_COUNT, defaultRebootcount); |
| 462 | output.put(REBOOT_NU, rebootnum); |
| 463 | rebootNu = rebootnum; |
| 464 | rebootRetryCount = defaultRebootcount; |
| 465 | try { |
| 466 | mapper.writerWithDefaultPrettyPrinter().writeValue(CFG_FILE, output); |
| 467 | } catch (IOException e) { |
| 468 | e.printStackTrace(); |
| 469 | } |
| 470 | } |
| 471 | |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 472 | private void clusterNodeDiagnosis() { |
| 473 | if (Objects.isNull(caService)) { |
| 474 | return; |
| 475 | } |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 476 | try { |
| 477 | if (caService.getState(localNodeId).equals(ControllerNode.State.READY)) { |
| 478 | if (rebootNu > 0) { |
| 479 | resetRebootNu(); |
| 480 | } |
| 481 | return; |
| 482 | } |
| 483 | long lastUpdatedInstant = caService.getLastUpdatedInstant(localNodeId).until(Instant.now(), |
| 484 | ChronoUnit.MINUTES); |
| 485 | if (lastUpdatedInstant <= initialClusterTimeoutPeriod) { |
| 486 | return; |
| 487 | } |
| 488 | /** |
| 489 | * Diagnosis Action if set to true, onos reboot occurs when required. |
| 490 | * Diagnosis Action if set to false, leaves logs informing that onos reboot is needed. |
| 491 | */ |
| 492 | if (!initialDiagnosisAction) { |
| 493 | log.info("onos Halt is needed as cluster node status is in: {} for Time out period: {}" + |
| 494 | " for node {} with lastUpdatedInstant: {}" + |
| 495 | " But, not onos is not rebooted as Diagnosis action is set to false", |
| 496 | caService.getState(localNodeId), initialClusterTimeoutPeriod, localNodeId, lastUpdatedInstant); |
| 497 | return; |
| 498 | } |
| 499 | log.info("onos Halt is needed as cluster node status is in: {} for Time out period: {}" + |
| 500 | " for node {} with lastUpdatedInstant: {}", |
| 501 | caService.getState(localNodeId), initialClusterTimeoutPeriod, localNodeId, lastUpdatedInstant); |
| 502 | if (rebootNu < rebootRetryCount) { |
| 503 | updateRebootNu(rebootNu + 1); |
| 504 | log.info("Halting.Number of Halting:{}", rebootNu); |
| 505 | multicastReboot(true, Collections.singleton(localNodeId)); |
| 506 | } else { |
| 507 | log.info("Halting is ignored as it is crossed limit of default Halting number"); |
| 508 | } |
| 509 | } catch (Exception e) { |
| 510 | log.error("Exception occured in Cluster Node Diagnosis", e); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 511 | } |
| 512 | } |
| 513 | |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 514 | /** |
| 515 | * Gets memory threshold. |
| 516 | * Obtains total memory of the system where onos is running. |
| 517 | * 80% of the total memory is taken as threshold. |
| 518 | */ |
| 519 | private void getMemoryThreshold() { |
| 520 | String memStr = ""; |
| 521 | try { |
| 522 | String outputMem; |
| 523 | Process getMemProc = Runtime.getRuntime().exec(CMD_FOR_TOTAL_MEMORY); |
| 524 | try (BufferedReader bufferedReader = new BufferedReader( |
| 525 | new InputStreamReader(getMemProc.getInputStream()))) { |
| 526 | while ((outputMem = bufferedReader.readLine()) != null) { |
| 527 | memStr += outputMem; |
| 528 | } |
| 529 | } |
| 530 | |
| 531 | memStr = memStr.replaceAll("\n", ""); |
| 532 | long totalMem = Long.parseLong(memStr); |
| 533 | //Taking 80% of total memory as threshold |
| 534 | memoryThreshold = (long) (totalMem * 0.80); |
| 535 | log.trace("totalMemory {}", memoryThreshold); |
| 536 | } catch (Exception e) { |
| 537 | log.error("Exception occured while getting Pid", e); |
| 538 | } |
| 539 | } |
| 540 | |
| 541 | /** |
| 542 | * Gets pid of the onos service. |
| 543 | */ |
| 544 | private void getPid() { |
| 545 | String pidStr = ""; |
| 546 | try { |
| 547 | String outputPid; |
| 548 | Process getPidProc = Runtime.getRuntime().exec(CMD_FOR_PID); |
| 549 | try (BufferedReader bufferedReader = new BufferedReader( |
| 550 | new InputStreamReader(getPidProc.getInputStream()))) { |
| 551 | while ((outputPid = bufferedReader.readLine()) != null) { |
| 552 | pidStr += outputPid; |
| 553 | } |
| 554 | } |
| 555 | |
| 556 | this.pid = Long.parseLong(pidStr); |
| 557 | log.trace("pid {}", pid); |
| 558 | } catch (Exception e) { |
| 559 | log.error("Exception occured while getting Pid", e); |
| 560 | } |
| 561 | } |
| 562 | |
| 563 | /** |
| 564 | * Restart onos if sdnc memory exceeds memory threshold. |
| 565 | */ |
| 566 | private void sdncMemoryDiagnosis() { |
| 567 | if (Objects.isNull(pid)) { |
| 568 | return; |
| 569 | } |
| 570 | if (Objects.isNull(memoryThreshold)) { |
| 571 | return; |
| 572 | } |
| 573 | try { |
| 574 | String[] getMemCmd = { |
| 575 | "/bin/sh", |
| 576 | "-c", |
| 577 | " ps -p " + pid + " -o rss" |
| 578 | }; |
| 579 | |
| 580 | String outputMem; |
| 581 | String outputMemFinal = ""; |
| 582 | Process getMemProc = Runtime.getRuntime().exec(getMemCmd); |
| 583 | try (BufferedReader bufferedReader = new BufferedReader( |
| 584 | new InputStreamReader(getMemProc.getInputStream()))) { |
| 585 | while ((outputMem = bufferedReader.readLine()) != null) { |
| 586 | outputMemFinal += outputMem; |
| 587 | } |
| 588 | } |
| 589 | //Ex:outputFinalMem-> " RSS1031496" |
| 590 | outputMemFinal = outputMemFinal.replaceAll(EXTRA_SPACE, ""); |
| 591 | String memTotalStr = outputMemFinal.substring(MEMORY_START_IDX); |
| 592 | if (memTotalStr.isEmpty()) { |
| 593 | log.error("Total Memory is empty"); |
| 594 | return; |
| 595 | } |
| 596 | long memTotal = Long.parseLong(memTotalStr); |
| 597 | memTotal *= DATA_BLOCK_1024; |
| 598 | log.trace("memTotal {}", memTotal); |
| 599 | |
| 600 | if (memTotal > memoryThreshold) { |
| 601 | log.info("onos Halt is needed as memory has exceeded. " + |
| 602 | "The threshold is {} and used memory is {} for node {}.", |
| 603 | memoryThreshold, memTotal, localNodeId); |
| 604 | multicastReboot(true, Collections.singleton(localNodeId)); |
| 605 | } |
| 606 | } catch (Exception e) { |
| 607 | log.error("exception at Sdnc Memory Diagnosis", e); |
| 608 | } |
| 609 | |
| 610 | } |
| 611 | |
| 612 | /** |
| 613 | * To obtain number of tcp socket descriptors. |
| 614 | */ |
| 615 | private static class CallableTcpexecute implements Callable<Long> { |
| 616 | public Long call() throws Exception { |
| 617 | String[] cmdTcpFd = {"/bin/sh", "-c", |
| 618 | "netstat -anp 2>/dev/null | grep " + pid + "/java | grep tcp | wc -l"}; |
| 619 | getTcpProc = Runtime.getRuntime().exec(cmdTcpFd); |
| 620 | if (Objects.isNull(getTcpProc)) { |
| 621 | return 0L; |
| 622 | } |
| 623 | String outputTcp; |
| 624 | try (BufferedReader bufferedReader = new BufferedReader( |
| 625 | new InputStreamReader(getTcpProc.getInputStream()))) { |
| 626 | outputTcp = bufferedReader.readLine(); |
| 627 | } |
| 628 | if (Objects.isNull(outputTcp)) { |
| 629 | return 0L; |
| 630 | } |
| 631 | return Long.parseLong(outputTcp); |
| 632 | } |
| 633 | } |
| 634 | |
| 635 | /** |
| 636 | * To obtain number of udp socket descriptors. |
| 637 | */ |
| 638 | private static class CallableUdpexecute implements Callable<Long> { |
| 639 | public Long call() throws Exception { |
| 640 | String[] cmdUdpFd = {"/bin/sh", "-c", |
| 641 | "netstat -anp 2>/dev/null | grep " + pid + "/java | grep udp | wc -l"}; |
| 642 | getUdpProc = Runtime.getRuntime().exec(cmdUdpFd); |
| 643 | if (Objects.isNull(getUdpProc)) { |
| 644 | return 0L; |
| 645 | } |
| 646 | String outputUdp; |
| 647 | try (BufferedReader bufferedReader = new BufferedReader( |
| 648 | new InputStreamReader(getUdpProc.getInputStream()))) { |
| 649 | outputUdp = bufferedReader.readLine(); |
| 650 | } |
| 651 | if (Objects.isNull(outputUdp)) { |
| 652 | return 0L; |
| 653 | } |
| 654 | return Long.parseLong(outputUdp); |
| 655 | } |
| 656 | } |
| 657 | |
| 658 | /** |
| 659 | * Restarts onos if total number of socket descriptors exceeds threshold. |
| 660 | */ |
| 661 | private void socketDescriptorsDiagnosis() { |
| 662 | ExecutorService executorService = Executors.newCachedThreadPool(); |
| 663 | Future<Long> futureTcp; |
| 664 | Future<Long> futureUdp; |
| 665 | futureTcp = executorService.submit(new CallableTcpexecute()); |
| 666 | futureUdp = executorService.submit(new CallableUdpexecute()); |
| 667 | try { |
| 668 | long tcpSds = futureTcp.get(5, TimeUnit.SECONDS); |
| 669 | long udpSds = futureUdp.get(5, TimeUnit.SECONDS); |
| 670 | |
| 671 | long totalSockets = tcpSds + udpSds; |
| 672 | log.trace("total {}, tcp {}, udp {}", totalSockets, tcpSds, udpSds); |
| 673 | if (totalSockets > SOCKETS_THRESHOLD) { |
| 674 | log.info("onos Halt is needed as socket descriptors has exceeded " + |
| 675 | "threshold limit for node {}", localNodeId); |
| 676 | multicastReboot(true, Collections.singleton(localNodeId)); |
| 677 | } |
| 678 | } catch (TimeoutException e) { |
| 679 | log.error("Timeout exception at Socket Descriptors diagnosis", e); |
| 680 | try { |
| 681 | if (Objects.nonNull(getTcpProc)) { |
| 682 | getTcpProc.destroy(); |
| 683 | } |
| 684 | if (Objects.nonNull(getUdpProc)) { |
| 685 | getUdpProc.destroy(); |
| 686 | } |
| 687 | } catch (Exception ex) { |
| 688 | log.error("Exception at destroying Tcp/Udp process", ex); |
| 689 | } |
| 690 | |
| 691 | String outputPid; |
| 692 | try { |
| 693 | String pidStr = ""; |
| 694 | Process getPidProc = Runtime.getRuntime().exec(CMD_FOR_NETSTAT_PID); |
| 695 | try (BufferedReader bufferedReader = new BufferedReader( |
| 696 | new InputStreamReader(getPidProc.getInputStream()))) { |
| 697 | while ((outputPid = bufferedReader.readLine()) != null) { |
| 698 | pidStr += outputPid; |
| 699 | } |
| 700 | } |
| 701 | if (!pidStr.equals("")) { |
| 702 | Runtime.getRuntime().exec("kill " + pidStr); |
| 703 | } |
| 704 | } catch (Exception ex) { |
| 705 | log.error("Exception at killing netstat command", ex); |
| 706 | } |
| 707 | |
| 708 | log.info("onos Halt is needed as timeout occured while finding total number of " + |
| 709 | "socket descriptors for node {}", localNodeId); |
| 710 | multicastReboot(true, Collections.singleton(localNodeId)); |
| 711 | } catch (Exception e) { |
| 712 | log.error("exception at Socket Descriptors diagnosis", e); |
| 713 | } finally { |
| 714 | futureTcp.cancel(true); |
| 715 | futureUdp.cancel(true); |
| 716 | } |
| 717 | } |
| 718 | |
| 719 | /** |
| 720 | * Restarts onos if total number of threads and file descriptors exceeds threshold. |
| 721 | */ |
| 722 | private void threadsAndFilesDescriptorDiagnosis() { |
| 723 | if (Objects.isNull(pid)) { |
| 724 | return; |
| 725 | } |
| 726 | try { |
| 727 | ProcFd procFd = sigar.getProcFd(pid); |
| 728 | long totalFd = procFd.getTotal(); |
| 729 | log.trace("total fds{}", totalFd); |
| 730 | if (totalFd > FD_THRESHOLD) { |
| 731 | log.info("onos halt is needed as number of threads and file descriptors " + |
| 732 | "has exceeded Threshold limit for node {}", localNodeId); |
| 733 | multicastReboot(true, Collections.singleton(localNodeId)); |
| 734 | } |
| 735 | } catch (Exception e) { |
| 736 | log.error("Exception at Sdnc file descriptor diagnosis", e); |
| 737 | |
| 738 | } |
| 739 | |
| 740 | } |
| 741 | |
| 742 | |
| 743 | private void sdncFileDescriptorDiagnosis() { |
| 744 | socketDescriptorsDiagnosis(); |
| 745 | threadsAndFilesDescriptorDiagnosis(); |
| 746 | } |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 747 | |
| 748 | public void restartNode() { |
| 749 | try { |
| 750 | systemService.reboot("now", SystemService.Swipe.CACHE); |
| 751 | } catch (Exception e) { |
| 752 | log.error("error occured because of {} ", e.getMessage()); |
| 753 | } |
| 754 | } |
| 755 | |
| 756 | private void multicastReboot(boolean removeDb, Set<NodeId> nodeIds) { |
| 757 | String data = "Reboot:" + removeDb; |
| 758 | communicationService.multicast(data, REBOOT_MSG, String::getBytes, nodeIds); |
| 759 | } |
| 760 | |
| 761 | private class InternalSampleCollector implements ClusterMessageHandler { |
| 762 | @Override |
| 763 | public void handle(ClusterMessage message) { |
| 764 | String reqMsg = new String(message.payload()); |
| 765 | log.info("Cluster communication message subject{} and message {}", |
Parvathi MH | ce78917 | 2020-04-14 10:55:45 +0530 | [diff] [blame] | 766 | message.subject(), reqMsg); |
Saritha | 80cb854 | 2019-12-09 10:39:20 +0530 | [diff] [blame] | 767 | boolean flag = Boolean.parseBoolean(reqMsg.split(":")[1].trim()); |
| 768 | if (flag) { |
| 769 | System.setProperty("apache.karaf.removedb", "true"); |
| 770 | } |
| 771 | if (message.subject().equals(REBOOT_MSG)) { |
| 772 | restartNode(); |
| 773 | } |
| 774 | } |
| 775 | } |
| 776 | } |