blob: cee96c3eb45d36fcb47c0d2edd7bbcb1a953e759 [file] [log] [blame]
Saritha80cb8542019-12-09 10:39:20 +05301/*
2 * Copyright 2020-present Open Networking Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package org.onosproject.diagnosis.impl;
17
Parvathi MHce789172020-04-14 10:55:45 +053018import com.fasterxml.jackson.databind.ObjectMapper;
19import com.fasterxml.jackson.databind.node.ObjectNode;
Saritha80cb8542019-12-09 10:39:20 +053020import org.osgi.service.component.annotations.Activate;
21import org.osgi.service.component.annotations.Component;
22import org.osgi.service.component.annotations.Modified;
23import org.osgi.service.component.annotations.Deactivate;
24import org.osgi.service.component.runtime.ServiceComponentRuntime;
25import org.osgi.service.component.annotations.Reference;
26import org.osgi.service.component.annotations.ReferenceCardinality;
27import org.osgi.service.component.runtime.dto.ComponentConfigurationDTO;
28import org.osgi.service.component.runtime.dto.ComponentDescriptionDTO;
29import org.osgi.service.component.ComponentContext;
30import org.apache.karaf.bundle.core.BundleService;
31import org.apache.karaf.system.SystemService;
32import org.onlab.osgi.DefaultServiceDirectory;
33import org.onosproject.cfg.ComponentConfigService;
34import org.onosproject.cluster.ClusterAdminService;
35import org.onosproject.cluster.ControllerNode;
36import org.onosproject.cluster.NodeId;
37import org.onosproject.store.cluster.messaging.ClusterCommunicationService;
38import org.onosproject.store.cluster.messaging.ClusterMessage;
39import org.onosproject.store.cluster.messaging.ClusterMessageHandler;
40import org.onosproject.store.cluster.messaging.MessageSubject;
41import org.osgi.framework.Bundle;
42import org.osgi.framework.BundleContext;
43import org.osgi.framework.FrameworkUtil;
44import org.osgi.framework.Constants;
45import org.osgi.framework.wiring.FrameworkWiring;
46import org.slf4j.Logger;
47
48import java.util.Dictionary;
Parvathi MHce789172020-04-14 10:55:45 +053049import java.io.File;
50import java.io.BufferedReader;
51import java.io.IOException;
52import java.io.InputStreamReader;
Saritha80cb8542019-12-09 10:39:20 +053053import java.util.Arrays;
54import java.util.ArrayList;
Parvathi MHce789172020-04-14 10:55:45 +053055import java.util.HashMap;
Saritha80cb8542019-12-09 10:39:20 +053056import java.util.List;
Parvathi MHce789172020-04-14 10:55:45 +053057import java.util.Map;
Saritha80cb8542019-12-09 10:39:20 +053058import java.util.Objects;
59import java.util.Set;
Parvathi MHce789172020-04-14 10:55:45 +053060import java.util.Collections;
61import java.time.Instant;
62import java.time.temporal.ChronoUnit;
Saritha80cb8542019-12-09 10:39:20 +053063import java.util.concurrent.CompletableFuture;
64import java.util.concurrent.ExecutionException;
65import java.util.concurrent.ScheduledExecutorService;
66import java.util.concurrent.ScheduledFuture;
67import java.util.concurrent.TimeUnit;
68import java.util.stream.Collectors;
Parvathi MHce789172020-04-14 10:55:45 +053069import java.util.concurrent.Callable;
70import java.util.concurrent.TimeoutException;
71import java.util.concurrent.Executors;
72import java.util.concurrent.Future;
73import org.hyperic.sigar.Sigar;
74import org.hyperic.sigar.ProcFd;
75import java.util.concurrent.ExecutorService;
Saritha80cb8542019-12-09 10:39:20 +053076
Saritha80cb8542019-12-09 10:39:20 +053077import static java.util.concurrent.Executors.newSingleThreadScheduledExecutor;
78import static org.onlab.util.SharedExecutors.getPoolThreadExecutor;
79import static org.onlab.util.Tools.groupedThreads;
80import static org.slf4j.LoggerFactory.getLogger;
81
82import static org.onlab.util.Tools.getIntegerProperty;
Parvathi MHce789172020-04-14 10:55:45 +053083import static org.onlab.util.Tools.isPropertyEnabled;
Saritha80cb8542019-12-09 10:39:20 +053084import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_POLL_DELAY_MINUTE;
85import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.POLL_FREQUENCY_MINUTE;
86import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_INITIAL_POLL_DELAY_MINUTE;
87import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_POLL_FREQUENCY_MINUTE;
Parvathi MHce789172020-04-14 10:55:45 +053088import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.REBOOT_RETRY_COUNT;
89import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_REBOOT_RETRY_COUNT;
90import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_CLUSTER_TIMEOUT_PERIOD;
91import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_CLUSTER_TIMEOUT_PERIOD;
92import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.INITIAL_DIAGNOSIS_ACTION;
93import static org.onosproject.diagnosis.impl.OsgiPropertyConstants.DEFAULT_DIAGNOSIS_ACTION;
Saritha80cb8542019-12-09 10:39:20 +053094
95
96@Component(immediate = true,
97 property = {
98 INITIAL_POLL_DELAY_MINUTE + ":Integer=" + DEFAULT_INITIAL_POLL_DELAY_MINUTE,
99 POLL_FREQUENCY_MINUTE + ":Integer=" + DEFAULT_POLL_FREQUENCY_MINUTE,
Parvathi MHce789172020-04-14 10:55:45 +0530100 REBOOT_RETRY_COUNT + ":Integer=" + DEFAULT_REBOOT_RETRY_COUNT,
101 INITIAL_CLUSTER_TIMEOUT_PERIOD + ":Integer=" + DEFAULT_CLUSTER_TIMEOUT_PERIOD,
102 INITIAL_DIAGNOSIS_ACTION + ":Boolean=" + DEFAULT_DIAGNOSIS_ACTION,
Saritha80cb8542019-12-09 10:39:20 +0530103 })
104public class NodeDiagnosisManager {
105
106 private static final MessageSubject REBOOT_MSG = new MessageSubject("Node-diagnosis");
107
108 private static int initialPollDelayMinute = DEFAULT_INITIAL_POLL_DELAY_MINUTE;
109 private static int pollFrequencyMinute = DEFAULT_POLL_FREQUENCY_MINUTE;
Parvathi MHce789172020-04-14 10:55:45 +0530110 private static final File CFG_FILE = new File("../config/diag-info.json");
111 private static final String REBOOT_NU = "rebootNu";
112 private static int initialClusterTimeoutPeriod = DEFAULT_CLUSTER_TIMEOUT_PERIOD;
113 private static boolean initialDiagnosisAction = true;
114 private static int rebootRetryCount = DEFAULT_REBOOT_RETRY_COUNT;
115 private int rebootNu;
Saritha80cb8542019-12-09 10:39:20 +0530116
117 private final Logger log = getLogger(getClass());
118 private ScheduledExecutorService metricsExecutor;
119
120 @Reference(cardinality = ReferenceCardinality.MANDATORY)
121 private ServiceComponentRuntime scrService;
122
123 @Reference(cardinality = ReferenceCardinality.MANDATORY)
124 private BundleService bundleService;
125
126 @Reference(cardinality = ReferenceCardinality.MANDATORY)
127 private SystemService systemService;
128
129 @Reference(cardinality = ReferenceCardinality.MANDATORY)
130 private ClusterCommunicationService communicationService;
131
132 @Reference(cardinality = ReferenceCardinality.MANDATORY)
133 protected ComponentConfigService cfgService;
134
135 private ScheduledFuture<?> clusterNodeDiagnosisFuture;
136
137 private BundleContext bundleContext;
138 private ClusterAdminService caService;
139 private NodeId localNodeId;
Parvathi MHce789172020-04-14 10:55:45 +0530140 private Sigar sigar;
141 private static Process getTcpProc;
142 private static Process getUdpProc;
143 private static final String[] CMD_FOR_NETSTAT_PID = {"/bin/sh", "-c",
144 "ps -ef | grep netstat | grep -v grep | cut -c10-15 | tr -d \' \'"};
145 private static final String[] CMD_FOR_PID = {"/bin/sh", "-c",
146 "ps -ef | grep org.apache.karaf.main.Main | grep -v grep | cut -c10-15 | tr -d \' \'"};
147 private static final String[] CMD_FOR_TOTAL_MEMORY = {"/bin/sh", "-c",
148 "free -b | cut -d \' \' -f 5"};
149 private static long memoryThreshold;
150 private static final int FD_THRESHOLD = 50000;
151 private static final int SOCKETS_THRESHOLD = 50000;
152 private static final int DATA_BLOCK_1024 = 1024;
153 private static final int MEMORY_START_IDX = 3;
154 private static final String EXTRA_SPACE = "\\s+";
155 private static long pid;
Saritha80cb8542019-12-09 10:39:20 +0530156
157 private static final long TIMEOUT = 3000;
158
159 @Activate
160 public void activate() {
161 cfgService.registerProperties(getClass());
162 bundleContext = FrameworkUtil.getBundle(this.getClass()).getBundleContext();
163 getNodeId();
Parvathi MHce789172020-04-14 10:55:45 +0530164 sigar = new Sigar();
Saritha80cb8542019-12-09 10:39:20 +0530165 scheduleAppDiagnosisPolling();
166 scheduleClusterNodeDiagnosisPolling();
Parvathi MHce789172020-04-14 10:55:45 +0530167 getPid();
168 getMemoryThreshold();
169 scheduleSdncMemoryDiagnosisPolling();
170 scheduleSdncFileDescriptorDiagnosisPolling();
Saritha80cb8542019-12-09 10:39:20 +0530171 communicationService.addSubscriber(REBOOT_MSG, new InternalSampleCollector(),
Parvathi MHce789172020-04-14 10:55:45 +0530172 getPoolThreadExecutor());
173 rebootNu = fetchRebootNu(); //to restrict number of reboots , reboot numbers will be saved in file and used
174 rebootRetryCount = fetchRetryRebootCount(); // to set maximum limit for reboot retry.
Saritha80cb8542019-12-09 10:39:20 +0530175 log.info("Started");
176 }
177
178 @Deactivate
179 public void deactivate() {
180 cfgService.unregisterProperties(getClass(), false);
181 communicationService.removeSubscriber(REBOOT_MSG);
182 metricsExecutor.shutdownNow();
183 clusterNodeDiagnosisFuture.cancel(true);
184 log.info("Stopped");
185 }
186
187 @Modified
188 public void modified(ComponentContext context) {
189 readComponentConfiguration(context);
190 log.info("modified");
191 }
192
193 /**
194 * Extracts properties from the component configuration context.
195 *
196 * @param context the component context
197 */
198 private void readComponentConfiguration(ComponentContext context) {
199
200 Dictionary<?, ?> properties = context.getProperties();
201 boolean changed = false;
202
203 int newPollFrequency = getNewPollFrequency(properties);
204 if (newPollFrequency != pollFrequencyMinute) {
205 pollFrequencyMinute = newPollFrequency;
206 changed = true;
207 }
208
209 int newPollDelay = getNewPollDelay(properties);
210 if (newPollDelay != pollFrequencyMinute) {
211 initialPollDelayMinute = newPollDelay;
212 changed = true;
213 }
214 log.info("Node Diagnosis properties are:" +
Parvathi MHce789172020-04-14 10:55:45 +0530215 " initialPollDelayMinute: {}, pollFrequencyMinute: {}",
216 initialPollDelayMinute, pollFrequencyMinute);
Saritha80cb8542019-12-09 10:39:20 +0530217 if (changed) {
218 //stops the old scheduled task
219 this.clusterNodeDiagnosisFuture.cancel(true);
220 //schedules new task at the new polling rate
221 log.info("New Scheduler started with,Node Diagnosis properties:" +
Parvathi MHce789172020-04-14 10:55:45 +0530222 " initialPollDelayMinute: {}, pollFrequencyMinute: {}",
223 initialPollDelayMinute, pollFrequencyMinute);
Saritha80cb8542019-12-09 10:39:20 +0530224 scheduleClusterNodeDiagnosisPolling();
225 }
Parvathi MHce789172020-04-14 10:55:45 +0530226
227 int newRebootRetryCount = getNewRebootRetryCount(properties);
228 updateDiagFile(rebootNu, newRebootRetryCount);
229 initialDiagnosisAction = getNewDiagnosisAction(properties);
Saritha80cb8542019-12-09 10:39:20 +0530230 }
231
232 private int getNewPollFrequency(Dictionary<?, ?> properties) {
233 int newPollFrequency;
234 try {
235 newPollFrequency = getIntegerProperty(properties, POLL_FREQUENCY_MINUTE);
236 } catch (NumberFormatException | ClassCastException e) {
237 newPollFrequency = DEFAULT_POLL_FREQUENCY_MINUTE;
238 }
239 return newPollFrequency;
240 }
241
242 private int getNewPollDelay(Dictionary<?, ?> properties) {
243 int newPollDelay;
244 try {
245 newPollDelay = getIntegerProperty(properties, INITIAL_POLL_DELAY_MINUTE);
246 } catch (NumberFormatException | ClassCastException e) {
247 newPollDelay = DEFAULT_INITIAL_POLL_DELAY_MINUTE;
248 }
249 return newPollDelay;
250 }
251
Parvathi MHce789172020-04-14 10:55:45 +0530252 private int getNewRebootRetryCount(Dictionary<?, ?> properties) {
253 int newRebootRetryCount;
254 try {
255 newRebootRetryCount = getIntegerProperty(properties, REBOOT_RETRY_COUNT);
256 } catch (NumberFormatException | ClassCastException e) {
257 newRebootRetryCount = DEFAULT_REBOOT_RETRY_COUNT;
258 }
259 return newRebootRetryCount;
260 }
261
262 private boolean getNewDiagnosisAction(Dictionary<?, ?> properties) {
263 boolean newDiagnosisAction;
264 try {
265 newDiagnosisAction = isPropertyEnabled(properties, INITIAL_DIAGNOSIS_ACTION);
266 } catch (NumberFormatException | ClassCastException e) {
267 newDiagnosisAction = DEFAULT_DIAGNOSIS_ACTION;
268 }
269 return newDiagnosisAction;
270 }
271
Saritha80cb8542019-12-09 10:39:20 +0530272 private List<Bundle> getAllBundles() {
273 return Arrays.asList(bundleContext.getBundles());
274 }
275
276 private void getNodeId() {
277 caService = DefaultServiceDirectory.getService(ClusterAdminService.class);
278 if (Objects.isNull(caService)) {
279 return;
280 }
Saritha80cb8542019-12-09 10:39:20 +0530281
282 localNodeId = caService.getLocalNode().id();
283 }
284
285 private void scheduleAppDiagnosisPolling() {
286 metricsExecutor = newSingleThreadScheduledExecutor(
287 groupedThreads("Nodediagnosis/diagnosisThread",
Parvathi MHce789172020-04-14 10:55:45 +0530288 "Nodediagnosis-executor-%d", log));
Saritha80cb8542019-12-09 10:39:20 +0530289 metricsExecutor.scheduleAtFixedRate(this::appDiagnosis,
Parvathi MHce789172020-04-14 10:55:45 +0530290 60,
291 30, TimeUnit.SECONDS);
Saritha80cb8542019-12-09 10:39:20 +0530292 }
293
294 private void scheduleClusterNodeDiagnosisPolling() {
295 clusterNodeDiagnosisFuture = metricsExecutor.scheduleAtFixedRate(this::clusterNodeDiagnosis,
Parvathi MHce789172020-04-14 10:55:45 +0530296 initialPollDelayMinute,
297 pollFrequencyMinute, TimeUnit.MINUTES);
298 }
299
300 private void scheduleSdncMemoryDiagnosisPolling() {
301 metricsExecutor.scheduleAtFixedRate(this::sdncMemoryDiagnosis,
302 60,
303 30, TimeUnit.SECONDS);
304
305 }
306
307 private void scheduleSdncFileDescriptorDiagnosisPolling() {
308 metricsExecutor.scheduleAtFixedRate(this::sdncFileDescriptorDiagnosis,
309 60,
310 30, TimeUnit.SECONDS);
Saritha80cb8542019-12-09 10:39:20 +0530311 }
312
313 private void appDiagnosis() {
314 verifyBundles(null);
315 verifyApps();
316 }
317
318 private void verifyBundles(String bundleName) {
319
320 if (Objects.isNull(bundleContext)) {
321 return;
322 }
323 try {
324 FrameworkWiring wiring = bundleContext.getBundle(Constants.SYSTEM_BUNDLE_LOCATION)
325 .adapt(FrameworkWiring.class);
326 if (Objects.isNull(wiring)) {
327 return;
328 }
329
330 boolean result;
331 List<Bundle> bundleList;
332 if (Objects.nonNull(bundleName)) {
333 log.info("bundle to be resolved and refreshed: {}", bundleName);
334 bundleList = this.getAllBundles().stream()
335 .filter(bundle -> bundleService.getInfo(bundle).getName().equals(bundleName))
336 .collect(Collectors.toList());
337 } else {
338 bundleList = this.getAllBundles().stream()
339 .filter(bundle -> bundleService.getDiag(bundle).split("[\n|\r]").length > 1)
340 .collect(Collectors.toList());
341 }
342 /**
343 * Example diags :
344 * BundleName:onos-providers-openflow-flow,
345 * Diag:Declarative Services
346 * ,number of lines of diag:1
347 * BundleName:onos-apps-faultmanagement-fmgui,
348 * Diag:Declarative Services
349 * org.onosproject.faultmanagement.alarms.gui.AlarmTableComponent (136)
350 * missing references: uiExtensionService
351 * org.onosproject.faultmanagement.alarms.gui.AlarmTopovComponent (137)
352 * missing references: uiExtensionService
353 * number of lines of diag:5
354 */
355 this.getAllBundles().forEach(
356 bundle -> {
357 log.debug("Bundle service - BundleName:{}, Diag:{}, number of lines of diag:{}",
Parvathi MHce789172020-04-14 10:55:45 +0530358 bundleService.getInfo(bundle).getName(),
359 bundleService.getDiag(bundle),
360 bundleService.getDiag(bundle).split("[\n|\r]").length);
Saritha80cb8542019-12-09 10:39:20 +0530361 });
362
363
364 CompletableFuture<Boolean> completableBundles = CompletableFuture.supplyAsync(() -> {
365 Boolean isResolved = wiring.resolveBundles(bundleList);
366
367 wiring.refreshBundles(bundleList);
368 return isResolved;
369 });
370 result = completableBundles.get();
371
372 if (Objects.nonNull(bundleName)) {
373 log.info("bundle {} is in resolved State ? {}", bundleName, result ? "Yes" : "No");
374 } else {
375 log.info("All the bundles are in resolved State ? {}", result ? "Yes" : "No");
376 }
377 } catch (InterruptedException | ExecutionException e) {
378 log.error("exception occurred because of", e);
379 } catch (Exception e) {
380 log.error("Exception occured in Verifying Bundles", e);
381 }
382 }
383
384 private void verifyApps() {
385 log.debug("verifyApps() method invoked");
386 List<ComponentDescriptionDTO> nonActiveComponents = getNonActiveComponents();
387
388 nonActiveComponents.forEach(component -> {
389 try {
390 scrService.enableComponent(component).timeout(TIMEOUT);
391 } catch (Exception e) {
392 throw new IllegalStateException("Unable to start component " + component.name, e);
393 }
394 });
395 }
396
397 private List<ComponentDescriptionDTO> getNonActiveComponents() {
398 List<ComponentDescriptionDTO> nonActiveComponents = new ArrayList<>();
399 for (ComponentDescriptionDTO component : scrService.getComponentDescriptionDTOs()) {
400 if (scrService.isComponentEnabled(component)) {
401 for (ComponentConfigurationDTO config : scrService.getComponentConfigurationDTOs(component)) {
402 if (config.state != ComponentConfigurationDTO.ACTIVE) {
403 nonActiveComponents.add(component);
404 break;
405 }
406 }
407 }
408 }
409 return nonActiveComponents;
410 }
411
Parvathi MHce789172020-04-14 10:55:45 +0530412 private int fetchRebootNu() {
413 int rebootNum = 0;
414 if (!CFG_FILE.exists()) {
415 log.debug("CFG file not found for reboot number");
416 return rebootNum;
417 }
418
419 ObjectNode root;
420 try {
421 root = (ObjectNode) new ObjectMapper().readTree(CFG_FILE);
422 if (Objects.nonNull(root.findValue(REBOOT_NU))) {
423 rebootNum = root.findValue(REBOOT_NU).asInt();
424 }
425 } catch (IOException e) {
426 log.error("applyConfiguration: Exception occurred: {} for {}", e, CFG_FILE);
427 }
428 return rebootNum;
429 }
430
431 private int fetchRetryRebootCount() {
432 int rebootCount = rebootRetryCount;
433 if (!CFG_FILE.exists()) {
434 log.debug("CFG file not found for reboot number");
435 return rebootCount;
436 }
437
438 ObjectNode root;
439 try {
440 root = (ObjectNode) new ObjectMapper().readTree(CFG_FILE);
441 if (Objects.nonNull(root.findValue(REBOOT_RETRY_COUNT))) {
442 rebootCount = root.findValue(REBOOT_RETRY_COUNT).asInt();
443 }
444 } catch (IOException e) {
445 log.error("applyConfiguration: Exception occurred: {} for {}", e, CFG_FILE);
446 }
447 return rebootCount;
448 }
449
450 private void resetRebootNu() {
451 updateRebootNu(0);
452 }
453
454 private void updateRebootNu(int rebootnum) {
455 updateDiagFile(rebootnum, rebootRetryCount);
456 }
457
458 private void updateDiagFile(int rebootnum, int defaultRebootcount) {
459 ObjectMapper mapper = new ObjectMapper();
460 Map<String, Integer> output = new HashMap<>();
461 output.put(REBOOT_RETRY_COUNT, defaultRebootcount);
462 output.put(REBOOT_NU, rebootnum);
463 rebootNu = rebootnum;
464 rebootRetryCount = defaultRebootcount;
465 try {
466 mapper.writerWithDefaultPrettyPrinter().writeValue(CFG_FILE, output);
467 } catch (IOException e) {
468 e.printStackTrace();
469 }
470 }
471
Saritha80cb8542019-12-09 10:39:20 +0530472 private void clusterNodeDiagnosis() {
473 if (Objects.isNull(caService)) {
474 return;
475 }
Parvathi MHce789172020-04-14 10:55:45 +0530476 try {
477 if (caService.getState(localNodeId).equals(ControllerNode.State.READY)) {
478 if (rebootNu > 0) {
479 resetRebootNu();
480 }
481 return;
482 }
483 long lastUpdatedInstant = caService.getLastUpdatedInstant(localNodeId).until(Instant.now(),
484 ChronoUnit.MINUTES);
485 if (lastUpdatedInstant <= initialClusterTimeoutPeriod) {
486 return;
487 }
488 /**
489 * Diagnosis Action if set to true, onos reboot occurs when required.
490 * Diagnosis Action if set to false, leaves logs informing that onos reboot is needed.
491 */
492 if (!initialDiagnosisAction) {
493 log.info("onos Halt is needed as cluster node status is in: {} for Time out period: {}" +
494 " for node {} with lastUpdatedInstant: {}" +
495 " But, not onos is not rebooted as Diagnosis action is set to false",
496 caService.getState(localNodeId), initialClusterTimeoutPeriod, localNodeId, lastUpdatedInstant);
497 return;
498 }
499 log.info("onos Halt is needed as cluster node status is in: {} for Time out period: {}" +
500 " for node {} with lastUpdatedInstant: {}",
501 caService.getState(localNodeId), initialClusterTimeoutPeriod, localNodeId, lastUpdatedInstant);
502 if (rebootNu < rebootRetryCount) {
503 updateRebootNu(rebootNu + 1);
504 log.info("Halting.Number of Halting:{}", rebootNu);
505 multicastReboot(true, Collections.singleton(localNodeId));
506 } else {
507 log.info("Halting is ignored as it is crossed limit of default Halting number");
508 }
509 } catch (Exception e) {
510 log.error("Exception occured in Cluster Node Diagnosis", e);
Saritha80cb8542019-12-09 10:39:20 +0530511 }
512 }
513
Parvathi MHce789172020-04-14 10:55:45 +0530514 /**
515 * Gets memory threshold.
516 * Obtains total memory of the system where onos is running.
517 * 80% of the total memory is taken as threshold.
518 */
519 private void getMemoryThreshold() {
520 String memStr = "";
521 try {
522 String outputMem;
523 Process getMemProc = Runtime.getRuntime().exec(CMD_FOR_TOTAL_MEMORY);
524 try (BufferedReader bufferedReader = new BufferedReader(
525 new InputStreamReader(getMemProc.getInputStream()))) {
526 while ((outputMem = bufferedReader.readLine()) != null) {
527 memStr += outputMem;
528 }
529 }
530
531 memStr = memStr.replaceAll("\n", "");
532 long totalMem = Long.parseLong(memStr);
533 //Taking 80% of total memory as threshold
534 memoryThreshold = (long) (totalMem * 0.80);
535 log.trace("totalMemory {}", memoryThreshold);
536 } catch (Exception e) {
537 log.error("Exception occured while getting Pid", e);
538 }
539 }
540
541 /**
542 * Gets pid of the onos service.
543 */
544 private void getPid() {
545 String pidStr = "";
546 try {
547 String outputPid;
548 Process getPidProc = Runtime.getRuntime().exec(CMD_FOR_PID);
549 try (BufferedReader bufferedReader = new BufferedReader(
550 new InputStreamReader(getPidProc.getInputStream()))) {
551 while ((outputPid = bufferedReader.readLine()) != null) {
552 pidStr += outputPid;
553 }
554 }
555
556 this.pid = Long.parseLong(pidStr);
557 log.trace("pid {}", pid);
558 } catch (Exception e) {
559 log.error("Exception occured while getting Pid", e);
560 }
561 }
562
563 /**
564 * Restart onos if sdnc memory exceeds memory threshold.
565 */
566 private void sdncMemoryDiagnosis() {
567 if (Objects.isNull(pid)) {
568 return;
569 }
570 if (Objects.isNull(memoryThreshold)) {
571 return;
572 }
573 try {
574 String[] getMemCmd = {
575 "/bin/sh",
576 "-c",
577 " ps -p " + pid + " -o rss"
578 };
579
580 String outputMem;
581 String outputMemFinal = "";
582 Process getMemProc = Runtime.getRuntime().exec(getMemCmd);
583 try (BufferedReader bufferedReader = new BufferedReader(
584 new InputStreamReader(getMemProc.getInputStream()))) {
585 while ((outputMem = bufferedReader.readLine()) != null) {
586 outputMemFinal += outputMem;
587 }
588 }
589 //Ex:outputFinalMem-> " RSS1031496"
590 outputMemFinal = outputMemFinal.replaceAll(EXTRA_SPACE, "");
591 String memTotalStr = outputMemFinal.substring(MEMORY_START_IDX);
592 if (memTotalStr.isEmpty()) {
593 log.error("Total Memory is empty");
594 return;
595 }
596 long memTotal = Long.parseLong(memTotalStr);
597 memTotal *= DATA_BLOCK_1024;
598 log.trace("memTotal {}", memTotal);
599
600 if (memTotal > memoryThreshold) {
601 log.info("onos Halt is needed as memory has exceeded. " +
602 "The threshold is {} and used memory is {} for node {}.",
603 memoryThreshold, memTotal, localNodeId);
604 multicastReboot(true, Collections.singleton(localNodeId));
605 }
606 } catch (Exception e) {
607 log.error("exception at Sdnc Memory Diagnosis", e);
608 }
609
610 }
611
612 /**
613 * To obtain number of tcp socket descriptors.
614 */
615 private static class CallableTcpexecute implements Callable<Long> {
616 public Long call() throws Exception {
617 String[] cmdTcpFd = {"/bin/sh", "-c",
618 "netstat -anp 2>/dev/null | grep " + pid + "/java | grep tcp | wc -l"};
619 getTcpProc = Runtime.getRuntime().exec(cmdTcpFd);
620 if (Objects.isNull(getTcpProc)) {
621 return 0L;
622 }
623 String outputTcp;
624 try (BufferedReader bufferedReader = new BufferedReader(
625 new InputStreamReader(getTcpProc.getInputStream()))) {
626 outputTcp = bufferedReader.readLine();
627 }
628 if (Objects.isNull(outputTcp)) {
629 return 0L;
630 }
631 return Long.parseLong(outputTcp);
632 }
633 }
634
635 /**
636 * To obtain number of udp socket descriptors.
637 */
638 private static class CallableUdpexecute implements Callable<Long> {
639 public Long call() throws Exception {
640 String[] cmdUdpFd = {"/bin/sh", "-c",
641 "netstat -anp 2>/dev/null | grep " + pid + "/java | grep udp | wc -l"};
642 getUdpProc = Runtime.getRuntime().exec(cmdUdpFd);
643 if (Objects.isNull(getUdpProc)) {
644 return 0L;
645 }
646 String outputUdp;
647 try (BufferedReader bufferedReader = new BufferedReader(
648 new InputStreamReader(getUdpProc.getInputStream()))) {
649 outputUdp = bufferedReader.readLine();
650 }
651 if (Objects.isNull(outputUdp)) {
652 return 0L;
653 }
654 return Long.parseLong(outputUdp);
655 }
656 }
657
658 /**
659 * Restarts onos if total number of socket descriptors exceeds threshold.
660 */
661 private void socketDescriptorsDiagnosis() {
662 ExecutorService executorService = Executors.newCachedThreadPool();
663 Future<Long> futureTcp;
664 Future<Long> futureUdp;
665 futureTcp = executorService.submit(new CallableTcpexecute());
666 futureUdp = executorService.submit(new CallableUdpexecute());
667 try {
668 long tcpSds = futureTcp.get(5, TimeUnit.SECONDS);
669 long udpSds = futureUdp.get(5, TimeUnit.SECONDS);
670
671 long totalSockets = tcpSds + udpSds;
672 log.trace("total {}, tcp {}, udp {}", totalSockets, tcpSds, udpSds);
673 if (totalSockets > SOCKETS_THRESHOLD) {
674 log.info("onos Halt is needed as socket descriptors has exceeded " +
675 "threshold limit for node {}", localNodeId);
676 multicastReboot(true, Collections.singleton(localNodeId));
677 }
678 } catch (TimeoutException e) {
679 log.error("Timeout exception at Socket Descriptors diagnosis", e);
680 try {
681 if (Objects.nonNull(getTcpProc)) {
682 getTcpProc.destroy();
683 }
684 if (Objects.nonNull(getUdpProc)) {
685 getUdpProc.destroy();
686 }
687 } catch (Exception ex) {
688 log.error("Exception at destroying Tcp/Udp process", ex);
689 }
690
691 String outputPid;
692 try {
693 String pidStr = "";
694 Process getPidProc = Runtime.getRuntime().exec(CMD_FOR_NETSTAT_PID);
695 try (BufferedReader bufferedReader = new BufferedReader(
696 new InputStreamReader(getPidProc.getInputStream()))) {
697 while ((outputPid = bufferedReader.readLine()) != null) {
698 pidStr += outputPid;
699 }
700 }
701 if (!pidStr.equals("")) {
702 Runtime.getRuntime().exec("kill " + pidStr);
703 }
704 } catch (Exception ex) {
705 log.error("Exception at killing netstat command", ex);
706 }
707
708 log.info("onos Halt is needed as timeout occured while finding total number of " +
709 "socket descriptors for node {}", localNodeId);
710 multicastReboot(true, Collections.singleton(localNodeId));
711 } catch (Exception e) {
712 log.error("exception at Socket Descriptors diagnosis", e);
713 } finally {
714 futureTcp.cancel(true);
715 futureUdp.cancel(true);
716 }
717 }
718
719 /**
720 * Restarts onos if total number of threads and file descriptors exceeds threshold.
721 */
722 private void threadsAndFilesDescriptorDiagnosis() {
723 if (Objects.isNull(pid)) {
724 return;
725 }
726 try {
727 ProcFd procFd = sigar.getProcFd(pid);
728 long totalFd = procFd.getTotal();
729 log.trace("total fds{}", totalFd);
730 if (totalFd > FD_THRESHOLD) {
731 log.info("onos halt is needed as number of threads and file descriptors " +
732 "has exceeded Threshold limit for node {}", localNodeId);
733 multicastReboot(true, Collections.singleton(localNodeId));
734 }
735 } catch (Exception e) {
736 log.error("Exception at Sdnc file descriptor diagnosis", e);
737
738 }
739
740 }
741
742
743 private void sdncFileDescriptorDiagnosis() {
744 socketDescriptorsDiagnosis();
745 threadsAndFilesDescriptorDiagnosis();
746 }
Saritha80cb8542019-12-09 10:39:20 +0530747
748 public void restartNode() {
749 try {
750 systemService.reboot("now", SystemService.Swipe.CACHE);
751 } catch (Exception e) {
752 log.error("error occured because of {} ", e.getMessage());
753 }
754 }
755
756 private void multicastReboot(boolean removeDb, Set<NodeId> nodeIds) {
757 String data = "Reboot:" + removeDb;
758 communicationService.multicast(data, REBOOT_MSG, String::getBytes, nodeIds);
759 }
760
761 private class InternalSampleCollector implements ClusterMessageHandler {
762 @Override
763 public void handle(ClusterMessage message) {
764 String reqMsg = new String(message.payload());
765 log.info("Cluster communication message subject{} and message {}",
Parvathi MHce789172020-04-14 10:55:45 +0530766 message.subject(), reqMsg);
Saritha80cb8542019-12-09 10:39:20 +0530767 boolean flag = Boolean.parseBoolean(reqMsg.split(":")[1].trim());
768 if (flag) {
769 System.setProperty("apache.karaf.removedb", "true");
770 }
771 if (message.subject().equals(REBOOT_MSG)) {
772 restartNode();
773 }
774 }
775 }
776}