blob: 37e6be86a8a98a9b356b63e3671f58c760c05e41 [file] [log] [blame]
Brian O'Connor3c58e962015-04-28 23:21:51 -07001/*
Brian O'Connora09fe5b2017-08-03 21:12:30 -07002 * Copyright 2015-present Open Networking Foundation
Brian O'Connor3c58e962015-04-28 23:21:51 -07003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package org.onosproject.net.intent.impl;
17
Brian O'Connor3c58e962015-04-28 23:21:51 -070018import org.onosproject.cfg.ComponentConfigService;
19import org.onosproject.net.intent.IntentData;
20import org.onosproject.net.intent.IntentEvent;
21import org.onosproject.net.intent.IntentListener;
22import org.onosproject.net.intent.IntentService;
23import org.onosproject.net.intent.IntentStore;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -070024import org.onosproject.net.intent.Key;
Pier Luigie6caf682017-01-26 15:25:09 -080025import org.onosproject.store.service.WallClockTimestamp;
Brian O'Connor3c58e962015-04-28 23:21:51 -070026import org.osgi.service.component.ComponentContext;
Ray Milkeyd84f89b2018-08-17 14:54:17 -070027import org.osgi.service.component.annotations.Activate;
28import org.osgi.service.component.annotations.Component;
29import org.osgi.service.component.annotations.Deactivate;
30import org.osgi.service.component.annotations.Modified;
31import org.osgi.service.component.annotations.Reference;
32import org.osgi.service.component.annotations.ReferenceCardinality;
Brian O'Connor3c58e962015-04-28 23:21:51 -070033import org.slf4j.Logger;
34
35import java.util.Dictionary;
36import java.util.Properties;
37import java.util.Timer;
38import java.util.TimerTask;
39import java.util.concurrent.ExecutorService;
40
41import static com.google.common.base.Strings.isNullOrEmpty;
42import static java.util.concurrent.Executors.newSingleThreadExecutor;
43import static org.onlab.util.Tools.get;
44import static org.onlab.util.Tools.groupedThreads;
Ray Milkeyd04e2272018-10-16 18:20:18 -070045import static org.onosproject.net.OsgiPropertyConstants.ICU_ENABLED;
46import static org.onosproject.net.OsgiPropertyConstants.ICU_ENABLED_DEFAULT;
47import static org.onosproject.net.OsgiPropertyConstants.ICU_PERIOD;
48import static org.onosproject.net.OsgiPropertyConstants.ICU_PERIOD_DEFAULT;
49import static org.onosproject.net.OsgiPropertyConstants.ICU_RETRY_THRESHOLD;
50import static org.onosproject.net.OsgiPropertyConstants.ICU_RETRY_THRESHOLD_DEFAULT;
Brian O'Connor3c58e962015-04-28 23:21:51 -070051import static org.slf4j.LoggerFactory.getLogger;
52
53/**
Brian O'Connora6c9b5c2015-04-29 22:38:29 -070054 * This component cleans up intents that have encountered errors or otherwise
55 * stalled during installation or withdrawal.
56 * <p>
57 * It periodically polls (based on configured period) for pending and CORRUPT
58 * intents from the store and retries. It also listens for CORRUPT event
59 * notifications, which signify errors in processing, and retries.
60 * </p>
Brian O'Connor3c58e962015-04-28 23:21:51 -070061 */
Ray Milkeyd04e2272018-10-16 18:20:18 -070062@Component(
63 immediate = true,
64 property = {
Ray Milkey2d7bca12018-10-17 14:51:52 -070065 ICU_ENABLED + ":Boolean=" + ICU_ENABLED_DEFAULT,
66 ICU_PERIOD + ":Integer=" + ICU_PERIOD_DEFAULT,
67 ICU_RETRY_THRESHOLD + ":Integer=" + ICU_RETRY_THRESHOLD_DEFAULT
Ray Milkeyd04e2272018-10-16 18:20:18 -070068 }
69)
Brian O'Connor3c58e962015-04-28 23:21:51 -070070public class IntentCleanup implements Runnable, IntentListener {
71
Brian O'Connorcdec4932015-04-30 16:16:47 -070072 private static final Logger log = getLogger(IntentCleanup.class);
Brian O'Connor3c58e962015-04-28 23:21:51 -070073
Pier Luigie6caf682017-01-26 15:25:09 -080074 // Logical timeout for stuck Intents in INSTALLING or WITHDRAWING. The unit is seconds
75 private static final int INSTALLING_WITHDRAWING_PERIOD = 120;
76
Ray Milkeyd04e2272018-10-16 18:20:18 -070077
78
Brian O'Connor3c58e962015-04-28 23:21:51 -070079
Ray Milkeyd84f89b2018-08-17 14:54:17 -070080 //@Property(name = "enabled", boolValue = true,
81 // label = "Enables/disables the intent cleanup component")
Ray Milkeyd04e2272018-10-16 18:20:18 -070082 private boolean enabled = ICU_ENABLED_DEFAULT;
Brian O'Connor5fcf6f52015-05-28 17:34:26 -070083
Ray Milkeyd84f89b2018-08-17 14:54:17 -070084 //@Property(name = "period", intValue = DEFAULT_PERIOD,
85 // label = "Frequency in ms between cleanup runs")
Ray Milkeyd04e2272018-10-16 18:20:18 -070086 protected int period = ICU_PERIOD_DEFAULT;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -070087 private long periodMs;
Pier Luigie6caf682017-01-26 15:25:09 -080088 private long periodMsForStuck;
Brian O'Connor3c58e962015-04-28 23:21:51 -070089
Ray Milkeyd84f89b2018-08-17 14:54:17 -070090 //@Property(name = "retryThreshold", intValue = DEFAULT_THRESHOLD,
91 // label = "Number of times to retry CORRUPT intent without delay")
Ray Milkeyd04e2272018-10-16 18:20:18 -070092 protected int retryThreshold = ICU_RETRY_THRESHOLD_DEFAULT;
Brian O'Connor6d8e3172015-04-30 15:43:57 -070093
Ray Milkeyd84f89b2018-08-17 14:54:17 -070094 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Brian O'Connor3c58e962015-04-28 23:21:51 -070095 protected IntentService service;
96
Ray Milkeyd84f89b2018-08-17 14:54:17 -070097 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Brian O'Connor3c58e962015-04-28 23:21:51 -070098 protected IntentStore store;
99
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700100 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Brian O'Connor3c58e962015-04-28 23:21:51 -0700101 protected ComponentConfigService cfgService;
102
103 private ExecutorService executor;
104 private Timer timer;
105 private TimerTask timerTask;
106
107 @Activate
108 public void activate() {
109 cfgService.registerProperties(getClass());
HIGUCHI Yutad9e01052016-04-14 09:31:42 -0700110 executor = newSingleThreadExecutor(groupedThreads("onos/intent", "cleanup", log));
Brian O'Connor3c58e962015-04-28 23:21:51 -0700111 timer = new Timer("onos-intent-cleanup-timer");
112 service.addListener(this);
113 adjustRate();
114 log.info("Started");
115 }
116
117 @Deactivate
118 public void deactivate() {
119 cfgService.unregisterProperties(getClass(), false);
120 service.removeListener(this);
121 timer.cancel();
122 timerTask = null;
123 executor.shutdown();
124 log.info("Stopped");
125 }
126
127 @Modified
128 public void modified(ComponentContext context) {
129 Dictionary<?, ?> properties = context != null ? context.getProperties() : new Properties();
130
131 int newPeriod;
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700132 boolean newEnabled;
Brian O'Connor3c58e962015-04-28 23:21:51 -0700133 try {
134 String s = get(properties, "period");
135 newPeriod = isNullOrEmpty(s) ? period : Integer.parseInt(s.trim());
Brian O'Connor6d8e3172015-04-30 15:43:57 -0700136
137 s = get(properties, "retryThreshold");
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700138 retryThreshold = isNullOrEmpty(s) ? retryThreshold : Integer.parseInt(s.trim());
139
140 s = get(properties, "enabled");
141 newEnabled = isNullOrEmpty(s) ? enabled : Boolean.parseBoolean(s.trim());
Brian O'Connor3c58e962015-04-28 23:21:51 -0700142 } catch (NumberFormatException e) {
143 log.warn(e.getMessage());
144 newPeriod = period;
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700145 newEnabled = enabled;
Brian O'Connor3c58e962015-04-28 23:21:51 -0700146 }
147
148 // Any change in the following parameters implies hard restart
Pier Luigie6caf682017-01-26 15:25:09 -0800149 // We could further restrict only for values multiple of the period
150 // of the stuck intents
151 if (newPeriod != period || enabled != newEnabled || newPeriod <= INSTALLING_WITHDRAWING_PERIOD) {
Brian O'Connor3c58e962015-04-28 23:21:51 -0700152 period = newPeriod;
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700153 enabled = newEnabled;
Brian O'Connor3c58e962015-04-28 23:21:51 -0700154 adjustRate();
155 }
156
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700157 log.info("Settings: enabled={}, period={}, retryThreshold={}",
158 enabled, period, retryThreshold);
Brian O'Connor3c58e962015-04-28 23:21:51 -0700159 }
160
Brian O'Connoreba4e342015-04-30 22:50:13 -0700161 protected void adjustRate() {
Brian O'Connor3c58e962015-04-28 23:21:51 -0700162 if (timerTask != null) {
163 timerTask.cancel();
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700164 timerTask = null;
Brian O'Connor3c58e962015-04-28 23:21:51 -0700165 }
166
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700167 if (enabled) {
168 timerTask = new TimerTask() {
169 @Override
170 public void run() {
HIGUCHI Yutad9e01052016-04-14 09:31:42 -0700171 executor.execute(IntentCleanup.this);
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700172 }
173 };
Pier Luigie6caf682017-01-26 15:25:09 -0800174 // Convert to ms
Ray Milkey3717e602018-02-01 13:49:47 -0800175 periodMs = period * 1_000L;
176 periodMsForStuck = INSTALLING_WITHDRAWING_PERIOD * 1000L;
Pier Luigie6caf682017-01-26 15:25:09 -0800177 // Schedule the executions
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700178 timer.scheduleAtFixedRate(timerTask, periodMs, periodMs);
179 }
Brian O'Connor3c58e962015-04-28 23:21:51 -0700180 }
181
182
183 @Override
184 public void run() {
185 try {
186 cleanup();
187 } catch (Exception e) {
188 log.warn("Caught exception during Intent cleanup", e);
189 }
190 }
191
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700192 private void resubmitCorrupt(IntentData intentData, boolean checkThreshold) {
Brian O'Connor6d8e3172015-04-30 15:43:57 -0700193 if (checkThreshold && intentData.errorCount() >= retryThreshold) {
Brian O'Connor38224302016-08-02 22:03:01 -0700194 //FIXME trace or debug statement?
Brian O'Connor6d8e3172015-04-30 15:43:57 -0700195 return; // threshold met or exceeded
Brian O'Connor38224302016-08-02 22:03:01 -0700196 } // FIXME should we backoff here?
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700197
198 switch (intentData.request()) {
199 case INSTALL_REQ:
200 service.submit(intentData.intent());
201 break;
202 case WITHDRAW_REQ:
203 service.withdraw(intentData.intent());
204 break;
205 default:
Jonathan Hartaae93b22015-07-22 14:59:47 -0700206 log.warn("Trying to resubmit corrupt/failed intent {} in state {} with request {}",
Brian O'Connorb55d6e62015-06-01 15:25:53 -0700207 intentData.key(), intentData.state(), intentData.request());
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700208 break;
209 }
210 }
211
212 private void resubmitPendingRequest(IntentData intentData) {
Brian O'Connor38224302016-08-02 22:03:01 -0700213 // FIXME should we back off here?
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700214 switch (intentData.request()) {
215 case INSTALL_REQ:
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700216 case WITHDRAW_REQ:
Brian O'Connor105cf532016-04-19 13:07:38 -0700217 case PURGE_REQ:
jaegonkimcbe1c5e2018-05-20 15:11:18 +0900218 service.addPending(IntentData.copy(intentData, new WallClockTimestamp()));
Brian O'Connor105cf532016-04-19 13:07:38 -0700219 break;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700220 default:
Brian O'Connorc90d1842015-10-01 15:48:00 -0700221 log.warn("Failed to resubmit pending intent {} in state {} with request {}",
Brian O'Connorb55d6e62015-06-01 15:25:53 -0700222 intentData.key(), intentData.state(), intentData.request());
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700223 break;
224 }
225 }
226
Brian O'Connor3c58e962015-04-28 23:21:51 -0700227 /**
Jonathan Hartaae93b22015-07-22 14:59:47 -0700228 * Iterates through corrupt, failed and pending intents and
229 * re-submit/withdraw appropriately.
Brian O'Connor3c58e962015-04-28 23:21:51 -0700230 */
231 private void cleanup() {
Pier Luigie6caf682017-01-26 15:25:09 -0800232 int corruptCount = 0, failedCount = 0, stuckCount = 0, pendingCount = 0, skipped = 0;
Jonathan Hartaae93b22015-07-22 14:59:47 -0700233
Brian O'Connorc590ebb2016-12-08 18:16:41 -0800234 // Check the pending map first, because the check of the current map
235 // will add items to the pending map.
236 for (IntentData intentData : store.getPendingData(true, periodMs)) {
Pier Luigi13b287f2017-01-10 15:07:52 -0800237 log.debug("Resubmit Pending Intent: key {}, state {}, request {}",
238 intentData.key(), intentData.state(), intentData.request());
Brian O'Connorc590ebb2016-12-08 18:16:41 -0800239 resubmitPendingRequest(intentData);
240 pendingCount++;
241 }
242
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700243 for (IntentData intentData : store.getIntentData(true, periodMs)) {
jaegonkimab7e59f2018-05-07 13:04:05 +0900244 IntentData pendingIntentData = store.getPendingData(intentData.key());
245 if (pendingIntentData != null) {
246 continue;
247 }
248
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700249 switch (intentData.state()) {
Jonathan Hartaae93b22015-07-22 14:59:47 -0700250 case FAILED:
Pier Luigi13b287f2017-01-10 15:07:52 -0800251 log.debug("Resubmit Failed Intent: key {}, state {}, request {}",
jaegonkimab7e59f2018-05-07 13:04:05 +0900252 intentData.key(), intentData.state(), intentData.request());
Jonathan Hartaae93b22015-07-22 14:59:47 -0700253 resubmitCorrupt(intentData, false);
254 failedCount++;
255 break;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700256 case CORRUPT:
Pier Luigi13b287f2017-01-10 15:07:52 -0800257 log.debug("Resubmit Corrupt Intent: key {}, state {}, request {}",
jaegonkimab7e59f2018-05-07 13:04:05 +0900258 intentData.key(), intentData.state(), intentData.request());
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700259 resubmitCorrupt(intentData, false);
260 corruptCount++;
Brian O'Connoreba4e342015-04-30 22:50:13 -0700261 break;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700262 case INSTALLING: //FALLTHROUGH
263 case WITHDRAWING:
Pier Luigie6caf682017-01-26 15:25:09 -0800264 // Instances can have different clocks and potentially we can have problems
265 // An Intent can be submitted again before the real period of the stuck intents
266 final WallClockTimestamp time = new WallClockTimestamp(
267 System.currentTimeMillis() - periodMsForStuck
268 );
269 if (intentData.version().isOlderThan(time)) {
270 resubmitPendingRequest(intentData);
271 stuckCount++;
272 } else {
273 skipped++;
274 }
Brian O'Connoreba4e342015-04-30 22:50:13 -0700275 break;
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700276 default:
277 //NOOP
278 break;
Brian O'Connor3c58e962015-04-28 23:21:51 -0700279 }
280 }
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700281
Jonathan Hart82efa692015-10-10 18:30:28 -0700282 if (corruptCount + failedCount + stuckCount + pendingCount > 0) {
283 log.debug("Intent cleanup ran and resubmitted {} corrupt, {} failed, {} stuck, and {} pending intents",
284 corruptCount, failedCount, stuckCount, pendingCount);
285 }
Pier Luigie6caf682017-01-26 15:25:09 -0800286 if (skipped > 0) {
287 log.debug("Intent cleanup skipped {} intents", skipped);
288 }
Brian O'Connor3c58e962015-04-28 23:21:51 -0700289 }
290
291 @Override
292 public void event(IntentEvent event) {
Brian O'Connor6d8e3172015-04-30 15:43:57 -0700293 // this is the fast path for CORRUPT intents, retry on event notification.
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700294 //TODO we might consider using the timer to back off for subsequent retries
Brian O'Connor5fcf6f52015-05-28 17:34:26 -0700295 if (enabled && event.type() == IntentEvent.Type.CORRUPT) {
Brian O'Connora6c9b5c2015-04-29 22:38:29 -0700296 Key key = event.subject().key();
297 if (store.isMaster(key)) {
298 IntentData data = store.getIntentData(event.subject().key());
299 resubmitCorrupt(data, true);
300 }
Brian O'Connor3c58e962015-04-28 23:21:51 -0700301 }
302 }
303}