Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 1 | /* |
Brian O'Connor | a09fe5b | 2017-08-03 21:12:30 -0700 | [diff] [blame] | 2 | * Copyright 2015-present Open Networking Foundation |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | package org.onosproject.net.intent.impl; |
| 17 | |
| 18 | import org.apache.felix.scr.annotations.Activate; |
| 19 | import org.apache.felix.scr.annotations.Component; |
| 20 | import org.apache.felix.scr.annotations.Deactivate; |
| 21 | import org.apache.felix.scr.annotations.Modified; |
| 22 | import org.apache.felix.scr.annotations.Property; |
| 23 | import org.apache.felix.scr.annotations.Reference; |
| 24 | import org.apache.felix.scr.annotations.ReferenceCardinality; |
| 25 | import org.onosproject.cfg.ComponentConfigService; |
| 26 | import org.onosproject.net.intent.IntentData; |
| 27 | import org.onosproject.net.intent.IntentEvent; |
| 28 | import org.onosproject.net.intent.IntentListener; |
| 29 | import org.onosproject.net.intent.IntentService; |
| 30 | import org.onosproject.net.intent.IntentStore; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 31 | import org.onosproject.net.intent.Key; |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 32 | import org.onosproject.store.service.WallClockTimestamp; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 33 | import org.osgi.service.component.ComponentContext; |
| 34 | import org.slf4j.Logger; |
| 35 | |
| 36 | import java.util.Dictionary; |
| 37 | import java.util.Properties; |
| 38 | import java.util.Timer; |
| 39 | import java.util.TimerTask; |
| 40 | import java.util.concurrent.ExecutorService; |
| 41 | |
| 42 | import static com.google.common.base.Strings.isNullOrEmpty; |
| 43 | import static java.util.concurrent.Executors.newSingleThreadExecutor; |
| 44 | import static org.onlab.util.Tools.get; |
| 45 | import static org.onlab.util.Tools.groupedThreads; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 46 | import static org.slf4j.LoggerFactory.getLogger; |
| 47 | |
| 48 | /** |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 49 | * This component cleans up intents that have encountered errors or otherwise |
| 50 | * stalled during installation or withdrawal. |
| 51 | * <p> |
| 52 | * It periodically polls (based on configured period) for pending and CORRUPT |
| 53 | * intents from the store and retries. It also listens for CORRUPT event |
| 54 | * notifications, which signify errors in processing, and retries. |
| 55 | * </p> |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 56 | */ |
| 57 | @Component(immediate = true) |
| 58 | public class IntentCleanup implements Runnable, IntentListener { |
| 59 | |
Brian O'Connor | cdec493 | 2015-04-30 16:16:47 -0700 | [diff] [blame] | 60 | private static final Logger log = getLogger(IntentCleanup.class); |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 61 | |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 62 | // Logical timeout for stuck Intents in INSTALLING or WITHDRAWING. The unit is seconds |
| 63 | private static final int INSTALLING_WITHDRAWING_PERIOD = 120; |
| 64 | |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 65 | private static final int DEFAULT_PERIOD = 5; //seconds |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 66 | private static final int DEFAULT_THRESHOLD = 5; //tries |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 67 | |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 68 | @Property(name = "enabled", boolValue = true, |
| 69 | label = "Enables/disables the intent cleanup component") |
| 70 | private boolean enabled = true; |
| 71 | |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 72 | @Property(name = "period", intValue = DEFAULT_PERIOD, |
| 73 | label = "Frequency in ms between cleanup runs") |
| 74 | protected int period = DEFAULT_PERIOD; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 75 | private long periodMs; |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 76 | private long periodMsForStuck; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 77 | |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 78 | @Property(name = "retryThreshold", intValue = DEFAULT_THRESHOLD, |
| 79 | label = "Number of times to retry CORRUPT intent without delay") |
Brian O'Connor | cdec493 | 2015-04-30 16:16:47 -0700 | [diff] [blame] | 80 | protected int retryThreshold = DEFAULT_THRESHOLD; |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 81 | |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 82 | @Reference(cardinality = ReferenceCardinality.MANDATORY_UNARY) |
| 83 | protected IntentService service; |
| 84 | |
| 85 | @Reference(cardinality = ReferenceCardinality.MANDATORY_UNARY) |
| 86 | protected IntentStore store; |
| 87 | |
| 88 | @Reference(cardinality = ReferenceCardinality.MANDATORY_UNARY) |
| 89 | protected ComponentConfigService cfgService; |
| 90 | |
| 91 | private ExecutorService executor; |
| 92 | private Timer timer; |
| 93 | private TimerTask timerTask; |
| 94 | |
| 95 | @Activate |
| 96 | public void activate() { |
| 97 | cfgService.registerProperties(getClass()); |
HIGUCHI Yuta | d9e0105 | 2016-04-14 09:31:42 -0700 | [diff] [blame] | 98 | executor = newSingleThreadExecutor(groupedThreads("onos/intent", "cleanup", log)); |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 99 | timer = new Timer("onos-intent-cleanup-timer"); |
| 100 | service.addListener(this); |
| 101 | adjustRate(); |
| 102 | log.info("Started"); |
| 103 | } |
| 104 | |
| 105 | @Deactivate |
| 106 | public void deactivate() { |
| 107 | cfgService.unregisterProperties(getClass(), false); |
| 108 | service.removeListener(this); |
| 109 | timer.cancel(); |
| 110 | timerTask = null; |
| 111 | executor.shutdown(); |
| 112 | log.info("Stopped"); |
| 113 | } |
| 114 | |
| 115 | @Modified |
| 116 | public void modified(ComponentContext context) { |
| 117 | Dictionary<?, ?> properties = context != null ? context.getProperties() : new Properties(); |
| 118 | |
| 119 | int newPeriod; |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 120 | boolean newEnabled; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 121 | try { |
| 122 | String s = get(properties, "period"); |
| 123 | newPeriod = isNullOrEmpty(s) ? period : Integer.parseInt(s.trim()); |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 124 | |
| 125 | s = get(properties, "retryThreshold"); |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 126 | retryThreshold = isNullOrEmpty(s) ? retryThreshold : Integer.parseInt(s.trim()); |
| 127 | |
| 128 | s = get(properties, "enabled"); |
| 129 | newEnabled = isNullOrEmpty(s) ? enabled : Boolean.parseBoolean(s.trim()); |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 130 | } catch (NumberFormatException e) { |
| 131 | log.warn(e.getMessage()); |
| 132 | newPeriod = period; |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 133 | newEnabled = enabled; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 134 | } |
| 135 | |
| 136 | // Any change in the following parameters implies hard restart |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 137 | // We could further restrict only for values multiple of the period |
| 138 | // of the stuck intents |
| 139 | if (newPeriod != period || enabled != newEnabled || newPeriod <= INSTALLING_WITHDRAWING_PERIOD) { |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 140 | period = newPeriod; |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 141 | enabled = newEnabled; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 142 | adjustRate(); |
| 143 | } |
| 144 | |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 145 | log.info("Settings: enabled={}, period={}, retryThreshold={}", |
| 146 | enabled, period, retryThreshold); |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 147 | } |
| 148 | |
Brian O'Connor | eba4e34 | 2015-04-30 22:50:13 -0700 | [diff] [blame] | 149 | protected void adjustRate() { |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 150 | if (timerTask != null) { |
| 151 | timerTask.cancel(); |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 152 | timerTask = null; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 153 | } |
| 154 | |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 155 | if (enabled) { |
| 156 | timerTask = new TimerTask() { |
| 157 | @Override |
| 158 | public void run() { |
HIGUCHI Yuta | d9e0105 | 2016-04-14 09:31:42 -0700 | [diff] [blame] | 159 | executor.execute(IntentCleanup.this); |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 160 | } |
| 161 | }; |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 162 | // Convert to ms |
| 163 | periodMs = period * 1_000; |
| 164 | periodMsForStuck = INSTALLING_WITHDRAWING_PERIOD * 1000; |
| 165 | // Schedule the executions |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 166 | timer.scheduleAtFixedRate(timerTask, periodMs, periodMs); |
| 167 | } |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 168 | } |
| 169 | |
| 170 | |
| 171 | @Override |
| 172 | public void run() { |
| 173 | try { |
| 174 | cleanup(); |
| 175 | } catch (Exception e) { |
| 176 | log.warn("Caught exception during Intent cleanup", e); |
| 177 | } |
| 178 | } |
| 179 | |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 180 | private void resubmitCorrupt(IntentData intentData, boolean checkThreshold) { |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 181 | if (checkThreshold && intentData.errorCount() >= retryThreshold) { |
Brian O'Connor | 3822430 | 2016-08-02 22:03:01 -0700 | [diff] [blame] | 182 | //FIXME trace or debug statement? |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 183 | return; // threshold met or exceeded |
Brian O'Connor | 3822430 | 2016-08-02 22:03:01 -0700 | [diff] [blame] | 184 | } // FIXME should we backoff here? |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 185 | |
| 186 | switch (intentData.request()) { |
| 187 | case INSTALL_REQ: |
| 188 | service.submit(intentData.intent()); |
| 189 | break; |
| 190 | case WITHDRAW_REQ: |
| 191 | service.withdraw(intentData.intent()); |
| 192 | break; |
| 193 | default: |
Jonathan Hart | aae93b2 | 2015-07-22 14:59:47 -0700 | [diff] [blame] | 194 | log.warn("Trying to resubmit corrupt/failed intent {} in state {} with request {}", |
Brian O'Connor | b55d6e6 | 2015-06-01 15:25:53 -0700 | [diff] [blame] | 195 | intentData.key(), intentData.state(), intentData.request()); |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 196 | break; |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | private void resubmitPendingRequest(IntentData intentData) { |
Brian O'Connor | 3822430 | 2016-08-02 22:03:01 -0700 | [diff] [blame] | 201 | // FIXME should we back off here? |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 202 | switch (intentData.request()) { |
| 203 | case INSTALL_REQ: |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 204 | case WITHDRAW_REQ: |
Brian O'Connor | 105cf53 | 2016-04-19 13:07:38 -0700 | [diff] [blame] | 205 | case PURGE_REQ: |
Brian O'Connor | 3822430 | 2016-08-02 22:03:01 -0700 | [diff] [blame] | 206 | service.addPending(intentData); |
Brian O'Connor | 105cf53 | 2016-04-19 13:07:38 -0700 | [diff] [blame] | 207 | break; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 208 | default: |
Brian O'Connor | c90d184 | 2015-10-01 15:48:00 -0700 | [diff] [blame] | 209 | log.warn("Failed to resubmit pending intent {} in state {} with request {}", |
Brian O'Connor | b55d6e6 | 2015-06-01 15:25:53 -0700 | [diff] [blame] | 210 | intentData.key(), intentData.state(), intentData.request()); |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 211 | break; |
| 212 | } |
| 213 | } |
| 214 | |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 215 | /** |
Jonathan Hart | aae93b2 | 2015-07-22 14:59:47 -0700 | [diff] [blame] | 216 | * Iterates through corrupt, failed and pending intents and |
| 217 | * re-submit/withdraw appropriately. |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 218 | */ |
| 219 | private void cleanup() { |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 220 | int corruptCount = 0, failedCount = 0, stuckCount = 0, pendingCount = 0, skipped = 0; |
Jonathan Hart | aae93b2 | 2015-07-22 14:59:47 -0700 | [diff] [blame] | 221 | |
Brian O'Connor | c590ebb | 2016-12-08 18:16:41 -0800 | [diff] [blame] | 222 | // Check the pending map first, because the check of the current map |
| 223 | // will add items to the pending map. |
| 224 | for (IntentData intentData : store.getPendingData(true, periodMs)) { |
Pier Luigi | 13b287f | 2017-01-10 15:07:52 -0800 | [diff] [blame] | 225 | log.debug("Resubmit Pending Intent: key {}, state {}, request {}", |
| 226 | intentData.key(), intentData.state(), intentData.request()); |
Brian O'Connor | c590ebb | 2016-12-08 18:16:41 -0800 | [diff] [blame] | 227 | resubmitPendingRequest(intentData); |
| 228 | pendingCount++; |
| 229 | } |
| 230 | |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 231 | for (IntentData intentData : store.getIntentData(true, periodMs)) { |
| 232 | switch (intentData.state()) { |
Jonathan Hart | aae93b2 | 2015-07-22 14:59:47 -0700 | [diff] [blame] | 233 | case FAILED: |
Pier Luigi | 13b287f | 2017-01-10 15:07:52 -0800 | [diff] [blame] | 234 | log.debug("Resubmit Failed Intent: key {}, state {}, request {}", |
| 235 | intentData.key(), intentData.state(), intentData.request()); |
Jonathan Hart | aae93b2 | 2015-07-22 14:59:47 -0700 | [diff] [blame] | 236 | resubmitCorrupt(intentData, false); |
| 237 | failedCount++; |
| 238 | break; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 239 | case CORRUPT: |
Pier Luigi | 13b287f | 2017-01-10 15:07:52 -0800 | [diff] [blame] | 240 | log.debug("Resubmit Corrupt Intent: key {}, state {}, request {}", |
| 241 | intentData.key(), intentData.state(), intentData.request()); |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 242 | resubmitCorrupt(intentData, false); |
| 243 | corruptCount++; |
Brian O'Connor | eba4e34 | 2015-04-30 22:50:13 -0700 | [diff] [blame] | 244 | break; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 245 | case INSTALLING: //FALLTHROUGH |
| 246 | case WITHDRAWING: |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 247 | // Instances can have different clocks and potentially we can have problems |
| 248 | // An Intent can be submitted again before the real period of the stuck intents |
| 249 | final WallClockTimestamp time = new WallClockTimestamp( |
| 250 | System.currentTimeMillis() - periodMsForStuck |
| 251 | ); |
| 252 | if (intentData.version().isOlderThan(time)) { |
| 253 | resubmitPendingRequest(intentData); |
| 254 | stuckCount++; |
| 255 | } else { |
| 256 | skipped++; |
| 257 | } |
Brian O'Connor | eba4e34 | 2015-04-30 22:50:13 -0700 | [diff] [blame] | 258 | break; |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 259 | default: |
| 260 | //NOOP |
| 261 | break; |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 262 | } |
| 263 | } |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 264 | |
Jonathan Hart | 82efa69 | 2015-10-10 18:30:28 -0700 | [diff] [blame] | 265 | if (corruptCount + failedCount + stuckCount + pendingCount > 0) { |
| 266 | log.debug("Intent cleanup ran and resubmitted {} corrupt, {} failed, {} stuck, and {} pending intents", |
| 267 | corruptCount, failedCount, stuckCount, pendingCount); |
| 268 | } |
Pier Luigi | e6caf68 | 2017-01-26 15:25:09 -0800 | [diff] [blame] | 269 | if (skipped > 0) { |
| 270 | log.debug("Intent cleanup skipped {} intents", skipped); |
| 271 | } |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 272 | } |
| 273 | |
| 274 | @Override |
| 275 | public void event(IntentEvent event) { |
Brian O'Connor | 6d8e317 | 2015-04-30 15:43:57 -0700 | [diff] [blame] | 276 | // this is the fast path for CORRUPT intents, retry on event notification. |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 277 | //TODO we might consider using the timer to back off for subsequent retries |
Brian O'Connor | 5fcf6f5 | 2015-05-28 17:34:26 -0700 | [diff] [blame] | 278 | if (enabled && event.type() == IntentEvent.Type.CORRUPT) { |
Brian O'Connor | a6c9b5c | 2015-04-29 22:38:29 -0700 | [diff] [blame] | 279 | Key key = event.subject().key(); |
| 280 | if (store.isMaster(key)) { |
| 281 | IntentData data = store.getIntentData(event.subject().key()); |
| 282 | resubmitCorrupt(data, true); |
| 283 | } |
Brian O'Connor | 3c58e96 | 2015-04-28 23:21:51 -0700 | [diff] [blame] | 284 | } |
| 285 | } |
| 286 | } |