blob: d6a7eaeafcf17d0bde4b9fe01d74ece98bc84265 [file] [log] [blame]
Thomas Vachuska4f1a60c2014-10-28 13:39:07 -07001/*
Brian O'Connora09fe5b2017-08-03 21:12:30 -07002 * Copyright 2014-present Open Networking Foundation
Thomas Vachuska4f1a60c2014-10-28 13:39:07 -07003 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Brian O'Connorabafb502014-12-02 22:26:20 -080016package org.onosproject.cluster.impl;
Ayaka Koshibe16609692014-09-23 12:46:15 -070017
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080018import com.codahale.metrics.Timer;
19import com.codahale.metrics.Timer.Context;
Madan Jampanide003d92015-05-11 17:14:20 -070020import com.google.common.collect.Lists;
Claudine Chiudce08152016-03-09 18:19:28 +000021import com.google.common.collect.Sets;
Madan Jampanide003d92015-05-11 17:14:20 -070022import com.google.common.util.concurrent.Futures;
Yuta HIGUCHI6a462832014-11-23 23:56:03 -080023import org.onlab.metrics.MetricsService;
Victor Silvaf2b9d032016-09-19 19:43:20 -030024import org.onosproject.cfg.ComponentConfigService;
25import org.onosproject.cfg.ConfigProperty;
Brian O'Connorabafb502014-12-02 22:26:20 -080026import org.onosproject.cluster.ClusterService;
27import org.onosproject.cluster.ControllerNode;
28import org.onosproject.cluster.NodeId;
29import org.onosproject.cluster.RoleInfo;
30import org.onosproject.core.MetricsHelper;
Thomas Vachuska7a8de842016-03-07 20:56:35 -080031import org.onosproject.event.AbstractListenerManager;
Brian O'Connorabafb502014-12-02 22:26:20 -080032import org.onosproject.mastership.MastershipAdminService;
33import org.onosproject.mastership.MastershipEvent;
Jordan Halterman0a2bd452018-06-13 17:24:58 -070034import org.onosproject.mastership.MastershipInfo;
Brian O'Connorabafb502014-12-02 22:26:20 -080035import org.onosproject.mastership.MastershipListener;
36import org.onosproject.mastership.MastershipService;
37import org.onosproject.mastership.MastershipStore;
38import org.onosproject.mastership.MastershipStoreDelegate;
39import org.onosproject.mastership.MastershipTerm;
40import org.onosproject.mastership.MastershipTermService;
41import org.onosproject.net.DeviceId;
42import org.onosproject.net.MastershipRole;
Claudine Chiudce08152016-03-09 18:19:28 +000043import org.onosproject.net.region.Region;
44import org.onosproject.net.region.RegionService;
Jordan Halterman713830d2017-10-07 13:40:44 -070045import org.onosproject.upgrade.UpgradeEvent;
46import org.onosproject.upgrade.UpgradeEventListener;
47import org.onosproject.upgrade.UpgradeService;
Ray Milkeyd84f89b2018-08-17 14:54:17 -070048import org.osgi.service.component.annotations.Activate;
49import org.osgi.service.component.annotations.Component;
50import org.osgi.service.component.annotations.Deactivate;
51import org.osgi.service.component.annotations.Modified;
52import org.osgi.service.component.annotations.Reference;
53import org.osgi.service.component.annotations.ReferenceCardinality;
Ayaka Koshibe16609692014-09-23 12:46:15 -070054import org.slf4j.Logger;
55
Claudine Chiudce08152016-03-09 18:19:28 +000056import java.util.ArrayList;
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080057import java.util.Collection;
Claudine Chiudce08152016-03-09 18:19:28 +000058import java.util.Collections;
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -080059import java.util.Comparator;
Thomas Vachuska12dfdc32014-11-29 16:03:12 -080060import java.util.HashMap;
61import java.util.HashSet;
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080062import java.util.Iterator;
63import java.util.List;
Thomas Vachuska12dfdc32014-11-29 16:03:12 -080064import java.util.Map;
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080065import java.util.Set;
Madan Jampanif7536ab2015-05-07 23:23:23 -070066import java.util.concurrent.CompletableFuture;
67
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080068import static com.google.common.base.Preconditions.checkNotNull;
69import static com.google.common.collect.Lists.newArrayList;
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -080070import static java.util.concurrent.CompletableFuture.allOf;
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -080071import static org.onlab.metrics.MetricsUtil.startTimer;
72import static org.onlab.metrics.MetricsUtil.stopTimer;
Brian O'Connorabafb502014-12-02 22:26:20 -080073import static org.onosproject.net.MastershipRole.MASTER;
Changhoon Yoon541ef712015-05-23 17:18:34 +090074import static org.onosproject.security.AppGuard.checkPermission;
Thomas Vachuska7a8de842016-03-07 20:56:35 -080075import static org.onosproject.security.AppPermission.Type.CLUSTER_READ;
76import static org.onosproject.security.AppPermission.Type.CLUSTER_WRITE;
Thomas Vachuska42e8cce2015-07-29 19:25:18 -070077import static org.slf4j.LoggerFactory.getLogger;
Ray Milkeyd04e2272018-10-16 18:20:18 -070078import static org.onosproject.net.OsgiPropertyConstants.*;
Changhoon Yoonb856b812015-08-10 03:47:19 +090079
Changhoon Yoon541ef712015-05-23 17:18:34 +090080
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -080081/**
82 * Component providing the node-device mastership service.
83 */
Ray Milkeyd04e2272018-10-16 18:20:18 -070084@Component(
85 immediate = true,
86 service = {
87 MastershipService.class,
88 MastershipAdminService.class,
89 MastershipTermService.class,
90 MetricsHelper.class
91 },
92 property = {
Ray Milkey2d7bca12018-10-17 14:51:52 -070093 USE_REGION_FOR_BALANCE_ROLES + ":Boolean=" + USE_REGION_FOR_BALANCE_ROLES_DEFAULT,
94 REBALANCE_ROLES_ON_UPGRADE + ":Boolean=" + REBALANCE_ROLES_ON_UPGRADE_DEFAULT
Ray Milkeyd04e2272018-10-16 18:20:18 -070095 }
96)
Ayaka Koshibe3eed2b02014-09-23 13:28:05 -070097public class MastershipManager
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -080098 extends AbstractListenerManager<MastershipEvent, MastershipListener>
99 implements MastershipService, MastershipAdminService, MastershipTermService,
100 MetricsHelper {
Ayaka Koshibe16609692014-09-23 12:46:15 -0700101
102 private static final String NODE_ID_NULL = "Node ID cannot be null";
103 private static final String DEVICE_ID_NULL = "Device ID cannot be null";
104 private static final String ROLE_NULL = "Mastership role cannot be null";
105
106 private final Logger log = getLogger(getClass());
107
alshabib339a3d92014-09-26 17:54:32 -0700108 private final MastershipStoreDelegate delegate = new InternalDelegate();
Jordan Halterman713830d2017-10-07 13:40:44 -0700109 private final UpgradeEventListener upgradeEventListener = new InternalUpgradeEventListener();
Ayaka Koshibe16609692014-09-23 12:46:15 -0700110
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700111 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Ayaka Koshibe16609692014-09-23 12:46:15 -0700112 protected MastershipStore store;
113
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700114 @Reference(cardinality = ReferenceCardinality.MANDATORY)
tom4a5d1712014-09-23 17:49:39 -0700115 protected ClusterService clusterService;
Ayaka Koshibe16609692014-09-23 12:46:15 -0700116
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700117 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Yuta HIGUCHI6a462832014-11-23 23:56:03 -0800118 protected MetricsService metricsService;
119
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700120 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Claudine Chiudce08152016-03-09 18:19:28 +0000121 protected RegionService regionService;
122
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700123 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Victor Silvaf2b9d032016-09-19 19:43:20 -0300124 protected ComponentConfigService cfgService;
125
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700126 @Reference(cardinality = ReferenceCardinality.MANDATORY)
Jordan Halterman713830d2017-10-07 13:40:44 -0700127 protected UpgradeService upgradeService;
128
Madan Jampanic6e574f2015-05-29 13:41:52 -0700129 private NodeId localNodeId;
Yuta HIGUCHI6a462832014-11-23 23:56:03 -0800130 private Timer requestRoleTimer;
Victor Silvaf2b9d032016-09-19 19:43:20 -0300131
Thomas Vachuskaf566fa22018-10-30 14:03:36 -0700132 /** Use Regions for balancing roles. */
Ray Milkeyd04e2272018-10-16 18:20:18 -0700133 protected boolean useRegionForBalanceRoles = USE_REGION_FOR_BALANCE_ROLES_DEFAULT;
Jordan Halterman713830d2017-10-07 13:40:44 -0700134
Thomas Vachuskaf566fa22018-10-30 14:03:36 -0700135 /** Automatically rebalance roles following an upgrade. */
Ray Milkeyd04e2272018-10-16 18:20:18 -0700136 protected boolean rebalanceRolesOnUpgrade = REBALANCE_ROLES_ON_UPGRADE_DEFAULT;
Ayaka Koshibe3de43ca2014-09-26 16:40:23 -0700137
Ayaka Koshibe16609692014-09-23 12:46:15 -0700138 @Activate
139 public void activate() {
Victor Silvaf2b9d032016-09-19 19:43:20 -0300140 cfgService.registerProperties(getClass());
141 modified();
142
Yuta HIGUCHI6a462832014-11-23 23:56:03 -0800143 requestRoleTimer = createTimer("Mastership", "requestRole", "responseTime");
Madan Jampanic6e574f2015-05-29 13:41:52 -0700144 localNodeId = clusterService.getLocalNode().id();
Jordan Halterman61aeb352017-10-18 16:22:17 -0700145 upgradeService.addListener(upgradeEventListener);
Ayaka Koshibe16609692014-09-23 12:46:15 -0700146 eventDispatcher.addSink(MastershipEvent.class, listenerRegistry);
alshabib339a3d92014-09-26 17:54:32 -0700147 store.setDelegate(delegate);
Ayaka Koshibe16609692014-09-23 12:46:15 -0700148 log.info("Started");
149 }
150
Victor Silvaf2b9d032016-09-19 19:43:20 -0300151 @Modified
152 public void modified() {
153 Set<ConfigProperty> configProperties = cfgService.getProperties(getClass().getCanonicalName());
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700154 if (configProperties != null) {
155 for (ConfigProperty property : configProperties) {
Thomas Vachuskaf566fa22018-10-30 14:03:36 -0700156 if (USE_REGION_FOR_BALANCE_ROLES.equals(property.name())) {
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700157 useRegionForBalanceRoles = property.asBoolean();
Thomas Vachuskaf566fa22018-10-30 14:03:36 -0700158 } else if (REBALANCE_ROLES_ON_UPGRADE.equals(property.name())) {
159 rebalanceRolesOnUpgrade = property.asBoolean();
Ray Milkeyd84f89b2018-08-17 14:54:17 -0700160 }
Victor Silvaf2b9d032016-09-19 19:43:20 -0300161 }
162 }
163 }
164
Ayaka Koshibe16609692014-09-23 12:46:15 -0700165 @Deactivate
166 public void deactivate() {
167 eventDispatcher.removeSink(MastershipEvent.class);
Jordan Halterman61aeb352017-10-18 16:22:17 -0700168 upgradeService.removeListener(upgradeEventListener);
alshabib339a3d92014-09-26 17:54:32 -0700169 store.unsetDelegate(delegate);
Ayaka Koshibe16609692014-09-23 12:46:15 -0700170 log.info("Stopped");
Victor Silvaf2b9d032016-09-19 19:43:20 -0300171 cfgService.unregisterProperties(getClass(), false);
Ayaka Koshibe16609692014-09-23 12:46:15 -0700172 }
173
Ayaka Koshibe16609692014-09-23 12:46:15 -0700174 @Override
Madan Jampanide003d92015-05-11 17:14:20 -0700175 public CompletableFuture<Void> setRole(NodeId nodeId, DeviceId deviceId, MastershipRole role) {
Ayaka Koshibe16609692014-09-23 12:46:15 -0700176 checkNotNull(nodeId, NODE_ID_NULL);
177 checkNotNull(deviceId, DEVICE_ID_NULL);
178 checkNotNull(role, ROLE_NULL);
Ayaka Koshibed9f693e2014-09-29 18:04:54 -0700179
Madan Jampanif7536ab2015-05-07 23:23:23 -0700180 CompletableFuture<MastershipEvent> eventFuture = null;
Ayaka Koshibee60d4522014-10-28 15:07:00 -0700181
182 switch (role) {
183 case MASTER:
Madan Jampanif7536ab2015-05-07 23:23:23 -0700184 eventFuture = store.setMaster(nodeId, deviceId);
Ayaka Koshibee60d4522014-10-28 15:07:00 -0700185 break;
186 case STANDBY:
Madan Jampanif7536ab2015-05-07 23:23:23 -0700187 eventFuture = store.setStandby(nodeId, deviceId);
Ayaka Koshibee60d4522014-10-28 15:07:00 -0700188 break;
189 case NONE:
Madan Jampanif7536ab2015-05-07 23:23:23 -0700190 eventFuture = store.relinquishRole(nodeId, deviceId);
Ayaka Koshibee60d4522014-10-28 15:07:00 -0700191 break;
192 default:
193 log.info("Unknown role; ignoring");
Madan Jampanide003d92015-05-11 17:14:20 -0700194 return CompletableFuture.completedFuture(null);
Ayaka Koshibe971a38a2014-09-30 11:56:23 -0700195 }
Ayaka Koshibed9f693e2014-09-29 18:04:54 -0700196
Madan Jampanic6e574f2015-05-29 13:41:52 -0700197 return eventFuture.thenAccept(this::post)
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800198 .thenApply(v -> null);
Ayaka Koshibe16609692014-09-23 12:46:15 -0700199 }
200
201 @Override
tomb41d1ac2014-09-24 01:51:24 -0700202 public MastershipRole getLocalRole(DeviceId deviceId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900203 checkPermission(CLUSTER_READ);
Changhoon Yoon541ef712015-05-23 17:18:34 +0900204
tomb41d1ac2014-09-24 01:51:24 -0700205 checkNotNull(deviceId, DEVICE_ID_NULL);
206 return store.getRole(clusterService.getLocalNode().id(), deviceId);
207 }
208
209 @Override
Madan Jampanic6e574f2015-05-29 13:41:52 -0700210 public CompletableFuture<Void> relinquishMastership(DeviceId deviceId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900211 checkPermission(CLUSTER_WRITE);
Madan Jampanic6e574f2015-05-29 13:41:52 -0700212 return store.relinquishRole(localNodeId, deviceId)
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800213 .thenAccept(this::post)
214 .thenApply(v -> null);
tomb41d1ac2014-09-24 01:51:24 -0700215 }
216
217 @Override
Madan Jampanide003d92015-05-11 17:14:20 -0700218 public CompletableFuture<MastershipRole> requestRoleFor(DeviceId deviceId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900219 checkPermission(CLUSTER_WRITE);
Changhoon Yoon541ef712015-05-23 17:18:34 +0900220
tomb41d1ac2014-09-24 01:51:24 -0700221 checkNotNull(deviceId, DEVICE_ID_NULL);
Yuta HIGUCHI6a462832014-11-23 23:56:03 -0800222 final Context timer = startTimer(requestRoleTimer);
Madan Jampanide003d92015-05-11 17:14:20 -0700223 return store.requestRole(deviceId).whenComplete((result, error) -> stopTimer(timer));
224
tomb41d1ac2014-09-24 01:51:24 -0700225 }
226
227 @Override
Ayaka Koshibe16609692014-09-23 12:46:15 -0700228 public NodeId getMasterFor(DeviceId deviceId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900229 checkPermission(CLUSTER_READ);
Changhoon Yoon541ef712015-05-23 17:18:34 +0900230
Ayaka Koshibe16609692014-09-23 12:46:15 -0700231 checkNotNull(deviceId, DEVICE_ID_NULL);
232 return store.getMaster(deviceId);
233 }
234
235 @Override
236 public Set<DeviceId> getDevicesOf(NodeId nodeId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900237 checkPermission(CLUSTER_READ);
Changhoon Yoon541ef712015-05-23 17:18:34 +0900238
Ayaka Koshibe16609692014-09-23 12:46:15 -0700239 checkNotNull(nodeId, NODE_ID_NULL);
240 return store.getDevices(nodeId);
241 }
242
Ayaka Koshibe45503ce2014-10-14 11:26:45 -0700243 @Override
Ayaka Koshibeabedb092014-10-20 17:01:31 -0700244 public RoleInfo getNodesFor(DeviceId deviceId) {
Changhoon Yoonb856b812015-08-10 03:47:19 +0900245 checkPermission(CLUSTER_READ);
Changhoon Yoon541ef712015-05-23 17:18:34 +0900246
Ayaka Koshibe45503ce2014-10-14 11:26:45 -0700247 checkNotNull(deviceId, DEVICE_ID_NULL);
248 return store.getNodes(deviceId);
249 }
Ayaka Koshibeb70d34b2014-09-25 15:43:01 -0700250
251 @Override
Jordan Halterman0a2bd452018-06-13 17:24:58 -0700252 public MastershipInfo getMastershipFor(DeviceId deviceId) {
253 checkPermission(CLUSTER_READ);
254 checkNotNull(deviceId, DEVICE_ID_NULL);
255 return store.getMastership(deviceId);
256 }
257
258 @Override
Yuta HIGUCHIbcac4992014-11-22 19:27:57 -0800259 public MastershipTerm getMastershipTerm(DeviceId deviceId) {
Heedo Kang4a47a302016-02-29 17:40:23 +0900260 checkPermission(CLUSTER_READ);
Yuta HIGUCHIbcac4992014-11-22 19:27:57 -0800261 return store.getTermFor(deviceId);
Ayaka Koshibeb70d34b2014-09-25 15:43:01 -0700262 }
263
Ayaka Koshibe16609692014-09-23 12:46:15 -0700264 @Override
Yuta HIGUCHIa22f69f2014-11-24 22:25:17 -0800265 public MetricsService metricsService() {
266 return metricsService;
267 }
Ayaka Koshibe16609692014-09-23 12:46:15 -0700268
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800269 @Override
270 public void balanceRoles() {
271 List<ControllerNode> nodes = newArrayList(clusterService.getNodes());
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800272 Map<ControllerNode, Set<DeviceId>> controllerDevices = new HashMap<>();
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800273 Set<DeviceId> orphanedDevices = Sets.newHashSet();
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800274 int deviceCount = 0;
275
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800276 // Create buckets reflecting current ownership; do this irrespective of
277 // whether the node is active.
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800278 for (ControllerNode node : nodes) {
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800279 Set<DeviceId> devicesOf = new HashSet<>(getDevicesOf(node.id()));
Thomas Vachuska7a8de842016-03-07 20:56:35 -0800280 if (clusterService.getState(node.id()).isActive()) {
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800281 log.info("Node {} has {} devices.", node.id(), devicesOf.size());
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800282 deviceCount += devicesOf.size();
283 controllerDevices.put(node, devicesOf);
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800284 } else if (!devicesOf.isEmpty()) {
285 log.warn("Inactive node {} has {} orphaned devices.", node.id(), devicesOf.size());
286 orphanedDevices.addAll(getDevicesOf(node.id()));
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800287 }
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800288 }
289
Claudine Chiudce08152016-03-09 18:19:28 +0000290 if (useRegionForBalanceRoles && balanceRolesUsingRegions(controllerDevices)) {
291 return;
292 }
293
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800294 List<CompletableFuture<Void>> balanceBucketsFutures = Lists.newLinkedList();
Claudine Chiudce08152016-03-09 18:19:28 +0000295
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800296 // First re-balance the buckets until they are roughly even.
297 balanceControllerNodes(controllerDevices, deviceCount, balanceBucketsFutures);
298
299 // Then attempt to distribute any orphaned devices among the buckets.
300 distributeOrphanedDevices(controllerDevices, orphanedDevices, balanceBucketsFutures);
301
302 CompletableFuture<Void> balanceRolesFuture =
303 allOf(balanceBucketsFutures.toArray(new CompletableFuture[balanceBucketsFutures.size()]));
Claudine Chiudce08152016-03-09 18:19:28 +0000304
305 Futures.getUnchecked(balanceRolesFuture);
306 }
307
pierventrecdcd91c2021-05-06 09:27:00 +0200308 @Override
309 public void demote(NodeId instance, DeviceId deviceId) {
310 checkNotNull(instance, NODE_ID_NULL);
311 checkNotNull(deviceId, DEVICE_ID_NULL);
312 checkPermission(CLUSTER_WRITE);
313
314 store.demote(instance, deviceId);
315 }
316
Claudine Chiudce08152016-03-09 18:19:28 +0000317 /**
318 * Balances the nodes specified in controllerDevices.
319 *
320 * @param controllerDevices controller nodes to devices map
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800321 * @param deviceCount number of devices mastered by controller nodes
322 * @param futures list of setRole futures for "moved" devices
Claudine Chiudce08152016-03-09 18:19:28 +0000323 */
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800324 private void balanceControllerNodes(Map<ControllerNode, Set<DeviceId>> controllerDevices,
325 int deviceCount,
326 List<CompletableFuture<Void>> futures) {
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800327 // Now re-balance the buckets until they are roughly even.
328 int rounds = controllerDevices.keySet().size();
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800329 for (int i = 0; i < rounds; i++) {
330 // Iterate over the buckets and find the smallest and the largest.
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800331 ControllerNode smallest = findBucket(true, controllerDevices);
332 ControllerNode largest = findBucket(false, controllerDevices);
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800333 futures.add(balanceBuckets(smallest, largest, controllerDevices, deviceCount));
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800334 }
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800335 }
336
337 /**
338 * Uses the set of orphaned devices to even out the load among the controllers.
339 *
340 * @param controllerDevices controller nodes to devices map
341 * @param orphanedDevices set of orphaned devices without an active master
342 * @param futures list of completable future to track the progress of the balancing operation
343 */
344 private void distributeOrphanedDevices(Map<ControllerNode, Set<DeviceId>> controllerDevices,
345 Set<DeviceId> orphanedDevices,
346 List<CompletableFuture<Void>> futures) {
347 // Now re-distribute the orphaned devices into buckets until they are roughly even.
348 while (!orphanedDevices.isEmpty()) {
349 // Iterate over the buckets and find the smallest bucket.
350 ControllerNode smallest = findBucket(true, controllerDevices);
351 changeMastership(smallest, controllerDevices.get(smallest),
352 orphanedDevices, 1, futures);
353 }
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800354 }
355
Claudine Chiudce08152016-03-09 18:19:28 +0000356 /**
357 * Finds node with the minimum/maximum devices from a list of nodes.
358 *
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800359 * @param min true: minimum, false: maximum
Claudine Chiudce08152016-03-09 18:19:28 +0000360 * @param controllerDevices controller nodes to devices map
361 * @return controller node with minimum/maximum devices
362 */
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800363
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800364 private ControllerNode findBucket(boolean min,
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800365 Map<ControllerNode, Set<DeviceId>> controllerDevices) {
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800366 int xSize = min ? Integer.MAX_VALUE : -1;
367 ControllerNode xNode = null;
Thomas Vachuska12dfdc32014-11-29 16:03:12 -0800368 for (ControllerNode node : controllerDevices.keySet()) {
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800369 int size = controllerDevices.get(node).size();
370 if ((min && size < xSize) || (!min && size > xSize)) {
371 xSize = size;
372 xNode = node;
373 }
374 }
375 return xNode;
376 }
377
Claudine Chiudce08152016-03-09 18:19:28 +0000378 /**
379 * Balance the node buckets by moving devices from largest to smallest node.
380 *
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800381 * @param smallest node that is master of the smallest number of devices
382 * @param largest node that is master of the largest number of devices
Claudine Chiudce08152016-03-09 18:19:28 +0000383 * @param controllerDevices controller nodes to devices map
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800384 * @param deviceCount number of devices mastered by controller nodes
Claudine Chiudce08152016-03-09 18:19:28 +0000385 * @return list of setRole futures for "moved" devices
386 */
Madan Jampanide003d92015-05-11 17:14:20 -0700387 private CompletableFuture<Void> balanceBuckets(ControllerNode smallest, ControllerNode largest,
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800388 Map<ControllerNode, Set<DeviceId>> controllerDevices,
389 int deviceCount) {
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800390 Collection<DeviceId> minBucket = controllerDevices.get(smallest);
391 Collection<DeviceId> maxBucket = controllerDevices.get(largest);
392 int bucketCount = controllerDevices.keySet().size();
393
394 int delta = (maxBucket.size() - minBucket.size()) / 2;
395 delta = Math.min(deviceCount / bucketCount, delta);
396
Madan Jampanide003d92015-05-11 17:14:20 -0700397 List<CompletableFuture<Void>> setRoleFutures = Lists.newLinkedList();
398
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800399 if (delta > 0) {
400 log.info("Attempting to move {} nodes from {} to {}...", delta,
401 largest.id(), smallest.id());
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800402 changeMastership(smallest, minBucket, maxBucket, delta, setRoleFutures);
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800403 }
Madan Jampanide003d92015-05-11 17:14:20 -0700404
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800405 return allOf(setRoleFutures.toArray(new CompletableFuture[setRoleFutures.size()]));
406 }
407
408 /**
409 * Changes mastership for the specified number of devices in the given source
410 * bucket to the specified node and ads those devices to the given target
411 * bucket. Also adds the futures for tracking the role reassignment progress.
412 *
413 * @param toNode target controller node
414 * @param toBucket target bucket
415 * @param fromBucket source bucket
416 * @param count number of devices
417 * @param futures futures for tracking operation progress
418 */
419 private void changeMastership(ControllerNode toNode, Collection<DeviceId> toBucket,
420 Collection<DeviceId> fromBucket, int count,
421 List<CompletableFuture<Void>> futures) {
422 int i = 0;
423 Iterator<DeviceId> it = fromBucket.iterator();
424 while (it.hasNext() && i < count) {
425 DeviceId deviceId = it.next();
426 log.info("Setting {} as the master for {}", toNode.id(), deviceId);
427 futures.add(setRole(toNode.id(), deviceId, MASTER));
428 toBucket.add(deviceId);
429 it.remove();
430 i++;
431 }
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800432 }
433
Claudine Chiudce08152016-03-09 18:19:28 +0000434 /**
435 * Balances the nodes considering Region information.
436 *
437 * @param allControllerDevices controller nodes to devices map
438 * @return true: nodes balanced; false: nodes not balanced
439 */
440 private boolean balanceRolesUsingRegions(Map<ControllerNode, Set<DeviceId>> allControllerDevices) {
441 Set<Region> regions = regionService.getRegions();
442 if (regions.isEmpty()) {
443 return false; // no balancing was done using regions.
444 }
445
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800446 // Handle nodes belonging to regions
Claudine Chiudce08152016-03-09 18:19:28 +0000447 Set<ControllerNode> nodesInRegions = Sets.newHashSet();
448 for (Region region : regions) {
449 Map<ControllerNode, Set<DeviceId>> activeRegionControllers =
450 balanceRolesInRegion(region, allControllerDevices);
451 nodesInRegions.addAll(activeRegionControllers.keySet());
452 }
453
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800454 // Handle nodes not belonging to any region
Claudine Chiudce08152016-03-09 18:19:28 +0000455 Set<ControllerNode> nodesNotInRegions = Sets.difference(allControllerDevices.keySet(), nodesInRegions);
456 if (!nodesNotInRegions.isEmpty()) {
457 int deviceCount = 0;
458 Map<ControllerNode, Set<DeviceId>> controllerDevicesNotInRegions = new HashMap<>();
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800459 for (ControllerNode controllerNode : nodesNotInRegions) {
Claudine Chiudce08152016-03-09 18:19:28 +0000460 controllerDevicesNotInRegions.put(controllerNode, allControllerDevices.get(controllerNode));
461 deviceCount += allControllerDevices.get(controllerNode).size();
462 }
463 // Now re-balance the buckets until they are roughly even.
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800464 List<CompletableFuture<Void>> balanceBucketsFutures = Lists.newArrayList();
465 balanceControllerNodes(controllerDevicesNotInRegions, deviceCount, balanceBucketsFutures);
Claudine Chiudce08152016-03-09 18:19:28 +0000466
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800467 CompletableFuture<Void> balanceRolesFuture = allOf(
Claudine Chiudce08152016-03-09 18:19:28 +0000468 balanceBucketsFutures.toArray(new CompletableFuture[balanceBucketsFutures.size()]));
469
470 Futures.getUnchecked(balanceRolesFuture);
471 }
472 return true; // balancing was done using regions.
473 }
474
475 /**
476 * Balances the nodes in specified region.
477 *
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800478 * @param region region in which nodes are to be balanced
Claudine Chiudce08152016-03-09 18:19:28 +0000479 * @param allControllerDevices controller nodes to devices map
480 * @return controller nodes that were balanced
481 */
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800482 private Map<ControllerNode, Set<DeviceId>>
483 balanceRolesInRegion(Region region,
484 Map<ControllerNode, Set<DeviceId>> allControllerDevices) {
Claudine Chiudce08152016-03-09 18:19:28 +0000485
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800486 // Retrieve all devices associated with specified region
Claudine Chiudce08152016-03-09 18:19:28 +0000487 Set<DeviceId> devicesInRegion = regionService.getRegionDevices(region.id());
488 log.info("Region {} has {} devices.", region.id(), devicesInRegion.size());
489 if (devicesInRegion.isEmpty()) {
490 return new HashMap<>(); // no devices in this region, so nothing to balance.
491 }
492
493 List<Set<NodeId>> mastersList = region.masters();
494 log.info("Region {} has {} sets of masters.", region.id(), mastersList.size());
495 if (mastersList.isEmpty()) {
496 // TODO handle devices that belong to a region, which has no masters defined
497 return new HashMap<>(); // for now just leave devices alone
498 }
499
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800500 // Get the region's preferred set of masters
Claudine Chiudce08152016-03-09 18:19:28 +0000501 Set<DeviceId> devicesInMasters = Sets.newHashSet();
502 Map<ControllerNode, Set<DeviceId>> regionalControllerDevices =
503 getRegionsPreferredMasters(region, devicesInMasters, allControllerDevices);
504
505 // Now re-balance the buckets until they are roughly even.
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800506 List<CompletableFuture<Void>> balanceBucketsFutures = Lists.newArrayList();
507 balanceControllerNodes(regionalControllerDevices, devicesInMasters.size(), balanceBucketsFutures);
Claudine Chiudce08152016-03-09 18:19:28 +0000508
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800509 // Handle devices that are not currently mastered by the master node set
Claudine Chiudce08152016-03-09 18:19:28 +0000510 Set<DeviceId> devicesNotMasteredWithControllers = Sets.difference(devicesInRegion, devicesInMasters);
511 if (!devicesNotMasteredWithControllers.isEmpty()) {
512 // active controllers in master node set are already balanced, just
513 // assign device mastership in sequence
514 List<ControllerNode> sorted = new ArrayList<>(regionalControllerDevices.keySet());
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800515 Collections.sort(sorted, Comparator.comparingInt(o -> (regionalControllerDevices.get(o)).size()));
Claudine Chiudce08152016-03-09 18:19:28 +0000516 int deviceIndex = 0;
517 for (DeviceId deviceId : devicesNotMasteredWithControllers) {
518 ControllerNode cnode = sorted.get(deviceIndex % sorted.size());
519 balanceBucketsFutures.add(setRole(cnode.id(), deviceId, MASTER));
520 regionalControllerDevices.get(cnode).add(deviceId);
521 deviceIndex++;
522 }
523 }
524
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800525 CompletableFuture<Void> balanceRolesFuture =
526 allOf(balanceBucketsFutures.toArray(new CompletableFuture[balanceBucketsFutures.size()]));
Claudine Chiudce08152016-03-09 18:19:28 +0000527
528 Futures.getUnchecked(balanceRolesFuture);
529
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800530 // Update the map before returning
Claudine Chiudce08152016-03-09 18:19:28 +0000531 regionalControllerDevices.forEach((controllerNode, deviceIds) -> {
532 regionalControllerDevices.put(controllerNode, new HashSet<>(getDevicesOf(controllerNode.id())));
533 });
534
535 return regionalControllerDevices;
536 }
537
538 /**
539 * Get region's preferred set of master nodes - the first master node set that has at
540 * least one active node.
541 *
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800542 * @param region region for which preferred set of master nodes is requested
543 * @param devicesInMasters device set to track devices in preferred set of master nodes
Claudine Chiudce08152016-03-09 18:19:28 +0000544 * @param allControllerDevices controller nodes to devices map
545 * @return region's preferred master nodes (and devices that use them as masters)
546 */
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800547 private Map<ControllerNode, Set<DeviceId>>
548 getRegionsPreferredMasters(Region region,
549 Set<DeviceId> devicesInMasters,
550 Map<ControllerNode, Set<DeviceId>> allControllerDevices) {
Claudine Chiudce08152016-03-09 18:19:28 +0000551 Map<ControllerNode, Set<DeviceId>> regionalControllerDevices = new HashMap<>();
552 int listIndex = 0;
Thomas Vachuskaf2e09cb2017-11-06 15:17:06 -0800553 for (Set<NodeId> masterSet : region.masters()) {
Claudine Chiudce08152016-03-09 18:19:28 +0000554 log.info("Region {} masters set {} has {} nodes.",
555 region.id(), listIndex, masterSet.size());
556 if (masterSet.isEmpty()) { // nothing on this level
557 listIndex++;
558 continue;
559 }
560 // Create buckets reflecting current ownership.
561 for (NodeId nodeId : masterSet) {
Michele Santuari6ebb36e2016-03-28 10:12:04 -0700562 if (clusterService.getState(nodeId).isActive()) {
Claudine Chiudce08152016-03-09 18:19:28 +0000563 ControllerNode controllerNode = clusterService.getNode(nodeId);
564 Set<DeviceId> devicesOf = new HashSet<>(allControllerDevices.get(controllerNode));
565 regionalControllerDevices.put(controllerNode, devicesOf);
566 devicesInMasters.addAll(devicesOf);
567 log.info("Active Node {} has {} devices.", nodeId, devicesOf.size());
568 }
569 }
570 if (!regionalControllerDevices.isEmpty()) {
571 break; // now have a set of >0 active controllers
572 }
573 listIndex++; // keep on looking
574 }
575 return regionalControllerDevices;
576 }
Thomas Vachuska1e68bdd2014-11-29 13:53:10 -0800577
alshabib339a3d92014-09-26 17:54:32 -0700578 public class InternalDelegate implements MastershipStoreDelegate {
alshabib339a3d92014-09-26 17:54:32 -0700579 @Override
580 public void notify(MastershipEvent event) {
Thomas Vachuska42e8cce2015-07-29 19:25:18 -0700581 post(event);
alshabib339a3d92014-09-26 17:54:32 -0700582 }
alshabib339a3d92014-09-26 17:54:32 -0700583 }
584
Jordan Halterman713830d2017-10-07 13:40:44 -0700585 private class InternalUpgradeEventListener implements UpgradeEventListener {
586 @Override
587 public void event(UpgradeEvent event) {
588 if (rebalanceRolesOnUpgrade &&
589 (event.type() == UpgradeEvent.Type.COMMITTED || event.type() == UpgradeEvent.Type.RESET)) {
590 balanceRoles();
591 }
592 }
593 }
594
Ayaka Koshibe16609692014-09-23 12:46:15 -0700595}