blob: 213d0f4d66c077e50685e65a97ab4b00fbbe9643 [file] [log] [blame]
Thomas Vachuska4f1a60c2014-10-28 13:39:07 -07001/*
2 * Copyright 2014 Open Networking Laboratory
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
Brian O'Connorabafb502014-12-02 22:26:20 -080016package org.onosproject.store.cluster.impl;
tom2d7c65f2014-09-23 01:09:35 -070017
Ayaka Koshibedd91b842015-03-02 14:48:47 -080018import static com.google.common.base.Preconditions.checkArgument;
19import static com.google.common.base.Preconditions.checkNotNull;
20import static org.onlab.util.Tools.groupedThreads;
21import static org.slf4j.LoggerFactory.getLogger;
22
23import java.io.File;
24import java.io.IOException;
25import java.net.InetAddress;
26import java.net.NetworkInterface;
27import java.net.SocketException;
28import java.util.Enumeration;
29import java.util.Map;
30import java.util.Set;
31import java.util.concurrent.ExecutorService;
32import java.util.concurrent.Executors;
33import java.util.concurrent.ScheduledExecutorService;
34import java.util.concurrent.TimeUnit;
35import java.util.stream.Collectors;
Yuta HIGUCHIb5df76d2014-09-27 20:54:00 -070036
tom2d7c65f2014-09-23 01:09:35 -070037import org.apache.felix.scr.annotations.Activate;
38import org.apache.felix.scr.annotations.Component;
39import org.apache.felix.scr.annotations.Deactivate;
tom2d7c65f2014-09-23 01:09:35 -070040import org.apache.felix.scr.annotations.Service;
Ayaka Koshibedd91b842015-03-02 14:48:47 -080041import org.onlab.netty.Endpoint;
42import org.onlab.netty.Message;
43import org.onlab.netty.MessageHandler;
44import org.onlab.netty.NettyMessagingService;
45import org.onlab.packet.IpAddress;
46import org.onlab.util.KryoNamespace;
Brian O'Connorabafb502014-12-02 22:26:20 -080047import org.onosproject.cluster.ClusterEvent;
48import org.onosproject.cluster.ClusterStore;
49import org.onosproject.cluster.ClusterStoreDelegate;
50import org.onosproject.cluster.ControllerNode;
51import org.onosproject.cluster.DefaultControllerNode;
Ayaka Koshibedd91b842015-03-02 14:48:47 -080052import org.onosproject.cluster.ControllerNode.State;
Brian O'Connorabafb502014-12-02 22:26:20 -080053import org.onosproject.cluster.NodeId;
Ayaka Koshibedd91b842015-03-02 14:48:47 -080054import org.onosproject.store.AbstractStore;
55import org.onosproject.store.serializers.KryoNamespaces;
56import org.onosproject.store.serializers.KryoSerializer;
57import org.slf4j.Logger;
tom2d7c65f2014-09-23 01:09:35 -070058
Ayaka Koshibedd91b842015-03-02 14:48:47 -080059import com.google.common.collect.ImmutableSet;
60import com.google.common.collect.Maps;
61import com.hazelcast.util.AddressUtil;
tom2d7c65f2014-09-23 01:09:35 -070062
tom2d7c65f2014-09-23 01:09:35 -070063@Component(immediate = true)
64@Service
Ayaka Koshibedd91b842015-03-02 14:48:47 -080065/**
66 * Distributed cluster nodes store that employs an accrual failure
67 * detector to identify cluster member up/down status.
68 */
tom0755a362014-09-24 11:54:43 -070069public class DistributedClusterStore
Ayaka Koshibedd91b842015-03-02 14:48:47 -080070 extends AbstractStore<ClusterEvent, ClusterStoreDelegate>
tomb41d1ac2014-09-24 01:51:24 -070071 implements ClusterStore {
tom2d7c65f2014-09-23 01:09:35 -070072
Ayaka Koshibedd91b842015-03-02 14:48:47 -080073 private final Logger log = getLogger(DistributedClusterStore.class);
tom2d7c65f2014-09-23 01:09:35 -070074
Ayaka Koshibedd91b842015-03-02 14:48:47 -080075 // TODO: make these configurable.
76 private static final int HEARTBEAT_FD_PORT = 2419;
77 private static final int HEARTBEAT_INTERVAL_MS = 100;
78 private static final int PHI_FAILURE_THRESHOLD = 10;
tom2d7c65f2014-09-23 01:09:35 -070079
Ayaka Koshibedd91b842015-03-02 14:48:47 -080080 private static final String CONFIG_DIR = "../config";
81 private static final String CLUSTER_DEFINITION_FILE = "cluster.json";
82 private static final String HEARTBEAT_MESSAGE = "onos-cluster-heartbeat";
Yuta HIGUCHId1a63e92014-12-02 13:14:28 -080083
Ayaka Koshibedd91b842015-03-02 14:48:47 -080084 private static final KryoSerializer SERIALIZER = new KryoSerializer() {
85 @Override
86 protected void setupKryoPool() {
87 serializerPool = KryoNamespace.newBuilder()
88 .register(KryoNamespaces.API)
89 .register(HeartbeatMessage.class)
90 .build()
91 .populate(1);
92 }
93 };
94
95 private static final String INSTANCE_ID_NULL = "Instance ID cannot be null";
96
97 private ClusterDefinition clusterDefinition;
98
99 private Set<ControllerNode> seedNodes;
100 private final Map<NodeId, ControllerNode> allNodes = Maps.newConcurrentMap();
101 private final Map<NodeId, State> nodeStates = Maps.newConcurrentMap();
102 private NettyMessagingService messagingService = new NettyMessagingService();
103 private ScheduledExecutorService heartBeatSender = Executors.newSingleThreadScheduledExecutor(
104 groupedThreads("onos/cluster/membership", "heartbeat-sender"));
105 private ExecutorService heartBeatMessageHandler = Executors.newSingleThreadExecutor(
106 groupedThreads("onos/cluster/membership", "heartbeat-receiver"));
107
108 private PhiAccrualFailureDetector failureDetector;
109
110 private ControllerNode localNode;
111
tom2d7c65f2014-09-23 01:09:35 -0700112 @Activate
113 public void activate() {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800114 File clusterDefinitionFile = new File(CONFIG_DIR,
115 CLUSTER_DEFINITION_FILE);
tom2d7c65f2014-09-23 01:09:35 -0700116
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800117 try {
118 clusterDefinition = new ClusterDefinitionStore(
119 clusterDefinitionFile.getPath()).read();
120 seedNodes = ImmutableSet
121 .copyOf(clusterDefinition.getNodes())
122 .stream()
123 .map(nodeInfo -> new DefaultControllerNode(new NodeId(
124 nodeInfo.getId()), IpAddress.valueOf(nodeInfo
125 .getIp()), nodeInfo.getTcpPort()))
126 .collect(Collectors.toSet());
127 } catch (IOException e) {
128 throw new IllegalStateException(
129 "Failed to read cluster definition.", e);
130 }
tomb41d1ac2014-09-24 01:51:24 -0700131
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800132 seedNodes.forEach(node -> {
133 allNodes.put(node.id(), node);
134 nodeStates.put(node.id(), State.INACTIVE);
135 });
136
137 establishSelfIdentity();
138
139 messagingService = new NettyMessagingService(HEARTBEAT_FD_PORT);
140
141 try {
142 messagingService.activate();
143 } catch (InterruptedException e) {
144 Thread.currentThread().interrupt();
145 throw new IllegalStateException(
146 "Failed to cleanly initialize membership and"
147 + " failure detector communication channel.", e);
148 }
149 messagingService.registerHandler(HEARTBEAT_MESSAGE,
150 new HeartbeatMessageHandler(), heartBeatMessageHandler);
151
152 failureDetector = new PhiAccrualFailureDetector();
153
154 heartBeatSender.scheduleWithFixedDelay(this::heartbeat, 0,
155 HEARTBEAT_INTERVAL_MS, TimeUnit.MILLISECONDS);
tomb41d1ac2014-09-24 01:51:24 -0700156
157 log.info("Started");
158 }
159
tom2d7c65f2014-09-23 01:09:35 -0700160 @Deactivate
161 public void deactivate() {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800162 try {
163 messagingService.deactivate();
164 } catch (Exception e) {
165 log.trace("Failed to cleanly shutdown cluster membership messaging", e);
166 }
167
168 heartBeatSender.shutdownNow();
169 heartBeatMessageHandler.shutdownNow();
170
tom2d7c65f2014-09-23 01:09:35 -0700171 log.info("Stopped");
172 }
173
174 @Override
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800175 public void setDelegate(ClusterStoreDelegate delegate) {
176 checkNotNull(delegate, "Delegate cannot be null");
177 this.delegate = delegate;
178 }
179
180 @Override
181 public void unsetDelegate(ClusterStoreDelegate delegate) {
182 this.delegate = null;
183 }
184
185 @Override
186 public boolean hasDelegate() {
187 return this.delegate != null;
188 }
189
190 @Override
tom2d7c65f2014-09-23 01:09:35 -0700191 public ControllerNode getLocalNode() {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800192 return localNode;
tom2d7c65f2014-09-23 01:09:35 -0700193 }
194
195 @Override
196 public Set<ControllerNode> getNodes() {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800197 return ImmutableSet.copyOf(allNodes.values());
tom2d7c65f2014-09-23 01:09:35 -0700198 }
199
200 @Override
201 public ControllerNode getNode(NodeId nodeId) {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800202 checkNotNull(nodeId, INSTANCE_ID_NULL);
203 return allNodes.get(nodeId);
tom2d7c65f2014-09-23 01:09:35 -0700204 }
205
206 @Override
tomb41d1ac2014-09-24 01:51:24 -0700207 public State getState(NodeId nodeId) {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800208 checkNotNull(nodeId, INSTANCE_ID_NULL);
209 return nodeStates.get(nodeId);
tomb41d1ac2014-09-24 01:51:24 -0700210 }
211
212 @Override
Pavlin Radoslavov444b5192014-10-28 10:45:19 -0700213 public ControllerNode addNode(NodeId nodeId, IpAddress ip, int tcpPort) {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800214 checkNotNull(nodeId, INSTANCE_ID_NULL);
215 checkNotNull(ip, "IP address must not be null");
216 checkArgument(tcpPort > 5000, "Tcp port must be greater than 5000");
217 ControllerNode node = new DefaultControllerNode(nodeId, ip, tcpPort);
218 allNodes.put(node.id(), node);
219 nodeStates.put(nodeId, State.INACTIVE);
220 delegate.notify(new ClusterEvent(ClusterEvent.Type.INSTANCE_ADDED, node));
221 return node;
tomee49c372014-09-26 15:14:50 -0700222 }
223
224 @Override
tomb41d1ac2014-09-24 01:51:24 -0700225 public void removeNode(NodeId nodeId) {
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800226 checkNotNull(nodeId, INSTANCE_ID_NULL);
227 ControllerNode node = allNodes.remove(nodeId);
228 if (node != null) {
229 nodeStates.remove(nodeId);
230 delegate.notify(new ClusterEvent(ClusterEvent.Type.INSTANCE_REMOVED, node));
tomb41d1ac2014-09-24 01:51:24 -0700231 }
232 }
233
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800234 private void establishSelfIdentity() {
235 try {
236 IpAddress ip = findLocalIp();
237 localNode = new DefaultControllerNode(new NodeId(ip.toString()), ip);
238 allNodes.put(localNode.id(), localNode);
239 nodeStates.put(localNode.id(), State.ACTIVE);
240 log.info("Local Node: {}", localNode);
241 } catch (SocketException e) {
242 throw new IllegalStateException("Cannot determine local IP", e);
243 }
tom2d7c65f2014-09-23 01:09:35 -0700244 }
245
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800246 private void heartbeat() {
247 try {
248 Set<ControllerNode> peers = allNodes.values()
249 .stream()
250 .filter(node -> !(node.id().equals(localNode.id())))
251 .collect(Collectors.toSet());
252 byte[] hbMessagePayload = SERIALIZER.encode(new HeartbeatMessage(localNode, peers));
253 peers.forEach((node) -> {
254 heartbeatToPeer(hbMessagePayload, node);
255 State currentState = nodeStates.get(node.id());
256 double phi = failureDetector.phi(node.id());
257 if (phi >= PHI_FAILURE_THRESHOLD) {
258 if (currentState == State.ACTIVE) {
259 nodeStates.put(node.id(), State.INACTIVE);
260 notifyStateChange(node.id(), State.ACTIVE, State.INACTIVE);
261 }
262 } else {
263 if (currentState == State.INACTIVE) {
264 nodeStates.put(node.id(), State.ACTIVE);
265 notifyStateChange(node.id(), State.INACTIVE, State.ACTIVE);
266 }
267 }
268 });
269 } catch (Exception e) {
270 log.debug("Failed to send heartbeat", e);
271 }
tomb41d1ac2014-09-24 01:51:24 -0700272 }
273
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800274 private void notifyStateChange(NodeId nodeId, State oldState, State newState) {
275 ControllerNode node = allNodes.get(nodeId);
276 if (newState == State.ACTIVE) {
277 delegate.notify(new ClusterEvent(ClusterEvent.Type.INSTANCE_ACTIVATED, node));
278 } else {
279 delegate.notify(new ClusterEvent(ClusterEvent.Type.INSTANCE_DEACTIVATED, node));
280 }
tomb41d1ac2014-09-24 01:51:24 -0700281 }
282
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800283 private void heartbeatToPeer(byte[] messagePayload, ControllerNode peer) {
284 Endpoint remoteEp = new Endpoint(peer.ip(), HEARTBEAT_FD_PORT);
285 try {
286 messagingService.sendAsync(remoteEp, HEARTBEAT_MESSAGE, messagePayload);
287 } catch (IOException e) {
288 log.debug("Sending heartbeat to {} failed", remoteEp, e);
289 }
290 }
291
292 private IpAddress findLocalIp() throws SocketException {
293 Enumeration<NetworkInterface> interfaces =
294 NetworkInterface.getNetworkInterfaces();
295 while (interfaces.hasMoreElements()) {
296 NetworkInterface iface = interfaces.nextElement();
297 Enumeration<InetAddress> inetAddresses = iface.getInetAddresses();
298 while (inetAddresses.hasMoreElements()) {
299 IpAddress ip = IpAddress.valueOf(inetAddresses.nextElement());
300 if (AddressUtil.matchInterface(ip.toString(), clusterDefinition.getIpPrefix())) {
301 return ip;
302 }
303 }
304 }
305 throw new IllegalStateException("Unable to determine local ip");
306 }
307
308 private class HeartbeatMessageHandler implements MessageHandler {
tomb41d1ac2014-09-24 01:51:24 -0700309 @Override
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800310 public void handle(Message message) throws IOException {
311 HeartbeatMessage hb = SERIALIZER.decode(message.payload());
312 failureDetector.report(hb.source().id());
313 hb.knownPeers().forEach(node -> {
314 allNodes.put(node.id(), node);
315 });
tomb41d1ac2014-09-24 01:51:24 -0700316 }
tom2d7c65f2014-09-23 01:09:35 -0700317 }
Ayaka Koshibedd91b842015-03-02 14:48:47 -0800318
319 private static class HeartbeatMessage {
320 private ControllerNode source;
321 private Set<ControllerNode> knownPeers;
322
323 public HeartbeatMessage(ControllerNode source, Set<ControllerNode> members) {
324 this.source = source;
325 this.knownPeers = ImmutableSet.copyOf(members);
326 }
327
328 public ControllerNode source() {
329 return source;
330 }
331
332 public Set<ControllerNode> knownPeers() {
333 return knownPeers;
334 }
335 }
336
tom2d7c65f2014-09-23 01:09:35 -0700337}