Role management for the case where the controller does not hear back from
the registry-service causing the handshake to time out. Earlier we were
disconnecting the switch. With this patch, we will move to state Equal/Slave.
This behavior should be changed, once the underlying registry replies
to every mastership request (currently only replies to the controller that
wins mastership).
Also changed the timeouts:
- role-reply not received timeout to 3 secs
- handshake not completed timeout to 10 secs
Previously the timeouts were 10 and 60 secs respectively, which is too long
to figure out that something has gone wrong.
Change-Id: I58bf56f511992ce874407f5205c42a38cd6403ae
diff --git a/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java b/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
index a69e933..d2e4156 100644
--- a/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
+++ b/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
@@ -84,7 +84,7 @@
private static final Logger log = LoggerFactory.getLogger(OFChannelHandler.class);
- private static final long DEFAULT_ROLE_TIMEOUT_MS = 10 * 1000; // 10 sec
+ private static final long DEFAULT_ROLE_TIMEOUT_MS = 3 * 1000; // 3 sec
private final Controller controller;
private final Counters counters;
private IOFSwitch sw;
@@ -1033,6 +1033,15 @@
}
@Override
+ public void handleTimedOutHandshake(OFChannelHandler h,
+ ChannelHandlerContext ctx) throws IOException {
+ log.info("Handshake timed out waiting to hear back from registry "
+ + "service. Moving to Role EQUAL for switch {}",
+ h.getSwitchInfoString());
+ setRoleAndStartDriverHandshake(h, Role.EQUAL);
+ }
+
+ @Override
void processOFFeaturesReply(OFChannelHandler h, OFFeaturesReply m)
throws IOException, SwitchStateException {
illegalMessageReceived(h, m);
@@ -1893,6 +1902,41 @@
h.getSwitchInfoString(), this.toString(), pendingRole);
throw new SwitchStateException(msg);
}
+
+ /**
+ * Handles switch handshake timeout.
+ * <p>
+ * If the handshake times-out while the switch is in any state other
+ * than WAIT_INITIAL_ROLE, then the switch is disconnected.
+ * <p>
+ * If the switch is in WAIT_INITIAL_ROLE state, and a pending role reply
+ * is not received, it would trigger the role reply timeout, which would
+ * be handled by handleTimedOutRoleReply (which would disconnect the
+ * switch).
+ * <p>
+ * If the switch is in WAIT_INITIAL_ROLE state, when the handshake
+ * timeout is triggered, then it's because we have not heard back from
+ * the registry service regarding switch mastership. In this case, we
+ * move to EQUAL (or SLAVE) state. See override for this method in
+ * WAIT_INITIAL_ROLE state.
+ * <p>
+ * XXX: This is required today as the registry service does not reply
+ * with role.slave to a mastership request, i.e it only replies to the
+ * controller that wins mastership. Once the registry API changes to
+ * reply to every request, we would not need to wait for a timeout to
+ * move to Role.EQUAL (or SLAVE).
+ *
+ * @param h the channel handler for this switch
+ * @param ctx the netty channel handler context for the channel 'h'
+ * @throws IOException
+ */
+ public void handleTimedOutHandshake(OFChannelHandler h,
+ ChannelHandlerContext ctx) throws IOException {
+ log.error("Disconnecting switch {}: failed to complete handshake",
+ h.getSwitchInfoString());
+ h.counters.switchDisconnectHandshakeTimeout.updateCounterWithFlush();
+ ctx.getChannel().close();
+ }
}
// *************************
@@ -1996,10 +2040,9 @@
counters.switchDisconnectReadTimeout.updateCounterWithFlush();
ctx.getChannel().close();
} else if (e.getCause() instanceof HandshakeTimeoutException) {
- log.error("Disconnecting switch {}: failed to complete handshake",
- getSwitchInfoString());
- counters.switchDisconnectHandshakeTimeout.updateCounterWithFlush();
- ctx.getChannel().close();
+ // handle timeout within state-machine - different actions taken
+ // depending on current state
+ state.handleTimedOutHandshake(this, ctx);
} else if (e.getCause() instanceof ClosedChannelException) {
log.debug("Channel for sw {} already closed", getSwitchInfoString());
} else if (e.getCause() instanceof IOException) {