Role management for the case where the controller does not hear back from
the registry-service causing the handshake to time out. Earlier we were
disconnecting the switch. With this patch, we will move to state Equal/Slave.
This behavior should be changed, once the underlying registry replies
to every mastership request (currently only replies to the controller that
wins mastership).
Also changed the timeouts:
- role-reply not received timeout to 3 secs
- handshake not completed timeout to 10 secs
Previously the timeouts were 10 and 60 secs respectively, which is too long
to figure out that something has gone wrong.
Change-Id: I58bf56f511992ce874407f5205c42a38cd6403ae
diff --git a/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java b/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
index a69e933..d2e4156 100644
--- a/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
+++ b/src/main/java/net/floodlightcontroller/core/internal/OFChannelHandler.java
@@ -84,7 +84,7 @@
private static final Logger log = LoggerFactory.getLogger(OFChannelHandler.class);
- private static final long DEFAULT_ROLE_TIMEOUT_MS = 10 * 1000; // 10 sec
+ private static final long DEFAULT_ROLE_TIMEOUT_MS = 3 * 1000; // 3 sec
private final Controller controller;
private final Counters counters;
private IOFSwitch sw;
@@ -1033,6 +1033,15 @@
}
@Override
+ public void handleTimedOutHandshake(OFChannelHandler h,
+ ChannelHandlerContext ctx) throws IOException {
+ log.info("Handshake timed out waiting to hear back from registry "
+ + "service. Moving to Role EQUAL for switch {}",
+ h.getSwitchInfoString());
+ setRoleAndStartDriverHandshake(h, Role.EQUAL);
+ }
+
+ @Override
void processOFFeaturesReply(OFChannelHandler h, OFFeaturesReply m)
throws IOException, SwitchStateException {
illegalMessageReceived(h, m);
@@ -1893,6 +1902,41 @@
h.getSwitchInfoString(), this.toString(), pendingRole);
throw new SwitchStateException(msg);
}
+
+ /**
+ * Handles switch handshake timeout.
+ * <p>
+ * If the handshake times-out while the switch is in any state other
+ * than WAIT_INITIAL_ROLE, then the switch is disconnected.
+ * <p>
+ * If the switch is in WAIT_INITIAL_ROLE state, and a pending role reply
+ * is not received, it would trigger the role reply timeout, which would
+ * be handled by handleTimedOutRoleReply (which would disconnect the
+ * switch).
+ * <p>
+ * If the switch is in WAIT_INITIAL_ROLE state, when the handshake
+ * timeout is triggered, then it's because we have not heard back from
+ * the registry service regarding switch mastership. In this case, we
+ * move to EQUAL (or SLAVE) state. See override for this method in
+ * WAIT_INITIAL_ROLE state.
+ * <p>
+ * XXX: This is required today as the registry service does not reply
+ * with role.slave to a mastership request, i.e it only replies to the
+ * controller that wins mastership. Once the registry API changes to
+ * reply to every request, we would not need to wait for a timeout to
+ * move to Role.EQUAL (or SLAVE).
+ *
+ * @param h the channel handler for this switch
+ * @param ctx the netty channel handler context for the channel 'h'
+ * @throws IOException
+ */
+ public void handleTimedOutHandshake(OFChannelHandler h,
+ ChannelHandlerContext ctx) throws IOException {
+ log.error("Disconnecting switch {}: failed to complete handshake",
+ h.getSwitchInfoString());
+ h.counters.switchDisconnectHandshakeTimeout.updateCounterWithFlush();
+ ctx.getChannel().close();
+ }
}
// *************************
@@ -1996,10 +2040,9 @@
counters.switchDisconnectReadTimeout.updateCounterWithFlush();
ctx.getChannel().close();
} else if (e.getCause() instanceof HandshakeTimeoutException) {
- log.error("Disconnecting switch {}: failed to complete handshake",
- getSwitchInfoString());
- counters.switchDisconnectHandshakeTimeout.updateCounterWithFlush();
- ctx.getChannel().close();
+ // handle timeout within state-machine - different actions taken
+ // depending on current state
+ state.handleTimedOutHandshake(this, ctx);
} else if (e.getCause() instanceof ClosedChannelException) {
log.debug("Channel for sw {} already closed", getSwitchInfoString());
} else if (e.getCause() instanceof IOException) {
diff --git a/src/main/java/net/floodlightcontroller/core/internal/OpenflowPipelineFactory.java b/src/main/java/net/floodlightcontroller/core/internal/OpenflowPipelineFactory.java
index 62938fe..c087a43 100644
--- a/src/main/java/net/floodlightcontroller/core/internal/OpenflowPipelineFactory.java
+++ b/src/main/java/net/floodlightcontroller/core/internal/OpenflowPipelineFactory.java
@@ -36,6 +36,7 @@
public class OpenflowPipelineFactory
implements ChannelPipelineFactory, ExternalResourceReleasable {
+ private static final long HANDSHAKE_TIMEOUT_SECS = 10;
protected Controller controller;
protected ThreadPoolExecutor pipelineExecutor;
protected Timer timer;
@@ -61,9 +62,8 @@
pipeline.addLast("ofmessageencoder", new OFMessageEncoder());
pipeline.addLast("idle", idleHandler);
pipeline.addLast("timeout", readTimeoutHandler);
- // XXX S ONOS: was 15 increased it to fix Issue #296
pipeline.addLast("handshaketimeout",
- new HandshakeTimeoutHandler(handler, timer, 60));
+ new HandshakeTimeoutHandler(handler, timer, HANDSHAKE_TIMEOUT_SECS));
if (pipelineExecutor != null)
pipeline.addLast("pipelineExecutor",
new ExecutionHandler(pipelineExecutor));
diff --git a/src/test/java/net/floodlightcontroller/core/internal/OFChannelHandlerTest.java b/src/test/java/net/floodlightcontroller/core/internal/OFChannelHandlerTest.java
index 7fc3002..1dc2706 100644
--- a/src/test/java/net/floodlightcontroller/core/internal/OFChannelHandlerTest.java
+++ b/src/test/java/net/floodlightcontroller/core/internal/OFChannelHandlerTest.java
@@ -519,8 +519,6 @@
assertEquals(OFChannelHandler.ChannelState.WAIT_INITIAL_ROLE,
handler.getStateForTesting());
- // build the stats reply
- OFStatsReply sr = createDescriptionStatsReply();
OFMessage rr = getRoleReply(xid, Role.SLAVE);
setupMessageEvent(Collections.<OFMessage>singletonList(rr));
@@ -528,8 +526,6 @@
reset(controller);
reset(swImplBase);
- expect(controller.getOFSwitchInstance((OFDescStatsReply)sr, ofVersion))
- .andReturn(swImplBase).anyTimes();
expect(controller.getThreadPoolService())
.andReturn(threadPool).anyTimes();
expect(controller.getDebugCounter())
@@ -564,6 +560,55 @@
}
/**
+ * In WAIT_INITIAL_ROLE state the controller is waiting to hear back from
+ * the registry service. If it gets mastership it sends a role-request to
+ * the switch and if it gets the reply it moves to state MASTER.
+ * <p>
+ * However if it does not hear from the registry service, the handshake
+ * process should timeout, and the controller should move to state EQUAL.
+ * This test checks this behavior.
+ *
+ * @throws Exception
+ */
+ @Test
+ public void moveToRoleEqualFromHandshakeTimeout() throws Exception {
+ moveToWaitInitialRole();
+ int xid = 2000;
+ resetChannel();
+ reset(controller);
+ reset(swImplBase);
+ // simulate that we never heard from registry service and so the
+ // handshake times out -- this results in a HandshakeTimeoutException
+ // called on the channel.
+ ExceptionEvent eeMock = createMock(ExceptionEvent.class);
+ expect(eeMock.getCause()).andReturn(new HandshakeTimeoutException())
+ .atLeastOnce();
+ // controller should move to role EQUAL via the driver handshake
+ swImplBase.setRole(Role.EQUAL);
+ expectLastCall().once();
+ swImplBase.startDriverHandshake();
+ expectLastCall().once();
+ // assume nothing-to-do in driver handshake by returning true
+ // immediately
+ expect(swImplBase.isDriverHandshakeComplete())
+ .andReturn(true).once();
+ expect(swImplBase.getRole()).andReturn(Role.EQUAL).once();
+ expect(swImplBase.getId())
+ .andReturn(1L).anyTimes();
+ expect(controller.addActivatedEqualSwitch(1L, swImplBase))
+ .andReturn(true).once();
+
+ replay(channel);
+ replay(controller);
+ replay(swImplBase);
+ replay(eeMock);
+ handler.exceptionCaught(ctx, eeMock);
+ assertEquals(OFChannelHandler.ChannelState.EQUAL,
+ handler.getStateForTesting());
+ }
+
+
+ /**
* Move the channel from scratch to WAIT_INITIAL_ROLE state,
* then move the channel to EQUAL state based on the switch Role.
* This test basically test the switch with role support.