Update RAMCloud start-up procedure
This patch will enable RAMCloud coordinator failover and
server failover if number of replica is properly configured.
- Update onos.sh RAMCloud start-up procedure
- Use ZooKeeper to manage cluster member
- Specify cluster name, num replicas,...
- WARN: ramcloud.coordinator.ip is now a configuration to specify listen address
- Specify ZooKeeper address/ClusterName in RAMCloud client config (conf/ramcloud.conf)
- Note: conf/ramcloud.conf ramcloud.coordinatorIp, etc. is no longer used for connection
- Enable failure detector, which is required for fail-over feature
- Undocumented maintenance option "deldb" to remove:
- RAMCloud coordination info in ZK
- Note: ZooKeeper must be running in order to execute "./onos.sh rc-c deldb"
- BackupData file created by RAMCloud Storage Server
- Update RAMCloud Java bindings
- Expose API to specify RAMCloud ClusterName
- Expose ClientExeption
- Note: You need to run ./build-ramcloud-java-bindings.sh to update Java bindings
- Utility function to check if ZooKeeper is running
- Utility function to wait for ZooKeeper to start
- Modified kill-processes() to try killing process gracefully first then KILL
- Specify absolute path to pgrep/pkill
- Added "rc" to manipulate both server and coordinator
Change-Id: Ia8a282f81bf82810a4d000883611ce12f504bd16
diff --git a/conf/onos_node.conf b/conf/onos_node.conf
index 07716fd..75f9aa0 100644
--- a/conf/onos_node.conf
+++ b/conf/onos_node.conf
@@ -19,6 +19,9 @@
# myid will be assigned incrementally according to order of list
#zookeeper.hosts = onosdev1,onosdev2,onosdev3,onosdev4
+# RAMClouod clusterName (ONOS-RC by default)
+#ramcloud.clusterName = ONOS-RC
+
# Protocol used by RAMCloud coordinator (fast+udp by default)
#ramcloud.coordinator.protocol = fast+udp
@@ -28,6 +31,10 @@
# Port number of RAMCloud coordinator (12246 by default)
#ramcloud.coordinator.port = 12246
+# RAMCloud coordinator option deadServerTimeout (1000 by ONOS default)
+# Increase this value if server failing due to false positive failure detection.
+#ramcloud.coordinator.deadServerTimeout = 1000
+
# Protocol used by RAMCloud server (fast+udp by default)
#ramcloud.server.protocol = fast+udp
@@ -38,13 +45,23 @@
#ramcloud.server.port = 12242
# RAMCloud server option masterServiceThreads (5 by default)
-#ramcloud.masterServiceThreads = 5
+#ramcloud.server.masterServiceThreads = 5
# RAMCloud server option logCleanerThreads (1 by default)
-#ramcloud.logCleanerThreads = 1
+#ramcloud.server.logCleanerThreads = 1
-# RAMCloud server option detectFailures [0=disabled] (0 by default for ONOS development)
-#ramcloud.detectFailures = 0
+# RAMCloud server option detectFailures [0=disabled] (1 by default)
+#ramcloud.server.detectFailures = 1
+
+# RAMCloud server option replicas (0 by default)
+# This value must be the same cluster wide.
+# On N node cluster, maximum value will be N-1 replicas
+#ramcloud.server.replicas = 0
+
+# RAMCloud server option file (/var/tmp/ramclouddata/backup.`hostname`.log by default)
+# Note: Cannot be placed int ONOS_HOME, if it is on VirtualBox sharedfolder.
+# File specified must be open()-able with O_DIRECT | O_SYNC options.
+#ramcloud.server.file = /var/tmp/ramclouddata/backup.${ONOS_HOST_NAME}.log
# Port number of Hazelcast (5701 by default)
#hazelcast.host.port = 5701
diff --git a/onos.sh b/onos.sh
index b54acc3..57d24db 100755
--- a/onos.sh
+++ b/onos.sh
@@ -166,7 +166,7 @@
\$ ${scriptname} {zk|rc-coord|rc-server|core} {start|stop|restart|status}
Control specific ONOS-related process"
- echo "${usage}"
+ echo "${usage}"
}
function rotate-log {
@@ -191,7 +191,11 @@
fi
for p in ${pids}; do
if [ x$p != "x" ]; then
- kill -KILL $p
+ (
+ # Ask process with SIGTERM first, if that did not kill the process
+ # wait 1s and if process still exist, force process to be killed.
+ kill -TERM $p && kill -0 $p && sleep 1 && kill -0 $p && kill -KILL $p
+ ) 2> /dev/null
echo "Killed existing process (pid: $p)"
fi
done
@@ -340,9 +344,12 @@
echo -n "Creating ${RAMCLOUD_CONF} ... "
local temp_rc=`begin-conf-creation ${RAMCLOUD_CONF}`
-
- echo "ramcloud.coordinatorIp=${RC_COORD_PROTOCOL}:host=${RC_COORD_IP}" > ${temp_rc}
- echo "ramcloud.coordinatorPort=port=${RC_COORD_PORT}" >> ${temp_rc}
+
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+
+ # TODO make ZooKeeper address configurable.
+ echo "ramcloud.locator=zk:localhost:2181" > ${temp_rc}
+ echo "ramcloud.clusterName=${rc_cluster_name}" >> ${temp_rc}
end-conf-creation ${RAMCLOUD_CONF}
@@ -440,6 +447,35 @@
${ZK_HOME}/bin/zkServer.sh status
}
+
+function check-zk {
+ # assumption here is that ZK status script is the last command in status-zk.
+ status-zk &> /dev/null
+ local zk_status=$?
+ if [ "$zk_status" -ne 0 ]; then
+ return 1;
+ fi
+ return 0
+}
+
+# wait-zk-or-die {timeout-sec}
+function wait-zk-or-die {
+ local retries=${1:-1}
+ # do-while retries >= 0
+ while true; do
+ check-zk
+ local zk_status=$?
+ if [ "$zk_status" -eq 0 ]; then
+ return 0
+ fi
+ sleep 1;
+ ((retries -= 1))
+ (( retries >= 0 )) || break
+ done
+ echo "ZooKeeper is not running."
+ exit 1
+}
+
############################################
@@ -466,11 +502,6 @@
fi
}
-function deldb {
-# TODO: implement
- return
-}
-
### Functions related to RAMCloud coordinator
function rc-coord-addr {
@@ -484,7 +515,6 @@
function rc-coord {
case "$1" in
start)
- deldb
stop-coord
start-coord
;;
@@ -499,6 +529,10 @@
stop)
stop-coord
;;
+ deldb)
+ stop-backend
+ del-coord-info
+ ;;
stat*) # <- status
local n=`pgrep -f obj.${RAMCLOUD_BRANCH}/coordinator | wc -l`
echo "$n RAMCloud coordinator running"
@@ -510,6 +544,8 @@
}
function start-coord {
+ wait-zk-or-die 1
+
if [ ! -d ${LOGDIR} ]; then
mkdir -p ${LOGDIR}
fi
@@ -519,27 +555,92 @@
local coord_addr=`rc-coord-addr`
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # RAMCloud option deadServerTimeout
+ # (note RC default is 250ms, setting relaxed ONOS default to 1000ms)
+ local rc_coord_deadServerTimeout=$(read-conf ${ONOS_CONF} ramcloud.coordinator.deadServerTimeout 1000)
+
+ # NOTE RAMCloud document suggests to use -L to specify listen address:port,
+ # but actual RAMCloud code only uses -C argument now.
+ # (FYI: -C is documented to be deprecated in the document)
+
+ local coord_args="-C ${coord_addr}"
+ coord_args="${coord_args} --externalStorage zk:${zk_addr}"
+ coord_args="${coord_args} --clusterName ${rc_cluster_name}"
+ coord_args="${coord_args} --deadServerTimeout ${rc_coord_deadServerTimeout}"
+
+ # Read environment variables if set
+ coord_args="${coord_args} ${RC_COORDINATOR_OPTS}"
+
+ if [ "${ONOS_HOST_ROLE}" == "single-node" ]; then
+ # Note: Following reset is required, since RC restart is considered node failure,
+ # and tries recovery of server, which will never succeed after restart.
+ echo "Role configured to single-node mode. RAMCloud cluster will be reset on each start-up."
+ coord_args="${coord_args} --reset"
+ fi
+
# Run ramcloud
echo -n "Starting RAMCloud coordinator ... "
- ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator -L ${coord_addr} > $RAMCLOUD_COORD_LOG 2>&1 &
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator ${coord_args} > $RAMCLOUD_COORD_LOG 2>&1 &
echo "STARTED"
}
+function del-coord-info {
+ wait-zk-or-die 1
+
+ if [ ! -d ${LOGDIR} ]; then
+ mkdir -p ${LOGDIR}
+ fi
+ if [ -f $RAMCLOUD_COORD_LOG ]; then
+ rotate-log $RAMCLOUD_COORD_LOG
+ fi
+
+ local coord_addr=`rc-coord-addr`
+
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # RAMCloud option deadServerTimeout
+ # (note RC default is 250ms, setting relaxed ONOS default to 1000ms)
+ local rc_coord_deadServerTimeout=$(read-conf ${ONOS_CONF} ramcloud.coordinator.deadServerTimeout 1000)
+
+ # NOTE RAMCloud document suggests to use -L to specify listen address:port,
+ # but actual RAMCloud code only uses -C argument now.
+ # (FYI: -C is documented to be deprecated in the document)
+
+ local coord_args="-C ${coord_addr}"
+ coord_args="${coord_args} --externalStorage zk:${zk_addr}"
+ coord_args="${coord_args} --clusterName ${rc_cluster_name}"
+
+ # Note: --reset will reset ZK stored info and start running as acoordinator.
+ echo -n "Deleting RAMCloud cluster coordination info ... "
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator ${coord_args} --reset &> $RAMCLOUD_COORD_LOG &
+
+ # TODO Assuming 1sec is enough. To be sure monitor log?
+ sleep 1
+ # Silently kill coordinator
+ (pkill -q -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator &> /dev/null)
+
+ echo "DONE"
+}
function stop-coord {
- kill-processes "RAMCloud coordinator" `pgrep -f obj.${RAMCLOUD_BRANCH}/coordinator`
+ kill-processes "RAMCloud coordinator" `pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator`
}
### Functions related to RAMCloud server
function rc-server {
case "$1" in
start)
- deldb
stop-server
start-server
;;
startifdown)
- local n=`pgrep -f obj.${RAMCLOUD_BRANCH}/server | wc -l`
+ local n=`pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server | wc -l`
if [ $n == 0 ]; then
start-server
else
@@ -549,11 +650,12 @@
stop)
stop-server
;;
-# deldb)
-# deldb
-# ;;
+ deldb)
+ stop-server
+ del-server-backup
+ ;;
stat*) # <- status
- n=`pgrep -f obj.${RAMCLOUD_BRANCH}/server | wc -l`
+ n=`pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server | wc -l`
echo "$n RAMCloud server running"
;;
*)
@@ -563,6 +665,8 @@
}
function start-server {
+ wait-zk-or-die 1
+
if [ ! -d ${LOGDIR} ]; then
mkdir -p ${LOGDIR}
fi
@@ -573,18 +677,48 @@
local coord_addr=`rc-coord-addr`
local server_addr=`rc-server-addr`
- local masterServiceThreads=$(read-conf ${ONOS_CONF} ramcloud.masterServiceThreads 5)
- local logCleanerThreads=$(read-conf ${ONOS_CONF} ramcloud.logCleanerThreads 1)
- local detectFailures=$(read-conf ${ONOS_CONF} ramcloud.detectFailures 0)
+ local masterServiceThreads=$(read-conf ${ONOS_CONF} ramcloud.server.masterServiceThreads 5)
+ local logCleanerThreads=$(read-conf ${ONOS_CONF} ramcloud.server.logCleanerThreads 1)
+ local detectFailures=$(read-conf ${ONOS_CONF} ramcloud.server.detectFailures 1)
+
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # replication factor (-r) config
+ local rc_replicas=$(read-conf ${ONOS_CONF} ramcloud.server.replicas 0)
+ # backup file path (-f) config
+ local rc_datafile=$(read-conf ${ONOS_CONF} ramcloud.server.file "/var/tmp/ramclouddata/backup.${ONOS_HOST_NAME}.log")
+ mkdir -p `dirname ${rc_datafile}`
+
+ local server_args="-L ${server_addr}"
+ server_args="${server_args} --externalStorage zk:${zk_addr}"
+ server_args="${server_args} --clusterName ${rc_cluster_name}"
+ server_args="${server_args} --masterServiceThreads ${masterServiceThreads}"
+ server_args="${server_args} --logCleanerThreads ${logCleanerThreads}"
+ server_args="${server_args} --detectFailures ${detectFailures}"
+ server_args="${server_args} --replicas ${rc_replicas}"
+ server_args="${server_args} --file ${rc_datafile}"
+
+ # Read environment variables if set
+ server_args="${server_args} ${RC_SERVER_OPTS}"
# Run ramcloud
echo -n "Starting RAMCloud server ... "
- ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server -M -L ${server_addr} -C ${coord_addr} --masterServiceThreads ${masterServiceThreads} --logCleanerThreads ${logCleanerThreads} --detectFailures ${detectFailures} > $RAMCLOUD_SERVER_LOG 2>&1 &
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server ${server_args} > $RAMCLOUD_SERVER_LOG 2>&1 &
echo "STARTED"
}
+function del-server-backup {
+ # TODO might want confirmation, since data can be lost
+ echo -n "Removing RAMCloud backup server data ... "
+ local rc_datafile=$(read-conf ${ONOS_CONF} ramcloud.server.file "/var/tmp/ramclouddata/backup.${ONOS_HOST_NAME}.log")
+ rm -f ${rc_datafile}
+ echo "DONE"
+}
+
function stop-server {
- kill-processes "RAMCloud server" `pgrep -f obj.${RAMCLOUD_BRANCH}/server`
+ kill-processes "RAMCloud server" `pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server`
}
############################################
@@ -655,7 +789,7 @@
# Need to cd ONOS_HOME. onos.properties currently specify hazelcast config path relative to CWD
cd ${ONOS_HOME}
- echo -n "Starting ONOS controller ..."
+ echo -n "Starting ONOS controller ... "
java ${JVM_OPTS} -Dlogback.configurationFile=${ONOS_LOGBACK} -cp ${JAVA_CP} ${MAIN_CLASS} -cf ${ONOS_PROPS} > ${LOGDIR}/${LOGBASE}.stdout 2>${LOGDIR}/${LOGBASE}.stderr &
# We need to wait a bit to find out whether starting the ONOS process succeeded
@@ -790,6 +924,10 @@
rc-s*) # <- rc-server
rc-server $2
;;
+ rc)
+ rc-coord $2
+ rc-server $2
+ ;;
core)
onos $2
;;
diff --git a/src/main/cpp/edu_stanford_ramcloud_JRamCloud.cc b/src/main/cpp/edu_stanford_ramcloud_JRamCloud.cc
index f6cfea7..4b2ecd4 100644
--- a/src/main/cpp/edu_stanford_ramcloud_JRamCloud.cc
+++ b/src/main/cpp/edu_stanford_ramcloud_JRamCloud.cc
@@ -21,6 +21,7 @@
* - Inner classes in JRamCloud.java should be moved out to be a separate
* stand alone class, to eliminate workaround 00024 signature in
* C methods.
+ * - Define and support some of ClientException sub-classes.
*
*/
@@ -223,22 +224,27 @@
} catch (InvalidObjectException& e) { \
createException(env, jRamCloud, "InvalidObjectException"); \
return _returnValue; \
+ } catch (ClientException& e) { \
+ createException(env, jRamCloud, "ClientException"); \
+ return _returnValue; \
}
/*
* Class: edu_stanford_ramcloud_JRamCloud
* Method: connect
- * Signature: (Ljava/lang/String;)J
+ * Signature: (Ljava/lang/String;Ljava/lang/String;)J
*/
JNIEXPORT jlong
JNICALL Java_edu_stanford_ramcloud_JRamCloud_connect(JNIEnv *env,
jclass jRamCloud,
- jstring coordinatorLocator)
+ jstring coordinatorLocator,
+ jstring clusterName)
{
JStringGetter locator(env, coordinatorLocator);
+ JStringGetter cluster(env, clusterName);
RamCloud* ramcloud = NULL;
try {
- ramcloud = new RamCloud(locator.string);
+ ramcloud = new RamCloud(locator.string, cluster.string);
} EXCEPTION_CATCHER((jlong)(NULL));
return reinterpret_cast<jlong>(ramcloud);
}
diff --git a/src/main/java/edu/stanford/ramcloud/JRamCloud.java b/src/main/java/edu/stanford/ramcloud/JRamCloud.java
index 2bfd145..d7a1f63 100644
--- a/src/main/java/edu/stanford/ramcloud/JRamCloud.java
+++ b/src/main/java/edu/stanford/ramcloud/JRamCloud.java
@@ -278,9 +278,15 @@
* underlying RamCloud C++ object.
*/
public
+ JRamCloud(String coordinatorLocator, String clusterName)
+ {
+ ramcloudObjectPointer = connect(coordinatorLocator, clusterName);
+ }
+
+ public
JRamCloud(String coordinatorLocator)
{
- ramcloudObjectPointer = connect(coordinatorLocator);
+ this(coordinatorLocator, "main");
}
/**
@@ -389,7 +395,7 @@
return write(tableId, key.getBytes(StandardCharsets.UTF_8), value, rules);
}
- private static native long connect(String coordinatorLocator);
+ private static native long connect(String coordinatorLocator, String clusterName);
private static native void disconnect(long ramcloudObjectPointer);
public native long createTable(String name);
@@ -453,6 +459,13 @@
}
}
+ // TODO Define and support some of ClientException sub-classes.
+ public static class ClientException extends Exception {
+ public ClientException(String message) {
+ super(message);
+ }
+ }
+
public static void tableEnumeratorTest(JRamCloud ramcloud) {
long startTime = 0;
for (int x = 0 ; x < 2 ; x ++){
diff --git a/src/main/java/net/onrc/onos/core/datastore/ramcloud/RCClient.java b/src/main/java/net/onrc/onos/core/datastore/ramcloud/RCClient.java
index c651e5f..a2d23a5 100644
--- a/src/main/java/net/onrc/onos/core/datastore/ramcloud/RCClient.java
+++ b/src/main/java/net/onrc/onos/core/datastore/ramcloud/RCClient.java
@@ -38,8 +38,12 @@
import edu.stanford.ramcloud.JRamCloud.TableEnumerator2;
public class RCClient implements IKVClient {
+
private static final Logger log = LoggerFactory.getLogger(RCClient.class);
+ private static final String DEFAULT_LOCATOR = "zk:localhost:2181";
+ private static final String DEFAULT_CLUSTERNAME = "ONOS-RC";
+
private static final String DB_CONFIG_FILE = "conf/ramcloud.conf";
public static final Configuration CONFIG = getConfiguration();
@@ -74,7 +78,7 @@
private static final ThreadLocal<JRamCloud> TLS_RC_CLIENT = new ThreadLocal<JRamCloud>() {
@Override
protected JRamCloud initialValue() {
- return new JRamCloud(getCoordinatorUrl(CONFIG));
+ return new JRamCloud(getLocator(CONFIG), getClusterName(CONFIG));
}
};
@@ -116,13 +120,37 @@
}
}
- public static String getCoordinatorUrl(final Configuration configuration) {
- final String coordinatorIp = configuration.getString("ramcloud.coordinatorIp", "fast+udp:host=127.0.0.1");
- final String coordinatorPort = configuration.getString("ramcloud.coordinatorPort", "port=12246");
+ public static String getLocator(final Configuration configuration) {
+
+ final String locator = configuration.getString("ramcloud.locator");
+ if (locator != null) {
+ return locator;
+ }
+
+ // TODO Stop reading obsolete coordinatorIp, etc. once we're ready.
+ final String coordinatorIp = configuration.getString("ramcloud.coordinatorIp");
+ if (coordinatorIp == null) {
+ return DEFAULT_LOCATOR;
+ }
+
+ final String coordinatorPort = configuration.getString("ramcloud.coordinatorPort");
+ if (coordinatorPort == null) {
+ return DEFAULT_LOCATOR;
+ }
+
final String coordinatorURL = coordinatorIp + "," + coordinatorPort;
return coordinatorURL;
}
+ public static String getClusterName(final Configuration configuration) {
+ final String clusterName = configuration.getString("ramcloud.clusterName");
+ if (clusterName != null) {
+ return clusterName;
+ }
+
+ return DEFAULT_CLUSTERNAME;
+ }
+
@Override
public IMultiEntryOperation createOp(IKVTableID tableId, byte[] key, byte[] value) {
return RCMultiEntryOperation.create(tableId, key, value);