Update RAMCloud start-up procedure
This patch will enable RAMCloud coordinator failover and
server failover if number of replica is properly configured.
- Update onos.sh RAMCloud start-up procedure
- Use ZooKeeper to manage cluster member
- Specify cluster name, num replicas,...
- WARN: ramcloud.coordinator.ip is now a configuration to specify listen address
- Specify ZooKeeper address/ClusterName in RAMCloud client config (conf/ramcloud.conf)
- Note: conf/ramcloud.conf ramcloud.coordinatorIp, etc. is no longer used for connection
- Enable failure detector, which is required for fail-over feature
- Undocumented maintenance option "deldb" to remove:
- RAMCloud coordination info in ZK
- Note: ZooKeeper must be running in order to execute "./onos.sh rc-c deldb"
- BackupData file created by RAMCloud Storage Server
- Update RAMCloud Java bindings
- Expose API to specify RAMCloud ClusterName
- Expose ClientExeption
- Note: You need to run ./build-ramcloud-java-bindings.sh to update Java bindings
- Utility function to check if ZooKeeper is running
- Utility function to wait for ZooKeeper to start
- Modified kill-processes() to try killing process gracefully first then KILL
- Specify absolute path to pgrep/pkill
- Added "rc" to manipulate both server and coordinator
Change-Id: Ia8a282f81bf82810a4d000883611ce12f504bd16
diff --git a/onos.sh b/onos.sh
index b54acc3..57d24db 100755
--- a/onos.sh
+++ b/onos.sh
@@ -166,7 +166,7 @@
\$ ${scriptname} {zk|rc-coord|rc-server|core} {start|stop|restart|status}
Control specific ONOS-related process"
- echo "${usage}"
+ echo "${usage}"
}
function rotate-log {
@@ -191,7 +191,11 @@
fi
for p in ${pids}; do
if [ x$p != "x" ]; then
- kill -KILL $p
+ (
+ # Ask process with SIGTERM first, if that did not kill the process
+ # wait 1s and if process still exist, force process to be killed.
+ kill -TERM $p && kill -0 $p && sleep 1 && kill -0 $p && kill -KILL $p
+ ) 2> /dev/null
echo "Killed existing process (pid: $p)"
fi
done
@@ -340,9 +344,12 @@
echo -n "Creating ${RAMCLOUD_CONF} ... "
local temp_rc=`begin-conf-creation ${RAMCLOUD_CONF}`
-
- echo "ramcloud.coordinatorIp=${RC_COORD_PROTOCOL}:host=${RC_COORD_IP}" > ${temp_rc}
- echo "ramcloud.coordinatorPort=port=${RC_COORD_PORT}" >> ${temp_rc}
+
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+
+ # TODO make ZooKeeper address configurable.
+ echo "ramcloud.locator=zk:localhost:2181" > ${temp_rc}
+ echo "ramcloud.clusterName=${rc_cluster_name}" >> ${temp_rc}
end-conf-creation ${RAMCLOUD_CONF}
@@ -440,6 +447,35 @@
${ZK_HOME}/bin/zkServer.sh status
}
+
+function check-zk {
+ # assumption here is that ZK status script is the last command in status-zk.
+ status-zk &> /dev/null
+ local zk_status=$?
+ if [ "$zk_status" -ne 0 ]; then
+ return 1;
+ fi
+ return 0
+}
+
+# wait-zk-or-die {timeout-sec}
+function wait-zk-or-die {
+ local retries=${1:-1}
+ # do-while retries >= 0
+ while true; do
+ check-zk
+ local zk_status=$?
+ if [ "$zk_status" -eq 0 ]; then
+ return 0
+ fi
+ sleep 1;
+ ((retries -= 1))
+ (( retries >= 0 )) || break
+ done
+ echo "ZooKeeper is not running."
+ exit 1
+}
+
############################################
@@ -466,11 +502,6 @@
fi
}
-function deldb {
-# TODO: implement
- return
-}
-
### Functions related to RAMCloud coordinator
function rc-coord-addr {
@@ -484,7 +515,6 @@
function rc-coord {
case "$1" in
start)
- deldb
stop-coord
start-coord
;;
@@ -499,6 +529,10 @@
stop)
stop-coord
;;
+ deldb)
+ stop-backend
+ del-coord-info
+ ;;
stat*) # <- status
local n=`pgrep -f obj.${RAMCLOUD_BRANCH}/coordinator | wc -l`
echo "$n RAMCloud coordinator running"
@@ -510,6 +544,8 @@
}
function start-coord {
+ wait-zk-or-die 1
+
if [ ! -d ${LOGDIR} ]; then
mkdir -p ${LOGDIR}
fi
@@ -519,27 +555,92 @@
local coord_addr=`rc-coord-addr`
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # RAMCloud option deadServerTimeout
+ # (note RC default is 250ms, setting relaxed ONOS default to 1000ms)
+ local rc_coord_deadServerTimeout=$(read-conf ${ONOS_CONF} ramcloud.coordinator.deadServerTimeout 1000)
+
+ # NOTE RAMCloud document suggests to use -L to specify listen address:port,
+ # but actual RAMCloud code only uses -C argument now.
+ # (FYI: -C is documented to be deprecated in the document)
+
+ local coord_args="-C ${coord_addr}"
+ coord_args="${coord_args} --externalStorage zk:${zk_addr}"
+ coord_args="${coord_args} --clusterName ${rc_cluster_name}"
+ coord_args="${coord_args} --deadServerTimeout ${rc_coord_deadServerTimeout}"
+
+ # Read environment variables if set
+ coord_args="${coord_args} ${RC_COORDINATOR_OPTS}"
+
+ if [ "${ONOS_HOST_ROLE}" == "single-node" ]; then
+ # Note: Following reset is required, since RC restart is considered node failure,
+ # and tries recovery of server, which will never succeed after restart.
+ echo "Role configured to single-node mode. RAMCloud cluster will be reset on each start-up."
+ coord_args="${coord_args} --reset"
+ fi
+
# Run ramcloud
echo -n "Starting RAMCloud coordinator ... "
- ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator -L ${coord_addr} > $RAMCLOUD_COORD_LOG 2>&1 &
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator ${coord_args} > $RAMCLOUD_COORD_LOG 2>&1 &
echo "STARTED"
}
+function del-coord-info {
+ wait-zk-or-die 1
+
+ if [ ! -d ${LOGDIR} ]; then
+ mkdir -p ${LOGDIR}
+ fi
+ if [ -f $RAMCLOUD_COORD_LOG ]; then
+ rotate-log $RAMCLOUD_COORD_LOG
+ fi
+
+ local coord_addr=`rc-coord-addr`
+
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # RAMCloud option deadServerTimeout
+ # (note RC default is 250ms, setting relaxed ONOS default to 1000ms)
+ local rc_coord_deadServerTimeout=$(read-conf ${ONOS_CONF} ramcloud.coordinator.deadServerTimeout 1000)
+
+ # NOTE RAMCloud document suggests to use -L to specify listen address:port,
+ # but actual RAMCloud code only uses -C argument now.
+ # (FYI: -C is documented to be deprecated in the document)
+
+ local coord_args="-C ${coord_addr}"
+ coord_args="${coord_args} --externalStorage zk:${zk_addr}"
+ coord_args="${coord_args} --clusterName ${rc_cluster_name}"
+
+ # Note: --reset will reset ZK stored info and start running as acoordinator.
+ echo -n "Deleting RAMCloud cluster coordination info ... "
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator ${coord_args} --reset &> $RAMCLOUD_COORD_LOG &
+
+ # TODO Assuming 1sec is enough. To be sure monitor log?
+ sleep 1
+ # Silently kill coordinator
+ (pkill -q -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator &> /dev/null)
+
+ echo "DONE"
+}
function stop-coord {
- kill-processes "RAMCloud coordinator" `pgrep -f obj.${RAMCLOUD_BRANCH}/coordinator`
+ kill-processes "RAMCloud coordinator" `pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/coordinator`
}
### Functions related to RAMCloud server
function rc-server {
case "$1" in
start)
- deldb
stop-server
start-server
;;
startifdown)
- local n=`pgrep -f obj.${RAMCLOUD_BRANCH}/server | wc -l`
+ local n=`pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server | wc -l`
if [ $n == 0 ]; then
start-server
else
@@ -549,11 +650,12 @@
stop)
stop-server
;;
-# deldb)
-# deldb
-# ;;
+ deldb)
+ stop-server
+ del-server-backup
+ ;;
stat*) # <- status
- n=`pgrep -f obj.${RAMCLOUD_BRANCH}/server | wc -l`
+ n=`pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server | wc -l`
echo "$n RAMCloud server running"
;;
*)
@@ -563,6 +665,8 @@
}
function start-server {
+ wait-zk-or-die 1
+
if [ ! -d ${LOGDIR} ]; then
mkdir -p ${LOGDIR}
fi
@@ -573,18 +677,48 @@
local coord_addr=`rc-coord-addr`
local server_addr=`rc-server-addr`
- local masterServiceThreads=$(read-conf ${ONOS_CONF} ramcloud.masterServiceThreads 5)
- local logCleanerThreads=$(read-conf ${ONOS_CONF} ramcloud.logCleanerThreads 1)
- local detectFailures=$(read-conf ${ONOS_CONF} ramcloud.detectFailures 0)
+ local masterServiceThreads=$(read-conf ${ONOS_CONF} ramcloud.server.masterServiceThreads 5)
+ local logCleanerThreads=$(read-conf ${ONOS_CONF} ramcloud.server.logCleanerThreads 1)
+ local detectFailures=$(read-conf ${ONOS_CONF} ramcloud.server.detectFailures 1)
+
+ # TODO Configuration for ZK address, port
+ local zk_addr="localhost:2181"
+ # RAMCloud cluster name
+ local rc_cluster_name=$(read-conf ${ONOS_CONF} ramcloud.clusterName "ONOS-RC")
+ # replication factor (-r) config
+ local rc_replicas=$(read-conf ${ONOS_CONF} ramcloud.server.replicas 0)
+ # backup file path (-f) config
+ local rc_datafile=$(read-conf ${ONOS_CONF} ramcloud.server.file "/var/tmp/ramclouddata/backup.${ONOS_HOST_NAME}.log")
+ mkdir -p `dirname ${rc_datafile}`
+
+ local server_args="-L ${server_addr}"
+ server_args="${server_args} --externalStorage zk:${zk_addr}"
+ server_args="${server_args} --clusterName ${rc_cluster_name}"
+ server_args="${server_args} --masterServiceThreads ${masterServiceThreads}"
+ server_args="${server_args} --logCleanerThreads ${logCleanerThreads}"
+ server_args="${server_args} --detectFailures ${detectFailures}"
+ server_args="${server_args} --replicas ${rc_replicas}"
+ server_args="${server_args} --file ${rc_datafile}"
+
+ # Read environment variables if set
+ server_args="${server_args} ${RC_SERVER_OPTS}"
# Run ramcloud
echo -n "Starting RAMCloud server ... "
- ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server -M -L ${server_addr} -C ${coord_addr} --masterServiceThreads ${masterServiceThreads} --logCleanerThreads ${logCleanerThreads} --detectFailures ${detectFailures} > $RAMCLOUD_SERVER_LOG 2>&1 &
+ ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server ${server_args} > $RAMCLOUD_SERVER_LOG 2>&1 &
echo "STARTED"
}
+function del-server-backup {
+ # TODO might want confirmation, since data can be lost
+ echo -n "Removing RAMCloud backup server data ... "
+ local rc_datafile=$(read-conf ${ONOS_CONF} ramcloud.server.file "/var/tmp/ramclouddata/backup.${ONOS_HOST_NAME}.log")
+ rm -f ${rc_datafile}
+ echo "DONE"
+}
+
function stop-server {
- kill-processes "RAMCloud server" `pgrep -f obj.${RAMCLOUD_BRANCH}/server`
+ kill-processes "RAMCloud server" `pgrep -f ${RAMCLOUD_HOME}/obj.${RAMCLOUD_BRANCH}/server`
}
############################################
@@ -655,7 +789,7 @@
# Need to cd ONOS_HOME. onos.properties currently specify hazelcast config path relative to CWD
cd ${ONOS_HOME}
- echo -n "Starting ONOS controller ..."
+ echo -n "Starting ONOS controller ... "
java ${JVM_OPTS} -Dlogback.configurationFile=${ONOS_LOGBACK} -cp ${JAVA_CP} ${MAIN_CLASS} -cf ${ONOS_PROPS} > ${LOGDIR}/${LOGBASE}.stdout 2>${LOGDIR}/${LOGBASE}.stderr &
# We need to wait a bit to find out whether starting the ONOS process succeeded
@@ -790,6 +924,10 @@
rc-s*) # <- rc-server
rc-server $2
;;
+ rc)
+ rc-coord $2
+ rc-server $2
+ ;;
core)
onos $2
;;