Repository: accumulo Updated Branches: refs/heads/master edc080c83 -> 998a31cdc
ACCUMULO-3569 initial pass at integrating auto-restarts Updating methods to be more flexible Needed a few more tweaks Fixing to work on all processes Updating for logigng changes Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/05853827 Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/05853827 Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/05853827 Branch: refs/heads/master Commit: 058538270223c2fe6f82e8c3a39baac73ede1669 Parents: edc080c Author: John Vines <vi...@apache.org> Authored: Fri Feb 6 15:58:27 2015 -0500 Committer: John Vines <vi...@apache.org> Committed: Tue Apr 7 11:17:10 2015 -0400 ---------------------------------------------------------------------- assemble/bin/accumulo_watcher.sh | 127 +++++++++++++++++++++++++++ assemble/bin/start-server.sh | 9 +- assemble/conf/templates/accumulo-env.sh | 14 +++ 3 files changed, 148 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/bin/accumulo_watcher.sh ---------------------------------------------------------------------- diff --git a/assemble/bin/accumulo_watcher.sh b/assemble/bin/accumulo_watcher.sh new file mode 100755 index 0000000..9a3dc9f --- /dev/null +++ b/assemble/bin/accumulo_watcher.sh @@ -0,0 +1,127 @@ +#! /usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +LOGHOST=$1 +shift +process=$1 + +SOURCE="${BASH_SOURCE[0]}" +while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink + bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )" + SOURCE="$(readlink "$SOURCE")" + [[ $SOURCE != /* ]] && SOURCE="$bin/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located +done +bin="$( cd -P "$( dirname "$SOURCE" )" && pwd )" +# Stop: Resolve Script Directory + +. "${bin}"/config.sh + +ERRFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.err +OUTFILE=${ACCUMULO_LOG_DIR}/${process}_${LOGHOST}.out +DEBUGLOG=${ACCUMULO_LOG_DIR}/${process}_$(hostname).debug.log +export COMMAND="${bin}/accumulo \"\$@\"" + +logger -s "starting process $process at $(date)" +stopRunning="" +while [ -z "$stopRunning" ]; +do + eval $COMMAND 2> $ERRFILE + exit=$? + unset cause + if [ "$exit" -eq 0 ]; then + potentialStopRunning="Clean Exit" + elif [ "$exit" -eq 1 ]; then + potentialStopRunning="Unexpected error" + elif [ "$exit" -eq 130 ]; then + stopRunning="Control C detected, exiting" + elif [ "$exit" -eq 143 ]; then + stopRunning="Process terminated, exiting" + elif [ "$exit" -eq 137 ]; then + potentialStopRunning="Process killed, exiting" + fi + if [ -z "$stopRunning" ]; then + stopRunning=$potentialStopRunning; + + if [ $exit -eq 1 ]; then + source="exit code" + cause="Unexpected Exception" + elif tail -n50 $OUTFILE | grep "java.lang.OutOfMemoryError:" > /dev/null; then + source="logs" + cause="Out of memory exception" + elif [ "$process" = "tserver" ]; then + if tail -n50 $DEBUGLOG | grep "ERROR: Lost tablet server lock (reason =" > /dev/null ; then + source="logs" + cause="ZKLock lost" + fi + elif [ "$process" = "master" ]; then + if tail -n50 $DEBUGLOG | grep "ERROR: Master lock in zookeeper lost (reason =" > /dev/null ; then + source="logs" + cause="ZKLock lost" + fi + elif [ "$process" = "gc" ]; then + if tail -n50 $DEBUGLOG | grep "FATAL: GC lock in zookeeper lost (reason =" > /dev/null ; then + source="logs" + cause="ZKLock lost" + fi + elif [ "$process" = "monitor" ]; then + if tail -n50 $DEBUGLOG | grep "ERROR: Monitor lock in zookeeper lost (reason =" > /dev/null ; then + source="logs" + cause="ZKLock lost" + fi + elif [ $exit -ne 0 ]; then + source="exit code" + cause="Unknown error" + fi + case $cause in + #Unknown exit code + "Unknown error") + #window doesn't matter when retries = 0 + RETRIES=0 + ;; + + "Unexpected Exception") + WINDOW=$UNEXPECTED_TIMESPAN + RETRIES=$UNEXPECTED_RETRIES + ;; + + "Out of memory exception") + WINDOW=$OOM_TIMESPAN + RETRIES=$OOM_RETRIES + ;; + + "ZKLock lost") + WINDOW=$ZKLOCK_TIMESPAN + RETRIES=$ZKLOCK_RETRIES + ;; + esac + + if [ -n "$cause" ]; then + stopRunning="" + declare -i attempts + attempts="`jobs | grep "reason$cause" | wc -l`+1" + if [ "$RETRIES" -le $attempts ]; then + stopRunning="$process encountered $cause in $source with exit code $exit- quitting ($attempts/$RETRIES in $WINDOW seconds)" + # kill all sleeps now + for list in `jobs | cut -b 2-2`; do kill %$list; done + else + logger -s "$process encountered $cause in $source with exit code $exit- retrying ($attempts/$RETRIES in $WINDOW seconds)" + eval "(sleep $WINDOW ; echo "reason$cause" >> /dev/null) &" + fi + fi + fi +done +logger -s $stopRunning http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/bin/start-server.sh ---------------------------------------------------------------------- diff --git a/assemble/bin/start-server.sh b/assemble/bin/start-server.sh index 1ed73de..c52bd64 100755 --- a/assemble/bin/start-server.sh +++ b/assemble/bin/start-server.sh @@ -72,11 +72,16 @@ fi if [[ -z "$PID" ]]; then echo "Starting $LONGNAME on $HOST" + COMMAND="${bin}/accumulo" + if [ "${ACCUMULO_WATCHER}" = "true" ]; then + COMMAND="${bin}/accumulo_watcher.sh ${LOGHOST}" + fi + if [[ $HOST == localhost || $HOST == "$(hostname -f)" || $HOST = "$IP" ]]; then - "${bin}/accumulo" "${SERVICE}" --address "${ADDRESS}" >"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out" 2>"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err" & + "$COMMAND" "${SERVICE}" --address "${ADDRESS}" >"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out" 2>"${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err" & MAX_FILES_OPEN=$(ulimit -n) else - $SSH "$HOST" "bash -c 'exec nohup ${bin}/accumulo ${SERVICE} --address ${ADDRESS} >${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out 2>${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err' &" + $SSH "$HOST" "bash -c 'exec nohup $COMMAND ${SERVICE} --address ${ADDRESS} >${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.out 2>${ACCUMULO_LOG_DIR}/${SERVICE}_${LOGHOST}.err' &" MAX_FILES_OPEN=$($SSH "$HOST" "/usr/bin/env bash -c 'ulimit -n'") fi http://git-wip-us.apache.org/repos/asf/accumulo/blob/05853827/assemble/conf/templates/accumulo-env.sh ---------------------------------------------------------------------- diff --git a/assemble/conf/templates/accumulo-env.sh b/assemble/conf/templates/accumulo-env.sh index a21702b..5be96c0 100644 --- a/assemble/conf/templates/accumulo-env.sh +++ b/assemble/conf/templates/accumulo-env.sh @@ -61,3 +61,17 @@ export ACCUMULO_KILL_CMD='kill -9 %p' # Should the monitor bind to all network interfaces -- default: false # export ACCUMULO_MONITOR_BIND_ALL="true" + +# Should process be automatically restarted +# export ACCUMULO_WATCHER="true" + +# What settings should we use for the watcher, if enabled +export UNEXPECTED_TIMESPAN="3600" +export UNEXPECTED_RETRIES="2" + +export OOM_TIMESPAN="3600" +export OOM_RETRIES="5" + +export ZKLOCK_TIMESPAN="600" +export ZKLOCK_RETRIES="5" +