| #!/bin/sh |
| RETRY_TIMEOUT=$1 |
| |
| if [ -e /proc/$$/oom_score_adj ] ; then |
| OOM_FILE=oom_score_adj |
| echo -500 >/proc/$$/$OOM_FILE |
| else |
| OOM_FILE=oom_adj |
| echo -17 >/proc/$$/$OOM_FILE |
| fi |
| |
| shift |
| if [ -z "$RETRY_TIMEOUT" -o -z "$*" ]; then |
| echo "Usage: $0 <retry_timeout> <program> [args...]" >&2 |
| exit 1 |
| fi |
| |
| progname() { |
| local first="$1" |
| while [ $# -gt 0 ]; do |
| case $1 in |
| alivemonitor) shift 5 ;; |
| babysit) shift 2 ;; |
| startpid) shift 2 ;; |
| deathrattle) shift 2 ;; |
| *) echo "$1"; return ;; |
| esac |
| done |
| echo "$first" |
| } |
| |
| run() { |
| "$@" & |
| local pid=$! |
| echo 0 >/proc/$pid/$OOM_FILE |
| wait $pid |
| } |
| |
| error() { |
| echo "Error: '$name' exited with code $RV. Retry in $RETRY_TIMEOUT secs." |
| } |
| |
| # Run until the program manages to exit successfully. |
| name=$(progname "$@") |
| TMP_LOG=/tmp/babysit_errors_$(basename "$name") |
| until run "$@"; do |
| RV=$? |
| if [ "$RV" = 143 ]; then |
| # SIGTERM means someone asked for it to die. |
| # We can't treat SIGKILL the same way because it might have been |
| # caused by the kernel OOM killer, in which case we definitely want to |
| # restart the task. |
| echo "SIGTERM: '$name' was killed explicitly. Not retrying." >&2 |
| exit $RV |
| fi |
| error >&2 |
| error > $TMP_LOG |
| sleep $RETRY_TIMEOUT || sleep 1 # just in case $RETRY_TIMEOUT is invalid |
| done |
| echo "'$name' exited successfully. Not restarting." >&2 |
| exit 0 |