| /* |
| * Copyright 2012-2014 Google Inc. All rights reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| #include <assert.h> |
| #include <errno.h> |
| #include <fcntl.h> |
| #include <limits.h> |
| #include <signal.h> |
| #include <stdio.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <sys/stat.h> |
| #include <sys/types.h> |
| #include <sys/wait.h> |
| #include <time.h> |
| #include <unistd.h> |
| |
| |
| static volatile sig_atomic_t got_signal = 0; |
| const char *keepalive_file; |
| pid_t p_pid; |
| time_t old_time; |
| |
| |
| void sighandler(int sig) { |
| got_signal = sig; |
| } |
| |
| static void usage(char *name) { |
| fprintf(stderr, |
| "Usage: %s [-S <prekill_signal> [-T <prekill_timeout>]]\n" |
| " <keepalive_file> <first_check> <incr_checks>\n" |
| " <timeout> <command> [args...]\n" |
| " -S <prekill_signal> try this signal (numeric) before SIGKILL\n" |
| " -T <prekill_timeout> wait time (secs) after prekill_signal\n" |
| " <keepalive_file> name of the stamp file to monitor\n" |
| " <first_check> time (secs) before first check\n" |
| " <incr_checks> time (secs) before subsequent checks\n" |
| " <timeout> time (secs) before killing process\n" |
| " <command> [args...] the command to kill upon timeout\n" |
| "\n" |
| " The keepalive logic runs in cycles. A cycle begins and ends\n" |
| " with a successful check of the <keepalive_file>, i.e., it was\n" |
| " touched since the last cycle. The first check starts\n" |
| " <first_check> secs after the cycle begins. Incremental checks\n" |
| " are done at <incr_checks> intervals, until <keepalive_file>\n" |
| " was found to be updated or <timeout> is reached. In the\n" |
| " former case, the cycle restarts, while in the latter\n" |
| " (timeout) case, the process is restarted and the cycle starts\n" |
| " again.\n\n", name); |
| } |
| |
| long long parse_to_msec(const char *str) { |
| return atof(str) * 1000; |
| } |
| |
| // return the current (monotonic) time in secs |
| long long now() { |
| struct timespec tp; |
| |
| if (clock_gettime(CLOCK_MONOTONIC, &tp)) { |
| perror("alivemonitor: clock_gettime failed."); |
| exit(1); |
| } |
| return tp.tv_sec * 1000 + (tp.tv_nsec / 1000000); |
| } |
| |
| enum Aliveness { |
| EXITED = 2, |
| NO_CHANGE = 1, |
| ALIVE = 0, |
| ERROR = -1, |
| }; |
| |
| // Sleep a given amount of time, while continuously checking on the parent. |
| // Return codes: |
| // EXITED: parent exited |
| // NO_CHANGE: no change in alive status |
| // ALIVE: alive! |
| // ERROR: system error, abort |
| enum Aliveness sleep_check_alive(long long stime) { |
| struct stat fst; |
| long long n = now(), endtime = n + stime; |
| |
| while (n < endtime) { |
| int s = endtime - n; |
| usleep(s * 1000); |
| |
| if (got_signal) |
| break; |
| |
| // check on the parent |
| assert(p_pid > 0); |
| if (kill(p_pid, 0) == -1) { |
| if (errno == ESRCH) { |
| fprintf(stderr, "alivemonitor: parent pid %d exited.\n", p_pid); |
| return EXITED; |
| } else { |
| perror("alivemonitor: kill(p_pid, 0) failed"); |
| return ERROR; |
| } |
| } |
| n = now(); |
| } |
| |
| memset(&fst, 0, sizeof(fst)); |
| if (stat(keepalive_file, &fst) != 0) { |
| perror("alivemonitor: stat failed"); |
| return ERROR; |
| } |
| |
| if (fst.st_mtime == old_time) { |
| return NO_CHANGE; |
| } |
| |
| // alive! |
| old_time = fst.st_mtime; |
| return ALIVE; |
| } |
| |
| void die(const char *argv0, const char *msg) { |
| fprintf(stderr, "%s: %s\n", argv0, msg); |
| exit(99); |
| } |
| |
| int main(int argc, char *const *argv) { |
| int fd; |
| long long timeout, first_check, incr_check, next_check; |
| struct stat fst; |
| mode_t old_mask; |
| pid_t pid; |
| long long start_time; |
| char *keepalive_name; |
| int prekill_signal = 0; |
| long long prekill_timeout = 1000; |
| |
| if (argc < 6) { |
| usage(basename(argv[0])); |
| return 99; |
| } |
| |
| signal(SIGTERM, sighandler); |
| signal(SIGHUP, sighandler); |
| |
| // GNU getopt() will helpfully try to grab options from the [args...] |
| // section unless we set this. We want those options to be set aside |
| // for the subprogram, not for us. |
| int opt; |
| while ((opt = getopt(argc, argv, "+?S:T:")) > 0) { |
| switch (opt) { |
| case 'S': |
| prekill_signal = atoi(optarg); |
| if (prekill_signal <= 0) die(argv[0], "invalid signal number provided"); |
| break; |
| case 'T': |
| prekill_timeout = parse_to_msec(optarg); |
| if (prekill_timeout <= 0) die(argv[0], "prekill timeout must be > 0"); |
| break; |
| case '?': |
| usage(basename(argv[0])); |
| return 99; |
| } |
| } |
| |
| // <keepalive_file> <first_check> <incr_checks> <timeout> <command> |
| keepalive_file = argv[optind]; |
| keepalive_name = basename(keepalive_file); |
| first_check = parse_to_msec(argv[optind + 1]); |
| incr_check = parse_to_msec(argv[optind + 2]); |
| timeout = parse_to_msec(argv[optind + 3]); |
| |
| if (first_check <= 0) die(argv[0], "first_check must be > 0"); |
| if (incr_check <= 0) die(argv[0], "incr_check must be > 0"); |
| if (timeout <= 0) die(argv[0], "timeout must be > 0"); |
| if (first_check > timeout) die(argv[0], "first_check must be <= timeout"); |
| |
| // create the keepalive file if it doesn't already exist |
| memset(&fst, 0, sizeof(fst)); |
| if (stat(keepalive_file, &fst) != 0) { |
| old_mask = umask(0000); |
| fd = creat(keepalive_file, 0666); |
| if (fd < 0) { |
| perror("alivemonitor: creat failed"); |
| return 99; |
| } |
| // Revert the umask to default so that the child doesn't |
| // inherit the changed value. |
| umask(old_mask); |
| close(fd); |
| } |
| old_time = fst.st_mtime; |
| |
| fprintf(stderr, "alivemonitor: Start monitoring '%s' with timeout=%lldms, " |
| "first_check=%lldms, incr_check=%lldms\n", |
| keepalive_file, timeout, first_check, incr_check); |
| |
| // create a new process group with pgid=pid |
| if (setpgid(0, 0)) { |
| perror("alivemonitor: setpgid failed"); |
| return 99; |
| } |
| |
| // spawn the child process |
| p_pid = getpid(); |
| pid = fork(); |
| if (pid == -1) { |
| perror("alivemonitor: fork failed"); |
| return 99; |
| } else if (pid > 0) { // parent |
| execvp(argv[optind + 4], argv + optind + 4); |
| perror("alivemonitor: execv failed"); |
| return 99; |
| } |
| |
| // from here: child |
| |
| while (1) { |
| start_time = now(); |
| |
| // sleep until first check |
| switch (sleep_check_alive(first_check)) { |
| case EXITED: return 0; |
| case ERROR: goto kill_it; |
| case ALIVE: goto not_dead; |
| case NO_CHANGE: break; // fall through and enter the inner loop |
| } |
| |
| // no sign of life yet, run the increments |
| long long time_passed = now() - start_time; |
| int cnt = 1; |
| while (1) { |
| if (got_signal) { |
| fprintf(stderr, "alivemonitor(%s): signal %d received, killing.\n", |
| keepalive_name, got_signal); |
| goto kill_it; |
| } else if (time_passed >= timeout) { |
| fprintf(stderr, "alivemonitor(%s): Timeout!\n", keepalive_name); |
| goto kill_it; |
| } |
| fprintf(stderr, "alivemonitor(%s): %d-No sign of life @ %lld/%lld ms\n", |
| keepalive_name, cnt++, time_passed, timeout); |
| next_check = timeout - time_passed; |
| if (incr_check < next_check) next_check = incr_check; |
| switch (sleep_check_alive(next_check)) { |
| case EXITED: return 0; |
| case ERROR: goto kill_it; |
| case NO_CHANGE: break; // do nothing |
| case ALIVE: |
| fprintf(stderr, "alivemonitor(%s): it's alive after all!\n", |
| keepalive_name); |
| goto not_dead; |
| } |
| time_passed = now() - start_time; |
| } |
| not_dead: |
| continue; |
| } |
| |
| kill_it: |
| fprintf(stderr, "alivemonitor(%s): kill parent process group %d\n", |
| keepalive_name, p_pid); |
| assert(p_pid > 0); |
| if (prekill_signal) { |
| // Send prekill signal only to the parent process (which might kill the |
| // rest of its group politely) |
| long long prekill_start = now(); |
| if (kill(p_pid, prekill_signal)) { |
| if (errno != ESRCH) perror("alivemonitor: prekill failed"); |
| } else { |
| do { |
| if (kill(p_pid, 0)) { |
| if (errno != ESRCH) perror("alivemonitor: prekill(0) failed"); |
| break; |
| } |
| usleep(100*1000); |
| } while (now() - prekill_start < prekill_timeout); |
| } |
| } |
| |
| // Send kill signal to whole process group. |
| if (kill(-p_pid, SIGKILL)) |
| perror("alivemonitor: killing parent process group failed"); |
| |
| // NOTE: Code after this point will not run since we just killed ourselves |
| |
| return 98; |
| } |