| #!/bin/sh |
| pid=$1 |
| signal=$2 |
| exe=$3 |
| |
| # TODO(kedong) need to specify different size based on platform. |
| |
| # If we have a disk available, dump the core there and don't overwrite older |
| # ones. Since the only usable partition of decent size is /var/media, we just |
| # create a "cores" folder in there. |
| # If no disk is available, use flash and overwrite previous cores (due to size |
| # limits). |
| hdd_media_dir="/var/media" |
| hdd_cores_dir="${hdd_media_dir}/cores" |
| flash_cores_dir="/user" |
| flash_cores_last_core_link="${flash_cores_dir}/last-core" |
| if { is-storage-box || is-spacecast; } && [ -w /var/media ]; then |
| mkdir -p "$hdd_cores_dir" |
| outdir="$hdd_cores_dir" |
| out="${hdd_cores_dir}/core-${pid}-${exe}.gz" |
| else |
| outdir="$flash_cores_dir" |
| out="${flash_cores_dir}/core.gz" |
| fi |
| |
| # Send all output to kernel log. We don't want to use the 'logger' program |
| # here in case there's something wrong with syslogd (eg. it's the one core |
| # dumping). |
| exec >/dev/kmsg 2>&1 |
| |
| log() |
| { |
| echo "coredump:" $@ >&2; |
| } |
| |
| uniqid=coredump-$$-coredump |
| |
| alarm() |
| { |
| pgrep -f "$uniqid" >/dev/null || return |
| log core took too long to save, quitting to avoid panic, core will be truncated |
| sysctl -w kernel.hung_task_timeout_secs=120 |
| pkill -f "$uniqid" |
| } |
| |
| log "pid $pid ($exe) dying on signal $2" |
| |
| if [ ! -e "/tmp/DEBUG" ]; then |
| log "skipping coredump for process '$exe'" |
| elif [ ! -w "$outdir" ] || [ -e "$out" -a ! -w "$out" ]; then |
| log "can't dump core: $out is not writable." |
| elif pgrep -x coredump-delay >/dev/null; then |
| log "coredump-delay not yet reached; not saving." |
| else |
| # read and compress the core dump from stdin, limiting max output size |
| log "dumping core in '$out'" |
| # siege coredumps can take a long time and we need them, increase the hung task |
| # timeout in the kernel and wait for longer to accomodate this |
| if [ "$exe" = "siege" ]; then |
| coredump-delay 900 & |
| sysctl -w kernel.hung_task_timeout_secs=900 |
| (sleep 840; alarm) & |
| else |
| coredump-delay 600 & |
| # abort after 100 seconds. bruno kernel will panic with hung_task at 120 seconds |
| (sleep 100; alarm) & |
| fi |
| # --suffix is a trick to be able to pkill exactly this gzip |
| gzip --fast -c --suffix "$uniqid" | dd bs=1024 count=163840 2>/dev/null >"$out" |
| upload-logs-now |
| sysctl -w kernel.hung_task_timeout_secs=120 |
| # if we dumped to HDD, create a symbolic link at the alternate place in flash |
| if [ "$outdir" = "$hdd_cores_dir" ]; then |
| ln -s -f "$out" "$flash_cores_last_core_link" |
| fi |
| log "finished dumping core for pid $pid ($exe)" |
| fi |