Fix OOM reporting

This commit is contained in:
Michael Vines 2018-09-08 17:46:43 -07:00 committed by Grimes
parent 5afcdcbbe6
commit 7029e4395c
4 changed files with 39 additions and 20 deletions

View File

@ -45,12 +45,6 @@ if [[ -d $SNAP ]]; then # Running inside a Linux Snap?
# 0700 # 0700
mkdir -p "$SNAP_DATA"/{drone,leader,validator} mkdir -p "$SNAP_DATA"/{drone,leader,validator}
SOLANA_METRICS_CONFIG="$(snapctl get metrics-config)"
SOLANA_DEFAULT_METRICS_RATE="$(snapctl get default-metrics-rate)"
export SOLANA_DEFAULT_METRICS_RATE
SOLANA_CUDA="$(snapctl get enable-cuda)"
RUST_LOG="$(snapctl get rust-log)"
elif [[ -n $USE_SNAP ]]; then # Use the Linux Snap binaries elif [[ -n $USE_SNAP ]]; then # Use the Linux Snap binaries
solana_program() { solana_program() {
declare program="$1" declare program="$1"

View File

@ -3,19 +3,21 @@
# Reports Linux OOM Killer activity # Reports Linux OOM Killer activity
# #
here=$(dirname "$0") cd "$(dirname "$0")"
# shellcheck source=scripts/oom-score-adj.sh
source "$here"/oom-score-adj.sh
if [[ $(uname) != Linux ]]; then # shellcheck source=scripts/oom-score-adj.sh
exit 0 source oom-score-adj.sh
fi
# shellcheck source=scripts/configure-metrics.sh
source configure-metrics.sh
[[ $(uname) = Linux ]] || exit 0
syslog=/var/log/syslog syslog=/var/log/syslog
if [[ ! -r $syslog ]]; then [[ -r $syslog ]] || {
echo Unable to read $syslog echo Unable to read $syslog
exit 0 exit 1
fi }
# Adjust OOM score to reduce the chance that this script will be killed # Adjust OOM score to reduce the chance that this script will be killed
# during an Out of Memory event since the purpose of this script is to # during an Out of Memory event since the purpose of this script is to
@ -24,9 +26,10 @@ oom_score_adj "self" -500
while read -r victim; do while read -r victim; do
echo "Out of memory event detected, $victim killed" echo "Out of memory event detected, $victim killed"
"$here"/metrics-write-datapoint.sh "oom-killer,victim=$victim killed=1" ./metrics-write-datapoint.sh "oom-killer,victim=$victim,hostname=$HOSTNAME killed=1"
done < <( \ done < <( \
tail --follow=name --retry -n0 $syslog \ tail --follow=name --retry -n0 $syslog \
| sed --unbuffered -n 's/^.* Out of memory: Kill process [1-9][0-9]* (\([^)]*\)) .*/\1/p' \ | sed --unbuffered -n 's/^.* Out of memory: Kill process [1-9][0-9]* (\([^)]*\)) .*/\1/p' \
) )
exit 1 exit 1

22
scripts/snap-config-to-env.sh Executable file
View File

@ -0,0 +1,22 @@
#!/bin/bash
#
# Snap daemons have no access to the environment so |snap set solana ...| is
# used to set runtime configuration.
#
# This script exports the snap runtime configuration options back as
# environment variables before invoking the specified program
#
if [[ -d $SNAP ]]; then # Running inside a Linux Snap?
RUST_LOG="$(snapctl get rust-log)"
SOLANA_CUDA="$(snapctl get enable-cuda)"
SOLANA_DEFAULT_METRICS_RATE="$(snapctl get default-metrics-rate)"
SOLANA_METRICS_CONFIG="$(snapctl get metrics-config)"
export RUST_LOG
export SOLANA_CUDA
export SOLANA_DEFAULT_METRICS_RATE
export SOLANA_METRICS_CONFIG
fi
exec "$@"

View File

@ -63,25 +63,25 @@ apps:
- home - home
daemon-validator: daemon-validator:
daemon: simple daemon: simple
command: multinode-demo/validator.sh command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/validator.sh
plugs: plugs:
- network - network
- network-bind - network-bind
daemon-leader: daemon-leader:
daemon: simple daemon: simple
command: multinode-demo/leader.sh command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/leader.sh
plugs: plugs:
- network - network
- network-bind - network-bind
daemon-drone: daemon-drone:
daemon: simple daemon: simple
command: multinode-demo/drone.sh command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/drone.sh
plugs: plugs:
- network - network
- network-bind - network-bind
daemon-oom-monitor: daemon-oom-monitor:
daemon: simple daemon: simple
command: scripts/oom-monitor.sh command: scripts/snap-config-to-env.sh $SNAP/scripts/oom-monitor.sh
plugs: plugs:
- network - network