From 7029e4395ce78a8848ad965501242de401ca1283 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Sat, 8 Sep 2018 17:46:43 -0700 Subject: [PATCH] Fix OOM reporting --- multinode-demo/common.sh | 6 ------ scripts/oom-monitor.sh | 23 +++++++++++++---------- scripts/snap-config-to-env.sh | 22 ++++++++++++++++++++++ snap/snapcraft.yaml | 8 ++++---- 4 files changed, 39 insertions(+), 20 deletions(-) create mode 100755 scripts/snap-config-to-env.sh diff --git a/multinode-demo/common.sh b/multinode-demo/common.sh index 938c8a34d..514c82325 100644 --- a/multinode-demo/common.sh +++ b/multinode-demo/common.sh @@ -45,12 +45,6 @@ if [[ -d $SNAP ]]; then # Running inside a Linux Snap? # 0700 mkdir -p "$SNAP_DATA"/{drone,leader,validator} - SOLANA_METRICS_CONFIG="$(snapctl get metrics-config)" - SOLANA_DEFAULT_METRICS_RATE="$(snapctl get default-metrics-rate)" - export SOLANA_DEFAULT_METRICS_RATE - SOLANA_CUDA="$(snapctl get enable-cuda)" - RUST_LOG="$(snapctl get rust-log)" - elif [[ -n $USE_SNAP ]]; then # Use the Linux Snap binaries solana_program() { declare program="$1" diff --git a/scripts/oom-monitor.sh b/scripts/oom-monitor.sh index a3e663f66..eb6835fa5 100755 --- a/scripts/oom-monitor.sh +++ b/scripts/oom-monitor.sh @@ -3,19 +3,21 @@ # Reports Linux OOM Killer activity # -here=$(dirname "$0") -# shellcheck source=scripts/oom-score-adj.sh -source "$here"/oom-score-adj.sh +cd "$(dirname "$0")" -if [[ $(uname) != Linux ]]; then - exit 0 -fi +# shellcheck source=scripts/oom-score-adj.sh +source oom-score-adj.sh + +# shellcheck source=scripts/configure-metrics.sh +source configure-metrics.sh + +[[ $(uname) = Linux ]] || exit 0 syslog=/var/log/syslog -if [[ ! -r $syslog ]]; then +[[ -r $syslog ]] || { echo Unable to read $syslog - exit 0 -fi + exit 1 +} # Adjust OOM score to reduce the chance that this script will be killed # during an Out of Memory event since the purpose of this script is to @@ -24,9 +26,10 @@ oom_score_adj "self" -500 while read -r victim; do echo "Out of memory event detected, $victim killed" - "$here"/metrics-write-datapoint.sh "oom-killer,victim=$victim killed=1" + ./metrics-write-datapoint.sh "oom-killer,victim=$victim,hostname=$HOSTNAME killed=1" done < <( \ tail --follow=name --retry -n0 $syslog \ | sed --unbuffered -n 's/^.* Out of memory: Kill process [1-9][0-9]* (\([^)]*\)) .*/\1/p' \ ) + exit 1 diff --git a/scripts/snap-config-to-env.sh b/scripts/snap-config-to-env.sh new file mode 100755 index 000000000..883c4d4c4 --- /dev/null +++ b/scripts/snap-config-to-env.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# +# Snap daemons have no access to the environment so |snap set solana ...| is +# used to set runtime configuration. +# +# This script exports the snap runtime configuration options back as +# environment variables before invoking the specified program +# + +if [[ -d $SNAP ]]; then # Running inside a Linux Snap? + RUST_LOG="$(snapctl get rust-log)" + SOLANA_CUDA="$(snapctl get enable-cuda)" + SOLANA_DEFAULT_METRICS_RATE="$(snapctl get default-metrics-rate)" + SOLANA_METRICS_CONFIG="$(snapctl get metrics-config)" + + export RUST_LOG + export SOLANA_CUDA + export SOLANA_DEFAULT_METRICS_RATE + export SOLANA_METRICS_CONFIG +fi + +exec "$@" diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml index e48af18c1..fab22db7b 100644 --- a/snap/snapcraft.yaml +++ b/snap/snapcraft.yaml @@ -63,25 +63,25 @@ apps: - home daemon-validator: daemon: simple - command: multinode-demo/validator.sh + command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/validator.sh plugs: - network - network-bind daemon-leader: daemon: simple - command: multinode-demo/leader.sh + command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/leader.sh plugs: - network - network-bind daemon-drone: daemon: simple - command: multinode-demo/drone.sh + command: scripts/snap-config-to-env.sh $SNAP/multinode-demo/drone.sh plugs: - network - network-bind daemon-oom-monitor: daemon: simple - command: scripts/oom-monitor.sh + command: scripts/snap-config-to-env.sh $SNAP/scripts/oom-monitor.sh plugs: - network