Add open file descriptor monitoring (#5655)

This commit is contained in:
Michael Vines 2019-08-26 15:17:19 -07:00 committed by GitHub
parent 6979a17674
commit 81bb208a62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 196 additions and 18 deletions

View File

@ -15,8 +15,8 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 851,
"iteration": 1565991401072,
"id": 883,
"iteration": 1566852798488,
"links": [
{
"asDropdown": true,
@ -2516,7 +2516,7 @@
"x": 12,
"y": 24
},
"id": 23,
"id": 61,
"interval": null,
"links": [],
"mappingType": 1,
@ -2569,7 +2569,7 @@
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT sum(\"one\") FROM \"$testnet\".\"autogen\".\"panic\" WHERE $timeFilter",
"query": "SELECT SUM(\"points_lost\") FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter\n",
"rawQuery": true,
"refId": "A",
"resultFormat": "table",
@ -2591,7 +2591,7 @@
}
],
"thresholds": "",
"title": "Total Panics",
"title": "Lost Datapoints",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@ -2840,7 +2840,7 @@
"datasource": "$datasource",
"fill": 1,
"gridPos": {
"h": 6,
"h": 3,
"w": 8,
"x": 0,
"y": 26
@ -2852,7 +2852,7 @@
"current": false,
"max": false,
"min": false,
"rightSide": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
@ -2888,7 +2888,7 @@
"hide": false,
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT MEAN(\"points_written\") as \"Mean points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"query": "SELECT MEAN(\"points_written\") as \"mean\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"rawQuery": true,
"refId": "B",
"resultFormat": "time_series",
@ -2925,7 +2925,7 @@
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT MAX(\"points_written\") as \"Max points written\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"query": "SELECT MAX(\"points_written\") as \"max\" FROM \"$testnet\".\"autogen\".\"metrics\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
@ -3263,6 +3263,162 @@
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"gridPos": {
"h": 3,
"w": 8,
"x": 0,
"y": 29
},
"id": 62,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"hide": false,
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT MEAN(\"count\") as \"mean\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"rawQuery": true,
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
},
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT MAX(\"count\") as \"max\" FROM \"$testnet\".\"autogen\".\"open-files\" WHERE $timeFilter GROUP BY time(5s) fill(null)\n",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Open Files per node",
"tooltip": {
"shared": true,
"sort": 1,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": "0.2",
"show": true
},
{
"decimals": null,
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"columns": [],
"datasource": "$datasource",
@ -8173,10 +8329,6 @@
},
{
"allValue": ".*",
"current": {
"text": "All",
"value": "$__all"
},
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
@ -8228,5 +8380,5 @@
"timezone": "",
"title": "Testnet Monitor (edge)",
"uid": "testnet-edge",
"version": 3
"version": 1
}

View File

@ -752,7 +752,7 @@ stopNode() {
PS4=\"$PS4\"
set -x
! tmux list-sessions || tmux kill-session
for pid in solana/{net-stats,oom-monitor}.pid; do
for pid in solana/{net-stats,fd-monitor,oom-monitor}.pid; do
pgid=\$(ps opgid= \$(cat \$pid) | tr -d '[:space:]')
if [[ -n \$pgid ]]; then
sudo kill -- -\$pgid

View File

@ -50,9 +50,13 @@ skip)
esac
(
sudo scripts/oom-monitor.sh
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
) > oom-monitor.log 2>&1 &
echo $! > oom-monitor.pid
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
echo $! > fd-monitor.pid
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid
! tmux list-sessions || tmux kill-session

View File

@ -93,6 +93,8 @@ local|tar|skip)
sudo SOLANA_METRICS_CONFIG="$SOLANA_METRICS_CONFIG" scripts/oom-monitor.sh
) > oom-monitor.log 2>&1 &
echo $! > oom-monitor.pid
scripts/fd-monitor.sh > fd-monitor.log 2>&1 &
echo $! > fd-monitor.pid
scripts/net-stats.sh > net-stats.log 2>&1 &
echo $! > net-stats.pid

20
scripts/fd-monitor.sh Executable file
View File

@ -0,0 +1,20 @@
#!/usr/bin/env bash
#
# Reports open file descriptors for the current user
#
set -e
[[ $(uname) == Linux ]] || exit 0
cd "$(dirname "$0")"
# shellcheck source=scripts/configure-metrics.sh
source configure-metrics.sh
while true; do
count=$(lsof -u $UID | wc -l)
./metrics-write-datapoint.sh "open-files,hostname=$HOSTNAME count=$count"
sleep 10
done
exit 1

View File

@ -22,5 +22,5 @@ if [[ -n $INFLUX_HOST ]]; then
fi
echo "${host}/write?db=${INFLUX_DATABASE}&u=${INFLUX_USERNAME}&p=${INFLUX_PASSWORD}" \
| xargs curl --max-time 5 -XPOST --data-binary "$point"
| xargs curl --max-time 5 --silent --show-error -XPOST --data-binary "$point"
exit 0