From 506b1eeb02182e45fbcb9b363ae61c62123f3ee8 Mon Sep 17 00:00:00 2001 From: Kirill Fedoseev Date: Sun, 8 May 2022 13:13:13 +0200 Subject: [PATCH] Add alert for offline validators --- config.schema.json | 7 ++++ config.yml | 2 ++ grafana/dashboards/amb.json | 6 ++-- grafana/dashboards/xdai.json | 6 ++-- monitor/alerts/alert_manager.go | 7 ++++ monitor/alerts/db_alerts_provider.go | 49 ++++++++++++++++++++++++++++ monitor/alerts/metrics.go | 9 +++++ prometheus/alertmanager.yml | 14 ++++++++ prometheus/rules.yml | 6 ++++ prometheus/templates/explorer.tmpl | 20 ++++++++++++ prometheus/templates/slack.tmpl | 11 +++++++ 11 files changed, 131 insertions(+), 6 deletions(-) diff --git a/config.schema.json b/config.schema.json index ceacaec..b85cb39 100644 --- a/config.schema.json +++ b/config.schema.json @@ -105,6 +105,13 @@ }, "stuck_erc_to_native_message_confirmation": { "$ref": "#/$defs/alert_config" + }, + "last_validator_activity": { + "type": [ + "object", + "null" + ], + "additionalProperties": false } }, "additionalProperties": false diff --git a/config.yml b/config.yml index 121dcbe..1841475 100644 --- a/config.yml +++ b/config.yml @@ -87,6 +87,7 @@ bridges: unknown_erc_to_native_message_confirmation: unknown_erc_to_native_message_execution: stuck_erc_to_native_message_confirmation: + last_validator_activity: xdai-amb: home: chain: xdai @@ -117,6 +118,7 @@ bridges: stuck_information_request: failed_information_request: different_information_signatures: + last_validator_activity: test-amb: home: chain: sokol diff --git a/grafana/dashboards/amb.json b/grafana/dashboards/amb.json index 2be7ef8..34f935a 100644 --- a/grafana/dashboards/amb.json +++ b/grafana/dashboards/amb.json @@ -21,7 +21,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "iteration": 1649742040975, + "iteration": 1652004548107, "links": [], "panels": [ { @@ -1586,7 +1586,7 @@ "group": [], "metricColumn": "none", "rawQuery": true, - "rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n", + "rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1", "refId": "A", "select": [ [ @@ -1792,5 +1792,5 @@ "timezone": "", "title": "AMB", "uid": "rxl6GONnk", - "version": 1 + "version": 2 } \ No newline at end of file diff --git a/grafana/dashboards/xdai.json b/grafana/dashboards/xdai.json index 085039d..306a92e 100644 --- a/grafana/dashboards/xdai.json +++ b/grafana/dashboards/xdai.json @@ -21,7 +21,7 @@ "editable": true, "gnetId": null, "graphTooltip": 0, - "iteration": 1650550279410, + "iteration": 1652004394458, "links": [], "panels": [ { @@ -1153,7 +1153,7 @@ "group": [], "metricColumn": "none", "rawQuery": true, - "rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n", + "rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1", "refId": "A", "select": [ [ @@ -1359,5 +1359,5 @@ "timezone": "", "title": "XDAI", "uid": "h48F4hIa2", - "version": 3 + "version": 4 } \ No newline at end of file diff --git a/monitor/alerts/alert_manager.go b/monitor/alerts/alert_manager.go index abb6753..9659066 100644 --- a/monitor/alerts/alert_manager.go +++ b/monitor/alerts/alert_manager.go @@ -104,6 +104,13 @@ func NewAlertManager(logger logging.Logger, db *db.DB, cfg *config.BridgeConfig) Func: provider.FindStuckErcToNativeMessages, Metric: NewAlertStuckErcToNativeMessageConfirmation(cfg.ID), } + case "last_validator_activity": + jobs[name] = &Job{ + Interval: time.Minute * 10, + Timeout: time.Second * 20, + Func: provider.FindLastValidatorActivity, + Metric: NewAlertLastValidatorActivity(cfg.ID), + } default: return nil, fmt.Errorf("unknown alert type %q", name) } diff --git a/monitor/alerts/db_alerts_provider.go b/monitor/alerts/db_alerts_provider.go index 03f5a53..57e10f0 100644 --- a/monitor/alerts/db_alerts_provider.go +++ b/monitor/alerts/db_alerts_provider.go @@ -742,3 +742,52 @@ func (p *DBAlertsProvider) FindStuckErcToNativeMessages(ctx context.Context, par } return alerts, nil } + +type LastValidatorActivity struct { + ChainID string `db:"chain_id"` + Address common.Address `db:"address"` + Age time.Duration `db:"age"` +} + +func (c *LastValidatorActivity) AlertValues() AlertValues { + return AlertValues{ + Labels: map[string]string{ + "chain_id": c.ChainID, + "address": c.Address.String(), + }, + Value: float64(c.Age), + } +} + +func (p *DBAlertsProvider) FindLastValidatorActivity(ctx context.Context, params *AlertJobParams) ([]AlertValues, error) { + query := ` + SELECT $2 as chain_id, + v.address as address, + EXTRACT(EPOCH FROM now() - max(coalesce( + va.last_active, + (SELECT max(bt.timestamp) + FROM logs l + JOIN block_timestamps bt + ON l.chain_id = bt.chain_id AND l.block_number = bt.block_number + WHERE v.log_id = l.id))))::int as age + FROM bridge_validators v + LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active + FROM signed_messages s + JOIN logs l ON s.log_id = l.id + JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number + WHERE s.bridge_id = $1 + GROUP BY s.signer) va ON va.signer = v.address + WHERE v.bridge_id = $1 + GROUP BY v.address + HAVING count(v.removed_log_id) < count(*)` + res := make([]LastValidatorActivity, 0, 5) + err := p.db.SelectContext(ctx, &res, query, params.Bridge, params.HomeChainID) + if err != nil { + return nil, fmt.Errorf("can't select alerts: %w", err) + } + alerts := make([]AlertValues, len(res)) + for i := range res { + alerts[i] = res[i].AlertValues() + } + return alerts, nil +} diff --git a/monitor/alerts/metrics.go b/monitor/alerts/metrics.go index 5106600..6f17bd1 100644 --- a/monitor/alerts/metrics.go +++ b/monitor/alerts/metrics.go @@ -114,4 +114,13 @@ var ( ConstLabels: prometheus.Labels{"bridge_id": bridge}, }, []string{"chain_id", "block_number", "tx_hash", "msg_hash", "count", "sender", "receiver", "value"}) } + NewAlertLastValidatorActivity = func(bridge string) *prometheus.GaugeVec { + return promauto.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "alert", + Subsystem: "monitor", + Name: "last_validator_activity", + Help: "Shows time passed since last successfully recorded action from the validator.", + ConstLabels: prometheus.Labels{"bridge_id": bridge}, + }, []string{"chain_id", "address"}) + } ) diff --git a/prometheus/alertmanager.yml b/prometheus/alertmanager.yml index 532a4c8..389bbea 100644 --- a/prometheus/alertmanager.yml +++ b/prometheus/alertmanager.yml @@ -137,6 +137,16 @@ receivers: - type: button text: 'Silence :no_bell:' url: '{{ template "__alert_silence_link" . }}' + - name: slack-validator-offline + slack_configs: + - send_resolved: true + channel: '#amb-alerts' + title: '{{ template "slack.validator_offline.title" . }}' + text: '{{ template "slack.validator_offline.text" . }}' + actions: + - type: button + text: 'Silence :no_bell:' + url: '{{ template "__alert_silence_link" . }}' - name: slack-dm slack_configs: - send_resolved: true @@ -200,6 +210,10 @@ route: group_by: [ "..." ] matchers: - alertname = UnknownErcToNativeMessageExecution + - receiver: slack-validator-offline + group_by: [ "..." ] + matchers: + - alertname = ValidatorOffline - receiver: slack-stuck-contract group_by: [ "..." ] matchers: diff --git a/prometheus/rules.yml b/prometheus/rules.yml index 20d1aff..888608c 100644 --- a/prometheus/rules.yml +++ b/prometheus/rules.yml @@ -71,6 +71,12 @@ groups: expr: max_over_time(alert_monitor_unknown_erc_to_native_message_execution[5m]) > 0 annotations: age: '{{ humanizeDuration $value }}' + - name: ValidatorOffline + rules: + - alert: ValidatorOffline + expr: max_over_time(alert_monitor_last_validator_activity[5m]) > 43200 + annotations: + age: '{{ humanizeDuration $value }}' - name: StuckContractProgress rules: - alert: StuckContractProgress diff --git a/prometheus/templates/explorer.tmpl b/prometheus/templates/explorer.tmpl index 631343d..686a90f 100644 --- a/prometheus/templates/explorer.tmpl +++ b/prometheus/templates/explorer.tmpl @@ -17,3 +17,23 @@ https://blockscout.com/xdai/mainnet/tx/{{ .tx_hash }} {{ .tx_hash }} {{- end -}} {{- end }} + +{{ define "explorer.address.link" -}} +{{- if eq .chain_id "1" -}} +https://etherscan.io/address/{{ .address }} +{{- else if eq .chain_id "4" -}} +https://rinkeby.etherscan.io/address/{{ .address }} +{{- else if eq .chain_id "42" -}} +https://kovan.etherscan.io/address/{{ .address }} +{{- else if eq .chain_id "56" -}} +https://bscscan.com/address/{{ .address }} +{{- else if eq .chain_id "77" -}} +https://blockscout.com/poa/sokol/address/{{ .address }} +{{- else if eq .chain_id "99" -}} +https://blockscout.com/poa/core/address/{{ .address }} +{{- else if eq .chain_id "100" -}} +https://blockscout.com/xdai/mainnet/address/{{ .address }} +{{- else -}} +{{ .address }} +{{- end -}} +{{- end }} diff --git a/prometheus/templates/slack.tmpl b/prometheus/templates/slack.tmpl index a830c54..347e5f9 100644 --- a/prometheus/templates/slack.tmpl +++ b/prometheus/templates/slack.tmpl @@ -163,3 +163,14 @@ Monitoring of contract is stuck *Chain ID:* {{ .CommonLabels.chain_id }} *Address:* {{ .CommonLabels.address }} {{- end }} + +{{ define "slack.validator_offline.title" -}} +Bridge validator stopped producing successful confirmations +{{- end }} +{{ define "slack.validator_offline.text" -}} +*Bridge:* {{ .CommonLabels.bridge_id }} +*Chain ID:* {{ .CommonLabels.chain_id }} +*Address:* {{ .CommonLabels.address }} +*Time since last recorded action:* {{ .CommonAnnotations.age }} +*Validator:* {{ template "explorer.address.link" .CommonLabels }} +{{- end }}