Add alert for offline validators

This commit is contained in:
Kirill Fedoseev 2022-05-08 13:13:13 +02:00
parent 9068aec25c
commit 506b1eeb02
11 changed files with 131 additions and 6 deletions

View File

@ -105,6 +105,13 @@
},
"stuck_erc_to_native_message_confirmation": {
"$ref": "#/$defs/alert_config"
},
"last_validator_activity": {
"type": [
"object",
"null"
],
"additionalProperties": false
}
},
"additionalProperties": false

View File

@ -87,6 +87,7 @@ bridges:
unknown_erc_to_native_message_confirmation:
unknown_erc_to_native_message_execution:
stuck_erc_to_native_message_confirmation:
last_validator_activity:
xdai-amb:
home:
chain: xdai
@ -117,6 +118,7 @@ bridges:
stuck_information_request:
failed_information_request:
different_information_signatures:
last_validator_activity:
test-amb:
home:
chain: sokol

View File

@ -21,7 +21,7 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1649742040975,
"iteration": 1652004548107,
"links": [],
"panels": [
{
@ -1586,7 +1586,7 @@
"group": [],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n",
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1",
"refId": "A",
"select": [
[
@ -1792,5 +1792,5 @@
"timezone": "",
"title": "AMB",
"uid": "rxl6GONnk",
"version": 1
"version": 2
}

View File

@ -21,7 +21,7 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"iteration": 1650550279410,
"iteration": 1652004394458,
"links": [],
"panels": [
{
@ -1153,7 +1153,7 @@
"group": [],
"metricColumn": "none",
"rawQuery": true,
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n",
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1",
"refId": "A",
"select": [
[
@ -1359,5 +1359,5 @@
"timezone": "",
"title": "XDAI",
"uid": "h48F4hIa2",
"version": 3
"version": 4
}

View File

@ -104,6 +104,13 @@ func NewAlertManager(logger logging.Logger, db *db.DB, cfg *config.BridgeConfig)
Func: provider.FindStuckErcToNativeMessages,
Metric: NewAlertStuckErcToNativeMessageConfirmation(cfg.ID),
}
case "last_validator_activity":
jobs[name] = &Job{
Interval: time.Minute * 10,
Timeout: time.Second * 20,
Func: provider.FindLastValidatorActivity,
Metric: NewAlertLastValidatorActivity(cfg.ID),
}
default:
return nil, fmt.Errorf("unknown alert type %q", name)
}

View File

@ -742,3 +742,52 @@ func (p *DBAlertsProvider) FindStuckErcToNativeMessages(ctx context.Context, par
}
return alerts, nil
}
type LastValidatorActivity struct {
ChainID string `db:"chain_id"`
Address common.Address `db:"address"`
Age time.Duration `db:"age"`
}
func (c *LastValidatorActivity) AlertValues() AlertValues {
return AlertValues{
Labels: map[string]string{
"chain_id": c.ChainID,
"address": c.Address.String(),
},
Value: float64(c.Age),
}
}
func (p *DBAlertsProvider) FindLastValidatorActivity(ctx context.Context, params *AlertJobParams) ([]AlertValues, error) {
query := `
SELECT $2 as chain_id,
v.address as address,
EXTRACT(EPOCH FROM now() - max(coalesce(
va.last_active,
(SELECT max(bt.timestamp)
FROM logs l
JOIN block_timestamps bt
ON l.chain_id = bt.chain_id AND l.block_number = bt.block_number
WHERE v.log_id = l.id))))::int as age
FROM bridge_validators v
LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active
FROM signed_messages s
JOIN logs l ON s.log_id = l.id
JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number
WHERE s.bridge_id = $1
GROUP BY s.signer) va ON va.signer = v.address
WHERE v.bridge_id = $1
GROUP BY v.address
HAVING count(v.removed_log_id) < count(*)`
res := make([]LastValidatorActivity, 0, 5)
err := p.db.SelectContext(ctx, &res, query, params.Bridge, params.HomeChainID)
if err != nil {
return nil, fmt.Errorf("can't select alerts: %w", err)
}
alerts := make([]AlertValues, len(res))
for i := range res {
alerts[i] = res[i].AlertValues()
}
return alerts, nil
}

View File

@ -114,4 +114,13 @@ var (
ConstLabels: prometheus.Labels{"bridge_id": bridge},
}, []string{"chain_id", "block_number", "tx_hash", "msg_hash", "count", "sender", "receiver", "value"})
}
NewAlertLastValidatorActivity = func(bridge string) *prometheus.GaugeVec {
return promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "alert",
Subsystem: "monitor",
Name: "last_validator_activity",
Help: "Shows time passed since last successfully recorded action from the validator.",
ConstLabels: prometheus.Labels{"bridge_id": bridge},
}, []string{"chain_id", "address"})
}
)

View File

@ -137,6 +137,16 @@ receivers:
- type: button
text: 'Silence :no_bell:'
url: '{{ template "__alert_silence_link" . }}'
- name: slack-validator-offline
slack_configs:
- send_resolved: true
channel: '#amb-alerts'
title: '{{ template "slack.validator_offline.title" . }}'
text: '{{ template "slack.validator_offline.text" . }}'
actions:
- type: button
text: 'Silence :no_bell:'
url: '{{ template "__alert_silence_link" . }}'
- name: slack-dm
slack_configs:
- send_resolved: true
@ -200,6 +210,10 @@ route:
group_by: [ "..." ]
matchers:
- alertname = UnknownErcToNativeMessageExecution
- receiver: slack-validator-offline
group_by: [ "..." ]
matchers:
- alertname = ValidatorOffline
- receiver: slack-stuck-contract
group_by: [ "..." ]
matchers:

View File

@ -71,6 +71,12 @@ groups:
expr: max_over_time(alert_monitor_unknown_erc_to_native_message_execution[5m]) > 0
annotations:
age: '{{ humanizeDuration $value }}'
- name: ValidatorOffline
rules:
- alert: ValidatorOffline
expr: max_over_time(alert_monitor_last_validator_activity[5m]) > 43200
annotations:
age: '{{ humanizeDuration $value }}'
- name: StuckContractProgress
rules:
- alert: StuckContractProgress

View File

@ -17,3 +17,23 @@ https://blockscout.com/xdai/mainnet/tx/{{ .tx_hash }}
{{ .tx_hash }}
{{- end -}}
{{- end }}
{{ define "explorer.address.link" -}}
{{- if eq .chain_id "1" -}}
https://etherscan.io/address/{{ .address }}
{{- else if eq .chain_id "4" -}}
https://rinkeby.etherscan.io/address/{{ .address }}
{{- else if eq .chain_id "42" -}}
https://kovan.etherscan.io/address/{{ .address }}
{{- else if eq .chain_id "56" -}}
https://bscscan.com/address/{{ .address }}
{{- else if eq .chain_id "77" -}}
https://blockscout.com/poa/sokol/address/{{ .address }}
{{- else if eq .chain_id "99" -}}
https://blockscout.com/poa/core/address/{{ .address }}
{{- else if eq .chain_id "100" -}}
https://blockscout.com/xdai/mainnet/address/{{ .address }}
{{- else -}}
{{ .address }}
{{- end -}}
{{- end }}

View File

@ -163,3 +163,14 @@ Monitoring of contract is stuck
*Chain ID:* {{ .CommonLabels.chain_id }}
*Address:* {{ .CommonLabels.address }}
{{- end }}
{{ define "slack.validator_offline.title" -}}
Bridge validator stopped producing successful confirmations
{{- end }}
{{ define "slack.validator_offline.text" -}}
*Bridge:* {{ .CommonLabels.bridge_id }}
*Chain ID:* {{ .CommonLabels.chain_id }}
*Address:* {{ .CommonLabels.address }}
*Time since last recorded action:* {{ .CommonAnnotations.age }}
*Validator:* {{ template "explorer.address.link" .CommonLabels }}
{{- end }}