Add alert for offline validators
This commit is contained in:
parent
9068aec25c
commit
506b1eeb02
|
@ -105,6 +105,13 @@
|
||||||
},
|
},
|
||||||
"stuck_erc_to_native_message_confirmation": {
|
"stuck_erc_to_native_message_confirmation": {
|
||||||
"$ref": "#/$defs/alert_config"
|
"$ref": "#/$defs/alert_config"
|
||||||
|
},
|
||||||
|
"last_validator_activity": {
|
||||||
|
"type": [
|
||||||
|
"object",
|
||||||
|
"null"
|
||||||
|
],
|
||||||
|
"additionalProperties": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"additionalProperties": false
|
"additionalProperties": false
|
||||||
|
|
|
@ -87,6 +87,7 @@ bridges:
|
||||||
unknown_erc_to_native_message_confirmation:
|
unknown_erc_to_native_message_confirmation:
|
||||||
unknown_erc_to_native_message_execution:
|
unknown_erc_to_native_message_execution:
|
||||||
stuck_erc_to_native_message_confirmation:
|
stuck_erc_to_native_message_confirmation:
|
||||||
|
last_validator_activity:
|
||||||
xdai-amb:
|
xdai-amb:
|
||||||
home:
|
home:
|
||||||
chain: xdai
|
chain: xdai
|
||||||
|
@ -117,6 +118,7 @@ bridges:
|
||||||
stuck_information_request:
|
stuck_information_request:
|
||||||
failed_information_request:
|
failed_information_request:
|
||||||
different_information_signatures:
|
different_information_signatures:
|
||||||
|
last_validator_activity:
|
||||||
test-amb:
|
test-amb:
|
||||||
home:
|
home:
|
||||||
chain: sokol
|
chain: sokol
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
"editable": true,
|
"editable": true,
|
||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
"iteration": 1649742040975,
|
"iteration": 1652004548107,
|
||||||
"links": [],
|
"links": [],
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
|
@ -1586,7 +1586,7 @@
|
||||||
"group": [],
|
"group": [],
|
||||||
"metricColumn": "none",
|
"metricColumn": "none",
|
||||||
"rawQuery": true,
|
"rawQuery": true,
|
||||||
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n",
|
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"select": [
|
"select": [
|
||||||
[
|
[
|
||||||
|
@ -1792,5 +1792,5 @@
|
||||||
"timezone": "",
|
"timezone": "",
|
||||||
"title": "AMB",
|
"title": "AMB",
|
||||||
"uid": "rxl6GONnk",
|
"uid": "rxl6GONnk",
|
||||||
"version": 1
|
"version": 2
|
||||||
}
|
}
|
|
@ -21,7 +21,7 @@
|
||||||
"editable": true,
|
"editable": true,
|
||||||
"gnetId": null,
|
"gnetId": null,
|
||||||
"graphTooltip": 0,
|
"graphTooltip": 0,
|
||||||
"iteration": 1650550279410,
|
"iteration": 1652004394458,
|
||||||
"links": [],
|
"links": [],
|
||||||
"panels": [
|
"panels": [
|
||||||
{
|
{
|
||||||
|
@ -1153,7 +1153,7 @@
|
||||||
"group": [],
|
"group": [],
|
||||||
"metricColumn": "none",
|
"metricColumn": "none",
|
||||||
"rawQuery": true,
|
"rawQuery": true,
|
||||||
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator, count(v.removed_log_id) < count(*) as enabled,\n(\nSELECT extract(epoch from now() - max(bt.timestamp)) FROM signed_messages s\nJOIN logs l ON s.log_id = l.id\nJOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\nWHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as since_last_active,\n(\nSELECT count(*) FROM signed_messages s WHERE s.bridge_id = '$bridge' AND s.signer = v.address\n) as total_confirmations\nFROM bridge_validators v\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1\n",
|
"rawSql": "SELECT concat('0x', encode(v.address, 'hex')) as validator,\n count(v.removed_log_id) < count(*) as enabled,\n extract(epoch from now() - max(va.last_active)) as since_last_active,\n coalesce(max(va.total_confirmations), 0) as total_confirmations\nFROM bridge_validators v\n LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active, count(*) as total_confirmations\n FROM signed_messages s\n JOIN logs l ON s.log_id = l.id\n JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number\n WHERE s.bridge_id = '$bridge'\n GROUP BY s.signer) va ON va.signer = v.address\nWHERE v.bridge_id = '$bridge'\nGROUP BY v.address\nORDER BY 2 DESC, 1",
|
||||||
"refId": "A",
|
"refId": "A",
|
||||||
"select": [
|
"select": [
|
||||||
[
|
[
|
||||||
|
@ -1359,5 +1359,5 @@
|
||||||
"timezone": "",
|
"timezone": "",
|
||||||
"title": "XDAI",
|
"title": "XDAI",
|
||||||
"uid": "h48F4hIa2",
|
"uid": "h48F4hIa2",
|
||||||
"version": 3
|
"version": 4
|
||||||
}
|
}
|
|
@ -104,6 +104,13 @@ func NewAlertManager(logger logging.Logger, db *db.DB, cfg *config.BridgeConfig)
|
||||||
Func: provider.FindStuckErcToNativeMessages,
|
Func: provider.FindStuckErcToNativeMessages,
|
||||||
Metric: NewAlertStuckErcToNativeMessageConfirmation(cfg.ID),
|
Metric: NewAlertStuckErcToNativeMessageConfirmation(cfg.ID),
|
||||||
}
|
}
|
||||||
|
case "last_validator_activity":
|
||||||
|
jobs[name] = &Job{
|
||||||
|
Interval: time.Minute * 10,
|
||||||
|
Timeout: time.Second * 20,
|
||||||
|
Func: provider.FindLastValidatorActivity,
|
||||||
|
Metric: NewAlertLastValidatorActivity(cfg.ID),
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unknown alert type %q", name)
|
return nil, fmt.Errorf("unknown alert type %q", name)
|
||||||
}
|
}
|
||||||
|
|
|
@ -742,3 +742,52 @@ func (p *DBAlertsProvider) FindStuckErcToNativeMessages(ctx context.Context, par
|
||||||
}
|
}
|
||||||
return alerts, nil
|
return alerts, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LastValidatorActivity struct {
|
||||||
|
ChainID string `db:"chain_id"`
|
||||||
|
Address common.Address `db:"address"`
|
||||||
|
Age time.Duration `db:"age"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *LastValidatorActivity) AlertValues() AlertValues {
|
||||||
|
return AlertValues{
|
||||||
|
Labels: map[string]string{
|
||||||
|
"chain_id": c.ChainID,
|
||||||
|
"address": c.Address.String(),
|
||||||
|
},
|
||||||
|
Value: float64(c.Age),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *DBAlertsProvider) FindLastValidatorActivity(ctx context.Context, params *AlertJobParams) ([]AlertValues, error) {
|
||||||
|
query := `
|
||||||
|
SELECT $2 as chain_id,
|
||||||
|
v.address as address,
|
||||||
|
EXTRACT(EPOCH FROM now() - max(coalesce(
|
||||||
|
va.last_active,
|
||||||
|
(SELECT max(bt.timestamp)
|
||||||
|
FROM logs l
|
||||||
|
JOIN block_timestamps bt
|
||||||
|
ON l.chain_id = bt.chain_id AND l.block_number = bt.block_number
|
||||||
|
WHERE v.log_id = l.id))))::int as age
|
||||||
|
FROM bridge_validators v
|
||||||
|
LEFT JOIN (SELECT s.signer, max(bt.timestamp) as last_active
|
||||||
|
FROM signed_messages s
|
||||||
|
JOIN logs l ON s.log_id = l.id
|
||||||
|
JOIN block_timestamps bt ON bt.chain_id = l.chain_id AND bt.block_number = l.block_number
|
||||||
|
WHERE s.bridge_id = $1
|
||||||
|
GROUP BY s.signer) va ON va.signer = v.address
|
||||||
|
WHERE v.bridge_id = $1
|
||||||
|
GROUP BY v.address
|
||||||
|
HAVING count(v.removed_log_id) < count(*)`
|
||||||
|
res := make([]LastValidatorActivity, 0, 5)
|
||||||
|
err := p.db.SelectContext(ctx, &res, query, params.Bridge, params.HomeChainID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("can't select alerts: %w", err)
|
||||||
|
}
|
||||||
|
alerts := make([]AlertValues, len(res))
|
||||||
|
for i := range res {
|
||||||
|
alerts[i] = res[i].AlertValues()
|
||||||
|
}
|
||||||
|
return alerts, nil
|
||||||
|
}
|
||||||
|
|
|
@ -114,4 +114,13 @@ var (
|
||||||
ConstLabels: prometheus.Labels{"bridge_id": bridge},
|
ConstLabels: prometheus.Labels{"bridge_id": bridge},
|
||||||
}, []string{"chain_id", "block_number", "tx_hash", "msg_hash", "count", "sender", "receiver", "value"})
|
}, []string{"chain_id", "block_number", "tx_hash", "msg_hash", "count", "sender", "receiver", "value"})
|
||||||
}
|
}
|
||||||
|
NewAlertLastValidatorActivity = func(bridge string) *prometheus.GaugeVec {
|
||||||
|
return promauto.NewGaugeVec(prometheus.GaugeOpts{
|
||||||
|
Namespace: "alert",
|
||||||
|
Subsystem: "monitor",
|
||||||
|
Name: "last_validator_activity",
|
||||||
|
Help: "Shows time passed since last successfully recorded action from the validator.",
|
||||||
|
ConstLabels: prometheus.Labels{"bridge_id": bridge},
|
||||||
|
}, []string{"chain_id", "address"})
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
|
@ -137,6 +137,16 @@ receivers:
|
||||||
- type: button
|
- type: button
|
||||||
text: 'Silence :no_bell:'
|
text: 'Silence :no_bell:'
|
||||||
url: '{{ template "__alert_silence_link" . }}'
|
url: '{{ template "__alert_silence_link" . }}'
|
||||||
|
- name: slack-validator-offline
|
||||||
|
slack_configs:
|
||||||
|
- send_resolved: true
|
||||||
|
channel: '#amb-alerts'
|
||||||
|
title: '{{ template "slack.validator_offline.title" . }}'
|
||||||
|
text: '{{ template "slack.validator_offline.text" . }}'
|
||||||
|
actions:
|
||||||
|
- type: button
|
||||||
|
text: 'Silence :no_bell:'
|
||||||
|
url: '{{ template "__alert_silence_link" . }}'
|
||||||
- name: slack-dm
|
- name: slack-dm
|
||||||
slack_configs:
|
slack_configs:
|
||||||
- send_resolved: true
|
- send_resolved: true
|
||||||
|
@ -200,6 +210,10 @@ route:
|
||||||
group_by: [ "..." ]
|
group_by: [ "..." ]
|
||||||
matchers:
|
matchers:
|
||||||
- alertname = UnknownErcToNativeMessageExecution
|
- alertname = UnknownErcToNativeMessageExecution
|
||||||
|
- receiver: slack-validator-offline
|
||||||
|
group_by: [ "..." ]
|
||||||
|
matchers:
|
||||||
|
- alertname = ValidatorOffline
|
||||||
- receiver: slack-stuck-contract
|
- receiver: slack-stuck-contract
|
||||||
group_by: [ "..." ]
|
group_by: [ "..." ]
|
||||||
matchers:
|
matchers:
|
||||||
|
|
|
@ -71,6 +71,12 @@ groups:
|
||||||
expr: max_over_time(alert_monitor_unknown_erc_to_native_message_execution[5m]) > 0
|
expr: max_over_time(alert_monitor_unknown_erc_to_native_message_execution[5m]) > 0
|
||||||
annotations:
|
annotations:
|
||||||
age: '{{ humanizeDuration $value }}'
|
age: '{{ humanizeDuration $value }}'
|
||||||
|
- name: ValidatorOffline
|
||||||
|
rules:
|
||||||
|
- alert: ValidatorOffline
|
||||||
|
expr: max_over_time(alert_monitor_last_validator_activity[5m]) > 43200
|
||||||
|
annotations:
|
||||||
|
age: '{{ humanizeDuration $value }}'
|
||||||
- name: StuckContractProgress
|
- name: StuckContractProgress
|
||||||
rules:
|
rules:
|
||||||
- alert: StuckContractProgress
|
- alert: StuckContractProgress
|
||||||
|
|
|
@ -17,3 +17,23 @@ https://blockscout.com/xdai/mainnet/tx/{{ .tx_hash }}
|
||||||
{{ .tx_hash }}
|
{{ .tx_hash }}
|
||||||
{{- end -}}
|
{{- end -}}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{ define "explorer.address.link" -}}
|
||||||
|
{{- if eq .chain_id "1" -}}
|
||||||
|
https://etherscan.io/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "4" -}}
|
||||||
|
https://rinkeby.etherscan.io/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "42" -}}
|
||||||
|
https://kovan.etherscan.io/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "56" -}}
|
||||||
|
https://bscscan.com/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "77" -}}
|
||||||
|
https://blockscout.com/poa/sokol/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "99" -}}
|
||||||
|
https://blockscout.com/poa/core/address/{{ .address }}
|
||||||
|
{{- else if eq .chain_id "100" -}}
|
||||||
|
https://blockscout.com/xdai/mainnet/address/{{ .address }}
|
||||||
|
{{- else -}}
|
||||||
|
{{ .address }}
|
||||||
|
{{- end -}}
|
||||||
|
{{- end }}
|
||||||
|
|
|
@ -163,3 +163,14 @@ Monitoring of contract is stuck
|
||||||
*Chain ID:* {{ .CommonLabels.chain_id }}
|
*Chain ID:* {{ .CommonLabels.chain_id }}
|
||||||
*Address:* {{ .CommonLabels.address }}
|
*Address:* {{ .CommonLabels.address }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
|
{{ define "slack.validator_offline.title" -}}
|
||||||
|
Bridge validator stopped producing successful confirmations
|
||||||
|
{{- end }}
|
||||||
|
{{ define "slack.validator_offline.text" -}}
|
||||||
|
*Bridge:* {{ .CommonLabels.bridge_id }}
|
||||||
|
*Chain ID:* {{ .CommonLabels.chain_id }}
|
||||||
|
*Address:* {{ .CommonLabels.address }}
|
||||||
|
*Time since last recorded action:* {{ .CommonAnnotations.age }}
|
||||||
|
*Validator:* {{ template "explorer.address.link" .CommonLabels }}
|
||||||
|
{{- end }}
|
||||||
|
|
Loading…
Reference in New Issue