Metrics for window repair (#2106)
* Metrics for window repair - Also increase max repair length * fix vote counters, and add repair window graph * update per node graphs * revert max repair length change
This commit is contained in:
parent
2238725d1c
commit
9243bc58db
|
@ -2083,6 +2083,7 @@ dependencies = [
|
|||
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.82 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.82 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"solana-metrics 0.11.0",
|
||||
"solana-sdk 0.11.0",
|
||||
]
|
||||
|
||||
|
|
|
@ -15,8 +15,8 @@
|
|||
"editable": true,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": 74,
|
||||
"iteration": 1544477352265,
|
||||
"id": 79,
|
||||
"iteration": 1544546712840,
|
||||
"links": [
|
||||
{
|
||||
"asDropdown": true,
|
||||
|
@ -458,7 +458,7 @@
|
|||
"hide": false,
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"counter-cluster_info-vote-count\" WHERE $timeFilter \n",
|
||||
"query": "SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter \n",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "table",
|
||||
|
@ -530,44 +530,6 @@
|
|||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"null"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-cluster_info-vote-count\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n\n",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"count"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "sum"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
},
|
||||
{
|
||||
"groupBy": [
|
||||
{
|
||||
|
@ -585,7 +547,7 @@
|
|||
],
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-validator-vote_sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT sum(\"count\") AS \"total\" FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -622,9 +584,9 @@
|
|||
],
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-vote_stage-leader_sent_vote\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n\n\n",
|
||||
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-validator-vote_sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "C",
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
|
@ -4995,7 +4957,7 @@
|
|||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "UDP Net Stats (validators)",
|
||||
"title": "UDP Net Stats ($hostid)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
|
@ -5089,7 +5051,7 @@
|
|||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"retransmit-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"retransmit-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -5127,7 +5089,7 @@
|
|||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT sum(\"count\") AS \"replicate\" FROM \"$testnet\".\"autogen\".\"replicate-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT sum(\"count\") AS \"replicate\" FROM \"$testnet\".\"autogen\".\"replicate-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -5165,7 +5127,7 @@
|
|||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT sum(\"count\") AS \"retransmit_q\" FROM \"$testnet\".\"autogen\".\"retransmit-queue\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT sum(\"count\") AS \"retransmit_q\" FROM \"$testnet\".\"autogen\".\"retransmit-queue\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "C",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -5203,7 +5165,7 @@
|
|||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT sum(\"count\") AS \"recv_window\" FROM \"$testnet\".\"autogen\".\"recv-window\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT sum(\"count\") AS \"recv_window\" FROM \"$testnet\".\"autogen\".\"recv-window\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "D",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -5227,7 +5189,7 @@
|
|||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Channel Pressure (validator)",
|
||||
"title": "Channel Pressure ($hostid)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
|
@ -5320,7 +5282,7 @@
|
|||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT last(\"consumed\") AS \"validator\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"query": "SELECT last(\"consumed\") AS \"validator\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
|
@ -5382,7 +5344,201 @@
|
|||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Leader broadcast vs Validator consume",
|
||||
"title": "Leader broadcast vs Validator consume ($hostid)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
"value_type": "individual"
|
||||
},
|
||||
"type": "graph",
|
||||
"xaxis": {
|
||||
"buckets": null,
|
||||
"mode": "time",
|
||||
"name": null,
|
||||
"show": true,
|
||||
"values": []
|
||||
},
|
||||
"yaxes": [
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
},
|
||||
{
|
||||
"format": "short",
|
||||
"label": null,
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": null,
|
||||
"show": true
|
||||
}
|
||||
],
|
||||
"yaxis": {
|
||||
"align": false,
|
||||
"alignLevel": null
|
||||
}
|
||||
},
|
||||
{
|
||||
"aliasColors": {
|
||||
"cluster-info.repair": "#ba43a9",
|
||||
"window-service.receive": "#b7dbab",
|
||||
"window-stage.consumed": "#5195ce"
|
||||
},
|
||||
"bars": false,
|
||||
"dashLength": 10,
|
||||
"dashes": false,
|
||||
"datasource": "Solana Metrics (read-only)",
|
||||
"fill": 1,
|
||||
"gridPos": {
|
||||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 71
|
||||
},
|
||||
"id": 42,
|
||||
"legend": {
|
||||
"alignAsTable": false,
|
||||
"avg": false,
|
||||
"current": false,
|
||||
"max": false,
|
||||
"min": false,
|
||||
"show": true,
|
||||
"total": false,
|
||||
"values": false
|
||||
},
|
||||
"lines": false,
|
||||
"linewidth": 1,
|
||||
"links": [],
|
||||
"nullPointMode": "connected",
|
||||
"percentage": false,
|
||||
"pointradius": 1,
|
||||
"points": true,
|
||||
"renderer": "flot",
|
||||
"seriesOverrides": [],
|
||||
"spaceLength": 10,
|
||||
"stack": false,
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"null"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"measurement": "counter-cluster_info-vote-count",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "autogen",
|
||||
"query": "SELECT last(\"last-recv\") AS \"receive\" FROM \"$testnet\".\"autogen\".\"window-service\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"count"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "sum"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
},
|
||||
{
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"null"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT last(\"consumed\") AS \"consumed\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
|
||||
"rawQuery": true,
|
||||
"refId": "B",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
},
|
||||
{
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"$__interval"
|
||||
],
|
||||
"type": "time"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"null"
|
||||
],
|
||||
"type": "fill"
|
||||
}
|
||||
],
|
||||
"orderByTime": "ASC",
|
||||
"policy": "default",
|
||||
"query": "SELECT last(\"repair-ix\") AS \"repair\" FROM \"$testnet\".\"autogen\".\"cluster-info\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
|
||||
"rawQuery": true,
|
||||
"refId": "C",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"value"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
],
|
||||
"thresholds": [],
|
||||
"timeFrom": null,
|
||||
"timeShift": null,
|
||||
"title": "Receive/Consume/Repair ($hostid)",
|
||||
"tooltip": {
|
||||
"shared": true,
|
||||
"sort": 0,
|
||||
|
@ -5425,7 +5581,7 @@
|
|||
"h": 1,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 71
|
||||
"y": 76
|
||||
},
|
||||
"id": 40,
|
||||
"panels": [],
|
||||
|
@ -5443,7 +5599,7 @@
|
|||
"h": 5,
|
||||
"w": 12,
|
||||
"x": 0,
|
||||
"y": 72
|
||||
"y": 77
|
||||
},
|
||||
"id": 41,
|
||||
"legend": {
|
||||
|
@ -5575,6 +5731,25 @@
|
|||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"datasource": "Solana Metrics (read-only)",
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": "HostID",
|
||||
"multi": false,
|
||||
"name": "hostid",
|
||||
"options": [],
|
||||
"query": "SELECT DISTINCT(\"host_id\") FROM \"$testnet\".\"autogen\".\"counter-bank-process_transactions-txs\" ",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tags": [],
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -5610,5 +5785,5 @@
|
|||
"timezone": "",
|
||||
"title": "Testnet Monitor (edge)",
|
||||
"uid": "testnet-edge",
|
||||
"version": 112
|
||||
"version": 113
|
||||
}
|
|
@ -12,6 +12,7 @@ env_logger = "0.6.0"
|
|||
log = "0.4.2"
|
||||
serde = "1.0.82"
|
||||
serde_derive = "1.0.82"
|
||||
solana-metrics = { path = "../../../metrics", version = "0.11.0" }
|
||||
solana-sdk = { path = "../../../sdk", version = "0.11.0" }
|
||||
|
||||
[lib]
|
||||
|
|
|
@ -5,10 +5,12 @@ extern crate bincode;
|
|||
extern crate env_logger;
|
||||
#[macro_use]
|
||||
extern crate log;
|
||||
extern crate solana_metrics;
|
||||
#[macro_use]
|
||||
extern crate solana_sdk;
|
||||
|
||||
use bincode::deserialize;
|
||||
use solana_metrics::{influxdb, submit};
|
||||
use solana_sdk::account::KeyedAccount;
|
||||
use solana_sdk::native_program::ProgramError;
|
||||
use solana_sdk::pubkey::Pubkey;
|
||||
|
@ -62,6 +64,11 @@ fn entrypoint(
|
|||
Err(ProgramError::InvalidArgument)?;
|
||||
}
|
||||
debug!("{:?} by {}", vote, keyed_accounts[0].signer_key().unwrap());
|
||||
submit(
|
||||
influxdb::Point::new("vote-native")
|
||||
.add_field("count", influxdb::Value::Integer(1))
|
||||
.to_owned(),
|
||||
);
|
||||
|
||||
let mut vote_state = VoteProgram::deserialize(&keyed_accounts[0].account.userdata)?;
|
||||
|
||||
|
|
|
@ -30,6 +30,7 @@ use bincode::{deserialize, serialize};
|
|||
use log::Level;
|
||||
use rand::{thread_rng, Rng};
|
||||
use rayon::prelude::*;
|
||||
use solana_metrics::{influxdb, submit};
|
||||
use solana_sdk::hash::Hash;
|
||||
use solana_sdk::pubkey::Pubkey;
|
||||
use solana_sdk::signature::{Keypair, KeypairUtil, Signable, Signature};
|
||||
|
@ -520,6 +521,13 @@ impl ClusterInfo {
|
|||
let addr = valid[n].gossip; // send the request to the peer's gossip port
|
||||
let req = Protocol::RequestWindowIndex(self.my_data().clone(), ix);
|
||||
let out = serialize(&req)?;
|
||||
|
||||
submit(
|
||||
influxdb::Point::new("cluster-info")
|
||||
.add_field("repair-ix", influxdb::Value::Integer(ix as i64))
|
||||
.to_owned(),
|
||||
);
|
||||
|
||||
Ok((addr, out))
|
||||
}
|
||||
fn new_pull_requests(&mut self) -> Vec<(SocketAddr, Protocol)> {
|
||||
|
|
|
@ -91,6 +91,12 @@ fn recv_window(
|
|||
(p.index()?, p.meta.size)
|
||||
};
|
||||
|
||||
submit(
|
||||
influxdb::Point::new("window-service")
|
||||
.add_field("last-recv", influxdb::Value::Integer(pix as i64))
|
||||
.to_owned(),
|
||||
);
|
||||
|
||||
pixs.push(pix);
|
||||
|
||||
trace!("{} window pix: {} size: {}", id, pix, meta_size);
|
||||
|
|
Loading…
Reference in New Issue