Metrics for window repair (#2106)

* Metrics for window repair

- Also increase max repair length

* fix vote counters, and add repair window graph

* update per node graphs

* revert max repair length change
This commit is contained in:
Pankaj Garg 2018-12-11 15:43:41 -08:00 committed by GitHub
parent 2238725d1c
commit 9243bc58db
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 254 additions and 56 deletions

1
Cargo.lock generated
View File

@ -2083,6 +2083,7 @@ dependencies = [
"log 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.82 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.82 (registry+https://github.com/rust-lang/crates.io-index)",
"solana-metrics 0.11.0",
"solana-sdk 0.11.0",
]

View File

@ -15,8 +15,8 @@
"editable": true,
"gnetId": null,
"graphTooltip": 0,
"id": 74,
"iteration": 1544477352265,
"id": 79,
"iteration": 1544546712840,
"links": [
{
"asDropdown": true,
@ -458,7 +458,7 @@
"hide": false,
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"counter-cluster_info-vote-count\" WHERE $timeFilter \n",
"query": "SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter \n",
"rawQuery": true,
"refId": "A",
"resultFormat": "table",
@ -530,44 +530,6 @@
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-cluster_info-vote-count\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n\n",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"count"
],
"type": "field"
},
{
"params": [],
"type": "sum"
}
]
],
"tags": []
},
{
"groupBy": [
{
@ -585,7 +547,7 @@
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-validator-vote_sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT sum(\"count\") AS \"total\" FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "B",
"resultFormat": "time_series",
@ -622,9 +584,9 @@
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-vote_stage-leader_sent_vote\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)\n\n\n",
"query": "SELECT sum(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"counter-validator-vote_sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "C",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
@ -4995,7 +4957,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "UDP Net Stats (validators)",
"title": "UDP Net Stats ($hostid)",
"tooltip": {
"shared": true,
"sort": 0,
@ -5089,7 +5051,7 @@
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"retransmit-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT sum(\"count\") AS \"retransmit\" FROM \"$testnet\".\"autogen\".\"retransmit-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
@ -5127,7 +5089,7 @@
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT sum(\"count\") AS \"replicate\" FROM \"$testnet\".\"autogen\".\"replicate-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT sum(\"count\") AS \"replicate\" FROM \"$testnet\".\"autogen\".\"replicate-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "B",
"resultFormat": "time_series",
@ -5165,7 +5127,7 @@
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT sum(\"count\") AS \"retransmit_q\" FROM \"$testnet\".\"autogen\".\"retransmit-queue\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT sum(\"count\") AS \"retransmit_q\" FROM \"$testnet\".\"autogen\".\"retransmit-queue\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "C",
"resultFormat": "time_series",
@ -5203,7 +5165,7 @@
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT sum(\"count\") AS \"recv_window\" FROM \"$testnet\".\"autogen\".\"recv-window\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT sum(\"count\") AS \"recv_window\" FROM \"$testnet\".\"autogen\".\"recv-window\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "D",
"resultFormat": "time_series",
@ -5227,7 +5189,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Channel Pressure (validator)",
"title": "Channel Pressure ($hostid)",
"tooltip": {
"shared": true,
"sort": 0,
@ -5320,7 +5282,7 @@
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT last(\"consumed\") AS \"validator\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)",
"query": "SELECT last(\"consumed\") AS \"validator\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval) FILL(0)",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
@ -5382,7 +5344,201 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Leader broadcast vs Validator consume",
"title": "Leader broadcast vs Validator consume ($hostid)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"cluster-info.repair": "#ba43a9",
"window-service.receive": "#b7dbab",
"window-stage.consumed": "#5195ce"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Solana Metrics (read-only)",
"fill": 1,
"gridPos": {
"h": 5,
"w": 12,
"x": 0,
"y": 71
},
"id": 42,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": false,
"linewidth": 1,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 1,
"points": true,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"measurement": "counter-cluster_info-vote-count",
"orderByTime": "ASC",
"policy": "autogen",
"query": "SELECT last(\"last-recv\") AS \"receive\" FROM \"$testnet\".\"autogen\".\"window-service\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
"rawQuery": true,
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"count"
],
"type": "field"
},
{
"params": [],
"type": "sum"
}
]
],
"tags": []
},
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT last(\"consumed\") AS \"consumed\" FROM \"$testnet\".\"autogen\".\"window-stage\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
"rawQuery": true,
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
},
{
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"null"
],
"type": "fill"
}
],
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT last(\"repair-ix\") AS \"repair\" FROM \"$testnet\".\"autogen\".\"cluster-info\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)",
"rawQuery": true,
"refId": "C",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"value"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Receive/Consume/Repair ($hostid)",
"tooltip": {
"shared": true,
"sort": 0,
@ -5425,7 +5581,7 @@
"h": 1,
"w": 24,
"x": 0,
"y": 71
"y": 76
},
"id": 40,
"panels": [],
@ -5443,7 +5599,7 @@
"h": 5,
"w": 12,
"x": 0,
"y": 72
"y": 77
},
"id": 41,
"legend": {
@ -5575,6 +5731,25 @@
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"datasource": "Solana Metrics (read-only)",
"hide": 0,
"includeAll": false,
"label": "HostID",
"multi": false,
"name": "hostid",
"options": [],
"query": "SELECT DISTINCT(\"host_id\") FROM \"$testnet\".\"autogen\".\"counter-bank-process_transactions-txs\" ",
"refresh": 2,
"regex": "",
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
@ -5610,5 +5785,5 @@
"timezone": "",
"title": "Testnet Monitor (edge)",
"uid": "testnet-edge",
"version": 112
}
"version": 113
}

View File

@ -12,6 +12,7 @@ env_logger = "0.6.0"
log = "0.4.2"
serde = "1.0.82"
serde_derive = "1.0.82"
solana-metrics = { path = "../../../metrics", version = "0.11.0" }
solana-sdk = { path = "../../../sdk", version = "0.11.0" }
[lib]

View File

@ -5,10 +5,12 @@ extern crate bincode;
extern crate env_logger;
#[macro_use]
extern crate log;
extern crate solana_metrics;
#[macro_use]
extern crate solana_sdk;
use bincode::deserialize;
use solana_metrics::{influxdb, submit};
use solana_sdk::account::KeyedAccount;
use solana_sdk::native_program::ProgramError;
use solana_sdk::pubkey::Pubkey;
@ -62,6 +64,11 @@ fn entrypoint(
Err(ProgramError::InvalidArgument)?;
}
debug!("{:?} by {}", vote, keyed_accounts[0].signer_key().unwrap());
submit(
influxdb::Point::new("vote-native")
.add_field("count", influxdb::Value::Integer(1))
.to_owned(),
);
let mut vote_state = VoteProgram::deserialize(&keyed_accounts[0].account.userdata)?;

View File

@ -30,6 +30,7 @@ use bincode::{deserialize, serialize};
use log::Level;
use rand::{thread_rng, Rng};
use rayon::prelude::*;
use solana_metrics::{influxdb, submit};
use solana_sdk::hash::Hash;
use solana_sdk::pubkey::Pubkey;
use solana_sdk::signature::{Keypair, KeypairUtil, Signable, Signature};
@ -520,6 +521,13 @@ impl ClusterInfo {
let addr = valid[n].gossip; // send the request to the peer's gossip port
let req = Protocol::RequestWindowIndex(self.my_data().clone(), ix);
let out = serialize(&req)?;
submit(
influxdb::Point::new("cluster-info")
.add_field("repair-ix", influxdb::Value::Integer(ix as i64))
.to_owned(),
);
Ok((addr, out))
}
fn new_pull_requests(&mut self) -> Vec<(SocketAddr, Protocol)> {

View File

@ -91,6 +91,12 @@ fn recv_window(
(p.index()?, p.meta.size)
};
submit(
influxdb::Point::new("window-service")
.add_field("last-recv", influxdb::Value::Integer(pix as i64))
.to_owned(),
);
pixs.push(pix);
trace!("{} window pix: {} size: {}", id, pix, meta_size);