liquidator: Collect and report persistent errors (#672)
This commit is contained in:
parent
1f55d549a6
commit
4eca3eb9f0
|
@ -213,7 +213,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
mango_group,
|
mango_group,
|
||||||
get_multiple_accounts_count: cli.get_multiple_accounts_count,
|
get_multiple_accounts_count: cli.get_multiple_accounts_count,
|
||||||
parallel_rpc_requests: cli.parallel_rpc_requests,
|
parallel_rpc_requests: cli.parallel_rpc_requests,
|
||||||
snapshot_interval: std::time::Duration::from_secs(cli.snapshot_interval_secs),
|
snapshot_interval: Duration::from_secs(cli.snapshot_interval_secs),
|
||||||
min_slot: first_websocket_slot + 10,
|
min_slot: first_websocket_slot + 10,
|
||||||
},
|
},
|
||||||
mango_oracles,
|
mango_oracles,
|
||||||
|
@ -289,16 +289,27 @@ async fn main() -> anyhow::Result<()> {
|
||||||
token_swap_info: token_swap_info_updater.clone(),
|
token_swap_info: token_swap_info_updater.clone(),
|
||||||
liq_errors: ErrorTracking {
|
liq_errors: ErrorTracking {
|
||||||
skip_threshold: 5,
|
skip_threshold: 5,
|
||||||
skip_duration: std::time::Duration::from_secs(120),
|
skip_duration: Duration::from_secs(120),
|
||||||
reset_duration: std::time::Duration::from_secs(360),
|
|
||||||
..ErrorTracking::default()
|
..ErrorTracking::default()
|
||||||
},
|
},
|
||||||
tcs_errors: ErrorTracking {
|
tcs_collection_hard_errors: ErrorTracking {
|
||||||
skip_threshold: 2,
|
skip_threshold: 2,
|
||||||
skip_duration: std::time::Duration::from_secs(120),
|
skip_duration: Duration::from_secs(120),
|
||||||
reset_duration: std::time::Duration::from_secs(360),
|
|
||||||
..ErrorTracking::default()
|
..ErrorTracking::default()
|
||||||
},
|
},
|
||||||
|
tcs_collection_partial_errors: ErrorTracking {
|
||||||
|
skip_threshold: 2,
|
||||||
|
skip_duration: Duration::from_secs(120),
|
||||||
|
..ErrorTracking::default()
|
||||||
|
},
|
||||||
|
tcs_execution_errors: ErrorTracking {
|
||||||
|
skip_threshold: 2,
|
||||||
|
skip_duration: Duration::from_secs(120),
|
||||||
|
..ErrorTracking::default()
|
||||||
|
},
|
||||||
|
persistent_error_report_interval: Duration::from_secs(300),
|
||||||
|
persistent_error_min_duration: Duration::from_secs(300),
|
||||||
|
last_persistent_error_report: Instant::now(),
|
||||||
});
|
});
|
||||||
|
|
||||||
let (liquidation_trigger_sender, liquidation_trigger_receiver) =
|
let (liquidation_trigger_sender, liquidation_trigger_receiver) =
|
||||||
|
@ -442,6 +453,8 @@ async fn main() -> anyhow::Result<()> {
|
||||||
state.health_check_accounts = vec![];
|
state.health_check_accounts = vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
liquidation.log_persistent_errors();
|
||||||
|
|
||||||
let liquidated = liquidation
|
let liquidated = liquidation
|
||||||
.maybe_liquidate_one_and_rebalance(account_addresses.iter())
|
.maybe_liquidate_one_and_rebalance(account_addresses.iter())
|
||||||
.await
|
.await
|
||||||
|
@ -464,6 +477,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
let shared_state = shared_state.clone();
|
let shared_state = shared_state.clone();
|
||||||
async move {
|
async move {
|
||||||
loop {
|
loop {
|
||||||
|
interval.tick().await;
|
||||||
if !shared_state.read().unwrap().one_snapshot_done {
|
if !shared_state.read().unwrap().one_snapshot_done {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -475,6 +489,7 @@ async fn main() -> anyhow::Result<()> {
|
||||||
.copied()
|
.copied()
|
||||||
.collect_vec();
|
.collect_vec();
|
||||||
for token_index in token_indexes {
|
for token_index in token_indexes {
|
||||||
|
min_delay.tick().await;
|
||||||
match token_swap_info_updater.update_one(token_index).await {
|
match token_swap_info_updater.update_one(token_index).await {
|
||||||
Ok(()) => {}
|
Ok(()) => {}
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
|
@ -484,11 +499,8 @@ async fn main() -> anyhow::Result<()> {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
min_delay.tick().await;
|
|
||||||
}
|
}
|
||||||
token_swap_info_updater.log_all();
|
token_swap_info_updater.log_all();
|
||||||
|
|
||||||
interval.tick().await;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -532,17 +544,17 @@ struct SharedState {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct AccountErrorState {
|
struct AccountErrorState {
|
||||||
|
pub messages: Vec<String>,
|
||||||
pub count: u64,
|
pub count: u64,
|
||||||
pub last_at: std::time::Instant,
|
pub last_at: Instant,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct ErrorTracking {
|
struct ErrorTracking {
|
||||||
accounts: HashMap<Pubkey, AccountErrorState>,
|
accounts: HashMap<Pubkey, AccountErrorState>,
|
||||||
skip_threshold: u64,
|
skip_threshold: u64,
|
||||||
skip_duration: std::time::Duration,
|
skip_duration: Duration,
|
||||||
reset_duration: std::time::Duration,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ErrorTracking {
|
impl ErrorTracking {
|
||||||
|
@ -560,21 +572,42 @@ impl ErrorTracking {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn record_error(&mut self, pubkey: &Pubkey, now: Instant) {
|
pub fn record_error(&mut self, pubkey: &Pubkey, now: Instant, message: String) {
|
||||||
let error_entry = self.accounts.entry(*pubkey).or_insert(AccountErrorState {
|
let error_entry = self.accounts.entry(*pubkey).or_insert(AccountErrorState {
|
||||||
|
messages: Vec::with_capacity(1),
|
||||||
count: 0,
|
count: 0,
|
||||||
last_at: now,
|
last_at: now,
|
||||||
});
|
});
|
||||||
if now.duration_since(error_entry.last_at) > self.reset_duration {
|
|
||||||
error_entry.count = 0;
|
|
||||||
}
|
|
||||||
error_entry.count += 1;
|
error_entry.count += 1;
|
||||||
error_entry.last_at = now;
|
error_entry.last_at = now;
|
||||||
|
if !error_entry.messages.contains(&message) {
|
||||||
|
error_entry.messages.push(message);
|
||||||
|
}
|
||||||
|
if error_entry.messages.len() > 5 {
|
||||||
|
error_entry.messages.remove(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn clear_errors(&mut self, pubkey: &Pubkey) {
|
pub fn clear_errors(&mut self, pubkey: &Pubkey) {
|
||||||
self.accounts.remove(pubkey);
|
self.accounts.remove(pubkey);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[instrument(skip_all, fields(%error_type))]
|
||||||
|
#[allow(unused_variables)]
|
||||||
|
pub fn log_persistent_errors(&self, error_type: &str, min_duration: Duration) {
|
||||||
|
let now = Instant::now();
|
||||||
|
for (pubkey, errors) in self.accounts.iter() {
|
||||||
|
if now.duration_since(errors.last_at) < min_duration {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
info!(
|
||||||
|
%pubkey,
|
||||||
|
count = errors.count,
|
||||||
|
messages = ?errors.messages,
|
||||||
|
"has persistent errors",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct LiquidationState {
|
struct LiquidationState {
|
||||||
|
@ -584,8 +617,16 @@ struct LiquidationState {
|
||||||
token_swap_info: Arc<token_swap_info::TokenSwapInfoUpdater>,
|
token_swap_info: Arc<token_swap_info::TokenSwapInfoUpdater>,
|
||||||
liquidation_config: liquidate::Config,
|
liquidation_config: liquidate::Config,
|
||||||
trigger_tcs_config: trigger_tcs::Config,
|
trigger_tcs_config: trigger_tcs::Config,
|
||||||
|
|
||||||
liq_errors: ErrorTracking,
|
liq_errors: ErrorTracking,
|
||||||
tcs_errors: ErrorTracking,
|
/// Errors that suggest we maybe should skip trying to collect tcs for that pubkey
|
||||||
|
tcs_collection_hard_errors: ErrorTracking,
|
||||||
|
/// Recording errors when some tcs have errors during collection but others don't
|
||||||
|
tcs_collection_partial_errors: ErrorTracking,
|
||||||
|
tcs_execution_errors: ErrorTracking,
|
||||||
|
persistent_error_report_interval: Duration,
|
||||||
|
last_persistent_error_report: Instant,
|
||||||
|
persistent_error_min_duration: Duration,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LiquidationState {
|
impl LiquidationState {
|
||||||
|
@ -623,14 +664,15 @@ impl LiquidationState {
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn maybe_liquidate_and_log_error(&mut self, pubkey: &Pubkey) -> anyhow::Result<bool> {
|
async fn maybe_liquidate_and_log_error(&mut self, pubkey: &Pubkey) -> anyhow::Result<bool> {
|
||||||
let now = std::time::Instant::now();
|
let now = Instant::now();
|
||||||
let error_tracking = &mut self.liq_errors;
|
let error_tracking = &mut self.liq_errors;
|
||||||
|
|
||||||
// Skip a pubkey if there've been too many errors recently
|
// Skip a pubkey if there've been too many errors recently
|
||||||
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
||||||
trace!(
|
trace!(
|
||||||
"skip checking account {pubkey}, had {} errors recently",
|
%pubkey,
|
||||||
error_entry.count
|
error_entry.count,
|
||||||
|
"skip checking account for liquidation, had errors recently",
|
||||||
);
|
);
|
||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
@ -645,7 +687,7 @@ impl LiquidationState {
|
||||||
|
|
||||||
if let Err(err) = result.as_ref() {
|
if let Err(err) = result.as_ref() {
|
||||||
// Keep track of pubkeys that had errors
|
// Keep track of pubkeys that had errors
|
||||||
error_tracking.record_error(pubkey, now);
|
error_tracking.record_error(pubkey, now, err.to_string());
|
||||||
|
|
||||||
// Not all errors need to be raised to the user's attention.
|
// Not all errors need to be raised to the user's attention.
|
||||||
let mut is_error = true;
|
let mut is_error = true;
|
||||||
|
@ -680,7 +722,7 @@ impl LiquidationState {
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let accounts = accounts_iter.collect::<Vec<&Pubkey>>();
|
let accounts = accounts_iter.collect::<Vec<&Pubkey>>();
|
||||||
|
|
||||||
let now = std::time::Instant::now();
|
let now = Instant::now();
|
||||||
let now_ts: u64 = std::time::SystemTime::now()
|
let now_ts: u64 = std::time::SystemTime::now()
|
||||||
.duration_since(std::time::UNIX_EPOCH)?
|
.duration_since(std::time::UNIX_EPOCH)?
|
||||||
.as_secs()
|
.as_secs()
|
||||||
|
@ -689,16 +731,49 @@ impl LiquidationState {
|
||||||
// Find interesting (pubkey, tcsid, volume)
|
// Find interesting (pubkey, tcsid, volume)
|
||||||
let mut interesting_tcs = Vec::with_capacity(accounts.len());
|
let mut interesting_tcs = Vec::with_capacity(accounts.len());
|
||||||
for pubkey in accounts.iter() {
|
for pubkey in accounts.iter() {
|
||||||
if let Ok(mut v) = trigger_tcs::find_interesting_tcs_for_account(
|
if let Some(error_entry) = self
|
||||||
|
.tcs_collection_hard_errors
|
||||||
|
.had_too_many_errors(pubkey, now)
|
||||||
|
{
|
||||||
|
trace!(
|
||||||
|
%pubkey,
|
||||||
|
error_entry.count,
|
||||||
|
"skip checking account for tcs, had errors recently",
|
||||||
|
);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
match trigger_tcs::find_interesting_tcs_for_account(
|
||||||
pubkey,
|
pubkey,
|
||||||
&self.mango_client,
|
&self.mango_client,
|
||||||
&self.account_fetcher,
|
&self.account_fetcher,
|
||||||
&self.token_swap_info,
|
&self.token_swap_info,
|
||||||
&self.tcs_errors,
|
|
||||||
now,
|
|
||||||
now_ts,
|
now_ts,
|
||||||
) {
|
) {
|
||||||
interesting_tcs.append(&mut v);
|
Ok(v) => {
|
||||||
|
self.tcs_collection_hard_errors.clear_errors(pubkey);
|
||||||
|
if v.is_empty() {
|
||||||
|
self.tcs_collection_partial_errors.clear_errors(pubkey);
|
||||||
|
self.tcs_execution_errors.clear_errors(pubkey);
|
||||||
|
} else if v.iter().all(|it| it.is_ok()) {
|
||||||
|
self.tcs_collection_partial_errors.clear_errors(pubkey);
|
||||||
|
} else {
|
||||||
|
for it in v.iter() {
|
||||||
|
if let Err(e) = it {
|
||||||
|
self.tcs_collection_partial_errors.record_error(
|
||||||
|
pubkey,
|
||||||
|
now,
|
||||||
|
e.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
interesting_tcs.extend(v.iter().filter_map(|it| it.as_ref().ok()));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
self.tcs_collection_hard_errors
|
||||||
|
.record_error(pubkey, now, e.to_string());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if interesting_tcs.is_empty() {
|
if interesting_tcs.is_empty() {
|
||||||
|
@ -750,8 +825,8 @@ impl LiquidationState {
|
||||||
pubkey: &Pubkey,
|
pubkey: &Pubkey,
|
||||||
tcs_id: u64,
|
tcs_id: u64,
|
||||||
) -> anyhow::Result<bool> {
|
) -> anyhow::Result<bool> {
|
||||||
let now = std::time::Instant::now();
|
let now = Instant::now();
|
||||||
let error_tracking = &mut self.tcs_errors;
|
let error_tracking = &mut self.tcs_execution_errors;
|
||||||
|
|
||||||
// Skip a pubkey if there've been too many errors recently
|
// Skip a pubkey if there've been too many errors recently
|
||||||
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
||||||
|
@ -774,7 +849,7 @@ impl LiquidationState {
|
||||||
|
|
||||||
if let Err(err) = result.as_ref() {
|
if let Err(err) = result.as_ref() {
|
||||||
// Keep track of pubkeys that had errors
|
// Keep track of pubkeys that had errors
|
||||||
error_tracking.record_error(pubkey, now);
|
error_tracking.record_error(pubkey, now, err.to_string());
|
||||||
|
|
||||||
// Not all errors need to be raised to the user's attention.
|
// Not all errors need to be raised to the user's attention.
|
||||||
let mut is_error = true;
|
let mut is_error = true;
|
||||||
|
@ -803,10 +878,30 @@ impl LiquidationState {
|
||||||
|
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn log_persistent_errors(&mut self) {
|
||||||
|
let now = Instant::now();
|
||||||
|
if now.duration_since(self.last_persistent_error_report)
|
||||||
|
< self.persistent_error_report_interval
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
self.last_persistent_error_report = now;
|
||||||
|
|
||||||
|
let min_duration = self.persistent_error_min_duration;
|
||||||
|
self.liq_errors
|
||||||
|
.log_persistent_errors("liquidation", min_duration);
|
||||||
|
self.tcs_execution_errors
|
||||||
|
.log_persistent_errors("tcs execution", min_duration);
|
||||||
|
self.tcs_collection_hard_errors
|
||||||
|
.log_persistent_errors("tcs collection hard", min_duration);
|
||||||
|
self.tcs_collection_partial_errors
|
||||||
|
.log_persistent_errors("tcs collection partial", min_duration);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn start_chain_data_metrics(chain: Arc<RwLock<chain_data::ChainData>>, metrics: &metrics::Metrics) {
|
fn start_chain_data_metrics(chain: Arc<RwLock<chain_data::ChainData>>, metrics: &metrics::Metrics) {
|
||||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(600));
|
let mut interval = tokio::time::interval(Duration::from_secs(600));
|
||||||
|
|
||||||
let mut metric_slots_count = metrics.register_u64("chain_data_slots_count".into());
|
let mut metric_slots_count = metrics.register_u64("chain_data_slots_count".into());
|
||||||
let mut metric_accounts_count = metrics.register_u64("chain_data_accounts_count".into());
|
let mut metric_accounts_count = metrics.register_u64("chain_data_accounts_count".into());
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
use std::time::{Duration, Instant};
|
use std::time::Duration;
|
||||||
|
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use mango_v4::{
|
use mango_v4::{
|
||||||
|
@ -10,7 +10,7 @@ use mango_v4_client::{chain_data, health_cache, JupiterSwapMode, MangoClient, Ma
|
||||||
use tracing::*;
|
use tracing::*;
|
||||||
use {anyhow::Context, fixed::types::I80F48, solana_sdk::pubkey::Pubkey};
|
use {anyhow::Context, fixed::types::I80F48, solana_sdk::pubkey::Pubkey};
|
||||||
|
|
||||||
use crate::{token_swap_info, util, ErrorTracking};
|
use crate::{token_swap_info, util};
|
||||||
|
|
||||||
// The liqee health ratio to aim for when executing tcs orders that are bigger
|
// The liqee health ratio to aim for when executing tcs orders that are bigger
|
||||||
// than the liqee can support.
|
// than the liqee can support.
|
||||||
|
@ -267,7 +267,7 @@ async fn execute_token_conditional_swap(
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[allow(clippy::too_many_arguments)]
|
||||||
#[instrument(skip_all, fields(%pubkey, tcs_id))]
|
#[instrument(skip_all, fields(%pubkey, %tcs_id))]
|
||||||
pub async fn remove_expired_token_conditional_swap(
|
pub async fn remove_expired_token_conditional_swap(
|
||||||
mango_client: &MangoClient,
|
mango_client: &MangoClient,
|
||||||
pubkey: &Pubkey,
|
pubkey: &Pubkey,
|
||||||
|
@ -380,13 +380,8 @@ pub fn find_interesting_tcs_for_account(
|
||||||
mango_client: &MangoClient,
|
mango_client: &MangoClient,
|
||||||
account_fetcher: &chain_data::AccountFetcher,
|
account_fetcher: &chain_data::AccountFetcher,
|
||||||
token_swap_info: &token_swap_info::TokenSwapInfoUpdater,
|
token_swap_info: &token_swap_info::TokenSwapInfoUpdater,
|
||||||
error_tracking: &ErrorTracking,
|
|
||||||
now: Instant,
|
|
||||||
now_ts: u64,
|
now_ts: u64,
|
||||||
) -> anyhow::Result<Vec<(Pubkey, u64, u64)>> {
|
) -> anyhow::Result<Vec<anyhow::Result<(Pubkey, u64, u64)>>> {
|
||||||
if error_tracking.had_too_many_errors(pubkey, now).is_some() {
|
|
||||||
anyhow::bail!("too many errors");
|
|
||||||
}
|
|
||||||
let liqee = account_fetcher.fetch_mango_account(pubkey)?;
|
let liqee = account_fetcher.fetch_mango_account(pubkey)?;
|
||||||
|
|
||||||
let interesting_tcs = liqee.active_token_conditional_swaps().filter_map(|tcs| {
|
let interesting_tcs = liqee.active_token_conditional_swaps().filter_map(|tcs| {
|
||||||
|
@ -397,12 +392,12 @@ pub fn find_interesting_tcs_for_account(
|
||||||
token_swap_info,
|
token_swap_info,
|
||||||
now_ts,
|
now_ts,
|
||||||
) {
|
) {
|
||||||
Ok(false) | Err(_) => None,
|
|
||||||
Ok(true) => {
|
Ok(true) => {
|
||||||
let volume =
|
let volume_result = tcs_max_volume(&liqee, mango_client, account_fetcher, tcs);
|
||||||
tcs_max_volume(&liqee, mango_client, account_fetcher, tcs).unwrap_or(1);
|
Some(volume_result.map(|v| (*pubkey, tcs.id, v)))
|
||||||
Some((*pubkey, tcs.id, volume))
|
|
||||||
}
|
}
|
||||||
|
Ok(false) => None,
|
||||||
|
Err(e) => Some(Err(e)),
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
Ok(interesting_tcs.collect_vec())
|
Ok(interesting_tcs.collect_vec())
|
||||||
|
|
Loading…
Reference in New Issue