liquidator: Collect and report persistent errors (#672)
This commit is contained in:
parent
1f55d549a6
commit
4eca3eb9f0
|
@ -213,7 +213,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
mango_group,
|
||||
get_multiple_accounts_count: cli.get_multiple_accounts_count,
|
||||
parallel_rpc_requests: cli.parallel_rpc_requests,
|
||||
snapshot_interval: std::time::Duration::from_secs(cli.snapshot_interval_secs),
|
||||
snapshot_interval: Duration::from_secs(cli.snapshot_interval_secs),
|
||||
min_slot: first_websocket_slot + 10,
|
||||
},
|
||||
mango_oracles,
|
||||
|
@ -289,16 +289,27 @@ async fn main() -> anyhow::Result<()> {
|
|||
token_swap_info: token_swap_info_updater.clone(),
|
||||
liq_errors: ErrorTracking {
|
||||
skip_threshold: 5,
|
||||
skip_duration: std::time::Duration::from_secs(120),
|
||||
reset_duration: std::time::Duration::from_secs(360),
|
||||
skip_duration: Duration::from_secs(120),
|
||||
..ErrorTracking::default()
|
||||
},
|
||||
tcs_errors: ErrorTracking {
|
||||
tcs_collection_hard_errors: ErrorTracking {
|
||||
skip_threshold: 2,
|
||||
skip_duration: std::time::Duration::from_secs(120),
|
||||
reset_duration: std::time::Duration::from_secs(360),
|
||||
skip_duration: Duration::from_secs(120),
|
||||
..ErrorTracking::default()
|
||||
},
|
||||
tcs_collection_partial_errors: ErrorTracking {
|
||||
skip_threshold: 2,
|
||||
skip_duration: Duration::from_secs(120),
|
||||
..ErrorTracking::default()
|
||||
},
|
||||
tcs_execution_errors: ErrorTracking {
|
||||
skip_threshold: 2,
|
||||
skip_duration: Duration::from_secs(120),
|
||||
..ErrorTracking::default()
|
||||
},
|
||||
persistent_error_report_interval: Duration::from_secs(300),
|
||||
persistent_error_min_duration: Duration::from_secs(300),
|
||||
last_persistent_error_report: Instant::now(),
|
||||
});
|
||||
|
||||
let (liquidation_trigger_sender, liquidation_trigger_receiver) =
|
||||
|
@ -442,6 +453,8 @@ async fn main() -> anyhow::Result<()> {
|
|||
state.health_check_accounts = vec![];
|
||||
}
|
||||
|
||||
liquidation.log_persistent_errors();
|
||||
|
||||
let liquidated = liquidation
|
||||
.maybe_liquidate_one_and_rebalance(account_addresses.iter())
|
||||
.await
|
||||
|
@ -464,6 +477,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
let shared_state = shared_state.clone();
|
||||
async move {
|
||||
loop {
|
||||
interval.tick().await;
|
||||
if !shared_state.read().unwrap().one_snapshot_done {
|
||||
continue;
|
||||
}
|
||||
|
@ -475,6 +489,7 @@ async fn main() -> anyhow::Result<()> {
|
|||
.copied()
|
||||
.collect_vec();
|
||||
for token_index in token_indexes {
|
||||
min_delay.tick().await;
|
||||
match token_swap_info_updater.update_one(token_index).await {
|
||||
Ok(()) => {}
|
||||
Err(err) => {
|
||||
|
@ -484,11 +499,8 @@ async fn main() -> anyhow::Result<()> {
|
|||
);
|
||||
}
|
||||
}
|
||||
min_delay.tick().await;
|
||||
}
|
||||
token_swap_info_updater.log_all();
|
||||
|
||||
interval.tick().await;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
@ -532,17 +544,17 @@ struct SharedState {
|
|||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct AccountErrorState {
|
||||
struct AccountErrorState {
|
||||
pub messages: Vec<String>,
|
||||
pub count: u64,
|
||||
pub last_at: std::time::Instant,
|
||||
pub last_at: Instant,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ErrorTracking {
|
||||
struct ErrorTracking {
|
||||
accounts: HashMap<Pubkey, AccountErrorState>,
|
||||
skip_threshold: u64,
|
||||
skip_duration: std::time::Duration,
|
||||
reset_duration: std::time::Duration,
|
||||
skip_duration: Duration,
|
||||
}
|
||||
|
||||
impl ErrorTracking {
|
||||
|
@ -560,21 +572,42 @@ impl ErrorTracking {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn record_error(&mut self, pubkey: &Pubkey, now: Instant) {
|
||||
pub fn record_error(&mut self, pubkey: &Pubkey, now: Instant, message: String) {
|
||||
let error_entry = self.accounts.entry(*pubkey).or_insert(AccountErrorState {
|
||||
messages: Vec::with_capacity(1),
|
||||
count: 0,
|
||||
last_at: now,
|
||||
});
|
||||
if now.duration_since(error_entry.last_at) > self.reset_duration {
|
||||
error_entry.count = 0;
|
||||
}
|
||||
error_entry.count += 1;
|
||||
error_entry.last_at = now;
|
||||
if !error_entry.messages.contains(&message) {
|
||||
error_entry.messages.push(message);
|
||||
}
|
||||
if error_entry.messages.len() > 5 {
|
||||
error_entry.messages.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn clear_errors(&mut self, pubkey: &Pubkey) {
|
||||
self.accounts.remove(pubkey);
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(%error_type))]
|
||||
#[allow(unused_variables)]
|
||||
pub fn log_persistent_errors(&self, error_type: &str, min_duration: Duration) {
|
||||
let now = Instant::now();
|
||||
for (pubkey, errors) in self.accounts.iter() {
|
||||
if now.duration_since(errors.last_at) < min_duration {
|
||||
continue;
|
||||
}
|
||||
info!(
|
||||
%pubkey,
|
||||
count = errors.count,
|
||||
messages = ?errors.messages,
|
||||
"has persistent errors",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct LiquidationState {
|
||||
|
@ -584,8 +617,16 @@ struct LiquidationState {
|
|||
token_swap_info: Arc<token_swap_info::TokenSwapInfoUpdater>,
|
||||
liquidation_config: liquidate::Config,
|
||||
trigger_tcs_config: trigger_tcs::Config,
|
||||
|
||||
liq_errors: ErrorTracking,
|
||||
tcs_errors: ErrorTracking,
|
||||
/// Errors that suggest we maybe should skip trying to collect tcs for that pubkey
|
||||
tcs_collection_hard_errors: ErrorTracking,
|
||||
/// Recording errors when some tcs have errors during collection but others don't
|
||||
tcs_collection_partial_errors: ErrorTracking,
|
||||
tcs_execution_errors: ErrorTracking,
|
||||
persistent_error_report_interval: Duration,
|
||||
last_persistent_error_report: Instant,
|
||||
persistent_error_min_duration: Duration,
|
||||
}
|
||||
|
||||
impl LiquidationState {
|
||||
|
@ -623,14 +664,15 @@ impl LiquidationState {
|
|||
}
|
||||
|
||||
async fn maybe_liquidate_and_log_error(&mut self, pubkey: &Pubkey) -> anyhow::Result<bool> {
|
||||
let now = std::time::Instant::now();
|
||||
let now = Instant::now();
|
||||
let error_tracking = &mut self.liq_errors;
|
||||
|
||||
// Skip a pubkey if there've been too many errors recently
|
||||
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
||||
trace!(
|
||||
"skip checking account {pubkey}, had {} errors recently",
|
||||
error_entry.count
|
||||
%pubkey,
|
||||
error_entry.count,
|
||||
"skip checking account for liquidation, had errors recently",
|
||||
);
|
||||
return Ok(false);
|
||||
}
|
||||
|
@ -645,7 +687,7 @@ impl LiquidationState {
|
|||
|
||||
if let Err(err) = result.as_ref() {
|
||||
// Keep track of pubkeys that had errors
|
||||
error_tracking.record_error(pubkey, now);
|
||||
error_tracking.record_error(pubkey, now, err.to_string());
|
||||
|
||||
// Not all errors need to be raised to the user's attention.
|
||||
let mut is_error = true;
|
||||
|
@ -680,7 +722,7 @@ impl LiquidationState {
|
|||
) -> anyhow::Result<()> {
|
||||
let accounts = accounts_iter.collect::<Vec<&Pubkey>>();
|
||||
|
||||
let now = std::time::Instant::now();
|
||||
let now = Instant::now();
|
||||
let now_ts: u64 = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)?
|
||||
.as_secs()
|
||||
|
@ -689,16 +731,49 @@ impl LiquidationState {
|
|||
// Find interesting (pubkey, tcsid, volume)
|
||||
let mut interesting_tcs = Vec::with_capacity(accounts.len());
|
||||
for pubkey in accounts.iter() {
|
||||
if let Ok(mut v) = trigger_tcs::find_interesting_tcs_for_account(
|
||||
if let Some(error_entry) = self
|
||||
.tcs_collection_hard_errors
|
||||
.had_too_many_errors(pubkey, now)
|
||||
{
|
||||
trace!(
|
||||
%pubkey,
|
||||
error_entry.count,
|
||||
"skip checking account for tcs, had errors recently",
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
match trigger_tcs::find_interesting_tcs_for_account(
|
||||
pubkey,
|
||||
&self.mango_client,
|
||||
&self.account_fetcher,
|
||||
&self.token_swap_info,
|
||||
&self.tcs_errors,
|
||||
now,
|
||||
now_ts,
|
||||
) {
|
||||
interesting_tcs.append(&mut v);
|
||||
Ok(v) => {
|
||||
self.tcs_collection_hard_errors.clear_errors(pubkey);
|
||||
if v.is_empty() {
|
||||
self.tcs_collection_partial_errors.clear_errors(pubkey);
|
||||
self.tcs_execution_errors.clear_errors(pubkey);
|
||||
} else if v.iter().all(|it| it.is_ok()) {
|
||||
self.tcs_collection_partial_errors.clear_errors(pubkey);
|
||||
} else {
|
||||
for it in v.iter() {
|
||||
if let Err(e) = it {
|
||||
self.tcs_collection_partial_errors.record_error(
|
||||
pubkey,
|
||||
now,
|
||||
e.to_string(),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
interesting_tcs.extend(v.iter().filter_map(|it| it.as_ref().ok()));
|
||||
}
|
||||
Err(e) => {
|
||||
self.tcs_collection_hard_errors
|
||||
.record_error(pubkey, now, e.to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
if interesting_tcs.is_empty() {
|
||||
|
@ -750,8 +825,8 @@ impl LiquidationState {
|
|||
pubkey: &Pubkey,
|
||||
tcs_id: u64,
|
||||
) -> anyhow::Result<bool> {
|
||||
let now = std::time::Instant::now();
|
||||
let error_tracking = &mut self.tcs_errors;
|
||||
let now = Instant::now();
|
||||
let error_tracking = &mut self.tcs_execution_errors;
|
||||
|
||||
// Skip a pubkey if there've been too many errors recently
|
||||
if let Some(error_entry) = error_tracking.had_too_many_errors(pubkey, now) {
|
||||
|
@ -774,7 +849,7 @@ impl LiquidationState {
|
|||
|
||||
if let Err(err) = result.as_ref() {
|
||||
// Keep track of pubkeys that had errors
|
||||
error_tracking.record_error(pubkey, now);
|
||||
error_tracking.record_error(pubkey, now, err.to_string());
|
||||
|
||||
// Not all errors need to be raised to the user's attention.
|
||||
let mut is_error = true;
|
||||
|
@ -803,10 +878,30 @@ impl LiquidationState {
|
|||
|
||||
result
|
||||
}
|
||||
|
||||
fn log_persistent_errors(&mut self) {
|
||||
let now = Instant::now();
|
||||
if now.duration_since(self.last_persistent_error_report)
|
||||
< self.persistent_error_report_interval
|
||||
{
|
||||
return;
|
||||
}
|
||||
self.last_persistent_error_report = now;
|
||||
|
||||
let min_duration = self.persistent_error_min_duration;
|
||||
self.liq_errors
|
||||
.log_persistent_errors("liquidation", min_duration);
|
||||
self.tcs_execution_errors
|
||||
.log_persistent_errors("tcs execution", min_duration);
|
||||
self.tcs_collection_hard_errors
|
||||
.log_persistent_errors("tcs collection hard", min_duration);
|
||||
self.tcs_collection_partial_errors
|
||||
.log_persistent_errors("tcs collection partial", min_duration);
|
||||
}
|
||||
}
|
||||
|
||||
fn start_chain_data_metrics(chain: Arc<RwLock<chain_data::ChainData>>, metrics: &metrics::Metrics) {
|
||||
let mut interval = tokio::time::interval(std::time::Duration::from_secs(600));
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(600));
|
||||
|
||||
let mut metric_slots_count = metrics.register_u64("chain_data_slots_count".into());
|
||||
let mut metric_accounts_count = metrics.register_u64("chain_data_accounts_count".into());
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
use std::time::{Duration, Instant};
|
||||
use std::time::Duration;
|
||||
|
||||
use itertools::Itertools;
|
||||
use mango_v4::{
|
||||
|
@ -10,7 +10,7 @@ use mango_v4_client::{chain_data, health_cache, JupiterSwapMode, MangoClient, Ma
|
|||
use tracing::*;
|
||||
use {anyhow::Context, fixed::types::I80F48, solana_sdk::pubkey::Pubkey};
|
||||
|
||||
use crate::{token_swap_info, util, ErrorTracking};
|
||||
use crate::{token_swap_info, util};
|
||||
|
||||
// The liqee health ratio to aim for when executing tcs orders that are bigger
|
||||
// than the liqee can support.
|
||||
|
@ -267,7 +267,7 @@ async fn execute_token_conditional_swap(
|
|||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[instrument(skip_all, fields(%pubkey, tcs_id))]
|
||||
#[instrument(skip_all, fields(%pubkey, %tcs_id))]
|
||||
pub async fn remove_expired_token_conditional_swap(
|
||||
mango_client: &MangoClient,
|
||||
pubkey: &Pubkey,
|
||||
|
@ -380,13 +380,8 @@ pub fn find_interesting_tcs_for_account(
|
|||
mango_client: &MangoClient,
|
||||
account_fetcher: &chain_data::AccountFetcher,
|
||||
token_swap_info: &token_swap_info::TokenSwapInfoUpdater,
|
||||
error_tracking: &ErrorTracking,
|
||||
now: Instant,
|
||||
now_ts: u64,
|
||||
) -> anyhow::Result<Vec<(Pubkey, u64, u64)>> {
|
||||
if error_tracking.had_too_many_errors(pubkey, now).is_some() {
|
||||
anyhow::bail!("too many errors");
|
||||
}
|
||||
) -> anyhow::Result<Vec<anyhow::Result<(Pubkey, u64, u64)>>> {
|
||||
let liqee = account_fetcher.fetch_mango_account(pubkey)?;
|
||||
|
||||
let interesting_tcs = liqee.active_token_conditional_swaps().filter_map(|tcs| {
|
||||
|
@ -397,12 +392,12 @@ pub fn find_interesting_tcs_for_account(
|
|||
token_swap_info,
|
||||
now_ts,
|
||||
) {
|
||||
Ok(false) | Err(_) => None,
|
||||
Ok(true) => {
|
||||
let volume =
|
||||
tcs_max_volume(&liqee, mango_client, account_fetcher, tcs).unwrap_or(1);
|
||||
Some((*pubkey, tcs.id, volume))
|
||||
let volume_result = tcs_max_volume(&liqee, mango_client, account_fetcher, tcs);
|
||||
Some(volume_result.map(|v| (*pubkey, tcs.id, v)))
|
||||
}
|
||||
Ok(false) => None,
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
});
|
||||
Ok(interesting_tcs.collect_vec())
|
||||
|
|
Loading…
Reference in New Issue