Serge/liquidator split tcs and liquidation (#914)

liquidator: split TCS triggering and liquidation job Concurrent execution of candidate lookup and tx building/sending - Also added an health assertion IX to protect liqor in multi liquidation scenario - And a timeout for jupiter v6 queries (avoid blocking liquidation because of slow TCS)
2024-03-20 15:25:52 +01:00 · 2024-03-20 15:25:52 +01:00 · f54bb6f0b0
parent 769f940a66
commit f54bb6f0b0
14 changed files with 1046 additions and 393 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3540,6 +3540,7 @@ dependencies = [
 "futures-core",
 "futures-util",
 "hdrhistogram",
 "indexmap 2.0.0",
 "itertools",
 "jemallocator",
 "jsonrpc-core 18.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/bin/liquidator/Cargo.toml
+++ b/bin/liquidator/Cargo.toml
@ -49,4 +49,5 @@ tokio-stream = { version = "0.1.9"}
 tokio-tungstenite = "0.16.1"
 tracing = "0.1"
 regex = "1.9.5"
-hdrhistogram = "7.5.4"
+hdrhistogram = "7.5.4"
 indexmap = "2.0.0"
--- a/bin/liquidator/src/cli_args.rs
+++ b/bin/liquidator/src/cli_args.rs
@ -136,6 +136,12 @@ pub struct Cli {
    #[clap(long, env, value_enum, default_value = "true")]
    pub(crate) take_tcs: BoolArg,
    #[clap(long, env, default_value = "30")]
    pub(crate) tcs_refresh_timeout_secs: u64,
    #[clap(long, env, default_value = "1000")]
    pub(crate) tcs_check_interval_ms: u64,
    /// profit margin at which to take tcs orders
    #[clap(long, env, default_value = "0.0005")]
    pub(crate) tcs_profit_fraction: f64,
@ -178,6 +184,10 @@ pub struct Cli {
    #[clap(long, env, default_value = "https://quote-api.jup.ag/v6")]
    pub(crate) jupiter_v6_url: String,
    /// override the jupiter http request timeout
    #[clap(long, env, default_value = "30")]
    pub(crate) jupiter_timeout_secs: u64,
    /// provide a jupiter token, currently only for jup v6
    #[clap(long, env, default_value = "")]
    pub(crate) jupiter_token: String,
@ -191,6 +201,12 @@ pub struct Cli {
    #[clap(long, env, value_enum, default_value = "true")]
    pub(crate) telemetry: BoolArg,
    /// if liquidation is enabled
    ///
    /// might be used to run an instance of liquidator dedicated to TCS and another one for liquidation
    #[clap(long, env, value_enum, default_value = "true")]
    pub(crate) liquidation_enabled: BoolArg,
    /// liquidation refresh timeout in secs
    #[clap(long, env, default_value = "30")]
    pub(crate) liquidation_refresh_timeout_secs: u8,
@ -216,4 +232,8 @@ pub struct Cli {
    /// how long should it wait before logging an oracle error again (for the same token)
    #[clap(long, env, default_value = "30")]
    pub(crate) skip_oracle_error_in_logs_duration_secs: u64,
    /// max number of liquidation/tcs to do concurrently
    #[clap(long, env, default_value = "5")]
    pub(crate) max_parallel_operations: u64,
 }
--- a/bin/liquidator/src/liquidate.rs
+++ b/bin/liquidator/src/liquidate.rs
@ -9,6 +9,7 @@ use mango_v4_client::{chain_data, MangoClient, PreparedInstructions};
 use solana_sdk::signature::Signature;
 use futures::{stream, StreamExt, TryStreamExt};
 use mango_v4::accounts_ix::HealthCheckKind::MaintRatio;
 use rand::seq::SliceRandom;
 use tracing::*;
 use {anyhow::Context, fixed::types::I80F48, solana_sdk::pubkey::Pubkey};
@ -260,7 +261,22 @@ impl<'a> LiquidateHelper<'a> {
            )
            .await
            .context("creating perp_liq_base_or_positive_pnl_instruction")?;
        liq_ixs.cu = liq_ixs.cu.max(self.config.compute_limit_for_liq_ix);
        let liqor = &self.client.mango_account().await?;
        liq_ixs.append(
            self.client
                .health_check_instruction(
                    liqor,
                    self.config.min_health_ratio,
                    vec![],
                    vec![*perp_market_index],
                    MaintRatio,
                )
                .await?,
        );
        let txsig = self
            .client
            .send_and_confirm_owner_tx(liq_ixs.to_instructions())
@ -501,6 +517,20 @@ impl<'a> LiquidateHelper<'a> {
            .await
            .context("creating liq_token_with_token ix")?;
        liq_ixs.cu = liq_ixs.cu.max(self.config.compute_limit_for_liq_ix);
        let liqor = self.client.mango_account().await?;
        liq_ixs.append(
            self.client
                .health_check_instruction(
                    &liqor,
                    self.config.min_health_ratio,
                    vec![asset_token_index, liab_token_index],
                    vec![],
                    MaintRatio,
                )
                .await?,
        );
        let txsig = self
            .client
            .send_and_confirm_owner_tx(liq_ixs.to_instructions())
@ -651,14 +681,11 @@ impl<'a> LiquidateHelper<'a> {
 }
 #[allow(clippy::too_many_arguments)]
-pub async fn maybe_liquidate_account(
+pub async fn can_liquidate_account(
    mango_client: &MangoClient,
    account_fetcher: &chain_data::AccountFetcher,
    pubkey: &Pubkey,
    config: &Config,
 ) -> anyhow::Result<bool> {
    let liqor_min_health_ratio = I80F48::from_num(config.min_health_ratio);
    let account = account_fetcher.fetch_mango_account(pubkey)?;
    let health_cache = mango_client
        .health_cache(&account)
@ -675,6 +702,18 @@ pub async fn maybe_liquidate_account(
        "possible candidate",
    );
    Ok(true)
 }
 #[allow(clippy::too_many_arguments)]
 pub async fn maybe_liquidate_account(
    mango_client: &MangoClient,
    account_fetcher: &chain_data::AccountFetcher,
    pubkey: &Pubkey,
    config: &Config,
 ) -> anyhow::Result<bool> {
    let liqor_min_health_ratio = I80F48::from_num(config.min_health_ratio);
    // Fetch a fresh account and re-compute
    // This is -- unfortunately -- needed because the websocket streams seem to not
    // be great at providing timely updates to the account data.
--- a/bin/liquidator/src/liquidation_state.rs
+++ b/bin/liquidator/src/liquidation_state.rs
@ -0,0 +1,238 @@
 use crate::cli_args::Cli;
 use crate::metrics::Metrics;
 use crate::unwrappable_oracle_error::UnwrappableOracleError;
 use crate::{liquidate, LiqErrorType, SharedState};
 use anchor_lang::prelude::Pubkey;
 use itertools::Itertools;
 use mango_v4::state::TokenIndex;
 use mango_v4_client::error_tracking::ErrorTracking;
 use mango_v4_client::{chain_data, MangoClient, MangoClientError};
 use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
 use tokio::task::JoinHandle;
 use tracing::{error, trace, warn};
 #[derive(Clone)]
 pub struct LiquidationState {
    pub mango_client: Arc<MangoClient>,
    pub account_fetcher: Arc<chain_data::AccountFetcher>,
    pub liquidation_config: liquidate::Config,
    pub errors: Arc<RwLock<ErrorTracking<Pubkey, LiqErrorType>>>,
    pub oracle_errors: Arc<RwLock<ErrorTracking<TokenIndex, LiqErrorType>>>,
 }
 impl LiquidationState {
    async fn find_candidates(
        &mut self,
        accounts_iter: impl Iterator<Item = &Pubkey>,
        action: impl Fn(Pubkey) -> anyhow::Result<()>,
    ) -> anyhow::Result<u64> {
        let mut found_counter = 0u64;
        use rand::seq::SliceRandom;
        let mut accounts = accounts_iter.collect::<Vec<&Pubkey>>();
        {
            let mut rng = rand::thread_rng();
            accounts.shuffle(&mut rng);
        }
        for pubkey in accounts {
            if self.should_skip_execution(pubkey) {
                continue;
            }
            let result =
                liquidate::can_liquidate_account(&self.mango_client, &self.account_fetcher, pubkey)
                    .await;
            self.log_or_ignore_error(&result, pubkey);
            if result.unwrap_or(false) {
                action(*pubkey)?;
                found_counter = found_counter + 1;
            }
        }
        Ok(found_counter)
    }
    fn should_skip_execution(&mut self, pubkey: &Pubkey) -> bool {
        let now = Instant::now();
        let error_tracking = &mut self.errors;
        // Skip a pubkey if there've been too many errors recently
        if let Some(error_entry) =
            error_tracking
                .read()
                .unwrap()
                .had_too_many_errors(LiqErrorType::Liq, pubkey, now)
        {
            trace!(
                %pubkey,
                error_entry.count,
                "skip checking account for liquidation, had errors recently",
            );
            return true;
        }
        false
    }
    fn log_or_ignore_error<T>(&mut self, result: &anyhow::Result<T>, pubkey: &Pubkey) {
        let error_tracking = &mut self.errors;
        if let Err(err) = result.as_ref() {
            if let Some((ti, ti_name)) = err.try_unwrap_oracle_error() {
                if self
                    .oracle_errors
                    .read()
                    .unwrap()
                    .had_too_many_errors(LiqErrorType::Liq, &ti, Instant::now())
                    .is_none()
                {
                    warn!(
                        "{:?} recording oracle error for token {} {}",
                        chrono::offset::Utc::now(),
                        ti_name,
                        ti
                    );
                }
                self.oracle_errors
                    .write()
                    .unwrap()
                    .record(LiqErrorType::Liq, &ti, err.to_string());
                return;
            }
            // Keep track of pubkeys that had errors
            error_tracking
                .write()
                .unwrap()
                .record(LiqErrorType::Liq, pubkey, err.to_string());
            // Not all errors need to be raised to the user's attention.
            let mut is_error = true;
            // Simulation errors due to liqee precondition failures on the liquidation instructions
            // will commonly happen if our liquidator is late or if there are chain forks.
            match err.downcast_ref::<MangoClientError>() {
                Some(MangoClientError::SendTransactionPreflightFailure { logs, .. }) => {
                    if logs.iter().any(|line| {
                        line.contains("HealthMustBeNegative") || line.contains("IsNotBankrupt")
                    }) {
                        is_error = false;
                    }
                }
                _ => {}
            };
            if is_error {
                error!("liquidating account {}: {:?}", pubkey, err);
            } else {
                trace!("liquidating account {}: {:?}", pubkey, err);
            }
        } else {
            error_tracking
                .write()
                .unwrap()
                .clear(LiqErrorType::Liq, pubkey);
        }
    }
    pub async fn maybe_liquidate_and_log_error(&mut self, pubkey: &Pubkey) -> anyhow::Result<bool> {
        if self.should_skip_execution(pubkey) {
            return Ok(false);
        }
        let result = liquidate::maybe_liquidate_account(
            &self.mango_client,
            &self.account_fetcher,
            pubkey,
            &self.liquidation_config,
        )
        .await;
        self.log_or_ignore_error(&result, pubkey);
        return result;
    }
 }
 pub fn spawn_liquidation_job(
    cli: &Cli,
    shared_state: &Arc<RwLock<SharedState>>,
    tx_trigger_sender: async_channel::Sender<()>,
    mut liquidation: Box<LiquidationState>,
    metrics: &Metrics,
 ) -> JoinHandle<()> {
    tokio::spawn({
        let mut interval =
            mango_v4_client::delay_interval(Duration::from_millis(cli.check_interval_ms));
        let mut metric_liquidation_check = metrics.register_latency("liquidation_check".into());
        let mut metric_liquidation_start_end =
            metrics.register_latency("liquidation_start_end".into());
        let mut liquidation_start_time = None;
        let shared_state = shared_state.clone();
        async move {
            loop {
                interval.tick().await;
                let account_addresses = {
                    let mut state = shared_state.write().unwrap();
                    if !state.one_snapshot_done {
                        // discard first latency info as it will skew data too much
                        state.oldest_chain_event_reception_time = None;
                        continue;
                    }
                    if state.oldest_chain_event_reception_time.is_none()
                        && liquidation_start_time.is_none()
                    {
                        // no new update, skip computing
                        continue;
                    }
                    state.mango_accounts.iter().cloned().collect_vec()
                };
                liquidation.errors.write().unwrap().update();
                liquidation.oracle_errors.write().unwrap().update();
                if liquidation_start_time.is_none() {
                    liquidation_start_time = Some(Instant::now());
                }
                let found_candidates = liquidation
                    .find_candidates(account_addresses.iter(), |p| {
                        if shared_state
                            .write()
                            .unwrap()
                            .liquidation_candidates_accounts
                            .insert(p)
                        {
                            tx_trigger_sender.try_send(())?;
                        }
                        Ok(())
                    })
                    .await
                    .unwrap();
                if found_candidates > 0 {
                    tracing::debug!("found {} candidates for liquidation", found_candidates);
                }
                let mut state = shared_state.write().unwrap();
                let reception_time = state.oldest_chain_event_reception_time.unwrap();
                let current_time = Instant::now();
                state.oldest_chain_event_reception_time = None;
                metric_liquidation_check.push(current_time - reception_time);
                metric_liquidation_start_end.push(current_time - liquidation_start_time.unwrap());
                liquidation_start_time = None;
            }
        }
    })
 }
--- a/bin/liquidator/src/main.rs
+++ b/bin/liquidator/src/main.rs
@ -4,33 +4,40 @@ use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
 use anchor_client::Cluster;
 use anyhow::Context;
 use clap::Parser;
 use futures_util::StreamExt;
 use mango_v4::state::{PerpMarketIndex, TokenIndex};
 use mango_v4_client::AsyncChannelSendUnlessFull;
 use mango_v4_client::{
    account_update_stream, chain_data, error_tracking::ErrorTracking, keypair_from_cli,
-    snapshot_source, websocket_source, Client, MangoClient, MangoClientError, MangoGroupContext,
+    snapshot_source, websocket_source, Client, MangoClient, MangoGroupContext,
    TransactionBuilderConfig,
 };
 use crate::cli_args::{BoolArg, Cli, CliDotenv};
 use crate::liquidation_state::LiquidationState;
 use crate::rebalance::Rebalancer;
 use crate::tcs_state::TcsState;
 use crate::token_swap_info::TokenSwapInfoUpdater;
 use itertools::Itertools;
 use solana_sdk::commitment_config::CommitmentConfig;
 use solana_sdk::pubkey::Pubkey;
 use solana_sdk::signer::Signer;
 use tokio::task::JoinHandle;
 use tracing::*;
 pub mod cli_args;
 pub mod liquidate;
 mod liquidation_state;
 pub mod metrics;
 pub mod rebalance;
 mod tcs_state;
 pub mod telemetry;
 pub mod token_swap_info;
 pub mod trigger_tcs;
 mod tx_sender;
 mod unwrappable_oracle_error;
 pub mod util;
 use crate::unwrappable_oracle_error::UnwrappableOracleError;
 use crate::util::{is_mango_account, is_mint_info, is_perp_market};
 // jemalloc seems to be better at keeping the memory footprint reasonable over
@ -69,7 +76,7 @@ async fn main() -> anyhow::Result<()> {
    // Client setup
    //
    let liqor_owner = Arc::new(keypair_from_cli(&cli.liqor_owner));
-    let rpc_url = cli.rpc_url;
+    let rpc_url = cli.rpc_url.clone();
    let ws_url = rpc_url.replace("https", "wss");
    let rpc_timeout = Duration::from_secs(10);
    let cluster = Cluster::Custom(rpc_url.clone(), ws_url.clone());
@ -79,8 +86,9 @@ async fn main() -> anyhow::Result<()> {
        .commitment(commitment)
        .fee_payer(Some(liqor_owner.clone()))
        .timeout(rpc_timeout)
-        .jupiter_v6_url(cli.jupiter_v6_url)
+        .jupiter_timeout(Duration::from_secs(cli.jupiter_timeout_secs))
-        .jupiter_token(cli.jupiter_token)
+        .jupiter_v6_url(cli.jupiter_v6_url.clone())
        .jupiter_token(cli.jupiter_token.clone())
        .transaction_builder_config(
            TransactionBuilderConfig::builder()
                .priority_fee_provider(prio_provider)
@ -89,7 +97,7 @@ async fn main() -> anyhow::Result<()> {
                .build()
                .unwrap(),
        )
-        .override_send_transaction_urls(cli.override_send_transaction_url)
+        .override_send_transaction_urls(cli.override_send_transaction_url.clone())
        .build()
        .unwrap();
@ -207,17 +215,18 @@ async fn main() -> anyhow::Result<()> {
        compute_limit_for_liq_ix: cli.compute_limit_for_liquidation,
        max_cu_per_transaction: 1_000_000,
        refresh_timeout: Duration::from_secs(cli.liquidation_refresh_timeout_secs as u64),
-        only_allowed_tokens: cli_args::cli_to_hashset::<TokenIndex>(cli.only_allow_tokens),
+        only_allowed_tokens: cli_args::cli_to_hashset::<TokenIndex>(cli.only_allow_tokens.clone()),
-        forbidden_tokens: cli_args::cli_to_hashset::<TokenIndex>(cli.forbidden_tokens),
+        forbidden_tokens: cli_args::cli_to_hashset::<TokenIndex>(cli.forbidden_tokens.clone()),
        only_allowed_perp_markets: cli_args::cli_to_hashset::<PerpMarketIndex>(
-            cli.liquidation_only_allow_perp_markets,
+            cli.liquidation_only_allow_perp_markets.clone(),
        ),
        forbidden_perp_markets: cli_args::cli_to_hashset::<PerpMarketIndex>(
-            cli.liquidation_forbidden_perp_markets,
+            cli.liquidation_forbidden_perp_markets.clone(),
        ),
    };
    let tcs_config = trigger_tcs::Config {
        refresh_timeout: Duration::from_secs(cli.tcs_refresh_timeout_secs),
        min_health_ratio: cli.min_health_ratio,
        max_trigger_quote_amount: (cli.tcs_max_trigger_amount * 1e6) as u64,
        compute_limit_for_trigger: cli.compute_limit_for_tcs,
@ -234,17 +243,19 @@ async fn main() -> anyhow::Result<()> {
        forbidden_tokens: liq_config.forbidden_tokens.clone(),
    };
    let mut rebalance_interval = tokio::time::interval(Duration::from_secs(30));
    let (rebalance_trigger_sender, rebalance_trigger_receiver) = async_channel::bounded::<()>(1);
    let (tx_tcs_trigger_sender, tx_tcs_trigger_receiver) = async_channel::unbounded::<()>();
    let (tx_liq_trigger_sender, tx_liq_trigger_receiver) = async_channel::unbounded::<()>();
    let rebalance_config = rebalance::Config {
        enabled: cli.rebalance == BoolArg::True,
        slippage_bps: cli.rebalance_slippage_bps,
        borrow_settle_excess: (1f64 + cli.rebalance_borrow_settle_excess).max(1f64),
        refresh_timeout: Duration::from_secs(cli.rebalance_refresh_timeout_secs),
        jupiter_version: cli.jupiter_version.into(),
-        skip_tokens: cli.rebalance_skip_tokens.unwrap_or_default(),
+        skip_tokens: cli.rebalance_skip_tokens.clone().unwrap_or(Vec::new()),
        alternate_jupiter_route_tokens: cli
            .rebalance_alternate_jupiter_route_tokens
            .clone()
            .unwrap_or_default(),
        allow_withdraws: signer_is_owner,
    };
@ -257,23 +268,39 @@ async fn main() -> anyhow::Result<()> {
        config: rebalance_config,
    });
-    let mut liquidation = Box::new(LiquidationState {
+    let liquidation = Box::new(LiquidationState {
        mango_client: mango_client.clone(),
        account_fetcher: account_fetcher.clone(),
        liquidation_config: liq_config,
        errors: Arc::new(RwLock::new(
            ErrorTracking::builder()
                .skip_threshold(2)
                .skip_threshold_for_type(LiqErrorType::Liq, 5)
                .skip_duration(Duration::from_secs(120))
                .build()?,
        )),
        oracle_errors: Arc::new(RwLock::new(
            ErrorTracking::builder()
                .skip_threshold(1)
                .skip_duration(Duration::from_secs(
                    cli.skip_oracle_error_in_logs_duration_secs,
                ))
                .build()?,
        )),
    });
    let tcs = Box::new(TcsState {
        mango_client: mango_client.clone(),
        account_fetcher,
        liquidation_config: liq_config,
        trigger_tcs_config: tcs_config,
        token_swap_info: token_swap_info_updater.clone(),
-        errors: ErrorTracking::builder()
+        errors: Arc::new(RwLock::new(
-            .skip_threshold(2)
+            ErrorTracking::builder()
-            .skip_threshold_for_type(LiqErrorType::Liq, 5)
+                .skip_threshold(2)
-            .skip_duration(Duration::from_secs(120))
+                .skip_threshold_for_type(LiqErrorType::Liq, 5)
-            .build()?,
+                .skip_duration(Duration::from_secs(120))
-        oracle_errors: ErrorTracking::builder()
+                .build()?,
-            .skip_threshold(1)
+        )),
            .skip_duration(Duration::from_secs(
                cli.skip_oracle_error_in_logs_duration_secs,
            ))
            .build()?,
    });
    info!("main loop");
@ -374,126 +401,83 @@ async fn main() -> anyhow::Result<()> {
        }
    });
    let mut optional_jobs = vec![];
    // Could be refactored to only start the below jobs when the first snapshot is done.
    // But need to take care to abort if the above job aborts beforehand.
    if cli.rebalance == BoolArg::True {
        let rebalance_job =
            spawn_rebalance_job(&shared_state, rebalance_trigger_receiver, rebalancer);
        optional_jobs.push(rebalance_job);
    }
-    let rebalance_job = tokio::spawn({
+    if cli.liquidation_enabled == BoolArg::True {
-        let shared_state = shared_state.clone();
+        let liquidation_job = liquidation_state::spawn_liquidation_job(
-        async move {
+            &cli,
-            loop {
+            &shared_state,
-                tokio::select! {
+            tx_liq_trigger_sender.clone(),
-                    _ = rebalance_interval.tick() => {}
+            liquidation.clone(),
-                    _ = rebalance_trigger_receiver.recv() => {}
+            &metrics,
-                }
+        );
-                if !shared_state.read().unwrap().one_snapshot_done {
+        optional_jobs.push(liquidation_job);
-                    continue;
+    }
                }
                if let Err(err) = rebalancer.zero_all_non_quote().await {
                    error!("failed to rebalance liqor: {:?}", err);
-                    // Workaround: We really need a sequence enforcer in the liquidator since we don't want to
+    if cli.take_tcs == BoolArg::True {
-                    // accidentally send a similar tx again when we incorrectly believe an earlier one got forked
+        let tcs_job = tcs_state::spawn_tcs_job(
-                    // off. For now, hard sleep on error to avoid the most frequent error cases.
+            &cli,
-                    tokio::time::sleep(Duration::from_secs(10)).await;
+            &shared_state,
-                }
+            tx_tcs_trigger_sender.clone(),
-            }
+            tcs.clone(),
-        }
+            &metrics,
-    });
+        );
        optional_jobs.push(tcs_job);
    }
-    let liquidation_job = tokio::spawn({
+    if cli.liquidation_enabled == BoolArg::True || cli.take_tcs == BoolArg::True {
-        let mut interval =
+        let mut tx_sender_jobs = tx_sender::spawn_tx_senders_job(
-            mango_v4_client::delay_interval(Duration::from_millis(cli.check_interval_ms));
+            cli.max_parallel_operations,
-        let mut metric_liquidation_check = metrics.register_latency("liquidation_check".into());
+            cli.liquidation_enabled == BoolArg::True,
-        let mut metric_liquidation_start_end =
+            tx_liq_trigger_receiver,
-            metrics.register_latency("liquidation_start_end".into());
+            tx_tcs_trigger_receiver,
            tx_tcs_trigger_sender,
            rebalance_trigger_sender,
            shared_state.clone(),
            liquidation,
            tcs,
        );
        optional_jobs.append(&mut tx_sender_jobs);
    }
-        let mut liquidation_start_time = None;
+    if cli.telemetry == BoolArg::True {
-        let mut tcs_start_time = None;
+        optional_jobs.push(spawn_telemetry_job(&cli, mango_client.clone()));
    }
-        let shared_state = shared_state.clone();
+    let token_swap_info_job =
-        async move {
+        spawn_token_swap_refresh_job(&cli, shared_state, token_swap_info_updater);
-            loop {
+    let check_changes_for_abort_job = spawn_context_change_watchdog_job(mango_client.clone());
                interval.tick().await;
-                let account_addresses = {
+    let mut jobs: futures::stream::FuturesUnordered<_> =
-                    let mut state = shared_state.write().unwrap();
+        vec![data_job, token_swap_info_job, check_changes_for_abort_job]
-                    if !state.one_snapshot_done {
+            .into_iter()
-                        // discard first latency info as it will skew data too much
+            .chain(optional_jobs)
-                        state.oldest_chain_event_reception_time = None;
+            .chain(prio_jobs.into_iter())
-                        continue;
+            .collect();
-                    }
+    jobs.next().await;
                    if state.oldest_chain_event_reception_time.is_none()
                        && liquidation_start_time.is_none()
                    {
                        // no new update, skip computing
                        continue;
                    }
-                    state.mango_accounts.iter().cloned().collect_vec()
+    error!("a critical job aborted, exiting");
-                };
+    Ok(())
 }
-                liquidation.errors.update();
+fn spawn_token_swap_refresh_job(
-                liquidation.oracle_errors.update();
+    cli: &Cli,
-
+    shared_state: Arc<RwLock<SharedState>>,
-                if liquidation_start_time.is_none() {
+    token_swap_info_updater: Arc<TokenSwapInfoUpdater>,
-                    liquidation_start_time = Some(Instant::now());
+) -> JoinHandle<()> {
-                }
+    tokio::spawn({
                let liquidated = liquidation
                    .maybe_liquidate_one(account_addresses.iter())
                    .await;
                if !liquidated {
                    // This will be incorrect if we liquidate the last checked account
                    // (We will wait for next full run, skewing latency metrics)
                    // Probability is very low, might not need to be fixed
                    let mut state = shared_state.write().unwrap();
                    let reception_time = state.oldest_chain_event_reception_time.unwrap();
                    let current_time = Instant::now();
                    state.oldest_chain_event_reception_time = None;
                    metric_liquidation_check.push(current_time - reception_time);
                    metric_liquidation_start_end
                        .push(current_time - liquidation_start_time.unwrap());
                    liquidation_start_time = None;
                }
                let mut took_tcs = false;
                if !liquidated && cli.take_tcs == BoolArg::True {
                    tcs_start_time = Some(tcs_start_time.unwrap_or(Instant::now()));
                    took_tcs = liquidation
                        .maybe_take_token_conditional_swap(account_addresses.iter())
                        .await
                        .unwrap_or_else(|err| {
                            error!("error during maybe_take_token_conditional_swap: {err}");
                            false
                        });
                    if !took_tcs {
                        let current_time = Instant::now();
                        let mut metric_tcs_start_end =
                            metrics.register_latency("tcs_start_end".into());
                        metric_tcs_start_end.push(current_time - tcs_start_time.unwrap());
                        tcs_start_time = None;
                    }
                }
                if liquidated || took_tcs {
                    rebalance_trigger_sender.send_unless_full(()).unwrap();
                }
            }
        }
    });
    let token_swap_info_job = tokio::spawn({
        let mut interval = mango_v4_client::delay_interval(Duration::from_secs(
            cli.token_swap_refresh_interval_secs,
        ));
        let mut startup_wait = mango_v4_client::delay_interval(Duration::from_secs(1));
        let shared_state = shared_state.clone();
        async move {
            loop {
                if !shared_state.read().unwrap().one_snapshot_done {
@ -517,41 +501,56 @@ async fn main() -> anyhow::Result<()> {
                token_swap_info_updater.log_all();
            }
        }
-    });
+    })
 }
-    let check_changes_for_abort_job =
+fn spawn_context_change_watchdog_job(mango_client: Arc<MangoClient>) -> JoinHandle<()> {
-        tokio::spawn(MangoClient::loop_check_for_context_changes_and_abort(
+    tokio::spawn(MangoClient::loop_check_for_context_changes_and_abort(
-            mango_client.clone(),
+        mango_client,
-            Duration::from_secs(300),
+        Duration::from_secs(300),
-        ));
+    ))
 }
-    if cli.telemetry == BoolArg::True {
+fn spawn_telemetry_job(cli: &Cli, mango_client: Arc<MangoClient>) -> JoinHandle<()> {
-        tokio::spawn(telemetry::report_regularly(
+    tokio::spawn(telemetry::report_regularly(
-            mango_client,
+        mango_client,
-            cli.min_health_ratio,
+        cli.min_health_ratio,
-        ));
+    ))
-    }
+}
-    use cli_args::{BoolArg, Cli, CliDotenv};
+fn spawn_rebalance_job(
-    use futures::StreamExt;
+    shared_state: &Arc<RwLock<SharedState>>,
-    let mut jobs: futures::stream::FuturesUnordered<_> = vec![
+    rebalance_trigger_receiver: async_channel::Receiver<()>,
-        data_job,
+    rebalancer: Arc<Rebalancer>,
-        rebalance_job,
+) -> JoinHandle<()> {
-        liquidation_job,
+    let mut rebalance_interval = tokio::time::interval(Duration::from_secs(30));
        token_swap_info_job,
        check_changes_for_abort_job,
    ]
    .into_iter()
    .chain(prio_jobs.into_iter())
    .collect();
    jobs.next().await;
-    error!("a critical job aborted, exiting");
+    tokio::spawn({
-    Ok(())
+        let shared_state = shared_state.clone();
        async move {
            loop {
                tokio::select! {
                    _ = rebalance_interval.tick() => {}
                    _ = rebalance_trigger_receiver.recv() => {}
                }
                if !shared_state.read().unwrap().one_snapshot_done {
                    continue;
                }
                if let Err(err) = rebalancer.zero_all_non_quote().await {
                    error!("failed to rebalance liqor: {:?}", err);
                    // Workaround: We really need a sequence enforcer in the liquidator since we don't want to
                    // accidentally send a similar tx again when we incorrectly believe an earlier one got forked
                    // off. For now, hard sleep on error to avoid the most frequent error cases.
                    tokio::time::sleep(Duration::from_secs(10)).await;
                }
            }
        }
    })
 }
 #[derive(Default)]
-struct SharedState {
+pub struct SharedState {
    /// Addresses of the MangoAccounts belonging to the mango program.
    /// Needed to check health of them all when the cache updates.
    mango_accounts: HashSet<Pubkey>,
@ -561,6 +560,18 @@ struct SharedState {
    /// Oldest chain event not processed yet
    oldest_chain_event_reception_time: Option<Instant>,
    /// Liquidation candidates (locally identified as liquidatable)
    liquidation_candidates_accounts: indexmap::set::IndexSet<Pubkey>,
    /// Interesting TCS that should be triggered
    interesting_tcs: indexmap::set::IndexSet<(Pubkey, u64, u64)>,
    /// Liquidation currently being processed by a worker
    processing_liquidation: HashSet<Pubkey>,
    // TCS currently being processed by a worker
    processing_tcs: HashSet<(Pubkey, u64, u64)>,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@ -584,218 +595,6 @@ impl std::fmt::Display for LiqErrorType {
    }
 }
 struct LiquidationState {
    mango_client: Arc<MangoClient>,
    account_fetcher: Arc<chain_data::AccountFetcher>,
    token_swap_info: Arc<token_swap_info::TokenSwapInfoUpdater>,
    liquidation_config: liquidate::Config,
    trigger_tcs_config: trigger_tcs::Config,
    errors: ErrorTracking<Pubkey, LiqErrorType>,
    oracle_errors: ErrorTracking<TokenIndex, LiqErrorType>,
 }
 impl LiquidationState {
    async fn maybe_liquidate_one<'b>(
        &mut self,
        accounts_iter: impl Iterator<Item = &'b Pubkey>,
    ) -> bool {
        use rand::seq::SliceRandom;
        let mut accounts = accounts_iter.collect::<Vec<&Pubkey>>();
        {
            let mut rng = rand::thread_rng();
            accounts.shuffle(&mut rng);
        }
        for pubkey in accounts {
            if self
                .maybe_liquidate_and_log_error(pubkey)
                .await
                .unwrap_or(false)
            {
                return true;
            }
        }
        false
    }
    async fn maybe_liquidate_and_log_error(&mut self, pubkey: &Pubkey) -> anyhow::Result<bool> {
        let now = Instant::now();
        let error_tracking = &mut self.errors;
        // Skip a pubkey if there've been too many errors recently
        if let Some(error_entry) =
            error_tracking.had_too_many_errors(LiqErrorType::Liq, pubkey, now)
        {
            trace!(
                %pubkey,
                error_entry.count,
                "skip checking account for liquidation, had errors recently",
            );
            return Ok(false);
        }
        let result = liquidate::maybe_liquidate_account(
            &self.mango_client,
            &self.account_fetcher,
            pubkey,
            &self.liquidation_config,
        )
        .await;
        if let Err(err) = result.as_ref() {
            if let Some((ti, ti_name)) = err.try_unwrap_oracle_error() {
                if self
                    .oracle_errors
                    .had_too_many_errors(LiqErrorType::Liq, &ti, Instant::now())
                    .is_none()
                {
                    warn!(
                        "{:?} recording oracle error for token {} {}",
                        chrono::offset::Utc::now(),
                        ti_name,
                        ti
                    );
                }
                self.oracle_errors
                    .record(LiqErrorType::Liq, &ti, err.to_string());
                return result;
            }
            // Keep track of pubkeys that had errors
            error_tracking.record(LiqErrorType::Liq, pubkey, err.to_string());
            // Not all errors need to be raised to the user's attention.
            let mut is_error = true;
            // Simulation errors due to liqee precondition failures on the liquidation instructions
            // will commonly happen if our liquidator is late or if there are chain forks.
            match err.downcast_ref::<MangoClientError>() {
                Some(MangoClientError::SendTransactionPreflightFailure { logs, .. }) => {
                    if logs.iter().any(|line| {
                        line.contains("HealthMustBeNegative") || line.contains("IsNotBankrupt")
                    }) {
                        is_error = false;
                    }
                }
                _ => {}
            };
            if is_error {
                error!("liquidating account {}: {:?}", pubkey, err);
            } else {
                trace!("liquidating account {}: {:?}", pubkey, err);
            }
        } else {
            error_tracking.clear(LiqErrorType::Liq, pubkey);
        }
        result
    }
    async fn maybe_take_token_conditional_swap(
        &mut self,
        accounts_iter: impl Iterator<Item = &Pubkey>,
    ) -> anyhow::Result<bool> {
        let accounts = accounts_iter.collect::<Vec<&Pubkey>>();
        let now = Instant::now();
        let now_ts: u64 = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)?
            .as_secs();
        let tcs_context = trigger_tcs::Context {
            mango_client: self.mango_client.clone(),
            account_fetcher: self.account_fetcher.clone(),
            token_swap_info: self.token_swap_info.clone(),
            config: self.trigger_tcs_config.clone(),
            jupiter_quote_cache: Arc::new(trigger_tcs::JupiterQuoteCache::default()),
            now_ts,
        };
        // Find interesting (pubkey, tcsid, volume)
        let mut interesting_tcs = Vec::with_capacity(accounts.len());
        for pubkey in accounts.iter() {
            if let Some(error_entry) =
                self.errors
                    .had_too_many_errors(LiqErrorType::TcsCollectionHard, pubkey, now)
            {
                trace!(
                    %pubkey,
                    error_entry.count,
                    "skip checking account for tcs, had errors recently",
                );
                continue;
            }
            match tcs_context.find_interesting_tcs_for_account(pubkey) {
                Ok(v) => {
                    self.errors.clear(LiqErrorType::TcsCollectionHard, pubkey);
                    if v.is_empty() {
                        self.errors
                            .clear(LiqErrorType::TcsCollectionPartial, pubkey);
                        self.errors.clear(LiqErrorType::TcsExecution, pubkey);
                    } else if v.iter().all(|it| it.is_ok()) {
                        self.errors
                            .clear(LiqErrorType::TcsCollectionPartial, pubkey);
                    } else {
                        for it in v.iter() {
                            if let Err(e) = it {
                                self.errors.record(
                                    LiqErrorType::TcsCollectionPartial,
                                    pubkey,
                                    e.to_string(),
                                );
                            }
                        }
                    }
                    interesting_tcs.extend(v.iter().filter_map(|it| it.as_ref().ok()));
                }
                Err(e) => {
                    self.errors
                        .record(LiqErrorType::TcsCollectionHard, pubkey, e.to_string());
                }
            }
        }
        if interesting_tcs.is_empty() {
            return Ok(false);
        }
        let (txsigs, mut changed_pubkeys) = tcs_context
            .execute_tcs(&mut interesting_tcs, &mut self.errors)
            .await?;
        for pubkey in changed_pubkeys.iter() {
            self.errors.clear(LiqErrorType::TcsExecution, pubkey);
        }
        if txsigs.is_empty() {
            return Ok(false);
        }
        changed_pubkeys.push(self.mango_client.mango_account_address);
        // Force a refresh of affected accounts
        let slot = self
            .account_fetcher
            .transaction_max_slot(&txsigs)
            .await
            .context("transaction_max_slot")?;
        if let Err(e) = self
            .account_fetcher
            .refresh_accounts_via_rpc_until_slot(
                &changed_pubkeys,
                slot,
                self.liquidation_config.refresh_timeout,
            )
            .await
        {
            info!(slot, "could not refresh after tcs execution: {}", e);
        }
        Ok(true)
    }
 }
 fn start_chain_data_metrics(chain: Arc<RwLock<chain_data::ChainData>>, metrics: &metrics::Metrics) {
    let mut interval = mango_v4_client::delay_interval(Duration::from_secs(600));
--- a/bin/liquidator/src/tcs_state.rs
+++ b/bin/liquidator/src/tcs_state.rs
@ -0,0 +1,218 @@
 use crate::cli_args::Cli;
 use crate::metrics::Metrics;
 use crate::token_swap_info::TokenSwapInfoUpdater;
 use crate::{trigger_tcs, LiqErrorType, SharedState};
 use anchor_lang::prelude::Pubkey;
 use anyhow::Context;
 use itertools::Itertools;
 use mango_v4_client::error_tracking::ErrorTracking;
 use mango_v4_client::{chain_data, MangoClient};
 use std::sync::{Arc, RwLock};
 use std::time::{Duration, Instant};
 use tokio::task::JoinHandle;
 use tracing::{error, info, trace};
 pub fn spawn_tcs_job(
    cli: &Cli,
    shared_state: &Arc<RwLock<SharedState>>,
    tx_trigger_sender: async_channel::Sender<()>,
    mut tcs: Box<TcsState>,
    metrics: &Metrics,
 ) -> JoinHandle<()> {
    tokio::spawn({
        let mut interval =
            mango_v4_client::delay_interval(Duration::from_millis(cli.tcs_check_interval_ms));
        let mut tcs_start_time = None;
        let mut metric_tcs_start_end = metrics.register_latency("tcs_start_end".into());
        let shared_state = shared_state.clone();
        async move {
            loop {
                interval.tick().await;
                let account_addresses = {
                    let state = shared_state.write().unwrap();
                    if !state.one_snapshot_done {
                        continue;
                    }
                    state.mango_accounts.iter().cloned().collect_vec()
                };
                tcs.errors.write().unwrap().update();
                tcs_start_time = Some(tcs_start_time.unwrap_or(Instant::now()));
                let found_candidates = tcs
                    .find_candidates(account_addresses.iter(), |candidate| {
                        if shared_state
                            .write()
                            .unwrap()
                            .interesting_tcs
                            .insert(candidate)
                        {
                            tx_trigger_sender.try_send(())?;
                        }
                        Ok(())
                    })
                    .await
                    .unwrap_or_else(|err| {
                        error!("error during find_candidate: {err}");
                        0
                    });
                if found_candidates > 0 {
                    tracing::debug!("found {} candidates for triggering", found_candidates);
                }
                let current_time = Instant::now();
                metric_tcs_start_end.push(current_time - tcs_start_time.unwrap());
                tcs_start_time = None;
            }
        }
    })
 }
 #[derive(Clone)]
 pub struct TcsState {
    pub mango_client: Arc<MangoClient>,
    pub account_fetcher: Arc<chain_data::AccountFetcher>,
    pub token_swap_info: Arc<TokenSwapInfoUpdater>,
    pub trigger_tcs_config: trigger_tcs::Config,
    pub errors: Arc<RwLock<ErrorTracking<Pubkey, LiqErrorType>>>,
 }
 impl TcsState {
    async fn find_candidates(
        &mut self,
        accounts_iter: impl Iterator<Item = &Pubkey>,
        action: impl Fn((Pubkey, u64, u64)) -> anyhow::Result<()>,
    ) -> anyhow::Result<usize> {
        let accounts = accounts_iter.collect::<Vec<&Pubkey>>();
        let now = Instant::now();
        let now_ts: u64 = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)?
            .as_secs();
        let tcs_context = trigger_tcs::Context {
            mango_client: self.mango_client.clone(),
            account_fetcher: self.account_fetcher.clone(),
            token_swap_info: self.token_swap_info.clone(),
            config: self.trigger_tcs_config.clone(),
            jupiter_quote_cache: Arc::new(trigger_tcs::JupiterQuoteCache::default()),
            now_ts,
        };
        let mut found_counter = 0;
        // Find interesting (pubkey, tcsid, volume)
        for pubkey in accounts.iter() {
            if let Some(error_entry) = self.errors.read().unwrap().had_too_many_errors(
                LiqErrorType::TcsCollectionHard,
                pubkey,
                now,
            ) {
                trace!(
                    %pubkey,
                    error_entry.count,
                    "skip checking account for tcs, had errors recently",
                );
                continue;
            }
            let candidates = tcs_context.find_interesting_tcs_for_account(pubkey);
            let mut error_guard = self.errors.write().unwrap();
            match candidates {
                Ok(v) => {
                    error_guard.clear(LiqErrorType::TcsCollectionHard, pubkey);
                    if v.is_empty() {
                        error_guard.clear(LiqErrorType::TcsCollectionPartial, pubkey);
                        error_guard.clear(LiqErrorType::TcsExecution, pubkey);
                    } else if v.iter().all(|it| it.is_ok()) {
                        error_guard.clear(LiqErrorType::TcsCollectionPartial, pubkey);
                    } else {
                        for it in v.iter() {
                            if let Err(e) = it {
                                error_guard.record(
                                    LiqErrorType::TcsCollectionPartial,
                                    pubkey,
                                    e.to_string(),
                                );
                            }
                        }
                    }
                    for interesting_candidate_res in v.iter() {
                        if let Ok(interesting_candidate) = interesting_candidate_res {
                            action(*interesting_candidate).expect("failed to send TCS candidate");
                            found_counter += 1;
                        }
                    }
                }
                Err(e) => {
                    error_guard.record(LiqErrorType::TcsCollectionHard, pubkey, e.to_string());
                }
            }
        }
        return Ok(found_counter);
    }
    pub async fn maybe_take_token_conditional_swap(
        &mut self,
        mut interesting_tcs: Vec<(Pubkey, u64, u64)>,
    ) -> anyhow::Result<bool> {
        let now_ts: u64 = std::time::SystemTime::now()
            .duration_since(std::time::UNIX_EPOCH)?
            .as_secs();
        let tcs_context = trigger_tcs::Context {
            mango_client: self.mango_client.clone(),
            account_fetcher: self.account_fetcher.clone(),
            token_swap_info: self.token_swap_info.clone(),
            config: self.trigger_tcs_config.clone(),
            jupiter_quote_cache: Arc::new(trigger_tcs::JupiterQuoteCache::default()),
            now_ts,
        };
        if interesting_tcs.is_empty() {
            return Ok(false);
        }
        let (txsigs, mut changed_pubkeys) = tcs_context
            .execute_tcs(&mut interesting_tcs, self.errors.clone())
            .await?;
        for pubkey in changed_pubkeys.iter() {
            self.errors
                .write()
                .unwrap()
                .clear(LiqErrorType::TcsExecution, pubkey);
        }
        if txsigs.is_empty() {
            return Ok(false);
        }
        changed_pubkeys.push(self.mango_client.mango_account_address);
        // Force a refresh of affected accounts
        let slot = self
            .account_fetcher
            .transaction_max_slot(&txsigs)
            .await
            .context("transaction_max_slot")?;
        if let Err(e) = self
            .account_fetcher
            .refresh_accounts_via_rpc_until_slot(
                &changed_pubkeys,
                slot,
                self.trigger_tcs_config.refresh_timeout,
            )
            .await
        {
            info!(slot, "could not refresh after tcs execution: {}", e);
        }
        Ok(true)
    }
 }
--- a/bin/liquidator/src/trigger_tcs.rs
+++ b/bin/liquidator/src/trigger_tcs.rs
@ -1,4 +1,5 @@
 use std::collections::HashSet;
 use std::time::Duration;
 use std::{
    collections::HashMap,
    pin::Pin,
@ -15,6 +16,7 @@ use mango_v4::{
 use mango_v4_client::{chain_data, jupiter, MangoClient, TransactionBuilder};
 use anyhow::Context as AnyhowContext;
 use mango_v4::accounts_ix::HealthCheckKind::MaintRatio;
 use solana_sdk::signature::Signature;
 use tracing::*;
 use {fixed::types::I80F48, solana_sdk::pubkey::Pubkey};
@ -55,6 +57,7 @@ pub enum Mode {
 #[derive(Clone)]
 pub struct Config {
    pub refresh_timeout: Duration,
    pub min_health_ratio: f64,
    pub max_trigger_quote_amount: u64,
    pub compute_limit_for_trigger: u32,
@ -1000,7 +1003,7 @@ impl Context {
    pub async fn execute_tcs(
        &self,
        tcs: &mut [(Pubkey, u64, u64)],
-        error_tracking: &mut ErrorTracking<Pubkey, LiqErrorType>,
+        error_tracking: Arc<RwLock<ErrorTracking<Pubkey, LiqErrorType>>>,
    ) -> anyhow::Result<(Vec<Signature>, Vec<Pubkey>)> {
        use rand::distributions::{Distribution, WeightedError, WeightedIndex};
@ -1049,7 +1052,7 @@ impl Context {
                    }
                    Err(e) => {
                        trace!(%result.pubkey, "preparation error {:?}", e);
-                        error_tracking.record(
+                        error_tracking.write().unwrap().record(
                            LiqErrorType::TcsExecution,
                            &result.pubkey,
                            e.to_string(),
@ -1093,7 +1096,7 @@ impl Context {
            };
            // start the new one
-            if let Some(job) = self.prepare_job(&pubkey, tcs_id, volume, error_tracking) {
+            if let Some(job) = self.prepare_job(&pubkey, tcs_id, volume, error_tracking.clone()) {
                pending_volume += volume;
                pending.push(job);
            }
@ -1130,7 +1133,11 @@ impl Context {
                Ok(v) => Some((pubkey, v)),
                Err(err) => {
                    trace!(%pubkey, "execution error {:?}", err);
-                    error_tracking.record(LiqErrorType::TcsExecution, &pubkey, err.to_string());
+                    error_tracking.write().unwrap().record(
                        LiqErrorType::TcsExecution,
                        &pubkey,
                        err.to_string(),
                    );
                    None
                }
            });
@ -1145,12 +1152,14 @@ impl Context {
        pubkey: &Pubkey,
        tcs_id: u64,
        volume: u64,
-        error_tracking: &ErrorTracking<Pubkey, LiqErrorType>,
+        error_tracking: Arc<RwLock<ErrorTracking<Pubkey, LiqErrorType>>>,
    ) -> Option<Pin<Box<dyn Future<Output = PreparationResult> + Send>>> {
        // Skip a pubkey if there've been too many errors recently
-        if let Some(error_entry) =
+        if let Some(error_entry) = error_tracking.read().unwrap().had_too_many_errors(
-            error_tracking.had_too_many_errors(LiqErrorType::TcsExecution, pubkey, Instant::now())
+            LiqErrorType::TcsExecution,
-        {
+            pubkey,
            Instant::now(),
        ) {
            trace!(
                "skip checking for tcs on account {pubkey}, had {} errors recently",
                error_entry.count
@ -1225,6 +1234,27 @@ impl Context {
            .instructions
            .append(&mut trigger_ixs.instructions);
        let (_, tcs) = liqee.token_conditional_swap_by_id(pending.tcs_id)?;
        let affected_tokens = allowed_tokens
            .iter()
            .chain(&[tcs.buy_token_index, tcs.sell_token_index])
            .copied()
            .collect_vec();
        let liqor = &self.mango_client.mango_account().await?;
        tx_builder.instructions.append(
            &mut self
                .mango_client
                .health_check_instruction(
                    liqor,
                    self.config.min_health_ratio,
                    affected_tokens,
                    vec![],
                    MaintRatio,
                )
                .await?
                .instructions,
        );
        let txsig = tx_builder
            .send_and_confirm(&self.mango_client.client)
            .await?;
--- a/bin/liquidator/src/tx_sender.rs
+++ b/bin/liquidator/src/tx_sender.rs
@ -0,0 +1,241 @@
 use crate::liquidation_state::LiquidationState;
 use crate::tcs_state::TcsState;
 use crate::SharedState;
 use anchor_lang::prelude::Pubkey;
 use async_channel::{Receiver, Sender};
 use mango_v4_client::AsyncChannelSendUnlessFull;
 use std::sync::{Arc, RwLock};
 use tokio::task::JoinHandle;
 use tracing::{debug, error, trace};
 enum WorkerTask {
    Liquidation(Pubkey),
    Tcs(Vec<(Pubkey, u64, u64)>),
    // Given two workers: #0=LIQ_only, #1=LIQ+TCS
    // If they are both busy, and the scanning jobs find a new TCS and a new LIQ candidates and enqueue them in the channel
    // Then if #1 wake up first, it will consume the LIQ candidate (LIQ always have priority)
    // Then when #0 wake up, it will not find any LIQ candidate, and would not do anything (it won't take a TCS)
    // But if we do nothing, #1 would never wake up again (no new task in channel)
    // So we use this `GiveUpTcs` that will be handled by #0 by queuing a new signal the channel and will wake up #1 again
    GiveUpTcs,
    // Can happen if TCS is batched (2 TCS enqueued, 2 workers waken, but first one take both tasks)
    NoWork,
 }
 pub fn spawn_tx_senders_job(
    max_parallel_operations: u64,
    enable_liquidation: bool,
    tx_liq_trigger_receiver: Receiver<()>,
    tx_tcs_trigger_receiver: Receiver<()>,
    tx_tcs_trigger_sender: Sender<()>,
    rebalance_trigger_sender: Sender<()>,
    shared_state: Arc<RwLock<SharedState>>,
    liquidation: Box<LiquidationState>,
    tcs: Box<TcsState>,
 ) -> Vec<JoinHandle<()>> {
    if max_parallel_operations < 1 {
        error!("max_parallel_operations must be >= 1");
        std::process::exit(1)
    }
    let reserve_one_worker_for_liquidation = max_parallel_operations > 1 && enable_liquidation;
    let workers: Vec<JoinHandle<()>> = (0..max_parallel_operations)
        .map(|worker_id| {
            tokio::spawn({
                let shared_state = shared_state.clone();
                let receiver_liq = tx_liq_trigger_receiver.clone();
                let receiver_tcs = tx_tcs_trigger_receiver.clone();
                let sender_tcs = tx_tcs_trigger_sender.clone();
                let rebalance_trigger_sender = rebalance_trigger_sender.clone();
                let liquidation = liquidation.clone();
                let tcs = tcs.clone();
                async move {
                    worker_loop(
                        shared_state,
                        receiver_liq,
                        receiver_tcs,
                        sender_tcs,
                        rebalance_trigger_sender,
                        liquidation,
                        tcs,
                        worker_id,
                        reserve_one_worker_for_liquidation && worker_id == 0,
                    )
                    .await;
                }
            })
        })
        .collect();
    workers
 }
 async fn worker_loop(
    shared_state: Arc<RwLock<SharedState>>,
    liq_receiver: Receiver<()>,
    tcs_receiver: Receiver<()>,
    tcs_sender: Sender<()>,
    rebalance_trigger_sender: Sender<()>,
    mut liquidation: Box<LiquidationState>,
    mut tcs: Box<TcsState>,
    id: u64,
    only_liquidation: bool,
 ) {
    loop {
        debug!(
            "Worker #{} waiting for task (only_liq={})",
            id, only_liquidation
        );
        let _ = if only_liquidation {
            liq_receiver.recv().await.expect("receive failed")
        } else {
            tokio::select!(
                _ = liq_receiver.recv() => {},
                _ = tcs_receiver.recv() => {},
            )
        };
        // a task must be available to process
        // find it in global shared state, and mark it as processing
        let task = worker_pull_task(&shared_state, id, only_liquidation)
            .expect("Worker woke up but has nothing to do");
        // execute the task
        let need_rebalancing = match &task {
            WorkerTask::Liquidation(l) => worker_execute_liquidation(&mut liquidation, *l).await,
            WorkerTask::Tcs(t) => worker_execute_tcs(&mut tcs, t.clone()).await,
            WorkerTask::GiveUpTcs => worker_give_up_tcs(&tcs_sender).await,
            WorkerTask::NoWork => false,
        };
        if need_rebalancing {
            rebalance_trigger_sender.send_unless_full(()).unwrap();
        }
        // remove from shared state
        worker_finalize_task(&shared_state, id, task, need_rebalancing);
    }
 }
 async fn worker_give_up_tcs(sender: &Sender<()>) -> bool {
    sender.send(()).await.expect("sending task failed");
    false
 }
 async fn worker_execute_tcs(tcs: &mut Box<TcsState>, candidates: Vec<(Pubkey, u64, u64)>) -> bool {
    tcs.maybe_take_token_conditional_swap(candidates)
        .await
        .unwrap_or(false)
 }
 async fn worker_execute_liquidation(
    liquidation: &mut Box<LiquidationState>,
    candidate: Pubkey,
 ) -> bool {
    liquidation
        .maybe_liquidate_and_log_error(&candidate)
        .await
        .unwrap_or(false)
 }
 fn worker_pull_task(
    shared_state: &Arc<RwLock<SharedState>>,
    id: u64,
    only_liquidation: bool,
 ) -> anyhow::Result<WorkerTask> {
    let mut writer = shared_state.write().unwrap();
    // print out list of all task for debugging
    for x in &writer.liquidation_candidates_accounts {
        if !writer.processing_liquidation.contains(x) {
            trace!(" - LIQ {:?}", x);
        }
    }
    // next liq task to execute
    if let Some(liq_candidate) = writer
        .liquidation_candidates_accounts
        .iter()
        .find(|x| !writer.processing_liquidation.contains(x))
        .copied()
    {
        debug!("worker #{} got a liq candidate -> {}", id, liq_candidate);
        writer.processing_liquidation.insert(liq_candidate);
        return Ok(WorkerTask::Liquidation(liq_candidate));
    }
    let tcs_todo = writer.interesting_tcs.len() - writer.processing_tcs.len();
    if only_liquidation {
        debug!("worker #{} giving up TCS (todo count: {})", id, tcs_todo);
        return Ok(WorkerTask::GiveUpTcs);
    }
    for x in &writer.interesting_tcs {
        if !writer.processing_tcs.contains(x) {
            trace!("  - TCS {:?}", x);
        }
    }
    // next tcs task to execute
    let max_tcs_batch_size = 20;
    let tcs_candidates: Vec<(Pubkey, u64, u64)> = writer
        .interesting_tcs
        .iter()
        .filter(|x| !writer.processing_tcs.contains(x))
        .take(max_tcs_batch_size)
        .copied()
        .collect();
    for tcs_candidate in &tcs_candidates {
        debug!(
            "worker #{} got a tcs candidate -> {:?} (out of {})",
            id,
            tcs_candidate,
            writer.interesting_tcs.len()
        );
        writer.processing_tcs.insert(tcs_candidate.clone());
    }
    if tcs_candidates.len() > 0 {
        Ok(WorkerTask::Tcs(tcs_candidates))
    } else {
        debug!("worker #{} got nothing", id);
        Ok(WorkerTask::NoWork)
    }
 }
 fn worker_finalize_task(
    shared_state: &Arc<RwLock<SharedState>>,
    id: u64,
    task: WorkerTask,
    done: bool,
 ) {
    let mut writer = shared_state.write().unwrap();
    match task {
        WorkerTask::Liquidation(liq) => {
            debug!(
                "worker #{} - checked liq {:?} with success ? {}",
                id, liq, done
            );
            writer.liquidation_candidates_accounts.shift_remove(&liq);
            writer.processing_liquidation.remove(&liq);
        }
        WorkerTask::Tcs(tcs_list) => {
            for tcs in tcs_list {
                debug!(
                    "worker #{} - checked tcs {:?} with success ? {}",
                    id, tcs, done
                );
                writer.interesting_tcs.shift_remove(&tcs);
                writer.processing_tcs.remove(&tcs);
            }
        }
        WorkerTask::GiveUpTcs => {}
        WorkerTask::NoWork => {}
    }
 }
--- a/lib/client/src/client.rs
+++ b/lib/client/src/client.rs
@ -17,7 +17,9 @@ use futures::{stream, StreamExt, TryFutureExt, TryStreamExt};
 use itertools::Itertools;
 use tracing::*;
-use mango_v4::accounts_ix::{Serum3OrderType, Serum3SelfTradeBehavior, Serum3Side};
+use mango_v4::accounts_ix::{
    HealthCheckKind, Serum3OrderType, Serum3SelfTradeBehavior, Serum3Side,
 };
 use mango_v4::accounts_zerocopy::KeyedAccountSharedData;
 use mango_v4::health::HealthCache;
 use mango_v4::state::{
@ -80,6 +82,12 @@ pub struct ClientConfig {
    #[builder(default = "Duration::from_secs(60)")]
    pub timeout: Duration,
    /// Jupiter Timeout, defaults to 30s
    ///
    /// This timeout applies to jupiter requests.
    #[builder(default = "Duration::from_secs(30)")]
    pub jupiter_timeout: Duration,
    #[builder(default)]
    pub transaction_builder_config: TransactionBuilderConfig,
@ -560,6 +568,48 @@ impl MangoClient {
        self.send_and_confirm_owner_tx(ixs.to_instructions()).await
    }
    /// Assert that health of account is > N
    pub async fn health_check_instruction(
        &self,
        account: &MangoAccountValue,
        min_health_value: f64,
        affected_tokens: Vec<TokenIndex>,
        affected_perp_markets: Vec<PerpMarketIndex>,
        check_kind: HealthCheckKind,
    ) -> anyhow::Result<PreparedInstructions> {
        let (health_check_metas, health_cu) = self
            .derive_health_check_remaining_account_metas(
                account,
                affected_tokens,
                vec![],
                affected_perp_markets,
            )
            .await?;
        let ixs = PreparedInstructions::from_vec(
            vec![Instruction {
                program_id: mango_v4::id(),
                accounts: {
                    let mut ams = anchor_lang::ToAccountMetas::to_account_metas(
                        &mango_v4::accounts::HealthCheck {
                            group: self.group(),
                            account: self.mango_account_address,
                        },
                        None,
                    );
                    ams.extend(health_check_metas.into_iter());
                    ams
                },
                data: anchor_lang::InstructionData::data(&mango_v4::instruction::HealthCheck {
                    min_health_value,
                    check_kind,
                }),
            }],
            self.instruction_cu(health_cu),
        );
        Ok(ixs)
    }
    /// Creates token withdraw instructions for the MangoClient's account/owner.
    /// The `account` state is passed in separately so changes during the tx can be
    /// accounted for when deriving health accounts.
@ -2094,7 +2144,10 @@ impl MangoClient {
    // jupiter
    pub fn jupiter_v6(&self) -> jupiter::v6::JupiterV6 {
-        jupiter::v6::JupiterV6 { mango_client: self }
+        jupiter::v6::JupiterV6 {
            mango_client: self,
            timeout_duration: self.client.config.jupiter_timeout,
        }
    }
    pub fn jupiter(&self) -> jupiter::Jupiter {
--- a/lib/client/src/jupiter/v6.rs
+++ b/lib/client/src/jupiter/v6.rs
@ -1,4 +1,5 @@
 use std::str::FromStr;
 use std::time::Duration;
 use anchor_lang::prelude::Pubkey;
 use serde::{Deserialize, Serialize};
@ -139,6 +140,7 @@ impl TryFrom<&AccountMeta> for solana_sdk::instruction::AccountMeta {
 pub struct JupiterV6<'a> {
    pub mango_client: &'a MangoClient,
    pub timeout_duration: Duration,
 }
 impl<'a> JupiterV6<'a> {
@ -204,6 +206,7 @@ impl<'a> JupiterV6<'a> {
            .http_client
            .get(format!("{}/quote", config.jupiter_v6_url))
            .query(&query_args)
            .timeout(self.timeout_duration)
            .send()
            .await
            .context("quote request to jupiter")?;
@ -290,6 +293,7 @@ impl<'a> JupiterV6<'a> {
                destination_token_account: None, // default to user ata
                quote_response: quote.clone(),
            })
            .timeout(self.timeout_duration)
            .send()
            .await
            .context("swap transaction request to jupiter")?;
--- a/lib/client/src/util.rs
+++ b/lib/client/src/util.rs
@ -20,19 +20,29 @@ impl<T, E: std::fmt::Debug> AnyhowWrap for Result<T, E> {
 /// Push to an async_channel::Sender and ignore if the channel is full
 pub trait AsyncChannelSendUnlessFull<T> {
    /// Send a message if the channel isn't full
-    fn send_unless_full(&self, msg: T) -> Result<(), async_channel::SendError<T>>;
+    fn send_unless_full(&self, msg: T) -> anyhow::Result<()>;
 }
 impl<T> AsyncChannelSendUnlessFull<T> for async_channel::Sender<T> {
-    fn send_unless_full(&self, msg: T) -> Result<(), async_channel::SendError<T>> {
+    fn send_unless_full(&self, msg: T) -> anyhow::Result<()> {
        use async_channel::*;
        match self.try_send(msg) {
            Ok(()) => Ok(()),
-            Err(TrySendError::Closed(msg)) => Err(async_channel::SendError(msg)),
+            Err(TrySendError::Closed(_)) => Err(anyhow::format_err!("channel is closed")),
            Err(TrySendError::Full(_)) => Ok(()),
        }
    }
 }
 impl<T> AsyncChannelSendUnlessFull<T> for tokio::sync::mpsc::Sender<T> {
    fn send_unless_full(&self, msg: T) -> anyhow::Result<()> {
        use tokio::sync::mpsc::*;
        match self.try_send(msg) {
            Ok(()) => Ok(()),
            Err(error::TrySendError::Closed(_)) => Err(anyhow::format_err!("channel is closed")),
            Err(error::TrySendError::Full(_)) => Ok(()),
        }
    }
 }
 /// Like tokio::time::interval(), but with Delay as default MissedTickBehavior
 ///
--- a/programs/mango-v4/tests/cases/test_health_check.rs
+++ b/programs/mango-v4/tests/cases/test_health_check.rs
@ -7,8 +7,6 @@ use mango_v4::accounts_ix::{HealthCheck, HealthCheckKind};
 use mango_v4::error::MangoError;
 use solana_sdk::transport::TransportError;
 // TODO FAS
 #[tokio::test]
 async fn test_health_check() -> Result<(), TransportError> {
    let context = TestContext::new().await;
--- a/ts/client/scripts/liqtest/README.md
+++ b/ts/client/scripts/liqtest/README.md
@ -51,6 +51,7 @@ This creates a bunch of to-be-liquidated accounts as well as a LIQOR account.
 Run the liquidator on the group with the liqor account.
 Since devnet doesn't have any jupiter, run with
 ```
 JUPITER_VERSION=mock
 TCS_MODE=borrow-buy