From 9831e4ddad95beb6ebfb4add001b61594d491b1e Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang <93241502+yhchiang-sol@users.noreply.github.com> Date: Fri, 16 Sep 2022 13:12:55 -0700 Subject: [PATCH] Remove daily rewrite/compaction of each ledger file (#27571) #### Problem Previously before #26651, our LedgerCleanupService needs RocksDB background compactions to reclaim ledger disk space via our custom CompactionFilter. However, since RocksDB's compaction isn't smart enough to know which file to pick, we rely on the 1-day compaction period so that each file will be forced to be compacted once a day so that we can reclaim ledger disk space in time. The downside of this is each ledger file will be rewritten once per day. #### Summary of Changes As #26651 makes LedgerCleanupService actively delete those files whose entire slot-range is older than both --limit-ledger-size and the current root, we can remove the 1-day compaction period and get rid of the daily ledger file rewrite. The results on mainnet-beta shows that this PR reduces ~20% write-bytes-per-second and reduces ~50% read-bytes-per-second on ledger disk. --- ledger/src/blockstore_db.rs | 64 ++----------------------------------- 1 file changed, 2 insertions(+), 62 deletions(-) diff --git a/ledger/src/blockstore_db.rs b/ledger/src/blockstore_db.rs index f7003112c7..c9ef037032 100644 --- a/ledger/src/blockstore_db.rs +++ b/ledger/src/blockstore_db.rs @@ -95,9 +95,6 @@ const PROGRAM_COSTS_CF: &str = "program_costs"; /// Column family for optimistic slots const OPTIMISTIC_SLOTS_CF: &str = "optimistic_slots"; -// 1 day is chosen for the same reasoning of DEFAULT_COMPACTION_SLOT_INTERVAL -const PERIODIC_COMPACTION_SECONDS: u64 = 60 * 60 * 24; - #[derive(Error, Debug)] pub enum BlockstoreError { ShredForIndexExists, @@ -281,7 +278,7 @@ impl Rocks { path, Self::cf_descriptors(&options, &oldest_slot), )?, - access_type: access_type.clone(), + access_type, oldest_slot, column_options, write_batch_perf_status: PerfSamplingStatus::default(), @@ -302,70 +299,13 @@ impl Rocks { &secondary_path, Self::cf_descriptors(&options, &oldest_slot), )?, - access_type: access_type.clone(), + access_type, oldest_slot, column_options, write_batch_perf_status: PerfSamplingStatus::default(), } } }; - // This is only needed by solana-validator for LedgerCleanupService so guard with AccessType::Primary - if matches!(access_type, AccessType::Primary) { - for cf_name in Self::columns() { - // these special column families must be excluded from LedgerCleanupService's rocksdb - // compactions - if should_exclude_from_compaction(cf_name) { - continue; - } - - // This is the crux of our write-stall-free storage cleaning strategy with consistent - // state view for higher-layers - // - // For the consistent view, we commit delete_range on pruned slot range by LedgerCleanupService. - // simple story here. - // - // For actual storage cleaning, we employ RocksDB compaction. But default RocksDB compaction - // settings don't work well for us. That's because we're using it rather like a really big - // (100 GBs) ring-buffer. RocksDB is basically assuming uniform data write over the key space for - // efficient compaction, which isn't true for our use as a ring buffer. - // - // So, we customize the compaction strategy with 2 combined tweaks: - // (1) compaction_filter and (2) shortening its periodic cycles. - // - // Via the compaction_filter, we finally reclaim previously delete_range()-ed storage occupied - // by pruned slots. When compaction_filter is set, each SST files are re-compacted periodically - // to hunt for keys newly expired by the compaction_filter re-evaluation. But RocksDb's default - // `periodic_compaction_seconds` is 30 days, which is too long for our case. So, we - // shorten it to a day (24 hours). - // - // As we write newer SST files over time at rather consistent rate of speed, this - // effectively makes each newly-created sets be re-compacted for the filter at - // well-dispersed different timings. - // As a whole, we rewrite the whole dataset at every PERIODIC_COMPACTION_SECONDS, - // slowly over the duration of PERIODIC_COMPACTION_SECONDS. So, this results in - // amortization. - // So, there is a bit inefficiency here because we'll rewrite not-so-old SST files - // too. But longer period would introduce higher variance of ledger storage sizes over - // the long period. And it's much better than the daily IO spike caused by compact_range() by - // previous implementation. - // - // `ttl` and `compact_range`(`ManualCompaction`), doesn't work nicely. That's - // because its original intention is delete_range()s to reclaim disk space. So it tries to merge - // them with N+1 SST files all way down to the bottommost SSTs, often leading to vastly large amount - // (= all) of invalidated SST files, when combined with newer writes happening at the opposite - // edge of the key space. This causes a long and heavy disk IOs and possible write - // stall and ultimately, the deadly Replay/Banking stage stall at higher layers. - db.db - .set_options_cf( - db.cf_handle(cf_name), - &[( - "periodic_compaction_seconds", - &format!("{}", PERIODIC_COMPACTION_SECONDS), - )], - ) - .unwrap(); - } - } Ok(db) }