2021-11-18 17:55:38 -08:00
|
|
|
//! The syncer downloads and verifies large numbers of blocks from peers to Zebra.
|
|
|
|
//!
|
|
|
|
//! It is used when Zebra is a long way behind the current chain tip.
|
|
|
|
|
2023-01-11 15:39:51 -08:00
|
|
|
use std::{cmp::max, collections::HashSet, pin::Pin, task::Poll, time::Duration};
|
2020-07-08 13:33:39 -07:00
|
|
|
|
2020-09-09 15:33:25 -07:00
|
|
|
use color_eyre::eyre::{eyre, Report};
|
2021-10-20 17:34:12 -07:00
|
|
|
use futures::stream::{FuturesUnordered, StreamExt};
|
2022-01-11 09:11:35 -08:00
|
|
|
use indexmap::IndexSet;
|
2022-10-24 16:39:00 -07:00
|
|
|
use serde::{Deserialize, Serialize};
|
2022-11-08 20:42:04 -08:00
|
|
|
use tokio::{sync::watch, time::sleep};
|
2020-09-22 10:46:50 -07:00
|
|
|
use tower::{
|
2020-10-23 18:56:54 -07:00
|
|
|
builder::ServiceBuilder, hedge::Hedge, limit::ConcurrencyLimit, retry::Retry, timeout::Timeout,
|
|
|
|
Service, ServiceExt,
|
2020-09-22 10:46:50 -07:00
|
|
|
};
|
2020-07-08 13:33:39 -07:00
|
|
|
|
2020-06-30 09:42:09 -07:00
|
|
|
use zebra_chain::{
|
2023-01-11 15:39:51 -08:00
|
|
|
block::{self, Height},
|
2022-01-11 09:11:35 -08:00
|
|
|
chain_tip::ChainTip,
|
2020-10-23 18:56:54 -07:00
|
|
|
parameters::genesis_hash,
|
2020-06-30 09:42:09 -07:00
|
|
|
};
|
2020-09-09 12:17:17 -07:00
|
|
|
use zebra_network as zn;
|
2020-07-21 19:22:43 -07:00
|
|
|
use zebra_state as zs;
|
2020-06-22 22:31:26 -07:00
|
|
|
|
2021-10-19 18:07:19 -07:00
|
|
|
use crate::{
|
2021-12-21 12:13:26 -08:00
|
|
|
components::sync::downloads::BlockDownloadVerifyError, config::ZebradConfig, BoxError,
|
2021-10-19 18:07:19 -07:00
|
|
|
};
|
2020-10-23 18:56:54 -07:00
|
|
|
|
2020-09-09 14:45:05 -07:00
|
|
|
mod downloads;
|
2021-10-07 03:46:37 -07:00
|
|
|
mod gossip;
|
2022-06-27 19:51:41 -07:00
|
|
|
mod progress;
|
2021-08-19 16:16:16 -07:00
|
|
|
mod recent_sync_lengths;
|
2021-08-29 17:01:33 -07:00
|
|
|
mod status;
|
2021-08-19 16:16:16 -07:00
|
|
|
|
2021-06-09 16:39:51 -07:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests;
|
|
|
|
|
2020-09-22 10:46:50 -07:00
|
|
|
use downloads::{AlwaysHedge, Downloads};
|
2021-10-07 03:46:37 -07:00
|
|
|
|
change(state): Write non-finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5257)
* Add a new block commit task and channels, that don't do anything yet
* Add last_block_hash_sent to the state service, to avoid database accesses
* Update last_block_hash_sent regardless of commit errors
* Rename a field to StateService.max_queued_finalized_height
* Commit finalized blocks to the state in a separate task
* Check for panics in the block write task
* Wait for the block commit task in tests, and check for errors
* Always run a proptest that sleeps once
* Add extra debugging to state shutdowns
* Work around a RocksDB shutdown bug
* Close the finalized block channel when we're finished with it
* Only reset state queue once per error
* Update some TODOs
* Add a module doc comment
* Drop channels and check for closed channels in the block commit task
* Close state channels and tasks on drop
* Remove some duplicate fields across StateService and ReadStateService
* Try tweaking the shutdown steps
* Update and clarify some comments
* Clarify another comment
* Don't try to cancel RocksDB background work on drop
* Fix up some comments
* Remove some duplicate code
* Remove redundant workarounds for shutdown issues
* Remode a redundant channel close in the block commit task
* Remove a mistaken `!force` shutdown condition
* Remove duplicate force-shutdown code and explain it better
* Improve RPC error logging
* Wait for chain tip updates in the RPC tests
* Wait 2 seconds for chain tip updates before skipping them
* Remove an unnecessary block_in_place()
* Fix some test error messages that were changed by earlier fixes
* Expand some comments, fix typos
Co-authored-by: Marek <mail@marek.onl>
* Actually drop children of failed blocks
* Explain why we drop descendants of failed blocks
* Clarify a comment
* Wait for chain tip updates in a failing test on macOS
* Clean duplicate finalized blocks when the non-finalized state activates
* Send an error when receiving a duplicate finalized block
* Update checkpoint block behaviour, document its consensus rule
* Wait for chain tip changes in inbound_block_height_lookahead_limit test
* Wait for the genesis block to commit in the fake peer set mempool tests
* Disable unreliable mempool verification check in the send transaction test
* Appease rustfmt
* Use clear_finalized_block_queue() everywhere that blocks are dropped
* Document how Finalized and NonFinalized clones are different
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* updates comments, renames send_process_queued, other minor cleanup
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* updates comments, renames send_process_queued, other minor cleanup
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* removes duplicate field definitions on StateService that were a result of a bad merge
* update NotReadyToBeCommitted error message
* Appear rustfmt
* Fix doc links
* Rename a function to initial_contextual_validity()
* Do error tasks on Err, and success tasks on Ok
* Simplify parent_error_map truncation
* Rewrite best_tip() to use tip()
* Rename latest_mem() to latest_non_finalized_state()
```sh
fastmod latest_mem latest_non_finalized_state zebra*
cargo fmt --all
```
* Simplify latest_non_finalized_state() using a new WatchReceiver API
* Expand some error messages
* Send the result after updating the channels, and document why
* wait for chain_tip_update before cancelling download in mempool_cancel_mined
* adds `sent_non_finalized_block_hashes` field to StateService
* adds batched sent_hash insertions and checks sent hashes in queue_and_commit_non_finalized before adding a block to the queue
* check that the `curr_buf` in SentHashes is not empty before pushing it to the `sent_bufs`
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* Fix rustfmt
* Check for finalized block heights using zs_contains()
* adds known_utxos field to SentHashes
* updates comment on SentHashes.add method
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* return early when there's a duplicate hash in QueuedBlocks.queue instead of panicking
* Make finalized UTXOs near the final checkpoint available for full block verification
* Replace a checkpoint height literal with the actual config
* Update mainnet and testnet checkpoints - 7 October 2022
* Fix some state service init arguments
* Allow more lookahead in the downloader, but less lookahead in the syncer
* Add the latest config to the tests, and fix the latest config check
* Increase the number of finalized blocks checked for non-finalized block UTXO spends
* fix(log): reduce verbose logs for block commits (#5348)
* Remove some verbose block write channel logs
* Only warn about tracing endpoint if the address is actually set
* Use CloneError instead of formatting a non-cloneable error
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
* Increase block verify timeout
* Work around a known block timeout bug by using a shorter timeout
Co-authored-by: teor <teor@riseup.net>
Co-authored-by: Marek <mail@marek.onl>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-10-11 12:25:45 -07:00
|
|
|
pub use downloads::VERIFICATION_PIPELINE_SCALING_MULTIPLIER;
|
2021-10-07 03:46:37 -07:00
|
|
|
pub use gossip::{gossip_best_tip_block_hashes, BlockGossipError};
|
2022-06-27 19:51:41 -07:00
|
|
|
pub use progress::show_block_chain_progress;
|
2021-09-28 16:06:40 -07:00
|
|
|
pub use recent_sync_lengths::RecentSyncLengths;
|
2021-08-29 17:01:33 -07:00
|
|
|
pub use status::SyncStatus;
|
2020-09-09 14:45:05 -07:00
|
|
|
|
2020-09-01 18:20:32 -07:00
|
|
|
/// Controls the number of peers used for each ObtainTips and ExtendTips request.
|
2021-11-30 13:04:32 -08:00
|
|
|
const FANOUT: usize = 3;
|
2020-10-16 16:44:30 -07:00
|
|
|
|
2020-09-01 18:20:32 -07:00
|
|
|
/// Controls how many times we will retry each block download.
|
|
|
|
///
|
2020-09-22 10:46:50 -07:00
|
|
|
/// Failing block downloads is important because it defends against peers who
|
|
|
|
/// feed us bad hashes. But spurious failures of valid blocks cause the syncer to
|
|
|
|
/// restart from the previous checkpoint, potentially re-downloading blocks.
|
2020-09-01 18:20:32 -07:00
|
|
|
///
|
2021-01-13 01:14:11 -08:00
|
|
|
/// We also hedge requests, so we may retry up to twice this many times. Hedged
|
|
|
|
/// retries may be concurrent, inner retries are sequential.
|
2022-04-26 09:28:09 -07:00
|
|
|
const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 3;
|
2020-09-01 18:20:32 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
/// A lower bound on the user-specified checkpoint verification concurrency limit.
|
2021-01-22 02:44:24 -08:00
|
|
|
///
|
2022-06-22 11:17:21 -07:00
|
|
|
/// Set to the maximum checkpoint interval, so the pipeline holds around a checkpoint's
|
2022-06-21 04:07:32 -07:00
|
|
|
/// worth of blocks.
|
2021-03-10 17:29:21 -08:00
|
|
|
///
|
|
|
|
/// ## Security
|
|
|
|
///
|
|
|
|
/// If a malicious node is chosen for an ObtainTips or ExtendTips request, it can
|
|
|
|
/// provide up to 500 malicious block hashes. These block hashes will be
|
|
|
|
/// distributed across all available peers. Assuming there are around 50 connected
|
|
|
|
/// peers, the malicious node will receive approximately 10 of those block requests.
|
|
|
|
///
|
|
|
|
/// Malicious deserialized blocks can take up a large amount of RAM, see
|
|
|
|
/// [`super::inbound::downloads::MAX_INBOUND_CONCURRENCY`] and #1880 for details.
|
|
|
|
/// So we want to keep the lookahead limit reasonably small.
|
|
|
|
///
|
|
|
|
/// Once these malicious blocks start failing validation, the syncer will cancel all
|
|
|
|
/// the pending download and verify tasks, drop all the blocks, and start a new
|
|
|
|
/// ObtainTips with a new set of peers.
|
2022-07-06 07:13:57 -07:00
|
|
|
pub const MIN_CHECKPOINT_CONCURRENCY_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP;
|
2021-11-18 17:55:38 -08:00
|
|
|
|
|
|
|
/// The default for the user-specified lookahead limit.
|
|
|
|
///
|
2022-07-06 07:13:57 -07:00
|
|
|
/// See [`MIN_CHECKPOINT_CONCURRENCY_LIMIT`] for details.
|
2022-11-08 20:42:04 -08:00
|
|
|
pub const DEFAULT_CHECKPOINT_CONCURRENCY_LIMIT: usize = MAX_TIPS_RESPONSE_HASH_COUNT * 2;
|
2022-07-06 07:13:57 -07:00
|
|
|
|
|
|
|
/// A lower bound on the user-specified concurrency limit.
|
|
|
|
///
|
|
|
|
/// If the concurrency limit is 0, Zebra can't download or verify any blocks.
|
|
|
|
pub const MIN_CONCURRENCY_LIMIT: usize = 1;
|
2020-09-01 18:20:32 -07:00
|
|
|
|
2021-12-17 08:31:51 -08:00
|
|
|
/// The expected maximum number of hashes in an ObtainTips or ExtendTips response.
|
|
|
|
///
|
|
|
|
/// This is used to allow block heights that are slightly beyond the lookahead limit,
|
|
|
|
/// but still limit the number of blocks in the pipeline between the downloader and
|
|
|
|
/// the state.
|
|
|
|
///
|
2022-07-06 07:13:57 -07:00
|
|
|
/// See [`MIN_CHECKPOINT_CONCURRENCY_LIMIT`] for details.
|
2021-12-17 08:31:51 -08:00
|
|
|
pub const MAX_TIPS_RESPONSE_HASH_COUNT: usize = 500;
|
|
|
|
|
2020-09-01 18:20:32 -07:00
|
|
|
/// Controls how long we wait for a tips response to return.
|
2021-01-13 01:14:11 -08:00
|
|
|
///
|
|
|
|
/// ## Correctness
|
|
|
|
///
|
|
|
|
/// If this timeout is removed (or set too high), the syncer will sometimes hang.
|
|
|
|
///
|
|
|
|
/// If this timeout is set too low, the syncer will sometimes get stuck in a
|
|
|
|
/// failure loop.
|
2021-10-08 04:59:46 -07:00
|
|
|
pub const TIPS_RESPONSE_TIMEOUT: Duration = Duration::from_secs(6);
|
2020-09-22 10:46:50 -07:00
|
|
|
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
/// Controls how long we wait for a block download request to complete.
|
2021-01-13 01:08:02 -08:00
|
|
|
///
|
|
|
|
/// This timeout makes sure that the syncer doesn't hang when:
|
|
|
|
/// - the lookahead queue is full, and
|
|
|
|
/// - we are waiting for a request that is stuck.
|
|
|
|
/// See [`BLOCK_VERIFY_TIMEOUT`] for details.
|
|
|
|
///
|
|
|
|
/// ## Correctness
|
|
|
|
///
|
|
|
|
/// If this timeout is removed (or set too high), the syncer will sometimes hang.
|
|
|
|
///
|
|
|
|
/// If this timeout is set too low, the syncer will sometimes get stuck in a
|
|
|
|
/// failure loop.
|
change(state): Write non-finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5257)
* Add a new block commit task and channels, that don't do anything yet
* Add last_block_hash_sent to the state service, to avoid database accesses
* Update last_block_hash_sent regardless of commit errors
* Rename a field to StateService.max_queued_finalized_height
* Commit finalized blocks to the state in a separate task
* Check for panics in the block write task
* Wait for the block commit task in tests, and check for errors
* Always run a proptest that sleeps once
* Add extra debugging to state shutdowns
* Work around a RocksDB shutdown bug
* Close the finalized block channel when we're finished with it
* Only reset state queue once per error
* Update some TODOs
* Add a module doc comment
* Drop channels and check for closed channels in the block commit task
* Close state channels and tasks on drop
* Remove some duplicate fields across StateService and ReadStateService
* Try tweaking the shutdown steps
* Update and clarify some comments
* Clarify another comment
* Don't try to cancel RocksDB background work on drop
* Fix up some comments
* Remove some duplicate code
* Remove redundant workarounds for shutdown issues
* Remode a redundant channel close in the block commit task
* Remove a mistaken `!force` shutdown condition
* Remove duplicate force-shutdown code and explain it better
* Improve RPC error logging
* Wait for chain tip updates in the RPC tests
* Wait 2 seconds for chain tip updates before skipping them
* Remove an unnecessary block_in_place()
* Fix some test error messages that were changed by earlier fixes
* Expand some comments, fix typos
Co-authored-by: Marek <mail@marek.onl>
* Actually drop children of failed blocks
* Explain why we drop descendants of failed blocks
* Clarify a comment
* Wait for chain tip updates in a failing test on macOS
* Clean duplicate finalized blocks when the non-finalized state activates
* Send an error when receiving a duplicate finalized block
* Update checkpoint block behaviour, document its consensus rule
* Wait for chain tip changes in inbound_block_height_lookahead_limit test
* Wait for the genesis block to commit in the fake peer set mempool tests
* Disable unreliable mempool verification check in the send transaction test
* Appease rustfmt
* Use clear_finalized_block_queue() everywhere that blocks are dropped
* Document how Finalized and NonFinalized clones are different
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* updates comments, renames send_process_queued, other minor cleanup
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* updates comments, renames send_process_queued, other minor cleanup
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* removes duplicate field definitions on StateService that were a result of a bad merge
* update NotReadyToBeCommitted error message
* Appear rustfmt
* Fix doc links
* Rename a function to initial_contextual_validity()
* Do error tasks on Err, and success tasks on Ok
* Simplify parent_error_map truncation
* Rewrite best_tip() to use tip()
* Rename latest_mem() to latest_non_finalized_state()
```sh
fastmod latest_mem latest_non_finalized_state zebra*
cargo fmt --all
```
* Simplify latest_non_finalized_state() using a new WatchReceiver API
* Expand some error messages
* Send the result after updating the channels, and document why
* wait for chain_tip_update before cancelling download in mempool_cancel_mined
* adds `sent_non_finalized_block_hashes` field to StateService
* adds batched sent_hash insertions and checks sent hashes in queue_and_commit_non_finalized before adding a block to the queue
* check that the `curr_buf` in SentHashes is not empty before pushing it to the `sent_bufs`
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* Fix rustfmt
* Check for finalized block heights using zs_contains()
* adds known_utxos field to SentHashes
* updates comment on SentHashes.add method
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* return early when there's a duplicate hash in QueuedBlocks.queue instead of panicking
* Make finalized UTXOs near the final checkpoint available for full block verification
* Replace a checkpoint height literal with the actual config
* Update mainnet and testnet checkpoints - 7 October 2022
* Fix some state service init arguments
* Allow more lookahead in the downloader, but less lookahead in the syncer
* Add the latest config to the tests, and fix the latest config check
* Increase the number of finalized blocks checked for non-finalized block UTXO spends
* fix(log): reduce verbose logs for block commits (#5348)
* Remove some verbose block write channel logs
* Only warn about tracing endpoint if the address is actually set
* Use CloneError instead of formatting a non-cloneable error
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
* Increase block verify timeout
* Work around a known block timeout bug by using a shorter timeout
Co-authored-by: teor <teor@riseup.net>
Co-authored-by: Marek <mail@marek.onl>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-10-11 12:25:45 -07:00
|
|
|
///
|
|
|
|
/// We set the timeout so that it requires under 1 Mbps bandwidth for a full 2 MB block.
|
|
|
|
pub(super) const BLOCK_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
|
2021-01-13 01:08:02 -08:00
|
|
|
|
|
|
|
/// Controls how long we wait for a block verify request to complete.
|
|
|
|
///
|
|
|
|
/// This timeout makes sure that the syncer doesn't hang when:
|
|
|
|
/// - the lookahead queue is full, and
|
|
|
|
/// - all pending verifications:
|
|
|
|
/// - are waiting on a missing download request,
|
|
|
|
/// - are waiting on a download or verify request that has failed, but we have
|
|
|
|
/// deliberately ignored the error,
|
|
|
|
/// - are for blocks a long way ahead of the current tip, or
|
|
|
|
/// - are for invalid blocks which will never verify, because they depend on
|
|
|
|
/// missing blocks or transactions.
|
|
|
|
/// These conditions can happen during normal operation - they are not bugs.
|
|
|
|
///
|
|
|
|
/// This timeout also mitigates or hides the following kinds of bugs:
|
|
|
|
/// - all pending verifications:
|
|
|
|
/// - are waiting on a download or verify request that has failed, but we have
|
|
|
|
/// accidentally dropped the error,
|
|
|
|
/// - are waiting on a download request that has hung inside Zebra,
|
|
|
|
/// - are on tokio threads that are waiting for blocked operations.
|
|
|
|
///
|
|
|
|
/// ## Correctness
|
|
|
|
///
|
|
|
|
/// If this timeout is removed (or set too high), the syncer will sometimes hang.
|
|
|
|
///
|
|
|
|
/// If this timeout is set too low, the syncer will sometimes get stuck in a
|
|
|
|
/// failure loop.
|
2022-06-22 11:17:21 -07:00
|
|
|
///
|
change(state): Write non-finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5257)
* Add a new block commit task and channels, that don't do anything yet
* Add last_block_hash_sent to the state service, to avoid database accesses
* Update last_block_hash_sent regardless of commit errors
* Rename a field to StateService.max_queued_finalized_height
* Commit finalized blocks to the state in a separate task
* Check for panics in the block write task
* Wait for the block commit task in tests, and check for errors
* Always run a proptest that sleeps once
* Add extra debugging to state shutdowns
* Work around a RocksDB shutdown bug
* Close the finalized block channel when we're finished with it
* Only reset state queue once per error
* Update some TODOs
* Add a module doc comment
* Drop channels and check for closed channels in the block commit task
* Close state channels and tasks on drop
* Remove some duplicate fields across StateService and ReadStateService
* Try tweaking the shutdown steps
* Update and clarify some comments
* Clarify another comment
* Don't try to cancel RocksDB background work on drop
* Fix up some comments
* Remove some duplicate code
* Remove redundant workarounds for shutdown issues
* Remode a redundant channel close in the block commit task
* Remove a mistaken `!force` shutdown condition
* Remove duplicate force-shutdown code and explain it better
* Improve RPC error logging
* Wait for chain tip updates in the RPC tests
* Wait 2 seconds for chain tip updates before skipping them
* Remove an unnecessary block_in_place()
* Fix some test error messages that were changed by earlier fixes
* Expand some comments, fix typos
Co-authored-by: Marek <mail@marek.onl>
* Actually drop children of failed blocks
* Explain why we drop descendants of failed blocks
* Clarify a comment
* Wait for chain tip updates in a failing test on macOS
* Clean duplicate finalized blocks when the non-finalized state activates
* Send an error when receiving a duplicate finalized block
* Update checkpoint block behaviour, document its consensus rule
* Wait for chain tip changes in inbound_block_height_lookahead_limit test
* Wait for the genesis block to commit in the fake peer set mempool tests
* Disable unreliable mempool verification check in the send transaction test
* Appease rustfmt
* Use clear_finalized_block_queue() everywhere that blocks are dropped
* Document how Finalized and NonFinalized clones are different
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* updates comments, renames send_process_queued, other minor cleanup
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* updates comments, renames send_process_queued, other minor cleanup
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* removes duplicate field definitions on StateService that were a result of a bad merge
* update NotReadyToBeCommitted error message
* Appear rustfmt
* Fix doc links
* Rename a function to initial_contextual_validity()
* Do error tasks on Err, and success tasks on Ok
* Simplify parent_error_map truncation
* Rewrite best_tip() to use tip()
* Rename latest_mem() to latest_non_finalized_state()
```sh
fastmod latest_mem latest_non_finalized_state zebra*
cargo fmt --all
```
* Simplify latest_non_finalized_state() using a new WatchReceiver API
* Expand some error messages
* Send the result after updating the channels, and document why
* wait for chain_tip_update before cancelling download in mempool_cancel_mined
* adds `sent_non_finalized_block_hashes` field to StateService
* adds batched sent_hash insertions and checks sent hashes in queue_and_commit_non_finalized before adding a block to the queue
* check that the `curr_buf` in SentHashes is not empty before pushing it to the `sent_bufs`
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* Fix rustfmt
* Check for finalized block heights using zs_contains()
* adds known_utxos field to SentHashes
* updates comment on SentHashes.add method
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* return early when there's a duplicate hash in QueuedBlocks.queue instead of panicking
* Make finalized UTXOs near the final checkpoint available for full block verification
* Replace a checkpoint height literal with the actual config
* Update mainnet and testnet checkpoints - 7 October 2022
* Fix some state service init arguments
* Allow more lookahead in the downloader, but less lookahead in the syncer
* Add the latest config to the tests, and fix the latest config check
* Increase the number of finalized blocks checked for non-finalized block UTXO spends
* fix(log): reduce verbose logs for block commits (#5348)
* Remove some verbose block write channel logs
* Only warn about tracing endpoint if the address is actually set
* Use CloneError instead of formatting a non-cloneable error
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
* Increase block verify timeout
* Work around a known block timeout bug by using a shorter timeout
Co-authored-by: teor <teor@riseup.net>
Co-authored-by: Marek <mail@marek.onl>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-10-11 12:25:45 -07:00
|
|
|
/// We've observed spurious 15 minute timeouts when a lot of blocks are being committed to
|
2022-10-14 13:20:24 -07:00
|
|
|
/// the state. But there are also some blocks that seem to hang entirely, and never return.
|
|
|
|
///
|
|
|
|
/// So we allow about half the spurious timeout, which might cause some re-downloads.
|
|
|
|
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(8 * 60);
|
change(state): Write non-finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5257)
* Add a new block commit task and channels, that don't do anything yet
* Add last_block_hash_sent to the state service, to avoid database accesses
* Update last_block_hash_sent regardless of commit errors
* Rename a field to StateService.max_queued_finalized_height
* Commit finalized blocks to the state in a separate task
* Check for panics in the block write task
* Wait for the block commit task in tests, and check for errors
* Always run a proptest that sleeps once
* Add extra debugging to state shutdowns
* Work around a RocksDB shutdown bug
* Close the finalized block channel when we're finished with it
* Only reset state queue once per error
* Update some TODOs
* Add a module doc comment
* Drop channels and check for closed channels in the block commit task
* Close state channels and tasks on drop
* Remove some duplicate fields across StateService and ReadStateService
* Try tweaking the shutdown steps
* Update and clarify some comments
* Clarify another comment
* Don't try to cancel RocksDB background work on drop
* Fix up some comments
* Remove some duplicate code
* Remove redundant workarounds for shutdown issues
* Remode a redundant channel close in the block commit task
* Remove a mistaken `!force` shutdown condition
* Remove duplicate force-shutdown code and explain it better
* Improve RPC error logging
* Wait for chain tip updates in the RPC tests
* Wait 2 seconds for chain tip updates before skipping them
* Remove an unnecessary block_in_place()
* Fix some test error messages that were changed by earlier fixes
* Expand some comments, fix typos
Co-authored-by: Marek <mail@marek.onl>
* Actually drop children of failed blocks
* Explain why we drop descendants of failed blocks
* Clarify a comment
* Wait for chain tip updates in a failing test on macOS
* Clean duplicate finalized blocks when the non-finalized state activates
* Send an error when receiving a duplicate finalized block
* Update checkpoint block behaviour, document its consensus rule
* Wait for chain tip changes in inbound_block_height_lookahead_limit test
* Wait for the genesis block to commit in the fake peer set mempool tests
* Disable unreliable mempool verification check in the send transaction test
* Appease rustfmt
* Use clear_finalized_block_queue() everywhere that blocks are dropped
* Document how Finalized and NonFinalized clones are different
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* updates comments, renames send_process_queued, other minor cleanup
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* updates comments, renames send_process_queued, other minor cleanup
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* removes duplicate field definitions on StateService that were a result of a bad merge
* update NotReadyToBeCommitted error message
* Appear rustfmt
* Fix doc links
* Rename a function to initial_contextual_validity()
* Do error tasks on Err, and success tasks on Ok
* Simplify parent_error_map truncation
* Rewrite best_tip() to use tip()
* Rename latest_mem() to latest_non_finalized_state()
```sh
fastmod latest_mem latest_non_finalized_state zebra*
cargo fmt --all
```
* Simplify latest_non_finalized_state() using a new WatchReceiver API
* Expand some error messages
* Send the result after updating the channels, and document why
* wait for chain_tip_update before cancelling download in mempool_cancel_mined
* adds `sent_non_finalized_block_hashes` field to StateService
* adds batched sent_hash insertions and checks sent hashes in queue_and_commit_non_finalized before adding a block to the queue
* check that the `curr_buf` in SentHashes is not empty before pushing it to the `sent_bufs`
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* Fix rustfmt
* Check for finalized block heights using zs_contains()
* adds known_utxos field to SentHashes
* updates comment on SentHashes.add method
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* return early when there's a duplicate hash in QueuedBlocks.queue instead of panicking
* Make finalized UTXOs near the final checkpoint available for full block verification
* Replace a checkpoint height literal with the actual config
* Update mainnet and testnet checkpoints - 7 October 2022
* Fix some state service init arguments
* Allow more lookahead in the downloader, but less lookahead in the syncer
* Add the latest config to the tests, and fix the latest config check
* Increase the number of finalized blocks checked for non-finalized block UTXO spends
* fix(log): reduce verbose logs for block commits (#5348)
* Remove some verbose block write channel logs
* Only warn about tracing endpoint if the address is actually set
* Use CloneError instead of formatting a non-cloneable error
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
* Increase block verify timeout
* Work around a known block timeout bug by using a shorter timeout
Co-authored-by: teor <teor@riseup.net>
Co-authored-by: Marek <mail@marek.onl>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-10-11 12:25:45 -07:00
|
|
|
|
|
|
|
/// A shorter timeout used for the first few blocks after the final checkpoint.
|
|
|
|
///
|
|
|
|
/// This is a workaround for bug #5125, where the first fully validated blocks
|
|
|
|
/// after the final checkpoint fail with a timeout, due to a UTXO race condition.
|
2022-10-14 13:20:24 -07:00
|
|
|
const FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(2 * 60);
|
change(state): Write non-finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5257)
* Add a new block commit task and channels, that don't do anything yet
* Add last_block_hash_sent to the state service, to avoid database accesses
* Update last_block_hash_sent regardless of commit errors
* Rename a field to StateService.max_queued_finalized_height
* Commit finalized blocks to the state in a separate task
* Check for panics in the block write task
* Wait for the block commit task in tests, and check for errors
* Always run a proptest that sleeps once
* Add extra debugging to state shutdowns
* Work around a RocksDB shutdown bug
* Close the finalized block channel when we're finished with it
* Only reset state queue once per error
* Update some TODOs
* Add a module doc comment
* Drop channels and check for closed channels in the block commit task
* Close state channels and tasks on drop
* Remove some duplicate fields across StateService and ReadStateService
* Try tweaking the shutdown steps
* Update and clarify some comments
* Clarify another comment
* Don't try to cancel RocksDB background work on drop
* Fix up some comments
* Remove some duplicate code
* Remove redundant workarounds for shutdown issues
* Remode a redundant channel close in the block commit task
* Remove a mistaken `!force` shutdown condition
* Remove duplicate force-shutdown code and explain it better
* Improve RPC error logging
* Wait for chain tip updates in the RPC tests
* Wait 2 seconds for chain tip updates before skipping them
* Remove an unnecessary block_in_place()
* Fix some test error messages that were changed by earlier fixes
* Expand some comments, fix typos
Co-authored-by: Marek <mail@marek.onl>
* Actually drop children of failed blocks
* Explain why we drop descendants of failed blocks
* Clarify a comment
* Wait for chain tip updates in a failing test on macOS
* Clean duplicate finalized blocks when the non-finalized state activates
* Send an error when receiving a duplicate finalized block
* Update checkpoint block behaviour, document its consensus rule
* Wait for chain tip changes in inbound_block_height_lookahead_limit test
* Wait for the genesis block to commit in the fake peer set mempool tests
* Disable unreliable mempool verification check in the send transaction test
* Appease rustfmt
* Use clear_finalized_block_queue() everywhere that blocks are dropped
* Document how Finalized and NonFinalized clones are different
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* updates comments, renames send_process_queued, other minor cleanup
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* sends non-finalized blocks to the block write task
* passes ZebraDb to commit_new_chain, commit_block, and no_duplicates_in_finalized_chain instead of FinalizedState
* updates comments, renames send_process_queued, other minor cleanup
* Update zebra-state/src/service/write.rs
Co-authored-by: teor <teor@riseup.net>
* update assert_block_can_be_validated comment
* removes `mem` field from StateService
* removes `disk` field from StateService and updates block_iter to use `ZebraDb` instead of the finalized state
* updates tests that use the disk to use read_service.db instead
* moves best_tip to a read fn and returns finalized & non-finalized states from setup instead of the state service
* changes `contextual_validity` to get the network from the finalized_state instead of another param
* swaps out StateService with FinalizedState and NonFinalizedState in tests
* adds NotReadyToBeCommitted error and returns it from validate_and_commit when a blocks parent hash is not in any chain
* removes NonFinalizedWriteCmd and calls, moves update_latest_channels above rsp_tx.send
* makes parent_errors_map an indexmap
* clears non-finalized block queue when the receiver is dropped and when the StateService is being dropped
* removes duplicate field definitions on StateService that were a result of a bad merge
* update NotReadyToBeCommitted error message
* Appear rustfmt
* Fix doc links
* Rename a function to initial_contextual_validity()
* Do error tasks on Err, and success tasks on Ok
* Simplify parent_error_map truncation
* Rewrite best_tip() to use tip()
* Rename latest_mem() to latest_non_finalized_state()
```sh
fastmod latest_mem latest_non_finalized_state zebra*
cargo fmt --all
```
* Simplify latest_non_finalized_state() using a new WatchReceiver API
* Expand some error messages
* Send the result after updating the channels, and document why
* wait for chain_tip_update before cancelling download in mempool_cancel_mined
* adds `sent_non_finalized_block_hashes` field to StateService
* adds batched sent_hash insertions and checks sent hashes in queue_and_commit_non_finalized before adding a block to the queue
* check that the `curr_buf` in SentHashes is not empty before pushing it to the `sent_bufs`
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* Fix rustfmt
* Check for finalized block heights using zs_contains()
* adds known_utxos field to SentHashes
* updates comment on SentHashes.add method
* Apply suggestions from code review
Co-authored-by: teor <teor@riseup.net>
* return early when there's a duplicate hash in QueuedBlocks.queue instead of panicking
* Make finalized UTXOs near the final checkpoint available for full block verification
* Replace a checkpoint height literal with the actual config
* Update mainnet and testnet checkpoints - 7 October 2022
* Fix some state service init arguments
* Allow more lookahead in the downloader, but less lookahead in the syncer
* Add the latest config to the tests, and fix the latest config check
* Increase the number of finalized blocks checked for non-finalized block UTXO spends
* fix(log): reduce verbose logs for block commits (#5348)
* Remove some verbose block write channel logs
* Only warn about tracing endpoint if the address is actually set
* Use CloneError instead of formatting a non-cloneable error
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
* Increase block verify timeout
* Work around a known block timeout bug by using a shorter timeout
Co-authored-by: teor <teor@riseup.net>
Co-authored-by: Marek <mail@marek.onl>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
2022-10-11 12:25:45 -07:00
|
|
|
|
|
|
|
/// The number of blocks after the final checkpoint that get the shorter timeout.
|
|
|
|
///
|
|
|
|
/// We've only seen this error on the first few blocks after the final checkpoint.
|
|
|
|
const FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT_LIMIT: i32 = 100;
|
2020-09-01 18:20:32 -07:00
|
|
|
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
/// Controls how long we wait to restart syncing after finishing a sync run.
|
2020-09-01 18:20:32 -07:00
|
|
|
///
|
2021-01-13 01:10:48 -08:00
|
|
|
/// This delay should be long enough to:
|
2020-09-01 18:20:32 -07:00
|
|
|
/// - allow zcashd peers to process pending requests. If the node only has a
|
|
|
|
/// few peers, we want to clear as much peer state as possible. In
|
|
|
|
/// particular, zcashd sends "next block range" hints, based on zcashd's
|
|
|
|
/// internal model of our sync progress. But we want to discard these hints,
|
2021-01-13 01:14:11 -08:00
|
|
|
/// so they don't get confused with ObtainTips and ExtendTips responses, and
|
|
|
|
/// - allow in-progress downloads to time out.
|
2020-09-01 18:20:32 -07:00
|
|
|
///
|
2021-01-13 01:10:48 -08:00
|
|
|
/// This delay is particularly important on instances with slow or unreliable
|
2020-09-08 03:04:01 -07:00
|
|
|
/// networks, and on testnet, which has a small number of slow peers.
|
2021-01-13 01:10:48 -08:00
|
|
|
///
|
2021-12-19 15:02:31 -08:00
|
|
|
/// Using a prime number makes sure that syncer fanouts don't synchronise with other crawls.
|
|
|
|
///
|
2021-01-13 01:10:48 -08:00
|
|
|
/// ## Correctness
|
|
|
|
///
|
|
|
|
/// If this delay is removed (or set too low), the syncer will
|
|
|
|
/// sometimes get stuck in a failure loop, due to leftover downloads from
|
|
|
|
/// previous sync runs.
|
2021-12-19 15:02:31 -08:00
|
|
|
const SYNC_RESTART_DELAY: Duration = Duration::from_secs(67);
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
|
2021-06-09 16:39:51 -07:00
|
|
|
/// Controls how long we wait to retry a failed attempt to download
|
|
|
|
/// and verify the genesis block.
|
|
|
|
///
|
|
|
|
/// This timeout gives the crawler time to find better peers.
|
|
|
|
///
|
|
|
|
/// ## Security
|
|
|
|
///
|
|
|
|
/// If this timeout is removed (or set too low), Zebra will immediately retry
|
|
|
|
/// to download and verify the genesis block from its peers. This can cause
|
|
|
|
/// a denial of service on those peers.
|
|
|
|
const GENESIS_TIMEOUT_RETRY: Duration = Duration::from_secs(5);
|
|
|
|
|
2022-10-24 16:39:00 -07:00
|
|
|
/// Sync configuration section.
|
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
|
|
|
#[serde(deny_unknown_fields, default)]
|
|
|
|
pub struct Config {
|
|
|
|
/// The number of parallel block download requests.
|
|
|
|
///
|
|
|
|
/// This is set to a low value by default, to avoid task and
|
|
|
|
/// network contention. Increasing this value may improve
|
|
|
|
/// performance on machines with a fast network connection.
|
|
|
|
#[serde(alias = "max_concurrent_block_requests")]
|
|
|
|
pub download_concurrency_limit: usize,
|
|
|
|
|
|
|
|
/// The number of blocks submitted in parallel to the checkpoint verifier.
|
|
|
|
///
|
|
|
|
/// Increasing this limit increases the buffer size, so it reduces
|
|
|
|
/// the impact of an individual block request failing. However, it
|
|
|
|
/// also increases memory and CPU usage if block validation stalls,
|
|
|
|
/// or there are some large blocks in the pipeline.
|
|
|
|
///
|
|
|
|
/// The block size limit is 2MB, so in theory, this could represent multiple
|
|
|
|
/// gigabytes of data, if we downloaded arbitrary blocks. However,
|
|
|
|
/// because we randomly load balance outbound requests, and separate
|
|
|
|
/// block download from obtaining block hashes, an adversary would
|
|
|
|
/// have to control a significant fraction of our peers to lead us
|
|
|
|
/// astray.
|
|
|
|
///
|
|
|
|
/// For reliable checkpoint syncing, Zebra enforces a
|
|
|
|
/// [`MIN_CHECKPOINT_CONCURRENCY_LIMIT`](MIN_CHECKPOINT_CONCURRENCY_LIMIT).
|
|
|
|
///
|
|
|
|
/// This is set to a high value by default, to avoid verification pipeline stalls.
|
|
|
|
/// Decreasing this value reduces RAM usage.
|
|
|
|
#[serde(alias = "lookahead_limit")]
|
|
|
|
pub checkpoint_verify_concurrency_limit: usize,
|
|
|
|
|
|
|
|
/// The number of blocks submitted in parallel to the full verifier.
|
|
|
|
///
|
|
|
|
/// This is set to a low value by default, to avoid verification timeouts on large blocks.
|
|
|
|
/// Increasing this value may improve performance on machines with many cores.
|
|
|
|
pub full_verify_concurrency_limit: usize,
|
|
|
|
|
|
|
|
/// The number of threads used to verify signatures, proofs, and other CPU-intensive code.
|
|
|
|
///
|
|
|
|
/// Set to `0` by default, which uses one thread per available CPU core.
|
|
|
|
/// For details, see [the `rayon` documentation](https://docs.rs/rayon/latest/rayon/struct.ThreadPoolBuilder.html#method.num_threads).
|
|
|
|
pub parallel_cpu_threads: usize,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for Config {
|
|
|
|
fn default() -> Self {
|
|
|
|
Self {
|
|
|
|
// 2/3 of the default outbound peer limit.
|
|
|
|
download_concurrency_limit: 50,
|
|
|
|
|
|
|
|
// A few max-length checkpoints.
|
|
|
|
checkpoint_verify_concurrency_limit: DEFAULT_CHECKPOINT_CONCURRENCY_LIMIT,
|
|
|
|
|
|
|
|
// This default is deliberately very low, so Zebra can verify a few large blocks in under 60 seconds,
|
|
|
|
// even on machines with only a few cores.
|
|
|
|
//
|
|
|
|
// This lets users see the committed block height changing in every progress log,
|
|
|
|
// and avoids hangs due to out-of-order verifications flooding the CPUs.
|
|
|
|
//
|
|
|
|
// TODO:
|
|
|
|
// - limit full verification concurrency based on block transaction counts?
|
|
|
|
// - move more disk work to blocking tokio threads,
|
|
|
|
// and CPU work to the rayon thread pool inside blocking tokio threads
|
|
|
|
full_verify_concurrency_limit: 20,
|
|
|
|
|
|
|
|
// Use one thread per CPU.
|
|
|
|
//
|
|
|
|
// If this causes tokio executor starvation, move CPU-intensive tasks to rayon threads,
|
|
|
|
// or reserve a few cores for tokio threads, based on `num_cpus()`.
|
|
|
|
parallel_cpu_threads: 0,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
/// Helps work around defects in the bitcoin protocol by checking whether
|
|
|
|
/// the returned hashes actually extend a chain tip.
|
2020-09-01 18:20:32 -07:00
|
|
|
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
struct CheckedTip {
|
2020-08-15 23:20:01 -07:00
|
|
|
tip: block::Hash,
|
|
|
|
expected_next: block::Hash,
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
pub struct ChainSync<ZN, ZS, ZV, ZSTip>
|
2020-07-01 13:35:01 -07:00
|
|
|
where
|
2022-01-11 09:11:35 -08:00
|
|
|
ZN: Service<zn::Request, Response = zn::Response, Error = BoxError>
|
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-07-21 13:50:38 -07:00
|
|
|
ZN::Future: Send,
|
2022-01-11 09:11:35 -08:00
|
|
|
ZS: Service<zs::Request, Response = zs::Response, Error = BoxError>
|
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-07-21 13:50:38 -07:00
|
|
|
ZS::Future: Send,
|
2023-01-11 15:39:51 -08:00
|
|
|
ZV: Service<zebra_consensus::Request, Response = block::Hash, Error = BoxError>
|
2022-01-11 09:11:35 -08:00
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-07-21 13:50:38 -07:00
|
|
|
ZV::Future: Send,
|
2022-01-11 09:11:35 -08:00
|
|
|
ZSTip: ChainTip + Clone + Send + 'static,
|
2020-07-01 13:35:01 -07:00
|
|
|
{
|
2021-01-13 01:14:11 -08:00
|
|
|
// Configuration
|
2022-07-06 07:13:57 -07:00
|
|
|
//
|
2021-01-13 01:14:11 -08:00
|
|
|
/// The genesis hash for the configured network
|
2020-08-15 23:20:01 -07:00
|
|
|
genesis_hash: block::Hash,
|
2021-01-13 01:14:11 -08:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
/// The largest block height for the checkpoint verifier, based on the current config.
|
|
|
|
max_checkpoint_height: Height,
|
|
|
|
|
|
|
|
/// The configured checkpoint verification concurrency limit, after applying the minimum limit.
|
|
|
|
checkpoint_verify_concurrency_limit: usize,
|
|
|
|
|
|
|
|
/// The configured full verification concurrency limit, after applying the minimum limit.
|
|
|
|
full_verify_concurrency_limit: usize,
|
2021-01-13 01:14:11 -08:00
|
|
|
|
|
|
|
// Services
|
2022-07-06 07:13:57 -07:00
|
|
|
//
|
2021-01-13 01:14:11 -08:00
|
|
|
/// A network service which is used to perform ObtainTips and ExtendTips
|
|
|
|
/// requests.
|
|
|
|
///
|
|
|
|
/// Has no retry logic, because failover is handled using fanout.
|
|
|
|
tip_network: Timeout<ZN>,
|
|
|
|
|
|
|
|
/// A service which downloads and verifies blocks, using the provided
|
|
|
|
/// network and verifier services.
|
2020-10-23 18:56:54 -07:00
|
|
|
downloads: Pin<
|
|
|
|
Box<
|
2021-01-13 01:08:02 -08:00
|
|
|
Downloads<
|
|
|
|
Hedge<ConcurrencyLimit<Retry<zn::RetryLimit, Timeout<ZN>>>, AlwaysHedge>,
|
|
|
|
Timeout<ZV>,
|
2022-01-11 09:11:35 -08:00
|
|
|
ZSTip,
|
2021-01-13 01:08:02 -08:00
|
|
|
>,
|
2020-10-23 18:56:54 -07:00
|
|
|
>,
|
|
|
|
>,
|
2021-01-13 01:14:11 -08:00
|
|
|
|
|
|
|
/// The cached block chain state.
|
|
|
|
state: ZS,
|
|
|
|
|
2022-01-28 14:12:19 -08:00
|
|
|
/// Allows efficient access to the best tip of the blockchain.
|
|
|
|
latest_chain_tip: ZSTip,
|
|
|
|
|
2021-01-13 01:14:11 -08:00
|
|
|
// Internal sync state
|
2022-07-06 07:13:57 -07:00
|
|
|
//
|
2021-01-13 01:14:11 -08:00
|
|
|
/// The tips that the syncer is currently following.
|
|
|
|
prospective_tips: HashSet<CheckedTip>,
|
2021-08-19 16:16:16 -07:00
|
|
|
|
|
|
|
/// The lengths of recent sync responses.
|
|
|
|
recent_syncs: RecentSyncLengths,
|
2022-11-08 20:42:04 -08:00
|
|
|
|
|
|
|
/// Receiver that is `true` when the downloader is past the lookahead limit.
|
|
|
|
/// This is based on the downloaded block height and the state tip height.
|
|
|
|
past_lookahead_limit_receiver: zs::WatchReceiver<bool>,
|
2020-07-01 13:35:01 -07:00
|
|
|
}
|
|
|
|
|
2020-09-18 16:13:57 -07:00
|
|
|
/// Polls the network to determine whether further blocks are available and
|
|
|
|
/// downloads them.
|
|
|
|
///
|
|
|
|
/// This component is used for initial block sync, but the `Inbound` service is
|
|
|
|
/// responsible for participating in the gossip protocols used for block
|
|
|
|
/// diffusion.
|
2022-01-11 09:11:35 -08:00
|
|
|
impl<ZN, ZS, ZV, ZSTip> ChainSync<ZN, ZS, ZV, ZSTip>
|
2020-06-22 19:24:53 -07:00
|
|
|
where
|
2022-01-11 09:11:35 -08:00
|
|
|
ZN: Service<zn::Request, Response = zn::Response, Error = BoxError>
|
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-06-22 19:24:53 -07:00
|
|
|
ZN::Future: Send,
|
2022-01-11 09:11:35 -08:00
|
|
|
ZS: Service<zs::Request, Response = zs::Response, Error = BoxError>
|
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-06-22 19:24:53 -07:00
|
|
|
ZS::Future: Send,
|
2023-01-11 15:39:51 -08:00
|
|
|
ZV: Service<zebra_consensus::Request, Response = block::Hash, Error = BoxError>
|
2022-01-11 09:11:35 -08:00
|
|
|
+ Send
|
|
|
|
+ Sync
|
|
|
|
+ Clone
|
|
|
|
+ 'static,
|
2020-06-30 09:42:09 -07:00
|
|
|
ZV::Future: Send,
|
2022-01-11 09:11:35 -08:00
|
|
|
ZSTip: ChainTip + Clone + Send + 'static,
|
2020-06-22 19:24:53 -07:00
|
|
|
{
|
2020-07-21 19:22:43 -07:00
|
|
|
/// Returns a new syncer instance, using:
|
|
|
|
/// - chain: the zebra-chain `Network` to download (Mainnet or Testnet)
|
|
|
|
/// - peers: the zebra-network peers to contact for downloads
|
|
|
|
/// - verifier: the zebra-consensus verifier that checks the chain
|
2021-12-17 08:31:51 -08:00
|
|
|
/// - state: the zebra-state that stores the chain
|
|
|
|
/// - latest_chain_tip: the latest chain tip from `state`
|
2021-08-19 16:16:16 -07:00
|
|
|
///
|
2021-08-29 17:01:33 -07:00
|
|
|
/// Also returns a [`SyncStatus`] to check if the syncer has likely reached the chain tip.
|
2021-12-17 08:31:51 -08:00
|
|
|
pub fn new(
|
|
|
|
config: &ZebradConfig,
|
2022-07-06 07:13:57 -07:00
|
|
|
max_checkpoint_height: Height,
|
2021-12-17 08:31:51 -08:00
|
|
|
peers: ZN,
|
|
|
|
verifier: ZV,
|
|
|
|
state: ZS,
|
2022-01-11 09:11:35 -08:00
|
|
|
latest_chain_tip: ZSTip,
|
2021-12-17 08:31:51 -08:00
|
|
|
) -> (Self, SyncStatus) {
|
2022-07-06 07:13:57 -07:00
|
|
|
let mut download_concurrency_limit = config.sync.download_concurrency_limit;
|
|
|
|
let mut checkpoint_verify_concurrency_limit =
|
|
|
|
config.sync.checkpoint_verify_concurrency_limit;
|
|
|
|
let mut full_verify_concurrency_limit = config.sync.full_verify_concurrency_limit;
|
|
|
|
|
|
|
|
if download_concurrency_limit < MIN_CONCURRENCY_LIMIT {
|
|
|
|
warn!(
|
|
|
|
"configured download concurrency limit {} too low, increasing to {}",
|
|
|
|
config.sync.download_concurrency_limit, MIN_CONCURRENCY_LIMIT,
|
|
|
|
);
|
|
|
|
|
|
|
|
download_concurrency_limit = MIN_CONCURRENCY_LIMIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if checkpoint_verify_concurrency_limit < MIN_CHECKPOINT_CONCURRENCY_LIMIT {
|
|
|
|
warn!(
|
|
|
|
"configured checkpoint verify concurrency limit {} too low, increasing to {}",
|
|
|
|
config.sync.checkpoint_verify_concurrency_limit, MIN_CHECKPOINT_CONCURRENCY_LIMIT,
|
|
|
|
);
|
|
|
|
|
|
|
|
checkpoint_verify_concurrency_limit = MIN_CHECKPOINT_CONCURRENCY_LIMIT;
|
|
|
|
}
|
|
|
|
|
|
|
|
if full_verify_concurrency_limit < MIN_CONCURRENCY_LIMIT {
|
|
|
|
warn!(
|
|
|
|
"configured full verify concurrency limit {} too low, increasing to {}",
|
|
|
|
config.sync.full_verify_concurrency_limit, MIN_CONCURRENCY_LIMIT,
|
|
|
|
);
|
|
|
|
|
|
|
|
full_verify_concurrency_limit = MIN_CONCURRENCY_LIMIT;
|
|
|
|
}
|
|
|
|
|
2020-09-01 18:20:32 -07:00
|
|
|
let tip_network = Timeout::new(peers.clone(), TIPS_RESPONSE_TIMEOUT);
|
2022-11-08 20:42:04 -08:00
|
|
|
|
2020-09-22 10:46:50 -07:00
|
|
|
// The Hedge middleware is the outermost layer, hedging requests
|
|
|
|
// between two retry-wrapped networks. The innermost timeout
|
|
|
|
// layer is relatively unimportant, because slow requests will
|
|
|
|
// probably be pre-emptively hedged.
|
|
|
|
//
|
2020-10-24 17:15:42 -07:00
|
|
|
// The Hedge goes outside the Retry, because the Retry layer
|
|
|
|
// abstracts away spurious failures from individual peers
|
|
|
|
// making a less-fallible network service, and the Hedge layer
|
|
|
|
// tries to reduce latency of that less-fallible service.
|
2020-09-22 10:46:50 -07:00
|
|
|
let block_network = Hedge::new(
|
2020-09-09 15:33:25 -07:00
|
|
|
ServiceBuilder::new()
|
2022-07-06 07:13:57 -07:00
|
|
|
.concurrency_limit(download_concurrency_limit)
|
2020-09-09 15:33:25 -07:00
|
|
|
.retry(zn::RetryLimit::new(BLOCK_DOWNLOAD_RETRY_LIMIT))
|
|
|
|
.timeout(BLOCK_DOWNLOAD_TIMEOUT)
|
|
|
|
.service(peers),
|
2020-09-22 10:46:50 -07:00
|
|
|
AlwaysHedge,
|
2020-11-12 11:44:14 -08:00
|
|
|
20,
|
2020-09-22 10:46:50 -07:00
|
|
|
0.95,
|
2021-01-13 01:10:48 -08:00
|
|
|
2 * SYNC_RESTART_DELAY,
|
2020-09-09 15:33:25 -07:00
|
|
|
);
|
2021-08-19 16:16:16 -07:00
|
|
|
|
2021-01-13 01:08:02 -08:00
|
|
|
// We apply a timeout to the verifier to avoid hangs due to missing earlier blocks.
|
|
|
|
let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT);
|
2021-08-19 16:16:16 -07:00
|
|
|
|
2021-08-29 17:01:33 -07:00
|
|
|
let (sync_status, recent_syncs) = SyncStatus::new();
|
2021-08-19 16:16:16 -07:00
|
|
|
|
2022-11-08 20:42:04 -08:00
|
|
|
let (past_lookahead_limit_sender, past_lookahead_limit_receiver) = watch::channel(false);
|
|
|
|
let past_lookahead_limit_receiver = zs::WatchReceiver::new(past_lookahead_limit_receiver);
|
|
|
|
|
|
|
|
let downloads = Box::pin(Downloads::new(
|
|
|
|
block_network,
|
|
|
|
verifier,
|
|
|
|
latest_chain_tip.clone(),
|
|
|
|
past_lookahead_limit_sender,
|
|
|
|
max(
|
|
|
|
checkpoint_verify_concurrency_limit,
|
|
|
|
full_verify_concurrency_limit,
|
|
|
|
),
|
|
|
|
max_checkpoint_height,
|
|
|
|
));
|
|
|
|
|
2021-08-19 16:16:16 -07:00
|
|
|
let new_syncer = Self {
|
2021-01-13 01:14:11 -08:00
|
|
|
genesis_hash: genesis_hash(config.network.network),
|
2022-07-06 07:13:57 -07:00
|
|
|
max_checkpoint_height,
|
|
|
|
checkpoint_verify_concurrency_limit,
|
|
|
|
full_verify_concurrency_limit,
|
2020-09-01 18:20:32 -07:00
|
|
|
tip_network,
|
2022-11-08 20:42:04 -08:00
|
|
|
downloads,
|
2021-01-13 01:14:11 -08:00
|
|
|
state,
|
2022-01-28 14:12:19 -08:00
|
|
|
latest_chain_tip,
|
2020-07-21 13:50:38 -07:00
|
|
|
prospective_tips: HashSet::new(),
|
2021-08-19 16:16:16 -07:00
|
|
|
recent_syncs,
|
2022-11-08 20:42:04 -08:00
|
|
|
past_lookahead_limit_receiver,
|
2021-08-19 16:16:16 -07:00
|
|
|
};
|
|
|
|
|
2021-08-29 17:01:33 -07:00
|
|
|
(new_syncer, sync_status)
|
2020-07-21 13:50:38 -07:00
|
|
|
}
|
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
/// Runs the syncer to synchronize the chain and keep it synchronized.
|
2020-07-08 13:33:39 -07:00
|
|
|
#[instrument(skip(self))]
|
2020-11-12 20:01:16 -08:00
|
|
|
pub async fn sync(mut self) -> Result<(), Report> {
|
2020-07-23 10:56:52 -07:00
|
|
|
// We can't download the genesis block using our normal algorithm,
|
|
|
|
// due to protocol limitations
|
|
|
|
self.request_genesis().await?;
|
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
loop {
|
|
|
|
if self.try_to_sync().await.is_err() {
|
2020-10-21 19:30:22 -07:00
|
|
|
self.downloads.cancel_all();
|
|
|
|
}
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
self.update_metrics();
|
|
|
|
|
2022-01-28 14:12:19 -08:00
|
|
|
info!(
|
2022-03-17 17:31:12 -07:00
|
|
|
timeout = ?SYNC_RESTART_DELAY,
|
2022-01-28 14:12:19 -08:00
|
|
|
state_tip = ?self.latest_chain_tip.best_tip_height(),
|
2022-03-17 17:31:12 -07:00
|
|
|
"waiting to restart sync"
|
2022-01-28 14:12:19 -08:00
|
|
|
);
|
2022-03-17 17:31:12 -07:00
|
|
|
sleep(SYNC_RESTART_DELAY).await;
|
|
|
|
}
|
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
/// Tries to synchronize the chain as far as it can.
|
|
|
|
///
|
|
|
|
/// Obtains some prospective tips and iteratively tries to extend them and download the missing
|
|
|
|
/// blocks.
|
|
|
|
///
|
|
|
|
/// Returns `Ok` if it was able to synchronize as much of the chain as it could, and then ran
|
|
|
|
/// out of prospective tips. This happens when synchronization finishes or if Zebra ended up
|
|
|
|
/// following a fork. Either way, Zebra should attempt to obtain some more tips.
|
|
|
|
///
|
|
|
|
/// Returns `Err` if there was an unrecoverable error and restarting the synchronization is
|
|
|
|
/// necessary.
|
|
|
|
#[instrument(skip(self))]
|
2022-05-04 15:04:34 -07:00
|
|
|
async fn try_to_sync(&mut self) -> Result<(), Report> {
|
2022-03-17 17:31:12 -07:00
|
|
|
self.prospective_tips = HashSet::new();
|
2020-07-21 13:50:38 -07:00
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
info!(
|
|
|
|
state_tip = ?self.latest_chain_tip.best_tip_height(),
|
|
|
|
"starting sync, obtaining new tips"
|
|
|
|
);
|
2022-07-06 07:13:57 -07:00
|
|
|
let mut extra_hashes = self.obtain_tips().await.map_err(|e| {
|
2022-03-25 19:28:38 -07:00
|
|
|
info!("temporary error obtaining tips: {:#}", e);
|
2022-07-06 07:13:57 -07:00
|
|
|
e
|
|
|
|
})?;
|
2022-03-17 17:31:12 -07:00
|
|
|
self.update_metrics();
|
2020-11-12 20:35:27 -08:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
while !self.prospective_tips.is_empty() || !extra_hashes.is_empty() {
|
2022-03-17 17:31:12 -07:00
|
|
|
// Check whether any block tasks are currently ready:
|
|
|
|
while let Poll::Ready(Some(rsp)) = futures::poll!(self.downloads.next()) {
|
2022-07-06 07:13:57 -07:00
|
|
|
self.handle_block_response(rsp)?;
|
2022-03-17 17:31:12 -07:00
|
|
|
}
|
|
|
|
self.update_metrics();
|
2020-09-01 18:20:32 -07:00
|
|
|
|
2022-11-08 20:42:04 -08:00
|
|
|
// Pause new downloads while the syncer or downloader are past their lookahead limits.
|
|
|
|
//
|
|
|
|
// To avoid a deadlock or long waits for blocks to expire, we ignore the download
|
|
|
|
// lookahead limit when there are only a small number of blocks waiting.
|
|
|
|
while self.downloads.in_flight() >= self.lookahead_limit(extra_hashes.len())
|
|
|
|
|| (self.downloads.in_flight() >= self.lookahead_limit(extra_hashes.len()) / 2
|
|
|
|
&& self.past_lookahead_limit_receiver.cloned_watch_data())
|
|
|
|
{
|
2022-03-17 17:31:12 -07:00
|
|
|
trace!(
|
2020-09-01 18:20:32 -07:00
|
|
|
tips.len = self.prospective_tips.len(),
|
2020-09-09 15:33:25 -07:00
|
|
|
in_flight = self.downloads.in_flight(),
|
2022-07-06 07:13:57 -07:00
|
|
|
extra_hashes = extra_hashes.len(),
|
|
|
|
lookahead_limit = self.lookahead_limit(extra_hashes.len()),
|
2022-01-28 14:12:19 -08:00
|
|
|
state_tip = ?self.latest_chain_tip.best_tip_height(),
|
2022-03-17 17:31:12 -07:00
|
|
|
"waiting for pending blocks",
|
2020-09-01 18:20:32 -07:00
|
|
|
);
|
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
let response = self.downloads.next().await.expect("downloads is nonempty");
|
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
self.handle_block_response(response)?;
|
2020-08-07 01:04:33 -07:00
|
|
|
self.update_metrics();
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
// Once we're below the lookahead limit, we can request more blocks or hashes.
|
|
|
|
if !extra_hashes.is_empty() {
|
|
|
|
debug!(
|
|
|
|
tips.len = self.prospective_tips.len(),
|
|
|
|
in_flight = self.downloads.in_flight(),
|
|
|
|
extra_hashes = extra_hashes.len(),
|
|
|
|
lookahead_limit = self.lookahead_limit(extra_hashes.len()),
|
|
|
|
state_tip = ?self.latest_chain_tip.best_tip_height(),
|
|
|
|
"requesting more blocks",
|
|
|
|
);
|
2022-03-17 17:31:12 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
let response = self.request_blocks(extra_hashes).await;
|
|
|
|
extra_hashes = Self::handle_hash_response(response)?;
|
|
|
|
} else {
|
|
|
|
info!(
|
|
|
|
tips.len = self.prospective_tips.len(),
|
|
|
|
in_flight = self.downloads.in_flight(),
|
|
|
|
extra_hashes = extra_hashes.len(),
|
|
|
|
lookahead_limit = self.lookahead_limit(extra_hashes.len()),
|
|
|
|
state_tip = ?self.latest_chain_tip.best_tip_height(),
|
|
|
|
"extending tips",
|
|
|
|
);
|
|
|
|
|
|
|
|
extra_hashes = self.extend_tips().await.map_err(|e| {
|
|
|
|
info!("temporary error extending tips: {:#}", e);
|
|
|
|
e
|
|
|
|
})?;
|
2022-03-17 17:31:12 -07:00
|
|
|
}
|
|
|
|
self.update_metrics();
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
2022-03-17 17:31:12 -07:00
|
|
|
|
|
|
|
info!("exhausted prospective tip set");
|
|
|
|
|
|
|
|
Ok(())
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Given a block_locator list fan out request for subsequent hashes to
|
|
|
|
/// multiple peers
|
2020-07-08 13:33:39 -07:00
|
|
|
#[instrument(skip(self))]
|
2022-07-06 07:13:57 -07:00
|
|
|
async fn obtain_tips(&mut self) -> Result<IndexSet<block::Hash>, Report> {
|
2020-07-22 18:01:31 -07:00
|
|
|
let block_locator = self
|
|
|
|
.state
|
2021-11-02 11:46:57 -07:00
|
|
|
.ready()
|
2020-07-22 18:01:31 -07:00
|
|
|
.await
|
|
|
|
.map_err(|e| eyre!(e))?
|
2020-09-09 21:19:15 -07:00
|
|
|
.call(zebra_state::Request::BlockLocator)
|
2020-07-22 18:01:31 -07:00
|
|
|
.await
|
|
|
|
.map(|response| match response {
|
2020-09-09 21:19:15 -07:00
|
|
|
zebra_state::Response::BlockLocator(block_locator) => block_locator,
|
2020-07-22 18:01:31 -07:00
|
|
|
_ => unreachable!(
|
|
|
|
"GetBlockLocator request can only result in Response::BlockLocator"
|
|
|
|
),
|
|
|
|
})
|
|
|
|
.map_err(|e| eyre!(e))?;
|
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(
|
|
|
|
tip = ?block_locator.first().expect("we have at least one block locator object"),
|
|
|
|
?block_locator,
|
|
|
|
"got block locator and trying to obtain new chain tips"
|
|
|
|
);
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2020-07-21 13:50:38 -07:00
|
|
|
let mut requests = FuturesUnordered::new();
|
2021-12-19 15:02:31 -08:00
|
|
|
for attempt in 0..FANOUT {
|
|
|
|
if attempt > 0 {
|
|
|
|
// Let other tasks run, so we're more likely to choose a different peer.
|
|
|
|
//
|
|
|
|
// TODO: move fanouts into the PeerSet, so we always choose different peers (#2214)
|
|
|
|
tokio::task::yield_now().await;
|
|
|
|
}
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
let ready_tip_network = self.tip_network.ready().await;
|
|
|
|
requests.push(tokio::spawn(ready_tip_network.map_err(|e| eyre!(e))?.call(
|
2021-11-02 11:46:57 -07:00
|
|
|
zn::Request::FindBlocks {
|
|
|
|
known_blocks: block_locator.clone(),
|
|
|
|
stop: None,
|
|
|
|
},
|
2022-01-11 09:11:35 -08:00
|
|
|
)));
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
let mut download_set = IndexSet::new();
|
2020-07-21 13:50:38 -07:00
|
|
|
while let Some(res) = requests.next().await {
|
2022-01-11 09:11:35 -08:00
|
|
|
match res
|
|
|
|
.expect("panic in spawned obtain tips request")
|
|
|
|
.map_err::<Report, _>(|e| eyre!(e))
|
|
|
|
{
|
2020-08-15 23:20:01 -07:00
|
|
|
Ok(zn::Response::BlockHashes(hashes)) => {
|
2022-01-28 09:24:53 -08:00
|
|
|
trace!(?hashes);
|
2020-09-03 15:09:34 -07:00
|
|
|
|
|
|
|
// zcashd sometimes appends an unrelated hash at the start
|
|
|
|
// or end of its response.
|
|
|
|
//
|
|
|
|
// We can't discard the first hash, because it might be a
|
|
|
|
// block we want to download. So we just accept any
|
|
|
|
// out-of-order first hashes.
|
|
|
|
|
|
|
|
// We use the last hash for the tip, and we want to avoid bad
|
|
|
|
// tips. So we discard the last hash. (We don't need to worry
|
|
|
|
// about missed downloads, because we will pick them up again
|
|
|
|
// in ExtendTips.)
|
|
|
|
let hashes = match hashes.as_slice() {
|
|
|
|
[] => continue,
|
|
|
|
[rest @ .., _last] => rest,
|
|
|
|
};
|
|
|
|
|
2020-08-10 16:17:50 -07:00
|
|
|
let mut first_unknown = None;
|
2020-06-22 22:31:26 -07:00
|
|
|
for (i, &hash) in hashes.iter().enumerate() {
|
2020-08-10 16:17:50 -07:00
|
|
|
if !self.state_contains(hash).await? {
|
|
|
|
first_unknown = Some(i);
|
2020-06-22 19:24:53 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2020-08-10 16:17:50 -07:00
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(hashes.len = ?hashes.len(), ?first_unknown);
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
|
|
|
|
let unknown_hashes = if let Some(index) = first_unknown {
|
|
|
|
&hashes[index..]
|
|
|
|
} else {
|
2020-08-10 16:17:50 -07:00
|
|
|
continue;
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
};
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
trace!(?unknown_hashes);
|
2020-07-08 13:33:39 -07:00
|
|
|
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
let new_tip = if let Some(end) = unknown_hashes.rchunks_exact(2).next() {
|
|
|
|
CheckedTip {
|
|
|
|
tip: end[0],
|
|
|
|
expected_next: end[1],
|
|
|
|
}
|
|
|
|
} else {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!("discarding response that extends only one block");
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
continue;
|
|
|
|
};
|
|
|
|
|
2020-08-24 05:09:41 -07:00
|
|
|
// Make sure we get the same tips, regardless of the
|
|
|
|
// order of peer responses
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
if !download_set.contains(&new_tip.expected_next) {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?new_tip,
|
2020-08-24 05:09:41 -07:00
|
|
|
"adding new prospective tip, and removing existing tips in the new block hash list");
|
|
|
|
self.prospective_tips
|
|
|
|
.retain(|t| !unknown_hashes.contains(&t.expected_next));
|
2020-07-08 13:33:39 -07:00
|
|
|
self.prospective_tips.insert(new_tip);
|
|
|
|
} else {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(
|
2020-09-03 15:09:34 -07:00
|
|
|
?new_tip,
|
|
|
|
"discarding prospective tip: already in download set"
|
|
|
|
);
|
2020-07-08 13:33:39 -07:00
|
|
|
}
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
// security: the first response determines our download order
|
|
|
|
//
|
|
|
|
// TODO: can we make the download order independent of response order?
|
2020-07-08 13:33:39 -07:00
|
|
|
let prev_download_len = download_set.len();
|
|
|
|
download_set.extend(unknown_hashes);
|
|
|
|
let new_download_len = download_set.len();
|
2021-08-19 16:16:16 -07:00
|
|
|
let new_hashes = new_download_len - prev_download_len;
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(new_hashes, "added hashes to download set");
|
2021-11-02 11:46:57 -07:00
|
|
|
metrics::histogram!("sync.obtain.response.hash.count", new_hashes as f64);
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
2020-07-21 13:50:38 -07:00
|
|
|
Ok(_) => unreachable!("network returned wrong response"),
|
|
|
|
// We ignore this error because we made multiple fanout requests.
|
2022-01-28 09:24:53 -08:00
|
|
|
Err(e) => debug!(?e),
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?self.prospective_tips);
|
2020-08-07 01:04:33 -07:00
|
|
|
|
2020-10-22 10:58:49 -07:00
|
|
|
// Check that the new tips we got are actually unknown.
|
|
|
|
for hash in &download_set {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?hash, "checking if state contains hash");
|
2020-10-22 10:58:49 -07:00
|
|
|
if self.state_contains(*hash).await? {
|
|
|
|
return Err(eyre!("queued download of hash behind our chain tip"));
|
|
|
|
}
|
|
|
|
}
|
2021-08-19 16:16:16 -07:00
|
|
|
|
|
|
|
let new_downloads = download_set.len();
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(new_downloads, "queueing new downloads");
|
2021-08-19 16:16:16 -07:00
|
|
|
metrics::gauge!("sync.obtain.queued.hash.count", new_downloads as f64);
|
|
|
|
|
|
|
|
// security: use the actual number of new downloads from all peers,
|
2022-01-11 09:11:35 -08:00
|
|
|
// so the last peer to respond can't toggle our mempool
|
2021-08-19 16:16:16 -07:00
|
|
|
self.recent_syncs.push_obtain_tips_length(new_downloads);
|
|
|
|
|
2022-05-04 15:04:34 -07:00
|
|
|
let response = self.request_blocks(download_set).await;
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
Self::handle_hash_response(response).map_err(Into::into)
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
|
2020-07-08 13:33:39 -07:00
|
|
|
#[instrument(skip(self))]
|
2022-07-06 07:13:57 -07:00
|
|
|
async fn extend_tips(&mut self) -> Result<IndexSet<block::Hash>, Report> {
|
2020-06-22 19:24:53 -07:00
|
|
|
let tips = std::mem::take(&mut self.prospective_tips);
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
let mut download_set = IndexSet::new();
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(tips = ?tips.len(), "trying to extend chain tips");
|
2020-06-22 19:24:53 -07:00
|
|
|
for tip in tips {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?tip, "asking peers to extend chain tip");
|
2020-07-21 13:50:38 -07:00
|
|
|
let mut responses = FuturesUnordered::new();
|
2021-12-19 15:02:31 -08:00
|
|
|
for attempt in 0..FANOUT {
|
|
|
|
if attempt > 0 {
|
|
|
|
// Let other tasks run, so we're more likely to choose a different peer.
|
|
|
|
//
|
|
|
|
// TODO: move fanouts into the PeerSet, so we always choose different peers (#2214)
|
|
|
|
tokio::task::yield_now().await;
|
|
|
|
}
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
let ready_tip_network = self.tip_network.ready().await;
|
|
|
|
responses.push(tokio::spawn(ready_tip_network.map_err(|e| eyre!(e))?.call(
|
2021-11-02 11:46:57 -07:00
|
|
|
zn::Request::FindBlocks {
|
|
|
|
known_blocks: vec![tip.tip],
|
|
|
|
stop: None,
|
|
|
|
},
|
2022-01-11 09:11:35 -08:00
|
|
|
)));
|
2020-07-08 13:33:39 -07:00
|
|
|
}
|
2020-07-21 13:50:38 -07:00
|
|
|
while let Some(res) = responses.next().await {
|
2022-01-11 09:11:35 -08:00
|
|
|
match res
|
|
|
|
.expect("panic in spawned extend tips request")
|
|
|
|
.map_err::<Report, _>(|e| eyre!(e))
|
|
|
|
{
|
2020-08-15 23:20:01 -07:00
|
|
|
Ok(zn::Response::BlockHashes(hashes)) => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(first = ?hashes.first(), len = ?hashes.len());
|
|
|
|
trace!(?hashes);
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
|
2020-09-03 15:08:19 -07:00
|
|
|
// zcashd sometimes appends an unrelated hash at the
|
|
|
|
// start or end of its response. Check the first hash
|
|
|
|
// against the previous response, and discard mismatches.
|
|
|
|
let unknown_hashes = match hashes.as_slice() {
|
|
|
|
[expected_hash, rest @ ..] if expected_hash == &tip.expected_next => {
|
|
|
|
rest
|
|
|
|
}
|
|
|
|
// If the first hash doesn't match, retry with the second.
|
|
|
|
[first_hash, expected_hash, rest @ ..]
|
|
|
|
if expected_hash == &tip.expected_next =>
|
|
|
|
{
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?first_hash,
|
2020-09-03 15:08:19 -07:00
|
|
|
?tip.expected_next,
|
|
|
|
?tip.tip,
|
|
|
|
"unexpected first hash, but the second matches: using the hashes after the match");
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
rest
|
2020-07-22 17:39:33 -07:00
|
|
|
}
|
2020-09-03 15:08:19 -07:00
|
|
|
// We ignore these responses
|
|
|
|
[] => continue,
|
|
|
|
[single_hash] => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?single_hash,
|
2020-09-03 15:08:19 -07:00
|
|
|
?tip.expected_next,
|
|
|
|
?tip.tip,
|
|
|
|
"discarding response containing a single unexpected hash");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
[first_hash, second_hash, rest @ ..] => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?first_hash,
|
2020-09-03 15:08:19 -07:00
|
|
|
?second_hash,
|
|
|
|
rest_len = ?rest.len(),
|
|
|
|
?tip.expected_next,
|
|
|
|
?tip.tip,
|
|
|
|
"discarding response that starts with two unexpected hashes");
|
2020-06-30 09:42:09 -07:00
|
|
|
continue;
|
|
|
|
}
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
};
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2020-09-03 15:09:34 -07:00
|
|
|
// We use the last hash for the tip, and we want to avoid
|
|
|
|
// bad tips. So we discard the last hash. (We don't need
|
|
|
|
// to worry about missed downloads, because we will pick
|
|
|
|
// them up again in the next ExtendTips.)
|
|
|
|
let unknown_hashes = match unknown_hashes {
|
|
|
|
[] => continue,
|
|
|
|
[rest @ .., _last] => rest,
|
|
|
|
};
|
2020-06-30 09:42:09 -07:00
|
|
|
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
let new_tip = if let Some(end) = unknown_hashes.rchunks_exact(2).next() {
|
|
|
|
CheckedTip {
|
|
|
|
tip: end[0],
|
|
|
|
expected_next: end[1],
|
|
|
|
}
|
|
|
|
} else {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!("discarding response that extends only one block");
|
2020-08-10 16:17:50 -07:00
|
|
|
continue;
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
};
|
2020-08-10 16:17:50 -07:00
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
trace!(?unknown_hashes);
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
|
2020-08-24 05:09:41 -07:00
|
|
|
// Make sure we get the same tips, regardless of the
|
|
|
|
// order of peer responses
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
if !download_set.contains(&new_tip.expected_next) {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(?new_tip,
|
2020-08-24 05:09:41 -07:00
|
|
|
"adding new prospective tip, and removing any existing tips in the new block hash list");
|
|
|
|
self.prospective_tips
|
|
|
|
.retain(|t| !unknown_hashes.contains(&t.expected_next));
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
self.prospective_tips.insert(new_tip);
|
|
|
|
} else {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(
|
2020-09-03 15:09:34 -07:00
|
|
|
?new_tip,
|
|
|
|
"discarding prospective tip: already in download set"
|
|
|
|
);
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
// security: the first response determines our download order
|
|
|
|
//
|
|
|
|
// TODO: can we make the download order independent of response order?
|
2020-08-07 01:04:33 -07:00
|
|
|
let prev_download_len = download_set.len();
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
download_set.extend(unknown_hashes);
|
2020-08-07 01:04:33 -07:00
|
|
|
let new_download_len = download_set.len();
|
2021-08-19 16:16:16 -07:00
|
|
|
let new_hashes = new_download_len - prev_download_len;
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(new_hashes, "added hashes to download set");
|
2021-11-02 11:46:57 -07:00
|
|
|
metrics::histogram!("sync.extend.response.hash.count", new_hashes as f64);
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
2020-07-21 13:50:38 -07:00
|
|
|
Ok(_) => unreachable!("network returned wrong response"),
|
|
|
|
// We ignore this error because we made multiple fanout requests.
|
2022-01-28 09:24:53 -08:00
|
|
|
Err(e) => debug!(?e),
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-19 16:16:16 -07:00
|
|
|
let new_downloads = download_set.len();
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(new_downloads, "queueing new downloads");
|
2021-08-19 16:16:16 -07:00
|
|
|
metrics::gauge!("sync.extend.queued.hash.count", new_downloads as f64);
|
|
|
|
|
|
|
|
// security: use the actual number of new downloads from all peers,
|
2022-01-11 09:11:35 -08:00
|
|
|
// so the last peer to respond can't toggle our mempool
|
2021-08-19 16:16:16 -07:00
|
|
|
self.recent_syncs.push_extend_tips_length(new_downloads);
|
|
|
|
|
2022-05-04 15:04:34 -07:00
|
|
|
let response = self.request_blocks(download_set).await;
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
Self::handle_hash_response(response).map_err(Into::into)
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
|
|
|
|
2020-09-03 15:09:34 -07:00
|
|
|
/// Download and verify the genesis block, if it isn't currently known to
|
2020-07-23 10:56:52 -07:00
|
|
|
/// our node.
|
|
|
|
async fn request_genesis(&mut self) -> Result<(), Report> {
|
|
|
|
// Due to Bitcoin protocol limitations, we can't request the genesis
|
|
|
|
// block using our standard tip-following algorithm:
|
|
|
|
// - getblocks requires at least one hash
|
|
|
|
// - responses start with the block *after* the requested block, and
|
|
|
|
// - the genesis hash is used as a placeholder for "no matches".
|
|
|
|
//
|
2020-09-03 15:09:34 -07:00
|
|
|
// So we just download and verify the genesis block here.
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
while !self.state_contains(self.genesis_hash).await? {
|
2022-01-28 09:24:53 -08:00
|
|
|
info!("starting genesis block download and verify");
|
2022-05-04 15:04:34 -07:00
|
|
|
|
|
|
|
let response = self.downloads.download_and_verify(self.genesis_hash).await;
|
|
|
|
Self::handle_response(response).map_err(|e| eyre!(e))?;
|
|
|
|
|
|
|
|
let response = self.downloads.next().await.expect("downloads is nonempty");
|
|
|
|
|
|
|
|
match response {
|
2022-07-06 07:13:57 -07:00
|
|
|
Ok(response) => self
|
|
|
|
.handle_block_response(Ok(response))
|
|
|
|
.expect("never returns Err for Ok"),
|
2022-05-04 15:04:34 -07:00
|
|
|
Err(error) => {
|
|
|
|
// TODO: exit syncer on permanent service errors (NetworkError, VerifierError)
|
|
|
|
if Self::should_restart_sync(&error) {
|
|
|
|
warn!(
|
|
|
|
?error,
|
|
|
|
"could not download or verify genesis block, retrying"
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
info!(
|
|
|
|
?error,
|
|
|
|
"temporary error downloading or verifying genesis block, retrying"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-06-09 16:39:51 -07:00
|
|
|
tokio::time::sleep(GENESIS_TIMEOUT_RETRY).await;
|
2020-09-03 15:13:00 -07:00
|
|
|
}
|
Fix sync algorithm. (#887)
* checkpoint: reject older of duplicate verification requests.
If we get a duplicate block verification request, we should drop the older one
in favor of the newer one, because the older request is likely to have been
canceled. Previously, this code would accept up to four duplicate verification
requests, then fail all subsequent ones.
* sync: add a timeout layer to block requests.
Note that if this timeout is too short, we'll bring down the peer set in a
retry storm.
* sync: restart syncing on error
Restart the syncing process when an error occurs, rather than ignoring it.
Restarting means we discard all tips and start over with a new block locator,
so we can have another chance to "unstuck" ourselves.
* sync: additional debug info
* sync: handle lookahead limit correctly.
Instead of extracting all the completed task results, the previous code pulled
results out until there were fewer tasks than the lookahead limit, then
stopped. This meant that completed tasks could be left until the limit was
exceeded again. Instead, extract all completed results, and use the number of
pending tasks to decide whether to extend the tip or wait for blocks to finish.
* network: add debug instrumentation to retry policy
* sync: instrument the spawned task
* sync: streamline ObtainTips/ExtendTips logic & tracing
This change does three things:
1. It aligns the implementation of ObtainTips and ExtendTips so that they use
the same deduplication method. This means that when debugging we only have one
deduplication algorithm to focus on.
2. It streamlines the tracing output to not include information already
included in spans. Both obtain_tips and extend_tips have their own spans
attached to the events, so it's not necessary to add Scope: prefixes in
messages.
3. It changes the messages to be focused on reporting the actual
events rather than the interpretation of the events (e.g., "got genesis hash in
response" rather than "peer could not extend tip"). The motivation for this
change is that when debugging, the interpretation of events is already known to
be incorrect, in the sense that the mental model of the code (no bug) does not
match its behavior (has bug), so presenting minimally-interpreted events forces
interpretation relative to the actual code.
* sync: hack to work around zcashd behavior
* sync: localize debug statement in extend_tips
* sync: change algorithm to define tips as pairs of hashes.
This is different enough from the existing description that its comments no
longer apply, so I removed them. A further chunk of work is to change the sync
RFC to document this algorithm.
* sync: reduce block timeout
* state: add resource limits for sled
Closes #888
* sync: add a restart timeout constant
* sync: de-pub constants
2020-08-12 16:48:01 -07:00
|
|
|
}
|
2020-07-23 10:56:52 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
/// Queue download and verify tasks for each block that isn't currently known to our node.
|
|
|
|
///
|
|
|
|
/// TODO: turn obtain and extend tips into a separate task, which sends hashes via a channel?
|
2022-05-04 15:04:34 -07:00
|
|
|
async fn request_blocks(
|
|
|
|
&mut self,
|
2022-07-06 07:13:57 -07:00
|
|
|
mut hashes: IndexSet<block::Hash>,
|
|
|
|
) -> Result<IndexSet<block::Hash>, BlockDownloadVerifyError> {
|
|
|
|
let lookahead_limit = self.lookahead_limit(hashes.len());
|
|
|
|
|
|
|
|
debug!(
|
|
|
|
hashes.len = hashes.len(),
|
|
|
|
?lookahead_limit,
|
|
|
|
"requesting blocks",
|
|
|
|
);
|
|
|
|
|
|
|
|
let extra_hashes = if hashes.len() > lookahead_limit {
|
|
|
|
hashes.split_off(lookahead_limit)
|
|
|
|
} else {
|
|
|
|
IndexSet::new()
|
|
|
|
};
|
|
|
|
|
2020-07-21 13:50:38 -07:00
|
|
|
for hash in hashes.into_iter() {
|
2020-10-24 17:31:41 -07:00
|
|
|
self.downloads.download_and_verify(hash).await?;
|
2020-06-30 09:42:09 -07:00
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
Ok(extra_hashes)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The configured lookahead limit, based on the currently verified height,
|
2022-11-08 20:42:04 -08:00
|
|
|
/// and the number of hashes we haven't queued yet.
|
2022-07-06 07:13:57 -07:00
|
|
|
fn lookahead_limit(&self, new_hashes: usize) -> usize {
|
|
|
|
let max_checkpoint_height: usize = self
|
|
|
|
.max_checkpoint_height
|
|
|
|
.0
|
|
|
|
.try_into()
|
|
|
|
.expect("fits in usize");
|
|
|
|
|
|
|
|
// When the state is empty, we want to verify using checkpoints
|
|
|
|
let verified_height: usize = self
|
|
|
|
.latest_chain_tip
|
|
|
|
.best_tip_height()
|
|
|
|
.unwrap_or(Height(0))
|
|
|
|
.0
|
|
|
|
.try_into()
|
|
|
|
.expect("fits in usize");
|
|
|
|
|
|
|
|
if verified_height >= max_checkpoint_height {
|
|
|
|
self.full_verify_concurrency_limit
|
|
|
|
} else if (verified_height + new_hashes) >= max_checkpoint_height {
|
|
|
|
// If we're just about to start full verification, allow enough for the remaining checkpoint,
|
|
|
|
// and also enough for a separate full verification lookahead.
|
|
|
|
let checkpoint_hashes = verified_height + new_hashes - max_checkpoint_height;
|
|
|
|
|
|
|
|
self.full_verify_concurrency_limit + checkpoint_hashes
|
|
|
|
} else {
|
|
|
|
self.checkpoint_verify_concurrency_limit
|
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|
2020-08-07 01:04:33 -07:00
|
|
|
|
2022-03-17 17:31:12 -07:00
|
|
|
/// Handles a response for a requested block.
|
|
|
|
///
|
2022-07-06 07:13:57 -07:00
|
|
|
/// See [`Self::handle_response`] for more details.
|
2022-11-03 19:00:56 -07:00
|
|
|
#[allow(unknown_lints)]
|
2022-05-04 15:04:34 -07:00
|
|
|
fn handle_block_response(
|
2022-07-06 07:13:57 -07:00
|
|
|
&mut self,
|
|
|
|
response: Result<(Height, block::Hash), BlockDownloadVerifyError>,
|
2022-05-04 15:04:34 -07:00
|
|
|
) -> Result<(), BlockDownloadVerifyError> {
|
2022-03-17 17:31:12 -07:00
|
|
|
match response {
|
2022-07-06 07:13:57 -07:00
|
|
|
Ok((height, hash)) => {
|
|
|
|
trace!(?height, ?hash, "verified and committed block to state");
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
Err(_) => Self::handle_response(response),
|
2022-05-04 15:04:34 -07:00
|
|
|
}
|
2022-07-06 07:13:57 -07:00
|
|
|
}
|
2022-05-04 15:04:34 -07:00
|
|
|
|
2022-07-06 07:13:57 -07:00
|
|
|
/// Handles a response to block hash submission, passing through any extra hashes.
|
|
|
|
///
|
|
|
|
/// See [`Self::handle_response`] for more details.
|
2022-11-03 19:00:56 -07:00
|
|
|
#[allow(unknown_lints)]
|
2022-07-06 07:13:57 -07:00
|
|
|
fn handle_hash_response(
|
|
|
|
response: Result<IndexSet<block::Hash>, BlockDownloadVerifyError>,
|
|
|
|
) -> Result<IndexSet<block::Hash>, BlockDownloadVerifyError> {
|
|
|
|
match response {
|
|
|
|
Ok(extra_hashes) => Ok(extra_hashes),
|
|
|
|
Err(_) => Self::handle_response(response).map(|()| IndexSet::new()),
|
|
|
|
}
|
2022-05-04 15:04:34 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Handles a response to a syncer request.
|
|
|
|
///
|
|
|
|
/// Returns `Ok` if the request was successful, or if an expected error occurred,
|
|
|
|
/// so that the synchronization can continue normally.
|
|
|
|
///
|
|
|
|
/// Returns `Err` if an unexpected error occurred, to force the synchronizer to restart.
|
2022-11-03 19:00:56 -07:00
|
|
|
#[allow(unknown_lints)]
|
2022-07-06 07:13:57 -07:00
|
|
|
fn handle_response<T>(
|
|
|
|
response: Result<T, BlockDownloadVerifyError>,
|
2022-05-04 15:04:34 -07:00
|
|
|
) -> Result<(), BlockDownloadVerifyError> {
|
2022-07-06 07:13:57 -07:00
|
|
|
match response {
|
|
|
|
Ok(_t) => Ok(()),
|
|
|
|
Err(error) => {
|
|
|
|
// TODO: exit syncer on permanent service errors (NetworkError, VerifierError)
|
|
|
|
if Self::should_restart_sync(&error) {
|
|
|
|
Err(error)
|
|
|
|
} else {
|
|
|
|
Ok(())
|
|
|
|
}
|
2022-03-17 17:31:12 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-09-03 15:09:34 -07:00
|
|
|
/// Returns `true` if the hash is present in the state, and `false`
|
2020-08-10 16:17:50 -07:00
|
|
|
/// if the hash is not present in the state.
|
2020-08-15 23:20:01 -07:00
|
|
|
async fn state_contains(&mut self, hash: block::Hash) -> Result<bool, Report> {
|
2020-08-10 16:17:50 -07:00
|
|
|
match self
|
|
|
|
.state
|
2021-11-02 11:46:57 -07:00
|
|
|
.ready()
|
2020-08-10 16:17:50 -07:00
|
|
|
.await
|
|
|
|
.map_err(|e| eyre!(e))?
|
2023-03-24 00:10:07 -07:00
|
|
|
.call(zebra_state::Request::KnownBlock(hash))
|
2020-08-10 16:17:50 -07:00
|
|
|
.await
|
|
|
|
.map_err(|e| eyre!(e))?
|
|
|
|
{
|
2023-03-24 00:10:07 -07:00
|
|
|
zs::Response::KnownBlock(loc) => Ok(loc.is_some()),
|
|
|
|
_ => unreachable!("wrong response to known block request"),
|
2020-08-10 16:17:50 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-01-11 09:11:35 -08:00
|
|
|
fn update_metrics(&mut self) {
|
2020-08-07 01:04:33 -07:00
|
|
|
metrics::gauge!(
|
|
|
|
"sync.prospective_tips.len",
|
2022-05-31 20:53:51 -07:00
|
|
|
self.prospective_tips.len() as f64,
|
2020-08-07 01:04:33 -07:00
|
|
|
);
|
2020-09-09 15:33:25 -07:00
|
|
|
metrics::gauge!(
|
|
|
|
"sync.downloads.in_flight",
|
2022-05-31 20:53:51 -07:00
|
|
|
self.downloads.in_flight() as f64,
|
2020-09-09 15:33:25 -07:00
|
|
|
);
|
2020-08-07 01:04:33 -07:00
|
|
|
}
|
2021-10-19 18:07:19 -07:00
|
|
|
|
|
|
|
/// Return if the sync should be restarted based on the given error
|
|
|
|
/// from the block downloader and verifier stream.
|
2022-05-04 15:04:34 -07:00
|
|
|
fn should_restart_sync(e: &BlockDownloadVerifyError) -> bool {
|
2021-10-19 18:07:19 -07:00
|
|
|
match e {
|
2022-05-04 15:04:34 -07:00
|
|
|
// Structural matches: downcasts
|
2022-12-07 22:11:33 -08:00
|
|
|
BlockDownloadVerifyError::Invalid { error, .. } if error.is_duplicate_request() => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing");
|
2021-10-19 18:07:19 -07:00
|
|
|
false
|
|
|
|
}
|
2022-05-04 15:04:34 -07:00
|
|
|
|
|
|
|
// Structural matches: direct
|
2022-05-10 23:51:06 -07:00
|
|
|
BlockDownloadVerifyError::CancelledDuringDownload { .. }
|
|
|
|
| BlockDownloadVerifyError::CancelledDuringVerification { .. } => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(error = ?e, "block verification was cancelled, continuing");
|
2021-12-02 06:28:20 -08:00
|
|
|
false
|
|
|
|
}
|
2022-05-10 23:51:06 -07:00
|
|
|
BlockDownloadVerifyError::BehindTipHeightLimit { .. } => {
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(
|
2022-01-11 09:11:35 -08:00
|
|
|
error = ?e,
|
|
|
|
"block height is behind the current state tip, \
|
|
|
|
assuming the syncer will eventually catch up to the state, continuing"
|
|
|
|
);
|
|
|
|
false
|
|
|
|
}
|
2022-05-04 15:04:34 -07:00
|
|
|
BlockDownloadVerifyError::DuplicateBlockQueuedForDownload { .. } => {
|
|
|
|
debug!(
|
|
|
|
error = ?e,
|
|
|
|
"queued duplicate block hash for download, \
|
|
|
|
assuming the syncer will eventually resolve duplicates, continuing"
|
|
|
|
);
|
|
|
|
false
|
|
|
|
}
|
2021-12-02 06:28:20 -08:00
|
|
|
|
|
|
|
// String matches
|
2022-12-07 22:11:33 -08:00
|
|
|
//
|
|
|
|
// We want to match VerifyChainError::Block(VerifyBlockError::Commit(ref source)),
|
|
|
|
// but that type is boxed.
|
|
|
|
// TODO:
|
|
|
|
// - turn this check into a function on VerifyChainError, like is_duplicate_request()
|
|
|
|
BlockDownloadVerifyError::Invalid { error, .. }
|
|
|
|
if format!("{error:?}").contains("block is already committed to the state")
|
|
|
|
|| format!("{error:?}")
|
|
|
|
.contains("block has already been sent to be committed to the state") =>
|
2022-11-03 23:57:45 -07:00
|
|
|
{
|
2021-12-02 06:28:20 -08:00
|
|
|
// TODO: improve this by checking the type (#2908)
|
2022-11-03 23:57:45 -07:00
|
|
|
debug!(error = ?e, "block is already committed or pending a commit, possibly from a previous sync run, continuing");
|
2021-10-19 18:07:19 -07:00
|
|
|
false
|
|
|
|
}
|
2022-05-10 23:51:06 -07:00
|
|
|
BlockDownloadVerifyError::DownloadFailed { ref error, .. }
|
2022-10-27 06:25:18 -07:00
|
|
|
if format!("{error:?}").contains("NotFound") =>
|
2021-12-02 06:28:20 -08:00
|
|
|
{
|
2022-05-04 15:04:34 -07:00
|
|
|
// Covers these errors:
|
|
|
|
// - NotFoundResponse
|
|
|
|
// - NotFoundRegistry
|
2022-02-14 17:44:33 -08:00
|
|
|
//
|
2021-12-02 06:28:20 -08:00
|
|
|
// TODO: improve this by checking the type (#2908)
|
|
|
|
// restart after a certain number of NotFound errors?
|
2022-01-28 09:24:53 -08:00
|
|
|
debug!(error = ?e, "block was not found, possibly from a peer that doesn't have the block yet, continuing");
|
2021-10-19 18:07:19 -07:00
|
|
|
false
|
|
|
|
}
|
2021-12-02 06:28:20 -08:00
|
|
|
|
2021-10-19 18:07:19 -07:00
|
|
|
_ => {
|
|
|
|
// download_and_verify downcasts errors from the block verifier
|
|
|
|
// into VerifyChainError, and puts the result inside one of the
|
|
|
|
// BlockDownloadVerifyError enumerations. This downcast could
|
|
|
|
// become incorrect e.g. after some refactoring, and it is difficult
|
|
|
|
// to write a test to check it. The test below is a best-effort
|
|
|
|
// attempt to catch if that happens and log it.
|
2022-05-04 15:04:34 -07:00
|
|
|
//
|
2021-10-19 18:07:19 -07:00
|
|
|
// TODO: add a proper test and remove this
|
|
|
|
// https://github.com/ZcashFoundation/zebra/issues/2909
|
2022-10-27 06:25:18 -07:00
|
|
|
let err_str = format!("{e:?}");
|
2021-10-19 18:07:19 -07:00
|
|
|
if err_str.contains("AlreadyVerified")
|
|
|
|
|| err_str.contains("AlreadyInChain")
|
|
|
|
|| err_str.contains("block is already committed to the state")
|
2022-11-03 23:57:45 -07:00
|
|
|
|| err_str.contains("block has already been sent to be committed to the state")
|
2021-12-02 06:28:20 -08:00
|
|
|
|| err_str.contains("NotFound")
|
2021-10-19 18:07:19 -07:00
|
|
|
{
|
2022-01-28 09:24:53 -08:00
|
|
|
error!(?e,
|
2021-10-19 18:07:19 -07:00
|
|
|
"a BlockDownloadVerifyError that should have been filtered out was detected, \
|
|
|
|
which possibly indicates a programming error in the downcast inside \
|
|
|
|
zebrad::components::sync::downloads::Downloads::download_and_verify"
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
2022-01-28 09:24:53 -08:00
|
|
|
warn!(?e, "error downloading and verifying block");
|
2021-10-19 18:07:19 -07:00
|
|
|
true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-22 19:24:53 -07:00
|
|
|
}
|