Merge pull request #1352 from LukasKorba/1351-Recover-from-download-issues

[#1351] Recover from block stream issues
This commit is contained in:
Lukas Korba 2024-01-25 13:36:01 +01:00 committed by GitHub
commit 06d2b6986f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 26 additions and 11 deletions

View File

@ -11,6 +11,9 @@ and this library adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### [#1346] Troubleshooting synchronization ### [#1346] Troubleshooting synchronization
We focused on performance of the synchronization and found out a root cause in progress reporting. Simple change reduced the synchronization significantly by reporting less frequently. This affect the UX a bit because the % of the sync is updated only every 500 scanned blocks instead of every 100. Proper solution is going to be handled in #1353. We focused on performance of the synchronization and found out a root cause in progress reporting. Simple change reduced the synchronization significantly by reporting less frequently. This affect the UX a bit because the % of the sync is updated only every 500 scanned blocks instead of every 100. Proper solution is going to be handled in #1353.
### [#1351] Recover from block stream issues
Async block stream grpc calls sometimes fail with unknown error 14, most of the times represented as `Transport became inactive` or `NIOHTTP2.StreamClosed`. Unless the service is truly down, these errors are usually false positive ones. The SDK was able to recover from this error with the next sync triggered but it takes 10-30s to happen. This delay is unnecessary so we made 2 changes. When these errors are caught the next sync is triggered immediately (at most 3 times) + the error state is not passed to the clients.
# 2.0.5 - 2023-12-15 # 2.0.5 - 2023-12-15
## Added ## Added

View File

@ -40,6 +40,7 @@ actor CompactBlockProcessor {
private let fileManager: ZcashFileManager private let fileManager: ZcashFileManager
private var retryAttempts: Int = 0 private var retryAttempts: Int = 0
private var blockStreamRetryAttempts: Int = 0
private var backoffTimer: Timer? private var backoffTimer: Timer?
private var consecutiveChainValidationErrors: Int = 0 private var consecutiveChainValidationErrors: Int = 0
@ -263,6 +264,7 @@ extension CompactBlockProcessor {
func start(retry: Bool = false) async { func start(retry: Bool = false) async {
if retry { if retry {
self.retryAttempts = 0 self.retryAttempts = 0
self.blockStreamRetryAttempts = 0
self.backoffTimer?.invalidate() self.backoffTimer?.invalidate()
self.backoffTimer = nil self.backoffTimer = nil
} }
@ -289,6 +291,7 @@ extension CompactBlockProcessor {
self.backoffTimer = nil self.backoffTimer = nil
await stopAllActions() await stopAllActions()
retryAttempts = 0 retryAttempts = 0
blockStreamRetryAttempts = 0
} }
func latestHeight() async throws -> BlockHeight { func latestHeight() async throws -> BlockHeight {
@ -530,7 +533,17 @@ extension CompactBlockProcessor {
await stopAllActions() await stopAllActions()
logger.error("Sync failed with error: \(error)") logger.error("Sync failed with error: \(error)")
if Task.isCancelled { // catching the block stream error
if case ZcashError.serviceBlockStreamFailed = error, self.blockStreamRetryAttempts < ZcashSDK.blockStreamRetries {
// This may be false positive communication error that is usually resolved by retry.
// We will try to reset the sync and continue but this will we done at most `ZcashSDK.blockStreamRetries` times.
logger.error("ZcashError.serviceBlockStreamFailed, retry is available, starting the sync all over again.")
self.blockStreamRetryAttempts += 1
// Start sync all over again
await resetContext()
} else if Task.isCancelled {
logger.info("Processing cancelled.") logger.info("Processing cancelled.")
do { do {
if try await syncTaskWasCancelled() { if try await syncTaskWasCancelled() {
@ -545,13 +558,8 @@ extension CompactBlockProcessor {
break break
} }
} else { } else {
if await handleSyncFailure(action: action, error: error) { await handleSyncFailure(action: action, error: error)
// Start sync all over again break
await resetContext()
} else {
// end the sync loop
break
}
} }
} }
} }
@ -567,15 +575,13 @@ extension CompactBlockProcessor {
return try await handleAfterSyncHooks() return try await handleAfterSyncHooks()
} }
private func handleSyncFailure(action: Action, error: Error) async -> Bool { private func handleSyncFailure(action: Action, error: Error) async {
if action.removeBlocksCacheWhenFailed { if action.removeBlocksCacheWhenFailed {
await ifTaskIsNotCanceledClearCompactBlockCache() await ifTaskIsNotCanceledClearCompactBlockCache()
} }
logger.error("Sync failed with error: \(error)") logger.error("Sync failed with error: \(error)")
await failure(error) await failure(error)
return false
} }
// swiftlint:disable:next cyclomatic_complexity // swiftlint:disable:next cyclomatic_complexity
@ -642,6 +648,7 @@ extension CompactBlockProcessor {
latestBlockHeightWhenSyncing > 0 && latestBlockHeightWhenSyncing < latestBlockHeight latestBlockHeightWhenSyncing > 0 && latestBlockHeightWhenSyncing < latestBlockHeight
retryAttempts = 0 retryAttempts = 0
blockStreamRetryAttempts = 0
consecutiveChainValidationErrors = 0 consecutiveChainValidationErrors = 0
let lastScannedHeight = await latestBlocksDataProvider.maxScannedHeight let lastScannedHeight = await latestBlocksDataProvider.maxScannedHeight

View File

@ -105,6 +105,11 @@ public enum ZcashSDK {
// TODO: [#1304] smart retry logic, https://github.com/zcash/ZcashLightClientKit/issues/1304 // TODO: [#1304] smart retry logic, https://github.com/zcash/ZcashLightClientKit/issues/1304
public static let defaultRetries = Int.max public static let defaultRetries = Int.max
/// The communication errors are represented as serviceBlockStreamFailed : LightWalletServiceError, unavailable 14
/// These cases are usually false positive and another try will continue the work, in case the service is trully down we
/// cap the amount of retries by this value.
public static let blockStreamRetries = 3
/// The default maximum amount of time to wait during retry backoff intervals. Failed loops will never wait longer than /// The default maximum amount of time to wait during retry backoff intervals. Failed loops will never wait longer than
/// this before retrying. /// this before retrying.
public static let defaultMaxBackOffInterval: TimeInterval = 600 public static let defaultMaxBackOffInterval: TimeInterval = 600