From f2a2dfb03c83da6cc2cd4390e5f97200d50feae5 Mon Sep 17 00:00:00 2001 From: Lukas Korba Date: Wed, 24 Jan 2024 14:58:48 +0100 Subject: [PATCH] [#1351] Recover from block stream issues - changelog updated - block stream errors are now handled as a special case of error, retry logic is triggered but at most 3-times in case of service being truly down - the failure is not passed to the clients so ideally the false positive errors are reduced as well as the delay in the sync time [#1351] Recover from block stream issues (#1352) - typo fixed --- CHANGELOG.md | 3 ++ .../Block/CompactBlockProcessor.swift | 29 ++++++++++++------- .../Constants/ZcashSDK.swift | 5 ++++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e71c5d6..571191db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ and this library adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### [#1346] Troubleshooting synchronization We focused on performance of the synchronization and found out a root cause in progress reporting. Simple change reduced the synchronization significantly by reporting less frequently. This affect the UX a bit because the % of the sync is updated only every 500 scanned blocks instead of every 100. Proper solution is going to be handled in #1353. +### [#1351] Recover from block stream issues +Async block stream grpc calls sometimes fail with unknown error 14, most of the times represented as `Transport became inactive` or `NIOHTTP2.StreamClosed`. Unless the service is truly down, these errors are usually false positive ones. The SDK was able to recover from this error with the next sync triggered but it takes 10-30s to happen. This delay is unnecessary so we made 2 changes. When these errors are caught the next sync is triggered immediately (at most 3 times) + the error state is not passed to the clients. + # 2.0.5 - 2023-12-15 ## Added diff --git a/Sources/ZcashLightClientKit/Block/CompactBlockProcessor.swift b/Sources/ZcashLightClientKit/Block/CompactBlockProcessor.swift index e0cfc5f8..b69a7e3f 100644 --- a/Sources/ZcashLightClientKit/Block/CompactBlockProcessor.swift +++ b/Sources/ZcashLightClientKit/Block/CompactBlockProcessor.swift @@ -40,6 +40,7 @@ actor CompactBlockProcessor { private let fileManager: ZcashFileManager private var retryAttempts: Int = 0 + private var blockStreamRetryAttempts: Int = 0 private var backoffTimer: Timer? private var consecutiveChainValidationErrors: Int = 0 @@ -263,6 +264,7 @@ extension CompactBlockProcessor { func start(retry: Bool = false) async { if retry { self.retryAttempts = 0 + self.blockStreamRetryAttempts = 0 self.backoffTimer?.invalidate() self.backoffTimer = nil } @@ -289,6 +291,7 @@ extension CompactBlockProcessor { self.backoffTimer = nil await stopAllActions() retryAttempts = 0 + blockStreamRetryAttempts = 0 } func latestHeight() async throws -> BlockHeight { @@ -530,7 +533,17 @@ extension CompactBlockProcessor { await stopAllActions() logger.error("Sync failed with error: \(error)") - if Task.isCancelled { + // catching the block stream error + if case ZcashError.serviceBlockStreamFailed = error, self.blockStreamRetryAttempts < ZcashSDK.blockStreamRetries { + // This may be false positive communication error that is usually resolved by retry. + // We will try to reset the sync and continue but this will we done at most `ZcashSDK.blockStreamRetries` times. + logger.error("ZcashError.serviceBlockStreamFailed, retry is available, starting the sync all over again.") + + self.blockStreamRetryAttempts += 1 + + // Start sync all over again + await resetContext() + } else if Task.isCancelled { logger.info("Processing cancelled.") do { if try await syncTaskWasCancelled() { @@ -545,13 +558,8 @@ extension CompactBlockProcessor { break } } else { - if await handleSyncFailure(action: action, error: error) { - // Start sync all over again - await resetContext() - } else { - // end the sync loop - break - } + await handleSyncFailure(action: action, error: error) + break } } } @@ -567,15 +575,13 @@ extension CompactBlockProcessor { return try await handleAfterSyncHooks() } - private func handleSyncFailure(action: Action, error: Error) async -> Bool { + private func handleSyncFailure(action: Action, error: Error) async { if action.removeBlocksCacheWhenFailed { await ifTaskIsNotCanceledClearCompactBlockCache() } logger.error("Sync failed with error: \(error)") await failure(error) - - return false } // swiftlint:disable:next cyclomatic_complexity @@ -642,6 +648,7 @@ extension CompactBlockProcessor { latestBlockHeightWhenSyncing > 0 && latestBlockHeightWhenSyncing < latestBlockHeight retryAttempts = 0 + blockStreamRetryAttempts = 0 consecutiveChainValidationErrors = 0 let lastScannedHeight = await latestBlocksDataProvider.maxScannedHeight diff --git a/Sources/ZcashLightClientKit/Constants/ZcashSDK.swift b/Sources/ZcashLightClientKit/Constants/ZcashSDK.swift index 17ee3dfa..ff89768a 100644 --- a/Sources/ZcashLightClientKit/Constants/ZcashSDK.swift +++ b/Sources/ZcashLightClientKit/Constants/ZcashSDK.swift @@ -105,6 +105,11 @@ public enum ZcashSDK { // TODO: [#1304] smart retry logic, https://github.com/zcash/ZcashLightClientKit/issues/1304 public static let defaultRetries = Int.max + /// The communication errors are represented as serviceBlockStreamFailed : LightWalletServiceError, unavailable 14 + /// These cases are usually false positive and another try will continue the work, in case the service is trully down we + /// cap the amount of retries by this value. + public static let blockStreamRetries = 3 + /// The default maximum amount of time to wait during retry backoff intervals. Failed loops will never wait longer than /// this before retrying. public static let defaultMaxBackOffInterval: TimeInterval = 600