Auto merge of #4447 - nuttycom:bug/4119-rewind_detection, r=str4d

Reproduce and fix off-by-one error in reorg logic (#4119) This attempts to reproduce #4119 using a simple chain split.It currently seems to trigger a different issue, an assertion failure in `CheckBlockIndex` when restarting
2020-05-06 21:50:03 +00:00 · 2020-05-06 21:50:03 +00:00 · eb06b0a8d4
parent 68fd808e0f 35ff8d9bf2
commit eb06b0a8d4
7 changed files with 153 additions and 22 deletions
--- a/qa/pull-tester/rpc-tests.sh
+++ b/qa/pull-tester/rpc-tests.sh
@ -83,6 +83,7 @@ testScripts=(
    'turnstile.py'
    'mining_shielded_coinbase.py'
    'framework.py'
+    'sapling_rewind_check.py'
    'feature_zip221.py'
 );
 testScriptsExt=(
--- a/qa/rpc-tests/feature_zip221.py
+++ b/qa/rpc-tests/feature_zip221.py
@ -5,9 +5,10 @@


 from test_framework.flyclient import (ZcashMMRNode, append, delete, make_root_commitment)
-from test_framework.mininode import (HEARTWOOD_BRANCH_ID, CBlockHeader)
+from test_framework.mininode import (CBlockHeader)
 from test_framework.test_framework import BitcoinTestFramework
 from test_framework.util import (
+    HEARTWOOD_BRANCH_ID, 
    assert_equal,
    bytes_to_hex_str,
    hex_str_to_bytes,
--- a/qa/rpc-tests/rewind_index.py
+++ b/qa/rpc-tests/rewind_index.py
@ -5,12 +5,13 @@

 from test_framework.test_framework import BitcoinTestFramework
 from test_framework.util import assert_equal, initialize_chain_clean, \
-    start_nodes, start_node, connect_nodes_bi, bitcoind_processes
+    start_nodes, start_node, connect_nodes_bi, bitcoind_processes, \
+    nuparams, OVERWINTER_BRANCH_ID, SAPLING_BRANCH_ID

 import time

-FAKE_SPROUT = ['-nuparams=5ba81b19:210', '-nuparams=76b809bb:220']
-FAKE_OVERWINTER = ['-nuparams=5ba81b19:10', '-nuparams=76b809bb:220']
+FAKE_SPROUT = [nuparams(OVERWINTER_BRANCH_ID, 210), nuparams(SAPLING_BRANCH_ID, 220)]
+FAKE_OVERWINTER = [nuparams(OVERWINTER_BRANCH_ID, 10), nuparams(SAPLING_BRANCH_ID, 220)]

 class RewindBlockIndexTest (BitcoinTestFramework):

--- a/qa/rpc-tests/sapling_rewind_check.py
+++ b/qa/rpc-tests/sapling_rewind_check.py
@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+# Copyright (c) 2014 The Bitcoin Core developers
+# Copyright (c) 2020 The Zcash developers
+# Distributed under the MIT software license, see the accompanying
+# file COPYING or https://www.opensource.org/licenses/mit-license.php .
+
+"""
+Exercise the chain rewind code at the Blossom boundary.
+
+Test case is:
+
+3 nodes are initialized, two of which are aware of the Blossom network upgrade,
+and one of which is not. On each node, the chain is advanced to just prior to
+the Blossom activation; height then, the network is split and each branch of
+the network produces blocks into the range of the upgraded protocol.
+
+The node that is not aware of Blossom activation is advanced beyond the maximum
+reorg length of 99 blocks, then that node is shut down. When the node is
+restarted with knowledge of the network activation height the checks on startup
+identify a need to reorg to come into agreement with the rest of the network.
+However, since the rollback required is greater than the maximum reorg length,
+the node shuts down with an error as a precautionary measure. It was noticed in
+#4119 that the error message indicated an incorrect computation of the rollback
+length. This test reproduces that error, and the associated change in rollback
+length computation (40b5d5e3ea4b602c34c4efaba0b9f6171dddfef5) corrects the issue.
+
+"""
+
+from test_framework.test_framework import BitcoinTestFramework
+from test_framework.util import (assert_equal, assert_true,
+    initialize_chain_clean, start_nodes, start_node, connect_nodes_bi,
+    bitcoind_processes,
+    nuparams, OVERWINTER_BRANCH_ID, SAPLING_BRANCH_ID)
+
+import re
+import logging
+
+HAS_SAPLING = [nuparams(OVERWINTER_BRANCH_ID, 10), nuparams(SAPLING_BRANCH_ID, 15)]
+NO_SAPLING = [nuparams(OVERWINTER_BRANCH_ID, 10), nuparams(SAPLING_BRANCH_ID, 150)]
+
+class SaplingRewindTest(BitcoinTestFramework):
+    def setup_chain(self):
+        logging.info("Initializing test directory "+self.options.tmpdir)
+        initialize_chain_clean(self.options.tmpdir, 4)
+
+    # This mirrors how the network was setup in the bash test
+    def setup_network(self, split=False):
+        logging.info("Initializing the network in "+self.options.tmpdir)
+        self.nodes = start_nodes(3, self.options.tmpdir, extra_args=[
+                HAS_SAPLING, # The first two nodes have a correct view of the network,
+                HAS_SAPLING, # the third will rewind after upgrading.
+                NO_SAPLING
+        ])
+        connect_nodes_bi(self.nodes,0,1)
+        connect_nodes_bi(self.nodes,1,2)
+        connect_nodes_bi(self.nodes,0,2)
+        self.is_network_split=False 
+        self.sync_all()
+
+    def run_test(self):
+        # Generate shared state up to the network split
+        logging.info("Generating initial blocks.")
+        self.nodes[0].generate(13)
+        block14 = self.nodes[0].generate(1)[0]
+        logging.info("Syncing network after initial generation...")
+        self.sync_all() # Everyone is still on overwinter
+
+        logging.info("Checking overwinter block propagation.")
+        assert_equal(self.nodes[0].getbestblockhash(), block14)
+        assert_equal(self.nodes[1].getbestblockhash(), block14)
+        assert_equal(self.nodes[2].getbestblockhash(), block14)
+        logging.info("All nodes are on overwinter.")
+
+        logging.info("Generating network split...")
+        self.is_network_split=True 
+
+        # generate past the boundary into sapling; this will become the "canonical" branch
+        self.nodes[0].generate(50) 
+        expected = self.nodes[0].getbestblockhash()
+
+        # generate blocks into sapling beyond the maximum rewind length (99 blocks)
+        self.nodes[2].generate(120) 
+        self.sync_all()
+
+        assert_true(expected != self.nodes[2].getbestblockhash(), "Split chains have not diverged!")
+
+        # Stop the overwinter node to ensure state is flushed to disk.
+        logging.info("Shutting down lagging node...")
+        self.nodes[2].stop()
+        bitcoind_processes[2].wait()
+        
+        # Restart the nodes, reconnect, and sync the network. This succeeds if "-reindex" is passed.
+        logging.info("Reconnecting the network...")
+        try:
+            # expect an exception; the node will refuse to fully start because its last point of
+            # agreement with the rest of the network was prior to the network upgrade activation
+            self.nodes[2] = start_node(2, self.options.tmpdir, extra_args=HAS_SAPLING) # + ["-reindex"])
+        except:
+            logpath = self.options.tmpdir + "/node2/regtest/debug.log"
+            found = False
+            with open(logpath, 'r') as f:
+                for line in f:
+                    # Search for the rollback message in the debug log, and ensure that it has the
+                    # correct expected rollback length.
+                    m = re.search(r'roll back ([0-9]+)', line)
+                    if m is None:
+                        continue
+                    elif m.group(1) == "120":
+                        found = True
+                        break
+                    else:
+                        raise AssertionError("Incorrect rollback length %s found, expected 120." %(m.group(1)))
+
+            if not found:
+                raise AssertionError("Expected rollback message not found in log file.")
+
+            # restart the node with -reindex to allow the test to complete gracefully,
+            # otherwise the node shutdown call in test cleanup will throw an error since
+            # it can't connect
+            self.nodes[2] = start_node(2, self.options.tmpdir, extra_args=NO_SAPLING + ["-reindex"])
+        else:
+            raise AssertionError("Expected node to halt due to excessive rewind length.")
+
+if __name__ == '__main__':
+    SaplingRewindTest().main()
--- a/qa/rpc-tests/test_framework/mininode.py
+++ b/qa/rpc-tests/test_framework/mininode.py
@ -53,16 +53,8 @@ OVERWINTER_VERSION_GROUP_ID = 0x03C48270
 SAPLING_VERSION_GROUP_ID = 0x892F2085
 # No transaction format change in Blossom.

-SPROUT_BRANCH_ID = 0x00000000
-OVERWINTER_BRANCH_ID = 0x5BA81B19
-SAPLING_BRANCH_ID = 0x76B809BB
-BLOSSOM_BRANCH_ID = 0x2BB40E60
-HEARTWOOD_BRANCH_ID = 0xF5B9230B
-NU4_BRANCH_ID = 0xE9FF75A6
-
 MAX_INV_SZ = 50000

-
 COIN = 100000000 # 1 zec in zatoshis

 # Keep our own socket map for asyncore, so that we can track disconnects
--- a/qa/rpc-tests/test_framework/util.py
+++ b/qa/rpc-tests/test_framework/util.py
@ -30,6 +30,12 @@ COVERAGE_DIR = None
 PRE_BLOSSOM_BLOCK_TARGET_SPACING = 150
 POST_BLOSSOM_BLOCK_TARGET_SPACING = 75

+SPROUT_BRANCH_ID = 0x00000000
+OVERWINTER_BRANCH_ID = 0x5BA81B19
+SAPLING_BRANCH_ID = 0x76B809BB
+BLOSSOM_BRANCH_ID = 0x2BB40E60
+HEARTWOOD_BRANCH_ID = 0xF5B9230B
+NU4_BRANCH_ID = 0xE9FF75A6

 def enable_coverage(dirname):
    """Maintain a log of which RPC calls are made during testing."""
@ -591,3 +597,6 @@ def check_node_log(self, node_number, line_to_check, stop_node = True):
        if line_to_check in logline:
            return n
    raise AssertionError(repr(line_to_check) + " not found")
+
+def nuparams(branch_id, height):
+    return '-nuparams=%x:%d' % (branch_id, height)
--- a/src/main.cpp
+++ b/src/main.cpp
@ -4752,20 +4752,23 @@ bool RewindBlockIndex(const CChainParams& chainparams, bool& clearWitnessCaches)
            *pindex->nCachedBranchId == CurrentEpochBranchId(pindex->nHeight, consensus);
    };

-    int nHeight = 1;
-    while (nHeight <= chainActive.Height()) {
-        if (!sufficientlyValidated(chainActive[nHeight])) {
+    int lastValidHeight = 0;
+    while (lastValidHeight < chainActive.Height()) {
+        if (sufficientlyValidated(chainActive[lastValidHeight + 1])) {
+            lastValidHeight++;
+        } else {
            break;
        }
-        nHeight++;
    }

-    // nHeight is now the height of the first insufficiently-validated block, or tipheight + 1
-    auto rewindLength = chainActive.Height() - nHeight;
+    // lastValidHeight is now the height of the last valid block below the active chain height
+    auto rewindLength = chainActive.Height() - lastValidHeight;
    clearWitnessCaches = false;

    if (rewindLength > 0) {
-        LogPrintf("*** First insufficiently validated block at height %d, rewind length %d\n", nHeight, rewindLength);
+        LogPrintf("*** Last validated block at height %d, active height is %d; rewind length %d\n", lastValidHeight, chainActive.Height(), rewindLength);
+
+        auto nHeight = lastValidHeight + 1;
        const uint256 *phashFirstInsufValidated = chainActive[nHeight]->phashBlock;
        auto networkID = chainparams.NetworkIDString();

@ -4777,7 +4780,6 @@ bool RewindBlockIndex(const CChainParams& chainparams, bool& clearWitnessCaches)
             uint256S("002e1d6daf4ab7b296e7df839dc1bee9d615583bb4bc34b1926ce78307532852"));

        clearWitnessCaches = (rewindLength > MAX_REORG_LENGTH && intendedRewind);
-
        if (clearWitnessCaches) {
            auto msg = strprintf(_(
                "An intended block chain rewind has been detected: network %s, hash %s, height %d"
@ -4787,7 +4789,7 @@ bool RewindBlockIndex(const CChainParams& chainparams, bool& clearWitnessCaches)

        if (rewindLength > MAX_REORG_LENGTH && !intendedRewind) {
            auto pindexOldTip = chainActive.Tip();
-            auto pindexRewind = chainActive[nHeight - 1];
+            auto pindexRewind = chainActive[lastValidHeight];
            auto msg = strprintf(_(
                "A block chain rewind has been detected that would roll back %d blocks! "
                "This is larger than the maximum of %d blocks, and so the node is shutting down for your safety."
@ -4807,7 +4809,7 @@ bool RewindBlockIndex(const CChainParams& chainparams, bool& clearWitnessCaches)

    CValidationState state;
    CBlockIndex* pindex = chainActive.Tip();
-    while (chainActive.Height() >= nHeight) {
+    while (chainActive.Height() > lastValidHeight) {
        if (fPruneMode && !(chainActive.Tip()->nStatus & BLOCK_HAVE_DATA)) {
            // If pruning, don't try rewinding past the HAVE_DATA point;
            // since older blocks can't be served anyway, there's