From 6488894210d3f7d26e41d51d6245a2c4443bbe7f Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Tue, 20 Dec 2016 19:25:02 +0400 Subject: [PATCH 01/12] add .vagrant to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 897b9a36..f3af0c40 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ rpc/test/.tendermint remote_dump .revision vendor +.vagrant From 3d47ef9d74b380ef739977a47d186778e1725d1a Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Tue, 20 Dec 2016 19:25:47 +0400 Subject: [PATCH 02/12] fix typo --- test/p2p/fast_sync/test_peer.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/p2p/fast_sync/test_peer.sh b/test/p2p/fast_sync/test_peer.sh index d3c10129..97821aa2 100644 --- a/test/p2p/fast_sync/test_peer.sh +++ b/test/p2p/fast_sync/test_peer.sh @@ -14,12 +14,12 @@ N=$4 ############################################################### -echo "Testing fasysync on node $COUNT" +echo "Testing fastsync on node $COUNT" -# kill peer +# kill peer set +e # circle sigh :( docker rm -vf local_testnet_$COUNT -set -e +set -e # restart peer - should have an empty blockchain SEEDS="$(test/p2p/ip.sh 1):46656" From 1c24031dd2c0c4bcb923df200165fdf198281036 Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Tue, 20 Dec 2016 19:28:41 +0400 Subject: [PATCH 03/12] rename COUNT to ID --- test/p2p/fast_sync/test_peer.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/p2p/fast_sync/test_peer.sh b/test/p2p/fast_sync/test_peer.sh index 97821aa2..135c5ddc 100644 --- a/test/p2p/fast_sync/test_peer.sh +++ b/test/p2p/fast_sync/test_peer.sh @@ -3,7 +3,7 @@ set -eu DOCKER_IMAGE=$1 NETWORK_NAME=$2 -COUNT=$3 +ID=$3 N=$4 ############################################################### @@ -14,11 +14,11 @@ N=$4 ############################################################### -echo "Testing fastsync on node $COUNT" +echo "Testing fastsync on node $ID" # kill peer set +e # circle sigh :( -docker rm -vf local_testnet_$COUNT +docker rm -vf local_testnet_$ID set -e # restart peer - should have an empty blockchain @@ -26,10 +26,10 @@ SEEDS="$(test/p2p/ip.sh 1):46656" for j in `seq 2 $N`; do SEEDS="$SEEDS,$(test/p2p/ip.sh $j):46656" done -bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $COUNT $SEEDS +bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $ID $SEEDS # wait for peer to sync and check the app hash -bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME fs_$COUNT "test/p2p/fast_sync/check_peer.sh $COUNT" +bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME fs_$ID "test/p2p/fast_sync/check_peer.sh $ID" echo "" echo "PASS" From 57f35924118dd47807dac399c55e005533bcf479 Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Tue, 20 Dec 2016 21:16:39 +0400 Subject: [PATCH 04/12] fix 2 errors when running p2p tests more than once Error #1: ``` Error response from daemon: network with name local_testnet already exists ``` Fixed by stopping and removing local_testnet containers and removing the network Error #2: ``` docker: Error response from daemon: Conflict. The name "/test_container_basic" is already in use by container a7cd15d479a964675e7f259de4ed852e7dfef85b447514728f437cd0b980a709. You have to remove (or rename) that container to beable to reuse that name.. ``` Fixed by adding `--rm` flag. --- test/p2p/client.sh | 2 +- .../p2p/{local_testnet.sh => local_testnet_start.sh} | 0 test/p2p/local_testnet_stop.sh | 12 ++++++++++++ test/p2p/test.sh | 7 ++++++- 4 files changed, 19 insertions(+), 2 deletions(-) rename test/p2p/{local_testnet.sh => local_testnet_start.sh} (100%) create mode 100644 test/p2p/local_testnet_stop.sh diff --git a/test/p2p/client.sh b/test/p2p/client.sh index efc5096e..b1ac64e7 100644 --- a/test/p2p/client.sh +++ b/test/p2p/client.sh @@ -8,7 +8,7 @@ CMD=$4 echo "starting test client container with CMD=$CMD" # run the test container on the local network -docker run -t \ +docker run -t --rm \ -v $GOPATH/src/github.com/tendermint/tendermint/test/p2p/:/go/src/github.com/tendermint/tendermint/test/p2p \ --net=$NETWORK_NAME \ --ip=$(test/p2p/ip.sh "-1") \ diff --git a/test/p2p/local_testnet.sh b/test/p2p/local_testnet_start.sh similarity index 100% rename from test/p2p/local_testnet.sh rename to test/p2p/local_testnet_start.sh diff --git a/test/p2p/local_testnet_stop.sh b/test/p2p/local_testnet_stop.sh new file mode 100644 index 00000000..6fe23ab2 --- /dev/null +++ b/test/p2p/local_testnet_stop.sh @@ -0,0 +1,12 @@ +#! /bin/bash +set -u + +NETWORK_NAME=$1 +N=$2 + +for i in `seq 1 $N`; do + docker stop local_testnet_$i + docker rm local_testnet_$i +done + +docker network rm $NETWORK_NAME diff --git a/test/p2p/test.sh b/test/p2p/test.sh index 7a64d464..58beb9a4 100644 --- a/test/p2p/test.sh +++ b/test/p2p/test.sh @@ -7,8 +7,13 @@ N=4 cd $GOPATH/src/github.com/tendermint/tendermint +# stop the existing testnet and remove local network +set +e +bash test/p2p/local_testnet_stop.sh $NETWORK_NAME $N +set -e + # start the testnet on a local network -bash test/p2p/local_testnet.sh $DOCKER_IMAGE $NETWORK_NAME $N +bash test/p2p/local_testnet_start.sh $DOCKER_IMAGE $NETWORK_NAME $N # test basic connectivity and consensus # start client container and check the num peers and height for all nodes From 30328548f7616ddd3e78bb145dae8ddf774cff8e Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Wed, 21 Dec 2016 01:36:06 +0400 Subject: [PATCH 05/12] test/p2p: kill and restart all nodes --- .gitignore | 1 + test/p2p/kill_all/check_peers.sh | 48 ++++++++++++++++++++++++++++++++ test/p2p/kill_all/test.sh | 29 +++++++++++++++++++ test/p2p/local_testnet_stop.sh | 2 +- test/p2p/peer.sh | 1 + test/p2p/test.sh | 3 ++ 6 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 test/p2p/kill_all/check_peers.sh create mode 100644 test/p2p/kill_all/test.sh diff --git a/.gitignore b/.gitignore index f3af0c40..acc957a9 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ remote_dump .revision vendor .vagrant +test/p2p/data/ diff --git a/test/p2p/kill_all/check_peers.sh b/test/p2p/kill_all/check_peers.sh new file mode 100644 index 00000000..31f6d550 --- /dev/null +++ b/test/p2p/kill_all/check_peers.sh @@ -0,0 +1,48 @@ +#! /bin/bash +set -eu + +NUM_OF_PEERS=$1 + +# how many attempts for each peer to catch up by height +MAX_ATTEMPTS_TO_CATCH_UP=10 + +echo "Waiting for nodes to come online" +set +e +for i in $(seq 1 "$NUM_OF_PEERS"); do + addr=$(test/p2p/ip.sh "$i"):46657 + curl -s "$addr/status" > /dev/null + ERR=$? + while [ "$ERR" != 0 ]; do + sleep 1 + curl -s "$addr/status" > /dev/null + ERR=$? + done + echo "... node $i is up" +done +set -e + +# get the first peer's height +addr=$(test/p2p/ip.sh 1):46657 +h1=$(curl -s "$addr/status" | jq .result[1].latest_block_height) +echo "1st peer is on height $h1" + +echo "Waiting until other peers reporting a height higher than the 1st one" +for i in $(seq 2 "$NUM_OF_PEERS"); do + attempt=1 + hi=0 + + while [[ $hi -le $h1 ]] ; do + addr=$(test/p2p/ip.sh "$i"):46657 + hi=$(curl -s "$addr/status" | jq .result[1].latest_block_height) + + echo "... peer $i is on height $hi" + + ((attempt++)) + if [ "$attempt" -ge $MAX_ATTEMPTS_TO_CATCH_UP ] ; then + echo "$attempt unsuccessful attempts were made to catch up" + exit 1 + fi + + sleep 1 + done +done diff --git a/test/p2p/kill_all/test.sh b/test/p2p/kill_all/test.sh new file mode 100644 index 00000000..e8157d7b --- /dev/null +++ b/test/p2p/kill_all/test.sh @@ -0,0 +1,29 @@ +#! /bin/bash +set -eu + +DOCKER_IMAGE=$1 +NETWORK_NAME=$2 +NUM_OF_PEERS=$3 +NUM_OF_CRASHES=$4 + +cd "$GOPATH/src/github.com/tendermint/tendermint" + +############################################################### +# NUM_OF_CRASHES times: +# restart all peers +# wait for them to sync and check that they are making progress +############################################################### + +for i in $(seq 1 "$NUM_OF_CRASHES"); do + # restart all peers + for i in $(seq 1 "$NUM_OF_PEERS"); do + docker stop "local_testnet_$i" + docker start "local_testnet_$i" + done + + bash test/p2p/client.sh "$DOCKER_IMAGE" "$NETWORK_NAME" kill_all "test/p2p/kill_all/check_peers.sh $NUM_OF_PEERS" +done + +echo "" +echo "PASS" +echo "" diff --git a/test/p2p/local_testnet_stop.sh b/test/p2p/local_testnet_stop.sh index 6fe23ab2..8edd3eeb 100644 --- a/test/p2p/local_testnet_stop.sh +++ b/test/p2p/local_testnet_stop.sh @@ -6,7 +6,7 @@ N=$2 for i in `seq 1 $N`; do docker stop local_testnet_$i - docker rm local_testnet_$i + docker rm -vf local_testnet_$i done docker network rm $NETWORK_NAME diff --git a/test/p2p/peer.sh b/test/p2p/peer.sh index d1405eff..32e696e7 100644 --- a/test/p2p/peer.sh +++ b/test/p2p/peer.sh @@ -19,5 +19,6 @@ docker run -d \ --ip=$(test/p2p/ip.sh $ID) \ --name local_testnet_$ID \ --entrypoint tendermint \ + -v $GOPATH/src/github.com/tendermint/tendermint/test/p2p/:/go/src/github.com/tendermint/tendermint/test/p2p \ -e TMROOT=/go/src/github.com/tendermint/tendermint/test/p2p/data/mach$ID/core \ $DOCKER_IMAGE node $SEEDS --proxy_app=dummy diff --git a/test/p2p/test.sh b/test/p2p/test.sh index 58beb9a4..2a45e7f8 100644 --- a/test/p2p/test.sh +++ b/test/p2p/test.sh @@ -26,3 +26,6 @@ bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME ab "test/p2p/atomic_broadcas # test fast sync (from current state of network): # for each node, kill it and readd via fast sync bash test/p2p/fast_sync/test.sh $DOCKER_IMAGE $NETWORK_NAME $N + +# test killing all peers +bash test/p2p/kill_all/test.sh $DOCKER_IMAGE $NETWORK_NAME $N 3 From 69a449a073f85919b622637ab731c173678f5b4e Mon Sep 17 00:00:00 2001 From: Anton Kalyaev Date: Thu, 22 Dec 2016 01:39:06 +0400 Subject: [PATCH 06/12] test/p2p: use PROXY_APP=persistent_dummy --- proxy/client.go | 2 ++ test/p2p/fast_sync/test.sh | 3 ++- test/p2p/fast_sync/test_peer.sh | 3 ++- test/p2p/kill_all/test.sh | 3 +++ test/p2p/local_testnet_start.sh | 3 ++- test/p2p/peer.sh | 16 ++++++++-------- test/p2p/test.sh | 6 ++++-- 7 files changed, 23 insertions(+), 13 deletions(-) diff --git a/proxy/client.go b/proxy/client.go index c6e03e78..587b4546 100644 --- a/proxy/client.go +++ b/proxy/client.go @@ -71,6 +71,8 @@ func DefaultClientCreator(config cfg.Config) ClientCreator { switch addr { case "dummy": return NewLocalClientCreator(dummy.NewDummyApplication()) + case "persistent_dummy": + return NewLocalClientCreator(dummy.NewPersistentDummyApplication(config.GetString("db_dir"))) case "nilapp": return NewLocalClientCreator(nilapp.NewNilApplication()) default: diff --git a/test/p2p/fast_sync/test.sh b/test/p2p/fast_sync/test.sh index b4ac90f9..8820d199 100644 --- a/test/p2p/fast_sync/test.sh +++ b/test/p2p/fast_sync/test.sh @@ -4,12 +4,13 @@ set -eu DOCKER_IMAGE=$1 NETWORK_NAME=$2 N=$3 +PROXY_APP=$4 cd $GOPATH/src/github.com/tendermint/tendermint # run it on each of them for i in `seq 1 $N`; do - bash test/p2p/fast_sync/test_peer.sh $DOCKER_IMAGE $NETWORK_NAME $i $N + bash test/p2p/fast_sync/test_peer.sh $DOCKER_IMAGE $NETWORK_NAME $i $N $PROXY_APP done diff --git a/test/p2p/fast_sync/test_peer.sh b/test/p2p/fast_sync/test_peer.sh index 135c5ddc..a065ea5c 100644 --- a/test/p2p/fast_sync/test_peer.sh +++ b/test/p2p/fast_sync/test_peer.sh @@ -5,6 +5,7 @@ DOCKER_IMAGE=$1 NETWORK_NAME=$2 ID=$3 N=$4 +PROXY_APP=$5 ############################################################### # this runs on each peer: @@ -26,7 +27,7 @@ SEEDS="$(test/p2p/ip.sh 1):46656" for j in `seq 2 $N`; do SEEDS="$SEEDS,$(test/p2p/ip.sh $j):46656" done -bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $ID $SEEDS +bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $ID $PROXY_APP $SEEDS # wait for peer to sync and check the app hash bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME fs_$ID "test/p2p/fast_sync/check_peer.sh $ID" diff --git a/test/p2p/kill_all/test.sh b/test/p2p/kill_all/test.sh index e8157d7b..ab8e29c5 100644 --- a/test/p2p/kill_all/test.sh +++ b/test/p2p/kill_all/test.sh @@ -15,6 +15,9 @@ cd "$GOPATH/src/github.com/tendermint/tendermint" ############################################################### for i in $(seq 1 "$NUM_OF_CRASHES"); do + echo "" + echo "Restarting all peers! Take $i ..." + # restart all peers for i in $(seq 1 "$NUM_OF_PEERS"); do docker stop "local_testnet_$i" diff --git a/test/p2p/local_testnet_start.sh b/test/p2p/local_testnet_start.sh index 50297d62..4dd2ab05 100644 --- a/test/p2p/local_testnet_start.sh +++ b/test/p2p/local_testnet_start.sh @@ -4,6 +4,7 @@ set -eu DOCKER_IMAGE=$1 NETWORK_NAME=$2 N=$3 +APP_PROXY=$4 cd $GOPATH/src/github.com/tendermint/tendermint @@ -17,5 +18,5 @@ done echo "Seeds: $seeds" for i in `seq 1 $N`; do - bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $i $seeds + bash test/p2p/peer.sh $DOCKER_IMAGE $NETWORK_NAME $i $APP_PROXY $seeds done diff --git a/test/p2p/peer.sh b/test/p2p/peer.sh index 32e696e7..76314f58 100644 --- a/test/p2p/peer.sh +++ b/test/p2p/peer.sh @@ -4,9 +4,10 @@ set -eu DOCKER_IMAGE=$1 NETWORK_NAME=$2 ID=$3 +APP_PROXY=$4 set +u -SEEDS=$4 +SEEDS=$5 set -u if [[ "$SEEDS" != "" ]]; then SEEDS=" --seeds $SEEDS " @@ -15,10 +16,9 @@ fi echo "starting tendermint peer ID=$ID" # start tendermint container on the network docker run -d \ - --net=$NETWORK_NAME \ - --ip=$(test/p2p/ip.sh $ID) \ - --name local_testnet_$ID \ - --entrypoint tendermint \ - -v $GOPATH/src/github.com/tendermint/tendermint/test/p2p/:/go/src/github.com/tendermint/tendermint/test/p2p \ - -e TMROOT=/go/src/github.com/tendermint/tendermint/test/p2p/data/mach$ID/core \ - $DOCKER_IMAGE node $SEEDS --proxy_app=dummy + --net=$NETWORK_NAME \ + --ip=$(test/p2p/ip.sh $ID) \ + --name local_testnet_$ID \ + --entrypoint tendermint \ + -e TMROOT=/go/src/github.com/tendermint/tendermint/test/p2p/data/mach$ID/core \ + $DOCKER_IMAGE node $SEEDS --proxy_app=$APP_PROXY diff --git a/test/p2p/test.sh b/test/p2p/test.sh index 2a45e7f8..0f29aa19 100644 --- a/test/p2p/test.sh +++ b/test/p2p/test.sh @@ -4,6 +4,7 @@ set -eu DOCKER_IMAGE=$1 NETWORK_NAME=local_testnet N=4 +PROXY_APP=persistent_dummy cd $GOPATH/src/github.com/tendermint/tendermint @@ -13,7 +14,8 @@ bash test/p2p/local_testnet_stop.sh $NETWORK_NAME $N set -e # start the testnet on a local network -bash test/p2p/local_testnet_start.sh $DOCKER_IMAGE $NETWORK_NAME $N +# NOTE we re-use the same network for all tests +bash test/p2p/local_testnet_start.sh $DOCKER_IMAGE $NETWORK_NAME $N $PROXY_APP # test basic connectivity and consensus # start client container and check the num peers and height for all nodes @@ -25,7 +27,7 @@ bash test/p2p/client.sh $DOCKER_IMAGE $NETWORK_NAME ab "test/p2p/atomic_broadcas # test fast sync (from current state of network): # for each node, kill it and readd via fast sync -bash test/p2p/fast_sync/test.sh $DOCKER_IMAGE $NETWORK_NAME $N +bash test/p2p/fast_sync/test.sh $DOCKER_IMAGE $NETWORK_NAME $N $PROXY_APP # test killing all peers bash test/p2p/kill_all/test.sh $DOCKER_IMAGE $NETWORK_NAME $N 3 From e4921733df1d661ffe2dea7452de8b29c1b23c98 Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 22 Dec 2016 02:49:50 -0500 Subject: [PATCH 07/12] test/persist: use fail-test failure indices --- test/persist/test.sh | 69 +----------------- .../{test2.sh => test_failure_indices.sh} | 0 test/persist/test_simple.sh | 70 +++++++++++++++++++ 3 files changed, 72 insertions(+), 67 deletions(-) rename test/persist/{test2.sh => test_failure_indices.sh} (100%) create mode 100644 test/persist/test_simple.sh diff --git a/test/persist/test.sh b/test/persist/test.sh index 1a94a093..b27394c5 100644 --- a/test/persist/test.sh +++ b/test/persist/test.sh @@ -1,70 +1,5 @@ #! /bin/bash +cd $GOPATH/src/github.com/tendermint/tendermint -export TMROOT=$HOME/.tendermint_persist - -rm -rf $TMROOT -tendermint init - -function start_procs(){ - name=$1 - echo "Starting persistent dummy and tendermint" - dummy --persist $TMROOT/dummy &> "dummy_${name}.log" & - PID_DUMMY=$! - tendermint node &> tendermint_${name}.log & - PID_TENDERMINT=$! - sleep 5 -} - -function kill_procs(){ - kill -9 $PID_DUMMY $PID_TENDERMINT -} - - -function send_txs(){ - # send a bunch of txs over a few blocks - echo "Sending txs" - for i in `seq 1 5`; do - for j in `seq 1 100`; do - tx=`head -c 8 /dev/urandom | hexdump -ve '1/1 "%.2X"'` - curl -s 127.0.0.1:46657/broadcast_tx_async?tx=\"$tx\" &> /dev/null - done - sleep 1 - done -} - - -start_procs 1 -send_txs -kill_procs - -start_procs 2 - -# wait for node to handshake and make a new block -addr="localhost:46657" -curl -s $addr/status > /dev/null -ERR=$? -i=0 -while [ "$ERR" != 0 ]; do - sleep 1 - curl -s $addr/status > /dev/null - ERR=$? - i=$(($i + 1)) - if [[ $i == 10 ]]; then - echo "Timed out waiting for tendermint to start" - exit 1 - fi -done - -# wait for a new block -h1=`curl -s $addr/status | jq .result[1].latest_block_height` -h2=$h1 -while [ "$h2" == "$h1" ]; do - sleep 1 - h2=`curl -s $addr/status | jq .result[1].latest_block_height` -done - -kill_procs -sleep 2 - -echo "Passed Test: Persistence" +bash ./test/persist/test_failure_indices.sh diff --git a/test/persist/test2.sh b/test/persist/test_failure_indices.sh similarity index 100% rename from test/persist/test2.sh rename to test/persist/test_failure_indices.sh diff --git a/test/persist/test_simple.sh b/test/persist/test_simple.sh new file mode 100644 index 00000000..1a94a093 --- /dev/null +++ b/test/persist/test_simple.sh @@ -0,0 +1,70 @@ +#! /bin/bash + + +export TMROOT=$HOME/.tendermint_persist + +rm -rf $TMROOT +tendermint init + +function start_procs(){ + name=$1 + echo "Starting persistent dummy and tendermint" + dummy --persist $TMROOT/dummy &> "dummy_${name}.log" & + PID_DUMMY=$! + tendermint node &> tendermint_${name}.log & + PID_TENDERMINT=$! + sleep 5 +} + +function kill_procs(){ + kill -9 $PID_DUMMY $PID_TENDERMINT +} + + +function send_txs(){ + # send a bunch of txs over a few blocks + echo "Sending txs" + for i in `seq 1 5`; do + for j in `seq 1 100`; do + tx=`head -c 8 /dev/urandom | hexdump -ve '1/1 "%.2X"'` + curl -s 127.0.0.1:46657/broadcast_tx_async?tx=\"$tx\" &> /dev/null + done + sleep 1 + done +} + + +start_procs 1 +send_txs +kill_procs + +start_procs 2 + +# wait for node to handshake and make a new block +addr="localhost:46657" +curl -s $addr/status > /dev/null +ERR=$? +i=0 +while [ "$ERR" != 0 ]; do + sleep 1 + curl -s $addr/status > /dev/null + ERR=$? + i=$(($i + 1)) + if [[ $i == 10 ]]; then + echo "Timed out waiting for tendermint to start" + exit 1 + fi +done + +# wait for a new block +h1=`curl -s $addr/status | jq .result[1].latest_block_height` +h2=$h1 +while [ "$h2" == "$h1" ]; do + sleep 1 + h2=`curl -s $addr/status | jq .result[1].latest_block_height` +done + +kill_procs +sleep 2 + +echo "Passed Test: Persistence" From f4e6cf4439b38f6e84d33d84dc8ea915c1e71d8c Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 22 Dec 2016 15:01:02 -0500 Subject: [PATCH 08/12] consensus: sync wal.writeHeight --- consensus/wal.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/consensus/wal.go b/consensus/wal.go index 2c03027c..099e3c1a 100644 --- a/consensus/wal.go +++ b/consensus/wal.go @@ -104,4 +104,9 @@ func (wal *WAL) Save(wmsg WALMessage) { func (wal *WAL) writeHeight(height int) { wal.group.WriteLine(Fmt("#HEIGHT: %v", height)) + + // TODO: only flush when necessary + if err := wal.group.Flush(); err != nil { + PanicQ(Fmt("Error flushing consensus wal buf to file. Error: %v \n", err)) + } } From 0e7694ca94374922e3cc5e135a35a31c351c4501 Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 22 Dec 2016 15:01:22 -0500 Subject: [PATCH 09/12] state: AppHashIsStale -> IntermediateState --- state/execution.go | 68 +++++++++++++++++++--------- state/state.go | 67 ++++++++++++++++++++------- test/persist/test_failure_indices.sh | 2 +- 3 files changed, 99 insertions(+), 38 deletions(-) diff --git a/state/execution.go b/state/execution.go index cec4849a..e1cea605 100644 --- a/state/execution.go +++ b/state/execution.go @@ -56,7 +56,9 @@ func (s *State) ExecBlock(eventCache types.Fireable, proxyAppConn proxy.AppConnC // save state with updated height/blockhash/validators // but stale apphash, in case we fail between Commit and Save - s.Save() + s.SaveIntermediate() + + fail.Fail() // XXX return nil } @@ -264,7 +266,6 @@ func (s *State) CommitStateUpdateMempool(proxyAppConn proxy.AppConnConsensus, bl // Set the state's new AppHash s.AppHash = res.Data - s.AppHashIsStale = false // Update mempool. mempool.Update(block.Height, block.Txs) @@ -322,7 +323,7 @@ func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { return nil } - log.Notice("TMSP Handshake", "height", blockInfo.BlockHeight, "app_hash", blockInfo.AppHash) + log.Notice("TMSP Handshake", "appHeight", blockInfo.BlockHeight, "appHash", blockInfo.AppHash) blockHeight := int(blockInfo.BlockHeight) // XXX: beware overflow appHash := blockInfo.AppHash @@ -352,29 +353,46 @@ func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnConsensus proxy.AppConnConsensus) error { storeBlockHeight := h.store.Height() - if storeBlockHeight < appBlockHeight { + stateBlockHeight := h.state.LastBlockHeight + log.Notice("TMSP Replay Blocks", "appHeight", appBlockHeight, "storeHeight", storeBlockHeight, "stateHeight", stateBlockHeight) + + if storeBlockHeight == 0 { + return nil + } else if storeBlockHeight < appBlockHeight { // if the app is ahead, there's nothing we can do return ErrAppBlockHeightTooHigh{storeBlockHeight, appBlockHeight} } else if storeBlockHeight == appBlockHeight { - // if we crashed between Commit and SaveState, - // the state's app hash is stale - // otherwise we're synced - if h.state.AppHashIsStale { - h.state.AppHashIsStale = false + // We ran Commit, but if we crashed before state.Save(), + // load the intermediate state and update the state.AppHash. + // NOTE: If TMSP allowed rollbacks, we could just replay the + // block even though it's been committed + stateAppHash := h.state.AppHash + lastBlockAppHash := h.store.LoadBlock(storeBlockHeight).AppHash + + if bytes.Equal(stateAppHash, appHash) { + // we're all synced up + log.Debug("TMSP RelpayBlocks: Already synced") + } else if bytes.Equal(stateAppHash, lastBlockAppHash) { + // we crashed after commit and before saving state, + // so load the intermediate state and update the hash + h.state.LoadIntermediate() h.state.AppHash = appHash + h.state.Save() + log.Debug("TMSP RelpayBlocks: Loaded intermediate state and updated state.AppHash") + } else { + PanicSanity(Fmt("Unexpected state.AppHash: state.AppHash %X; app.AppHash %X, lastBlock.AppHash %X", stateAppHash, appHash, lastBlockAppHash)) + } return nil - } else if h.state.LastBlockHeight == appBlockHeight { - // store is ahead of app but core's state height is at apps height - // this happens if we crashed after saving the block, - // but before committing it. We should be 1 ahead - if storeBlockHeight != appBlockHeight+1 { - PanicSanity(Fmt("core.state.height == app.height but store.height (%d) > app.height+1 (%d)", storeBlockHeight, appBlockHeight+1)) - } + } else if storeBlockHeight == appBlockHeight+1 && + storeBlockHeight == stateBlockHeight+1 { + // We crashed after saving the block + // but before Commit (both the state and app are behind), + // so just replay the block - // check that the blocks last apphash is the states apphash + // check that the lastBlock.AppHash matches the state apphash block := h.store.LoadBlock(storeBlockHeight) if !bytes.Equal(block.Header.AppHash, appHash) { return ErrLastStateMismatch{storeBlockHeight, block.Header.AppHash, appHash} @@ -385,13 +403,19 @@ func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnCon h.nBlocks += 1 var eventCache types.Fireable // nil - // replay the block against the actual tendermint state + // replay the latest block return h.state.ApplyBlock(eventCache, appConnConsensus, block, blockMeta.PartsHeader, MockMempool{}) - + } else if storeBlockHeight != stateBlockHeight { + // unless we failed before committing or saving state (previous 2 case), + // the store and state should be at the same height! + PanicSanity(Fmt("Expected storeHeight (%d) and stateHeight (%d) to match.", storeBlockHeight, stateBlockHeight)) } else { - // either we're caught up or there's blocks to replay + // store is more than one ahead, + // so app wants to replay many blocks + // replay all blocks starting with appBlockHeight+1 var eventCache types.Fireable // nil + var appHash []byte for i := appBlockHeight + 1; i <= storeBlockHeight; i++ { h.nBlocks += 1 @@ -413,8 +437,10 @@ func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnCon appHash = res.Data } if !bytes.Equal(h.state.AppHash, appHash) { - return errors.New(Fmt("Tendermint state.AppHash does not match AppHash after replay", "expected", h.state.AppHash, "got", appHash)) + return errors.New(Fmt("Tendermint state.AppHash does not match AppHash after replay. Got %X, expected %X", appHash, h.state.AppHash)) } return nil } + + return nil } diff --git a/state/state.go b/state/state.go index af2f69ca..455ba409 100644 --- a/state/state.go +++ b/state/state.go @@ -14,7 +14,8 @@ import ( ) var ( - stateKey = []byte("stateKey") + stateKey = []byte("stateKey") + stateIntermediateKey = []byte("stateIntermediateKey") ) //----------------------------------------------------------------------------- @@ -36,15 +37,17 @@ type State struct { Validators *types.ValidatorSet LastValidators *types.ValidatorSet // block.LastCommit validated against this - // AppHash is updated after Commit; - // it's stale after ExecBlock and before Commit - AppHashIsStale bool - AppHash []byte + // AppHash is updated after Commit + AppHash []byte } func LoadState(db dbm.DB) *State { + return loadState(db, stateKey) +} + +func loadState(db dbm.DB, key []byte) *State { s := &State{db: db} - buf := db.Get(stateKey) + buf := db.Get(key) if len(buf) == 0 { return nil } else { @@ -60,9 +63,6 @@ func LoadState(db dbm.DB) *State { } func (s *State) Copy() *State { - if s.AppHashIsStale { - PanicSanity(Fmt("App hash is stale: %v", s)) - } return &State{ db: s.db, GenesisDoc: s.GenesisDoc, @@ -72,7 +72,6 @@ func (s *State) Copy() *State { LastBlockTime: s.LastBlockTime, Validators: s.Validators.Copy(), LastValidators: s.LastValidators.Copy(), - AppHashIsStale: false, AppHash: s.AppHash, } } @@ -83,6 +82,35 @@ func (s *State) Save() { s.db.SetSync(stateKey, s.Bytes()) } +func (s *State) SaveIntermediate() { + s.mtx.Lock() + defer s.mtx.Unlock() + s.db.SetSync(stateIntermediateKey, s.Bytes()) +} + +// Load the intermediate state into the current state +// and do some sanity checks +func (s *State) LoadIntermediate() { + s2 := loadState(s.db, stateIntermediateKey) + if s.ChainID != s2.ChainID { + PanicSanity(Fmt("State mismatch for ChainID. Got %v, Expected %v", s2.ChainID, s.ChainID)) + } + + if s.LastBlockHeight+1 != s2.LastBlockHeight { + PanicSanity(Fmt("State mismatch for LastBlockHeight. Got %v, Expected %v", s2.LastBlockHeight, s.LastBlockHeight+1)) + } + + if !bytes.Equal(s.Validators.Hash(), s2.LastValidators.Hash()) { + PanicSanity(Fmt("State mismatch for LastValidators. Got %X, Expected %X", s2.LastValidators.Hash(), s.Validators.Hash())) + } + + if !bytes.Equal(s.AppHash, s2.AppHash) { + PanicSanity(Fmt("State mismatch for AppHash. Got %X, Expected %X", s2.AppHash, s.AppHash)) + } + + s.setBlockAndValidators(s2.LastBlockHeight, s2.LastBlockID, s2.LastBlockTime, s2.Validators.Copy(), s2.LastValidators.Copy()) +} + func (s *State) Equals(s2 *State) bool { return bytes.Equal(s.Bytes(), s2.Bytes()) } @@ -97,15 +125,22 @@ func (s *State) Bytes() []byte { } // Mutate state variables to match block and validators -// Since we don't have the new AppHash yet, we set s.AppHashIsStale=true +// after running EndBlock func (s *State) SetBlockAndValidators(header *types.Header, blockPartsHeader types.PartSetHeader, prevValSet, nextValSet *types.ValidatorSet) { - s.LastBlockHeight = header.Height - s.LastBlockID = types.BlockID{header.Hash(), blockPartsHeader} - s.LastBlockTime = header.Time + s.setBlockAndValidators(header.Height, + types.BlockID{header.Hash(), blockPartsHeader}, header.Time, + prevValSet, nextValSet) +} + +func (s *State) setBlockAndValidators( + height int, blockID types.BlockID, blockTime time.Time, + prevValSet, nextValSet *types.ValidatorSet) { + + s.LastBlockHeight = height + s.LastBlockID = blockID + s.LastBlockTime = blockTime s.Validators = nextValSet s.LastValidators = prevValSet - - s.AppHashIsStale = true } func (s *State) GetValidators() (*types.ValidatorSet, *types.ValidatorSet) { diff --git a/test/persist/test_failure_indices.sh b/test/persist/test_failure_indices.sh index 509deee7..7302ccac 100644 --- a/test/persist/test_failure_indices.sh +++ b/test/persist/test_failure_indices.sh @@ -14,7 +14,7 @@ function start_procs(){ PID_DUMMY=$! if [[ "$indexToFail" == "" ]]; then # run in background, dont fail - tendermint node &> tendermint_${name}.log & + tendermint node --log_level=debug &> tendermint_${name}.log & PID_TENDERMINT=$! else # run in foreground, fail From 0c01b0ded95e568adf829a13f9394b4aa88546ab Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 22 Dec 2016 19:30:09 -0500 Subject: [PATCH 10/12] state.State and wal.writeHeight after handshake --- consensus/replay.go | 1 + consensus/state.go | 8 ++++++++ state/execution.go | 4 +++- test/docker/Dockerfile | 3 +++ test/persist/test_failure_indices.sh | 2 +- 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/consensus/replay.go b/consensus/replay.go index b69b4384..124d3b5f 100644 --- a/consensus/replay.go +++ b/consensus/replay.go @@ -101,6 +101,7 @@ func (cs *ConsensusState) catchupReplay(csHeight int) error { // Search for height marker gr, found, err = cs.wal.group.Search("#HEIGHT: ", makeHeightSearchFunc(csHeight)) if err == io.EOF { + log.Warn("Replay: wal.group.Search returned EOF", "height", csHeight) return nil } else if err != nil { return err diff --git a/consensus/state.go b/consensus/state.go index 31a2e2c9..9dead0cf 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -362,6 +362,14 @@ func (cs *ConsensusState) OnStart() error { // let's go for it anyways, maybe we're fine } + // If the latest block was applied in the tmsp handshake, + // we may not have written the current height to the wal, + // so write it here in case + if cs.Step == RoundStepNewHeight { + log.Warn("wal.writeHeight", "height", cs.Height) + cs.wal.writeHeight(cs.Height) + } + // now start the receiveRoutine go cs.receiveRoutine(0) diff --git a/state/execution.go b/state/execution.go index e1cea605..47ca0149 100644 --- a/state/execution.go +++ b/state/execution.go @@ -344,6 +344,9 @@ func (h *Handshaker) Handshake(proxyApp proxy.AppConns) error { return errors.New(Fmt("Error on replay: %v", err)) } + // Save the state + h.state.Save() + // TODO: (on restart) replay mempool return nil @@ -378,7 +381,6 @@ func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnCon // so load the intermediate state and update the hash h.state.LoadIntermediate() h.state.AppHash = appHash - h.state.Save() log.Debug("TMSP RelpayBlocks: Loaded intermediate state and updated state.AppHash") } else { PanicSanity(Fmt("Unexpected state.AppHash: state.AppHash %X; app.AppHash %X, lastBlock.AppHash %X", stateAppHash, appHash, lastBlockAppHash)) diff --git a/test/docker/Dockerfile b/test/docker/Dockerfile index 7cc95254..5a859a28 100644 --- a/test/docker/Dockerfile +++ b/test/docker/Dockerfile @@ -21,5 +21,8 @@ COPY . $REPO RUN go install ./cmd/tendermint RUN bash scripts/install_tmsp_apps.sh +# expose the volume for debugging +VOLUME $REPO + EXPOSE 46656 EXPOSE 46657 diff --git a/test/persist/test_failure_indices.sh b/test/persist/test_failure_indices.sh index 7302ccac..d6012fbe 100644 --- a/test/persist/test_failure_indices.sh +++ b/test/persist/test_failure_indices.sh @@ -18,7 +18,7 @@ function start_procs(){ PID_TENDERMINT=$! else # run in foreground, fail - FAIL_TEST_INDEX=$indexToFail tendermint node &> tendermint_${name}.log + FAIL_TEST_INDEX=$indexToFail tendermint node --log_level=debug &> tendermint_${name}.log PID_TENDERMINT=$! fi } From bd222d6e3c902345659cb366d88453a3761a8dbe Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 22 Dec 2016 20:50:13 -0500 Subject: [PATCH 11/12] test: more unique container names --- test/p2p/client.sh | 5 +++-- test/p2p/kill_all/test.sh | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/test/p2p/client.sh b/test/p2p/client.sh index b1ac64e7..f3025ba5 100644 --- a/test/p2p/client.sh +++ b/test/p2p/client.sh @@ -6,13 +6,14 @@ NETWORK_NAME=$2 ID=$3 CMD=$4 +NAME=test_container_$ID + echo "starting test client container with CMD=$CMD" # run the test container on the local network docker run -t --rm \ -v $GOPATH/src/github.com/tendermint/tendermint/test/p2p/:/go/src/github.com/tendermint/tendermint/test/p2p \ --net=$NETWORK_NAME \ --ip=$(test/p2p/ip.sh "-1") \ - --name test_container_$ID \ + --name $NAME \ --entrypoint bash \ $DOCKER_IMAGE $CMD - diff --git a/test/p2p/kill_all/test.sh b/test/p2p/kill_all/test.sh index ab8e29c5..318a1fe4 100644 --- a/test/p2p/kill_all/test.sh +++ b/test/p2p/kill_all/test.sh @@ -19,12 +19,12 @@ for i in $(seq 1 "$NUM_OF_CRASHES"); do echo "Restarting all peers! Take $i ..." # restart all peers - for i in $(seq 1 "$NUM_OF_PEERS"); do - docker stop "local_testnet_$i" - docker start "local_testnet_$i" + for j in $(seq 1 "$NUM_OF_PEERS"); do + docker stop "local_testnet_$j" + docker start "local_testnet_$j" done - bash test/p2p/client.sh "$DOCKER_IMAGE" "$NETWORK_NAME" kill_all "test/p2p/kill_all/check_peers.sh $NUM_OF_PEERS" + bash test/p2p/client.sh "$DOCKER_IMAGE" "$NETWORK_NAME" kill_all_$i "test/p2p/kill_all/check_peers.sh $NUM_OF_PEERS" done echo "" From bae0bc02a6006c8ede83a33904746b8b8d9ad701 Mon Sep 17 00:00:00 2001 From: Ethan Buchman Date: Thu, 5 Jan 2017 20:16:42 -0800 Subject: [PATCH 12/12] consensus: be more explicit when we need to write height after handshake --- consensus/state.go | 26 ++++++++++++++++++-------- state/execution.go | 4 +++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/consensus/state.go b/consensus/state.go index 9dead0cf..4b541ccb 100644 --- a/consensus/state.go +++ b/consensus/state.go @@ -4,6 +4,7 @@ import ( "bytes" "errors" "fmt" + "io" "reflect" "sync" "time" @@ -348,6 +349,23 @@ func (cs *ConsensusState) OnStart() error { return err } + // If the latest block was applied in the tmsp handshake, + // we may not have written the current height to the wal, + // so check here and write it if not found. + // TODO: remove this and run the handhsake/replay + // through the consensus state with a mock app + gr, found, err := cs.wal.group.Search("#HEIGHT: ", makeHeightSearchFunc(cs.Height)) + if (err == io.EOF || !found) && cs.Step == RoundStepNewHeight { + log.Warn("Height not found in wal. Writing new height", "height", cs.Height) + rs := cs.RoundStateEvent() + cs.wal.Save(rs) + } else if err != nil { + return err + } + if gr != nil { + gr.Close() + } + // we need the timeoutRoutine for replay so // we don't block on the tick chan. // NOTE: we will get a build up of garbage go routines @@ -362,14 +380,6 @@ func (cs *ConsensusState) OnStart() error { // let's go for it anyways, maybe we're fine } - // If the latest block was applied in the tmsp handshake, - // we may not have written the current height to the wal, - // so write it here in case - if cs.Step == RoundStepNewHeight { - log.Warn("wal.writeHeight", "height", cs.Height) - cs.wal.writeHeight(cs.Height) - } - // now start the receiveRoutine go cs.receiveRoutine(0) diff --git a/state/execution.go b/state/execution.go index 47ca0149..f8e2c1d8 100644 --- a/state/execution.go +++ b/state/execution.go @@ -418,6 +418,9 @@ func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnCon // replay all blocks starting with appBlockHeight+1 var eventCache types.Fireable // nil + // TODO: use stateBlockHeight instead and let the consensus state + // do the replay + var appHash []byte for i := appBlockHeight + 1; i <= storeBlockHeight; i++ { h.nBlocks += 1 @@ -443,6 +446,5 @@ func (h *Handshaker) ReplayBlocks(appHash []byte, appBlockHeight int, appConnCon } return nil } - return nil }