diff --git a/net/gce.sh b/net/gce.sh index 484b05dac..c36101381 100755 --- a/net/gce.sh +++ b/net/gce.sh @@ -169,6 +169,7 @@ delete) exit 0 fi gcloud_DeleteInstances + rm -f "$configFile" ;; create) diff --git a/net/net.sh b/net/net.sh index 0027e63c9..bde1de727 100755 --- a/net/net.sh +++ b/net/net.sh @@ -17,9 +17,10 @@ usage: $0 [start|stop] Operate a configured testnet - start - Start the network - sanity - Sanity check the network - stop - Stop the network + start - Start the network + sanity - Sanity check the network + stop - Stop the network + restart - Shortcut for stop then start start-specific options: -S snapFilename - Deploy the specified Snap file @@ -29,7 +30,7 @@ Operate a configured testnet Note: if RUST_LOG is set in the environment it will be propogated into the network nodes. - sanity-specific options: + sanity/start-specific options: -o noLedgerVerify - Skip ledger verification -o noValidatorSanity - Skip validatory sanity @@ -49,8 +50,6 @@ sanityExtraArgs= command=$1 [[ -n $command ]] || usage shift -[[ $command = start || $command = sanity || $command = stop ]] || - usage "Invalid command: $command" while getopts "h?S:s:a:o:" opt; do case $opt in @@ -58,7 +57,6 @@ while getopts "h?S:s:a:o:" opt; do usage ;; S) - [[ $command = start ]] || usage "-s is only valid with the 'start' command" snapFilename=$OPTARG [[ -f $snapFilename ]] || usage "Snap not readable: $snapFilename" deployMethod=snap @@ -67,6 +65,7 @@ while getopts "h?S:s:a:o:" opt; do case $OPTARG in edge|beta|stable) snapChannel=$OPTARG + deployMethod=snap ;; *) usage "Invalid snap channel: $OPTARG" @@ -94,6 +93,7 @@ while getopts "h?S:s:a:o:" opt; do done loadConfigFile +expectedNodeCount=$((${#validatorIpList[@]} + 1)) build() { declare MAYBE_DOCKER= @@ -103,8 +103,7 @@ build() { SECONDS=0 ( cd "$SOLANA_ROOT" - echo "****************" - echo "Build started at $(date)" + echo "--- Build started at $(date)" set -x rm -rf farf @@ -121,17 +120,16 @@ common_start_setup() { set -x test -d "$SOLANA_ROOT" ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin" - rsync -vPrz -e "ssh ${sshOptions[*]}" \ + rsync -vPr -e "ssh ${sshOptions[*]}" \ "$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \ "$ipAddress":~/solana/ - ) >> "$logFile" + ) >> "$logFile" 2>&1 } startLeader() { declare ipAddress=$1 declare logFile="$2" - echo "****************" - echo "Starting leader: $leaderIp" + echo "--- Starting leader: $leaderIp" common_start_setup "$ipAddress" "$logFile" @@ -141,58 +139,58 @@ startLeader() { set -x case $deployMethod in snap) - rsync -vPrz -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap" + rsync -vPr -e "ssh ${sshOptions[*]}" "$snapFilename" "$ipAddress:~/solana/solana.snap" ;; local) - rsync -vPrz -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" + rsync -vPr -e "ssh ${sshOptions[*]}" "$SOLANA_ROOT"/farf/bin/* "$ipAddress:~/.cargo/bin/" ;; *) usage "Internal error: invalid deployMethod: $deployMethod" ;; esac - ssh "${sshOptions[@]}" -f "$ipAddress" \ - "./solana/net/remote/remote_node.sh $deployMethod leader $leaderIp \"$nodeSetupArgs\" \"$RUST_LOG\"" - ) >> "$logFile" + ssh "${sshOptions[@]}" -n "$ipAddress" \ + "./solana/net/remote/remote_node.sh $deployMethod leader $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" + ) >> "$logFile" 2>&1 } startValidator() { declare ipAddress=$1 declare logFile="$2" - echo "*******************" - echo "Starting validator: $leaderIp" - common_start_setup "$ipAddress" "$logFile" + echo "--- Starting validator: $leaderIp" ( + common_start_setup "$ipAddress" /dev/stdout set -x - ssh "${sshOptions[@]}" -f "$ipAddress" \ - "./solana/net/remote/remote_node.sh $deployMethod validator $leaderIp \"$nodeSetupArgs\" \"$RUST_LOG\"" - ) >> "$logFile" + ssh "${sshOptions[@]}" -n "$ipAddress" \ + "./solana/net/remote/remote_node.sh $deployMethod validator $leaderIp $expectedNodeCount \"$nodeSetupArgs\" \"$RUST_LOG\"" + ) >> "$netLogDir/validator-$ipAddress.log" 2>&1 & + declare pid=$! + ln -sfT "validator-$ipAddress.log" "$netLogDir/validator-$pid.log" + pids+=("$pid") } startClient() { declare ipAddress=$1 declare logFile="$2" - echo "****************" - echo "Starting client: $leaderIp" + echo "--- Starting client: $leaderIp" common_start_setup "$ipAddress" "$logFile" - declare expectedNodeCount=$((${#validatorIpList[@]} + 1)) - ( set -x ssh "${sshOptions[@]}" -f "$ipAddress" \ "./solana/net/remote/remote_client.sh $deployMethod $leaderIp $expectedNodeCount \"$RUST_LOG\"" - ) >> "$logFile" + ) >> "$logFile" 2>&1 } sanity() { declare expectedNodeCount=$((${#validatorIpList[@]} + 1)) + echo "--- Sanity" ( set -x - # shellcheck disable=SC2029 # remote_client.sh are expanded on client side intentionally... + # shellcheck disable=SC2029 # remote_client.sh args are expanded on client side intentionally ssh "${sshOptions[@]}" "$leaderIp" \ - "./solana/net/remote/remote_sanity.sh $deployMethod $leaderIp $expectedNodeCount $sanityExtraArgs" + "./solana/net/remote/remote_sanity.sh $sanityExtraArgs" ) } @@ -200,13 +198,25 @@ start() { case $deployMethod in snap) if [[ -n $snapChannel ]]; then + rm -f "$SOLANA_ROOT"/solana_*.snap if [[ $(uname) != Linux ]]; then - echo Error: snap channel deployment only supported in Linux - exit 1 + ( + set -x + SOLANA_DOCKER_RUN_NOSETUID=1 "$SOLANA_ROOT"/ci/docker-run.sh ubuntu:18.04 bash -c " + set -ex; + apt-get -qq update; + apt-get -qq -y install snapd; + snap download --channel=$snapChannel solana; + " + ) + else + snap download --channel="$snapChannel" solana fi - usage "TODO: the snap download command below is probably wrong..." - snap download --"$snapChannel" solana - snapFilename=solana.snap + snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)" + [[ -r $snapFilename ]] || { + echo "Error: Snap not readable: $snapFilename" + exit 1 + } fi ;; local) @@ -226,10 +236,21 @@ start() { leaderDeployTime=$SECONDS SECONDS=0 + pids=() for ipAddress in "${validatorIpList[@]}"; do - startValidator "$ipAddress" "$netLogDir/validator-$ipAddress.log" & + startValidator "$ipAddress" done - wait + + for pid in "${pids[@]}"; do + declare ok=true + wait "$pid" || ok=false + if ! $ok; then + cat "$netLogDir/validator-$pid.log" + echo ^^^ +++ + exit 1 + fi + done + validatorDeployTime=$SECONDS sanity @@ -239,7 +260,6 @@ start() { startClient "$ipAddress" "$netLogDir/client-$ipAddress.log" done clientDeployTime=$SECONDS - wait if [[ $deployMethod = "snap" ]]; then IFS=\ read -r _ networkVersion _ < <( @@ -264,8 +284,7 @@ start() { stop_node() { local ipAddress=$1 - echo "**************" - echo "Stopping node: $ipAddress" + echo "--- Stopping node: $ipAddress" ( set -x ssh "${sshOptions[@]}" "$ipAddress" " @@ -273,8 +292,8 @@ stop_node() { if snap list solana; then sudo snap set solana mode=; sudo snap remove solana; - fi; \ - pkill -9 solana- remote_ oom-monitor; + fi; + for pattern in solana- remote_ oom-monitor; do pkill -9 \$pattern; done; " ) || true } @@ -294,10 +313,13 @@ stop() { } case $command in -start) +restart) stop start ;; +start) + start + ;; sanity) sanity ;; diff --git a/net/remote/remote_client.sh b/net/remote/remote_client.sh old mode 100755 new mode 100644 index 5a7e4c5ee..3003fda06 --- a/net/remote/remote_client.sh +++ b/net/remote/remote_client.sh @@ -1,14 +1,15 @@ #!/bin/bash -e +cd "$(dirname "$0")"/../.. + deployMethod="$1" -netEntrypoint="$2" +leaderIp="$2" numNodes="$3" RUST_LOG="$4" [[ -n $deployMethod ]] || exit -[[ -n $netEntrypoint ]] || exit +[[ -n $leaderIp ]] || exit [[ -n $numNodes ]] || exit -cd "$(dirname "$0")"/../.. source net/common.sh loadConfigFile @@ -17,14 +18,19 @@ if [[ $threadCount -gt 4 ]]; then threadCount=4 fi -./script/install-earlyoom.sh +scripts/install-earlyoom.sh case $deployMethod in snap) + rsync -vPr "$leaderIp:~/solana/solana.snap" . sudo snap install solana.snap --devmode --dangerous rm solana.snap - sudo snap set solana metrics-config="$SOLANA_METRICS_CONFIG" rust-log="$RUST_LOG" + sudo snap set solana "\ + leader-ip=$leaderIp \ + metrics-config=$SOLANA_METRICS_CONFIG \ + rust-log=$RUST_LOG \ + " solana_bench_tps=/snap/bin/solana.bench-tps ;; local) @@ -32,20 +38,19 @@ local) export USE_INSTALL=1 export RUST_LOG - rsync -vPrz "$netEntrypoint:~/.cargo/bin/solana*" ~/.cargo/bin/ - solana_bench_tps=multinode-demo/client.sh - netEntrypoint="$:~/solana" + rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ + solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana" ;; *) echo "Unknown deployment method: $deployMethod" exit 1 esac -./scripts/oom-monitor.sh > oom-monitor.log 2>&1 & +scripts/oom-monitor.sh > oom-monitor.log 2>&1 & while true; do echo "=== Client start: $(date)" >> client.log - clientCommand="$solana_bench_tps $netEntrypoint $numNodes --loop -s 600 --sustained -t threadCount" + clientCommand="$solana_bench_tps --num-nodes $numNodes --loop -s 600 --sustained -t threadCount" echo "$ $clientCommand" >> client.log $clientCommand >> client.log 2>&1 diff --git a/net/remote/remote_node.sh b/net/remote/remote_node.sh index 4a5096109..d1439ac7f 100755 --- a/net/remote/remote_node.sh +++ b/net/remote/remote_node.sh @@ -1,37 +1,51 @@ #!/bin/bash -e +cd "$(dirname "$0")"/../.. + deployMethod="$1" nodeType="$2" -netEntrypoint="$3" -setupArgs="$4" -RUST_LOG="$5" +leaderIp="$3" +numNodes="$4" +setupArgs="$5" +RUST_LOG="$6" + +cat > deployConfig < oom-monitor.log 2>&1 & + scripts/oom-monitor.sh > oom-monitor.log 2>&1 & case $nodeType in leader) @@ -64,11 +80,11 @@ local) ./multinode-demo/leader.sh > leader.log 2>&1 & ;; validator) - rsync -vPrz "$netEntrypoint:~/.cargo/bin/solana*" ~/.cargo/bin/ + rsync -vPr "$leaderIp:~/.cargo/bin/solana*" ~/.cargo/bin/ # shellcheck disable=SC2086 # Don't want to double quote "$setupArgs" ./multinode-demo/setup.sh -t validator -p $setupArgs - ./multinode-demo/validator.sh "$netEntrypoint":~/solana "$netEntrypoint" >validator.log 2>&1 & + ./multinode-demo/validator.sh "$leaderIp":~/solana "$leaderIp" >validator.log 2>&1 & ;; *) echo "Error: unknown node type: $nodeType" @@ -80,4 +96,3 @@ local) echo "Unknown deployment method: $deployMethod" exit 1 esac - diff --git a/net/remote/remote_sanity.sh b/net/remote/remote_sanity.sh index aac32bd27..f4cae9bf0 100755 --- a/net/remote/remote_sanity.sh +++ b/net/remote/remote_sanity.sh @@ -1,14 +1,25 @@ #!/bin/bash -e -deployMethod="$1" -netEntrypoint="$2" -numNodes="$3" +cd "$(dirname "$0")"/../.. -[[ -n $deployMethod ]] || exit -[[ -n $netEntrypoint ]] || exit -[[ -n $numNodes ]] || exit +deployMethod= +leaderIp= +numNodes= +# shellcheck source=/dev/null # deployConfig is written by remote_sanity.sh +source deployConfig -shift 3 +[[ -n $deployMethod ]] || { + echo "deployMethod empty" + exit 1 +} +[[ -n $leaderIp ]] || { + echo "leaderIp empty" + exit 1 +} +[[ -n $numNodes ]] || { + echo "numNodes empty" + exit 1 +} ledgerVerify=true validatorSanity=true @@ -29,26 +40,25 @@ while [[ $1 = "-o" ]]; do esac done - -cd "$(dirname "$0")"/../.. source net/common.sh loadConfigFile case $deployMethod in snap) + PATH="/snap/bin:$PATH" export USE_SNAP=1 - solana_bench_tps=/snap/bin/solana.bench-tps - solana_ledger_tool=/snap/bin/solana.ledger-tool + + solana_bench_tps=solana.bench-tps + solana_ledger_tool=solana.ledger-tool ledger=/var/snap/solana/current/config/ledger ;; local) PATH="$HOME"/.cargo/bin:"$PATH" export USE_INSTALL=1 - solana_bench_tps=multinode-demo/client.sh + solana_bench_tps="multinode-demo/client.sh $leaderIp:~/solana" solana_ledger_tool=solana-ledger-tool ledger=config/ledger - netEntrypoint="$:~/solana" ;; *) echo "Unknown deployment method: $deployMethod" @@ -56,19 +66,19 @@ local) esac -echo "--- $netEntrypoint: wallet sanity" +echo "--- $leaderIp: wallet sanity" ( set -x - multinode-demo/test/wallet-sanity.sh "$netEntrypoint" + multinode-demo/test/wallet-sanity.sh "$leaderIp" ) -echo "--- $netEntrypoint: node count" +echo "--- $leaderIp: node count" ( set -x - $solana_bench_tps "$netEntrypoint" "$numNodes" -c + $solana_bench_tps --num-nodes "$numNodes" --converge-only ) -echo "--- $netEntrypoint: verify ledger" +echo "--- $leaderIp: verify ledger" if $ledgerVerify; then if [[ -d $ledger ]]; then ( @@ -87,12 +97,12 @@ else fi -echo "--- $netEntrypoint: validator sanity" +echo "--- $leaderIp: validator sanity" if $validatorSanity; then ( ./multinode-demo/setup.sh -t validator set -e pipefail - timeout 10s ./multinode-demo/validator.sh "$netEntrypoint" 2>&1 | tee validator.log + timeout 10s ./multinode-demo/validator.sh "$leaderIp" 2>&1 | tee validator.log ) wc -l validator.log if grep -C100 panic validator.log; then diff --git a/net/remote/remote_startup.sh b/net/remote/remote_startup.sh index 388f138db..0c511b033 100644 --- a/net/remote/remote_startup.sh +++ b/net/remote/remote_startup.sh @@ -9,3 +9,12 @@ systemctl disable apt-daily.service # disable run when system boot systemctl disable apt-daily.timer # disable timer run apt-get --assume-yes install rsync libssl-dev +cat > /etc/rsyncd.conf <<-EOF +[config] +path = /var/snap/solana/current/config +hosts allow = * +read only = true +EOF + +systemctl enable rsync +systemctl start rsync