Use a common solana user on all testnet instances

This commit is contained in:
Michael Vines 2018-09-08 19:19:12 -07:00 committed by Grimes
parent 7029e4395c
commit ebcac3c2d1
8 changed files with 116 additions and 181 deletions

View File

@ -7,8 +7,12 @@
# shellcheck disable=2034
#
netConfigDir="$(dirname "${BASH_SOURCE[0]}")"/config
netLogDir="$(dirname "${BASH_SOURCE[0]}")"/log
netDir=$(
cd "$(dirname "${BASH_SOURCE[0]}")" || exit
echo "$PWD"
)
netConfigDir="$netDir"/config
netLogDir="$netDir"/log
mkdir -p "$netConfigDir" "$netLogDir"
# shellcheck source=scripts/configure-metrics.sh
@ -21,7 +25,6 @@ publicNetwork=
leaderIp=
netBasename=
sshPrivateKey=
sshUsername=
clientIpList=()
sshOptions=()
validatorIpList=()
@ -31,9 +34,10 @@ buildSshOptions() {
-o "BatchMode=yes"
-o "StrictHostKeyChecking=no"
-o "UserKnownHostsFile=/dev/null"
-o "User=$sshUsername"
-o "User=solana"
-o "IdentityFile=$sshPrivateKey"
-o "LogLevel=ERROR"
-F /dev/null
)
}
@ -47,7 +51,6 @@ loadConfigFile() {
[[ -n "$leaderIp" ]] || usage "Config file invalid, leaderIp unspecified: $configFile"
[[ -n "$netBasename" ]] || usage "Config file invalid, netBasename unspecified: $configFile"
[[ -n $sshPrivateKey ]] || usage "Config file invalid, sshPrivateKey unspecified: $configFile"
[[ -n $sshUsername ]] || usage "Config file invalid, sshUsername unspecified: $configFile"
[[ ${#validatorIpList[@]} -gt 0 ]] || usage "Config file invalid, validatorIpList unspecified: $configFile"
buildSshOptions

View File

@ -106,6 +106,7 @@ done
shift $((OPTIND - 1))
[[ -z $1 ]] || usage "Unexpected argument: $1"
sshPrivateKey="$netConfigDir/id_$prefix"
prepareInstancesAndWriteConfigFile() {
$metricsWriteDatapoint "testnet-deploy net-config-begin=1"
@ -114,15 +115,10 @@ prepareInstancesAndWriteConfigFile() {
# autogenerated at $(date)
netBasename=$prefix
publicNetwork=$publicNetwork
sshPrivateKey=$sshPrivateKey
EOF
declare sshPrivateKey="$netConfigDir/id_$prefix"
rm -rf "$sshPrivateKey"{,.pub}
(
set -x
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
)
echo "sshPrivateKey=$sshPrivateKey" >> "$configFile"
buildSshOptions
recordInstanceIp() {
declare name="$1"
@ -141,38 +137,79 @@ EOF
fi
}
waitForStartupComplete() {
declare name="$1"
declare publicIp="$3"
echo "Waiting for $name to finish booting..."
(
for i in $(seq 1 30); do
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.gce-startup-complete"); then
break
fi
sleep 2
echo "Retry $i..."
done
)
}
echo "Looking for leader instance..."
gcloud_FindInstances "name=$prefix-leader" show
[[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to start leader"
echo "Unable to find leader"
exit 1
}
gcloud_FigureRemoteUsername "${instances[0]}"
sshUsername=$gcloud_username
echo "sshUsername=$sshUsername" >> "$configFile"
buildSshOptions
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
echo "Fetching $sshPrivateKey from $leaderName"
(
rm -rf "$sshPrivateKey"{,pub}
declare leaderName
declare leaderZone
declare leaderIp
IFS=: read -r leaderName leaderZone leaderIp _ < <(echo "${instances[0]}")
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
# Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute scp --zone "$leaderZone" \
"$leaderName:/solana-id_ecdsa" "$sshPrivateKey"; then
break
fi
sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey"
)
echo "leaderIp=()" >> "$configFile"
gcloud_ForEachInstance recordInstanceIp leaderIp
gcloud_ForEachInstance waitForStartupComplete
echo "Looking for validator instances..."
gcloud_FindInstances "name~^$prefix-validator" show
[[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to start validators"
echo "Unable to find validators"
exit 1
}
echo "validatorIpList=()" >> "$configFile"
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp validatorIpList
gcloud_ForEachInstance waitForStartupComplete
echo "clientIpList=()" >> "$configFile"
echo "Looking for client instances..."
gcloud_FindInstances "name~^$prefix-client" show
[[ ${#instances[@]} -eq 0 ]] || {
gcloud_PrepInstancesForSsh "$gcloud_username" "$sshPrivateKey"
gcloud_ForEachInstance recordInstanceIp clientIpList
gcloud_ForEachInstance waitForStartupComplete
}
echo "Wrote $configFile"
@ -206,6 +243,9 @@ create)
$metricsWriteDatapoint "testnet-deploy net-create-begin=1"
rm -rf "$sshPrivateKey"{,.pub}
ssh-keygen -t ecdsa -N '' -f "$sshPrivateKey"
printNetworkInfo() {
cat <<EOF
========================================================================================
@ -233,15 +273,29 @@ cat > /etc/motd <<EOM
See "startup-script" log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | grep startup-script
To block until setup is complete, run:
$ until [[ -f /.gce-startup-complete ]]; do sleep 1; done
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
EOM
# Place the generated private key at /solana-id_ecdsa so it's retrievable by anybody
# who is able to log into this machine
cat > /solana-id_ecdsa <<EOK
$(cat "$sshPrivateKey")
EOK
cat > /solana-id_ecdsa.pub <<EOK
$(cat "$sshPrivateKey.pub")
EOK
chmod 444 /solana-id_ecdsa
USER=\$(id -un)
$(
cd "$here"/scripts/
cat \
disable-background-upgrades.sh \
create-solana-user.sh \
install-earlyoom.sh \
install-rsync.sh \
install-libssl-compatability.sh \
@ -251,6 +305,8 @@ cat > /etc/motd <<EOM
$(printNetworkInfo)
EOM
touch /.gce-startup-complete
EOF
gcloud_CreateInstances "$prefix-leader" 1 "$zone" \

View File

@ -116,16 +116,7 @@ build() {
startCommon() {
declare ipAddress=$1
test -d "$SOLANA_ROOT"
ssh "${sshOptions[@]}" "$ipAddress" "
mkdir -p ~/solana ~/.cargo/bin
# Help other users of the machine locate network logs
[[ -d /tmp/solana/ ]] || {
mkdir /tmp/solana/
chmod go+w /tmp/solana/
}
ln -sfT ~/solana /tmp/solana/=
"
ssh "${sshOptions[@]}" "$ipAddress" "mkdir -p ~/solana ~/.cargo/bin"
rsync -vPrc -e "ssh ${sshOptions[*]}" \
"$SOLANA_ROOT"/{fetch-perf-libs.sh,scripts,net,multinode-demo} \
"$ipAddress":~/solana/
@ -231,7 +222,10 @@ start() {
"
)
else
snap download --channel="$snapChannel" solana
(
cd "$SOLANA_ROOT"
snap download --channel="$snapChannel" solana
)
fi
snapFilename="$(echo "$SOLANA_ROOT"/solana_*.snap)"
[[ -r $snapFilename ]] || {

View File

@ -2,6 +2,8 @@
cd "$(dirname "$0")"/../..
echo "$(date) | $0 $*" > client.log
deployMethod="$1"
entrypointIp="$2"
numNodes="$3"

View File

@ -0,0 +1,27 @@
#!/bin/bash -ex
[[ $(uname) = Linux ]] || exit 1
[[ $USER = root ]] || exit 1
adduser solana --gecos "" --disabled-password --quiet
adduser solana sudo
echo "solana ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers
id solana
[[ -r /solana-id_ecdsa ]] || exit 1
[[ -r /solana-id_ecdsa.pub ]] || exit 1
sudo -u solana bash -c "
mkdir -p /home/solana/.ssh/
cd /home/solana/.ssh/
cp /solana-id_ecdsa.pub authorized_keys
umask 377
cp /solana-id_ecdsa id_ecdsa
echo \"
Host *
BatchMode yes
IdentityFile ~/.ssh/id_ecdsa
StrictHostKeyChecking no
\" > config
"

View File

@ -1,5 +1,5 @@
#!/bin/bash -ex
#
# Prevent background upgrades that block |apt-get|
#
# TODO: This approach is pretty uncompromising. An alternative solution that
@ -18,4 +18,3 @@ while fuser /var/lib/dpkg/lock; do
sleep 1
done

View File

@ -185,149 +185,3 @@ gcloud_DeleteInstances() {
)
}
#
# gcloud_FigureRemoteUsername [instanceInfo]
#
# The remote username when ssh-ing into GCP instances tends to not be the same
# as the user's local username, but it needs to be discovered by ssh-ing into an
# instance and examining the system.
#
# On success the gcloud_username global variable is updated
#
# instanceInfo - an entry from the `instances` array
#
# example:
# gcloud_FigureRemoteUsername "name:zone:..."
#
gcloud_FigureRemoteUsername() {
if [[ -n $gcloud_username ]]; then
return
fi
declare instanceInfo="$1"
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
echo "Detecting remote username using $zone in $zone:"
# Figure the gcp ssh username
(
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 30s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
# Try to ssh in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -o pipefail
for i in $(seq 1 10); do
if gcloud compute ssh "$name" \
--zone "$zone" -- "echo whoami:\$USER:iamwho" \
| tr -d $'\r '| tee /tmp/whoami-$$; then
break
fi
sleep 1
echo "Retry $i..."
done
)
while IFS=: read -r whoami gcloud_username iamwho ; do
[[ $whoami == "whoami" && $iamwho == "iamwho" ]] && break;
done < /tmp/whoami-$$
rm -f /tmp/whoami-$$
if [[ -z $gcloud_username ]]; then
echo Unable to figure remote user name
exit 1
fi
echo "Remote username: $gcloud_username"
}
#
# gcloud_PrepInstancesForSsh [username] [privateKey]
#
# Prepares all the instances in the `instances` array for ssh with the specified
# keypair. This eliminates the need to use the restrictive |gcloud compute ssh|,
# use plain |ssh| instead.
#
# username - gcp ssh username as computed by gcloud_FigureRemoteUsername
# privateKey - private key to install on all the instances
#
gcloud_PrepInstancesForSsh() {
declare username="$1"
declare privateKey="$2"
declare publicKey="$privateKey".pub
declare logDir=log/
mkdir -p $logDir
rm -rf $logDir/gcloud_PrepInstancesForSsh-*
[[ -r $publicKey ]] || {
echo "Unable to read public key: $publicKey"
exit 1
}
[[ -r $privateKey ]] || {
echo "Unable to read private key: $privateKey"
exit 1
}
[[ -d $logDir ]] || {
echo "logDir does not exist: $logDir"
exit 1
}
declare -a pids
for instanceInfo in "${instances[@]}"; do
declare name zone publicIp
IFS=: read -r name zone publicIp _ < <(echo "$instanceInfo")
logFile="$logDir/gcloud_PrepInstancesForSsh-$name.log"
# TODO: This next subshell runs in series because for unknown reason running
# multiple |gcloud compute ssh| commands in parallel cause the macOS
# terminal to misbehave
(
set -x
# Try to ping the machine first. There can be a delay between when the
# instance is reported as RUNNING and when it's reachable over the network
timeout 60s bash -c "set -o pipefail; until ping -c 3 $publicIp | tr - _; do echo .; done"
gcloud compute ssh --zone "$zone" "$name" -- "
set -x;
mkdir -p .ssh;
echo \"$(cat "$publicKey")\" >> .ssh/authorized_keys;
echo \"
Host *
BatchMode yes
IdentityFile ~/.ssh/id_testnet
StrictHostKeyChecking no
\" > .ssh/config;
"
) >> "$logFile" 2>&1
(
set -x
scp \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
-i "$privateKey" \
"$privateKey" "$username@$publicIp:.ssh/id_testnet"
) >> "$logFile" 2>&1 &
declare pid=$!
ln -sfT "$logFile" "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
pids+=("$pid")
done
for pid in "${pids[@]}"; do
declare ok=true
wait "$pid" || ok=false
if ! $ok; then
cat "$logDir/gcloud_PrepInstancesForSsh-$pid.log"
echo ^^^ +++
exit 1
fi
done
}

View File

@ -46,7 +46,7 @@ fi
printNode() {
declare nodeType=$1
declare ip=$2
printf " %-25s | For logs run: $0 $ip tail -f /tmp/solana/=/$nodeType.log\n" "$0 $ip"
printf " %-25s | For logs run: $0 $ip tail -f solana/$nodeType.log\n" "$0 $ip"
}
echo Leader: