diff --git a/ci/README.md b/ci/README.md index 45ff539d56..7e8081608e 100644 --- a/ci/README.md +++ b/ci/README.md @@ -2,7 +2,7 @@ Our CI infrastructure is built around [BuildKite](https://buildkite.com) with some additional GitHub integration provided by https://github.com/mvines/ci-gate -## Agent Queues +# Agent Queues We define two [Agent Queues](https://buildkite.com/docs/agent/v3/queues): `queue=default` and `queue=cuda`. The `default` queue should be favored and @@ -12,9 +12,52 @@ be run on the `default` queue, and the [buildkite artifact system](https://buildkite.com/docs/builds/artifacts) used to transfer build products over to a GPU instance for testing. -## Buildkite Agent Management +# Buildkite Agent Management -### Buildkite Azure Setup +## Manual Node Setup for Colocated Hardware + +This section describes how to set up a new machine that does not have a +pre-configured image with all the requirements installed. Used for custom-built +hardware at a colocation or office facility. Also works for vanilla Ubuntu cloud +instances. + +### Pre-Requisites + + - Install Ubuntu 18.04 LTS Server + - Log in as a local or remote user with `sudo` privileges + +### Install Core Requirements + +##### Non-GPU enabled machines +```bash +sudo ./setup-new-buildkite-agent/setup-new-machine.sh +``` + +##### GPU-enabled machines + - 1 or more NVIDIA GPUs should be installed in the machine (tested with 2080Ti) +```bash +sudo CUDA=1 ./setup-new-buildkite-agent/setup-new-machine.sh +``` + +### Configure Node for Buildkite-agent based CI + +- Install `buildkite-agent` and set up it user environment with: +```bash +sudo ./setup-new-buildkite-agent/setup-buildkite.sh +``` +- Copy the pubkey contents from `~buildkite-agent/.ssh/id_ecdsa.pub` and +add the pubkey as an authorized SSH key on github. +- Edit `/etc/buildkite-agent/buildkite-agent.cfg` and/or `/etc/systemd/system/buildkite-agent@*` to the desired configuration of the agent(s) +- Copy `ejson` keys from another CI node at `/opt/ejson/keys/` +to the same location on the new node. +- Start the new agent(s) with `sudo systemctl enable --now buildkite-agent` + +# Reference + +This section contains details regarding previous CI setups that have been used, +and that we may return to one day. + +## Buildkite Azure Setup Create a new Azure-based "queue=default" agent by running the following command: ``` @@ -35,7 +78,7 @@ Creating a "queue=cuda" agent follows the same process but additionally: 2. Edit the tags field in /etc/buildkite-agent/buildkite-agent.cfg to `tags="queue=cuda,queue=default"` and decrease the value of the priority field by one -#### Updating the CI Disk Image +### Updating the CI Disk Image 1. Create a new VM Instance as described above 1. Modify it as required @@ -48,12 +91,7 @@ Creating a "queue=cuda" agent follows the same process but additionally: 1. Goto the `ci` resource group in the Azure portal and remove all resources with the XYZ name in them -## Reference - -This section contains details regarding previous CI setups that have been used, -and that we may return to one day. - -### Buildkite AWS CloudFormation Setup +## Buildkite AWS CloudFormation Setup **AWS CloudFormation is currently inactive, although it may be restored in the future** @@ -62,7 +100,7 @@ AWS CloudFormation can be used to scale machines up and down based on the current CI load. If no machine is currently running it can take up to 60 seconds to spin up a new instance, please remain calm during this time. -#### AMI +### AMI We use a custom AWS AMI built via https://github.com/solana-labs/elastic-ci-stack-for-aws/tree/solana/cuda. Use the following process to update this AMI as dependencies change: @@ -84,13 +122,13 @@ The new AMI should also now be visible in your EC2 Dashboard. Go to the desired AWS CloudFormation stack, update the **ImageId** field to the new AMI id, and *apply* the stack changes. -### Buildkite GCP Setup +## Buildkite GCP Setup CI runs on Google Cloud Platform via two Compute Engine Instance groups: `ci-default` and `ci-cuda`. Autoscaling is currently disabled and the number of VM Instances in each group is manually adjusted. -#### Updating a CI Disk Image +### Updating a CI Disk Image Each Instance group has its own disk image, `ci-default-vX` and `ci-cuda-vY`, where *X* and *Y* are incremented each time the image is changed. diff --git a/net/datacenter-node-install/disable-networkd-wait.sh b/ci/setup-new-buildkite-agent/disable-networkd-wait.sh similarity index 78% rename from net/datacenter-node-install/disable-networkd-wait.sh rename to ci/setup-new-buildkite-agent/disable-networkd-wait.sh index 35cf3b0fa1..01a6c5969d 100755 --- a/net/datacenter-node-install/disable-networkd-wait.sh +++ b/ci/setup-new-buildkite-agent/disable-networkd-wait.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/net/datacenter-node-install/disable-nouveau.sh b/ci/setup-new-buildkite-agent/disable-nouveau.sh old mode 100644 new mode 100755 similarity index 78% rename from net/datacenter-node-install/disable-nouveau.sh rename to ci/setup-new-buildkite-agent/disable-nouveau.sh index f88691bda6..94f92cd21c --- a/net/datacenter-node-install/disable-nouveau.sh +++ b/ci/setup-new-buildkite-agent/disable-nouveau.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/ci/setup-new-buildkite-agent/enable-buildkite.sh b/ci/setup-new-buildkite-agent/enable-buildkite.sh new file mode 100755 index 0000000000..4514fac225 --- /dev/null +++ b/ci/setup-new-buildkite-agent/enable-buildkite.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash + +sudo systemctl daemon-reload +sudo systemctl enable --now buildkite-agent diff --git a/net/datacenter-node-install/set-hostname.sh b/ci/setup-new-buildkite-agent/set-hostname.sh old mode 100644 new mode 100755 similarity index 81% rename from net/datacenter-node-install/set-hostname.sh rename to ci/setup-new-buildkite-agent/set-hostname.sh index 78b0d153fa..d25363f268 --- a/net/datacenter-node-install/set-hostname.sh +++ b/ci/setup-new-buildkite-agent/set-hostname.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/ci/setup-new-buildkite-agent/setup-buildkite.sh b/ci/setup-new-buildkite-agent/setup-buildkite.sh new file mode 100755 index 0000000000..5387762ebd --- /dev/null +++ b/ci/setup-new-buildkite-agent/setup-buildkite.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash + +HERE="$(dirname "$0")" + +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh +source "$HERE"/utils.sh + +ensure_env || exit 1 + +set -e + +# Install buildkite-agent +echo "deb https://apt.buildkite.com/buildkite-agent stable main" | tee /etc/apt/sources.list.d/buildkite-agent.list +apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 32A37959C2FA5C3C99EFBC32A79206696452D198 +apt-get update +apt-get install -y buildkite-agent + + +# Configure the installation +echo "Go to https://buildkite.com/organizations/solana-labs/agents" +echo "Click Reveal Agent Token" +echo "Paste the Agent Token, then press Enter:" + +read -r agent_token +sudo sed -i "s/xxx/$agent_token/g" /etc/buildkite-agent/buildkite-agent.cfg + +cat > /etc/buildkite-agent/hooks/environment < /lib/systemd/system/buildkite-agent.service < /etc/modprobe.d/nvidia-enable-user-profiling.conf + +# setup persistence mode across reboots +TMPDIR="$(mktemp -d)" +if pushd "$TMPDIR"; then + tar -xvf /usr/share/doc/NVIDIA_GLX-1.0/samples/nvidia-persistenced-init.tar.bz2 + ./nvidia-persistenced-init/install.sh systemd + popd + rm -rf "$TMPDIR" +fi + +nvidia-smi -pm ENABLED \ No newline at end of file diff --git a/net/datacenter-node-install/setup-limits.sh b/ci/setup-new-buildkite-agent/setup-limits.sh old mode 100644 new mode 100755 similarity index 78% rename from net/datacenter-node-install/setup-limits.sh rename to ci/setup-new-buildkite-agent/setup-limits.sh index df7d219f75..841ab84108 --- a/net/datacenter-node-install/setup-limits.sh +++ b/ci/setup-new-buildkite-agent/setup-limits.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/ci/setup-new-buildkite-agent/setup-new-machine.sh b/ci/setup-new-buildkite-agent/setup-new-machine.sh new file mode 100755 index 0000000000..b8ee0805fe --- /dev/null +++ b/ci/setup-new-buildkite-agent/setup-new-machine.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +HERE="$(dirname "$0")" +SOLANA_ROOT="$HERE"/../.. + +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh +source "$HERE"/utils.sh + +ensure_env || exit 1 + +set -ex + +apt update +apt upgrade -y + +cat >/etc/apt/apt.conf.d/99-solana <<'EOF' +// Set and persist extra caps on iftop binary +Dpkg::Post-Invoke { "which iftop 2>&1 >/dev/null && setcap cap_net_raw=eip $(which iftop) || true"; }; +EOF + +apt install -y build-essential pkg-config clang cmake sysstat linux-tools-common \ + linux-generic-hwe-18.04-edge linux-tools-generic-hwe-18.04-edge \ + iftop heaptrack jq ruby python3-venv gcc-multilib libudev-dev + +gem install ejson ejson2env +mkdir -p /opt/ejson/keys + +"$SOLANA_ROOT"/net/scripts/install-docker.sh +usermod -aG docker "$SETUP_USER" +"$SOLANA_ROOT"/net/scripts/install-certbot.sh +"$HERE"/setup-sudoers.sh +"$HERE"/setup-ssh.sh + +"$HERE"/disable-nouveau.sh +"$HERE"/disable-networkd-wait.sh + +"$SOLANA_ROOT"/net/scripts/install-earlyoom.sh +"$SOLANA_ROOT"/net/scripts/install-nodejs.sh +"$SOLANA_ROOT"/net/scripts/localtime.sh +"$SOLANA_ROOT"/net/scripts/install-redis.sh +"$SOLANA_ROOT"/net/scripts/install-rsync.sh +"$SOLANA_ROOT"/net/scripts/install-libssl-compatability.sh + +"$HERE"/setup-procfs-knobs.sh +"$HERE"/setup-limits.sh + +[[ -n $CUDA ]] && "$HERE"/setup-cuda.sh + +exit 0 diff --git a/net/datacenter-node-install/setup-partner-node.sh b/ci/setup-new-buildkite-agent/setup-partner-node.sh old mode 100644 new mode 100755 similarity index 84% rename from net/datacenter-node-install/setup-partner-node.sh rename to ci/setup-new-buildkite-agent/setup-partner-node.sh index d0136eff59..11b50e32c2 --- a/net/datacenter-node-install/setup-partner-node.sh +++ b/ci/setup-new-buildkite-agent/setup-partner-node.sh @@ -2,12 +2,12 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 -set -xe +set -ex "$HERE"/disable-nouveau.sh "$HERE"/disable-networkd-wait.sh diff --git a/net/datacenter-node-install/setup-procfs-knobs.sh b/ci/setup-new-buildkite-agent/setup-procfs-knobs.sh old mode 100644 new mode 100755 similarity index 88% rename from net/datacenter-node-install/setup-procfs-knobs.sh rename to ci/setup-new-buildkite-agent/setup-procfs-knobs.sh index 725e681591..8e6e2507d7 --- a/net/datacenter-node-install/setup-procfs-knobs.sh +++ b/ci/setup-new-buildkite-agent/setup-procfs-knobs.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/net/datacenter-node-install/setup-ssh.sh b/ci/setup-new-buildkite-agent/setup-ssh.sh old mode 100644 new mode 100755 similarity index 85% rename from net/datacenter-node-install/setup-ssh.sh rename to ci/setup-new-buildkite-agent/setup-ssh.sh index d9de90fe43..48324a98a5 --- a/net/datacenter-node-install/setup-ssh.sh +++ b/ci/setup-new-buildkite-agent/setup-ssh.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/net/datacenter-node-install/setup-sudoers.sh b/ci/setup-new-buildkite-agent/setup-sudoers.sh old mode 100644 new mode 100755 similarity index 94% rename from net/datacenter-node-install/setup-sudoers.sh rename to ci/setup-new-buildkite-agent/setup-sudoers.sh index 1964aa7a05..bcf9428390 --- a/net/datacenter-node-install/setup-sudoers.sh +++ b/ci/setup-new-buildkite-agent/setup-sudoers.sh @@ -2,7 +2,7 @@ HERE="$(dirname "$0")" -# shellcheck source=net/datacenter-node-install/utils.sh +# shellcheck source=ci/setup-new-buildkite-agent/utils.sh source "$HERE"/utils.sh ensure_env || exit 1 diff --git a/net/datacenter-node-install/utils.sh b/ci/setup-new-buildkite-agent/utils.sh old mode 100644 new mode 100755 similarity index 100% rename from net/datacenter-node-install/utils.sh rename to ci/setup-new-buildkite-agent/utils.sh diff --git a/net/datacenter-node-install/README.md b/net/datacenter-node-install/README.md deleted file mode 100644 index 1d204ddf89..0000000000 --- a/net/datacenter-node-install/README.md +++ /dev/null @@ -1,25 +0,0 @@ -# Introduction - -These scripts are intended to facilitate the preparation of dedicated Solana -nodes. They have been tested as working from a clean installation of Ubuntu -18.04 Server. Use elsewhere is unsupported. - -# Installation - -Both installation methods require that the NVIDIA proprietary driver installer -programs be downloaded alongside [setup-cuda.sh](./setup-cuda.sh). If they do -not exist at runtime, an attempt will be made to download them automatically. To -avoid downloading the installers at runtime, they may be downloaded in advance -and placed as siblings to [setup-cuda.sh](./setup-cuda.sh). - -For up-to-date NVIDIA driver version requirements, see [setup-cuda.sh](./setup-cuda.sh) - -## Datacenter Node - -1) `sudo ./setup-dc-node-1.sh` -2) `sudo reboot` -3) `sudo ./setup-dc-node-2.sh` - -## Partner Node - -1) `$ sudo ./setup-partner-node.sh` diff --git a/net/datacenter-node-install/setup-dc-node-1.sh b/net/datacenter-node-install/setup-dc-node-1.sh deleted file mode 100644 index 632d3577a7..0000000000 --- a/net/datacenter-node-install/setup-dc-node-1.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -HERE="$(dirname "$0")" - -# shellcheck source=net/datacenter-node-install/utils.sh -source "$HERE"/utils.sh - -ensure_env || exit 1 - -if [[ -n "$1" ]]; then - PUBKEY_FILE="$1" -else - cat </etc/apt/apt.conf.d/99-solana <<'EOF' -// Set and persist extra caps on iftop binary -Dpkg::Post-Invoke { "which iftop 2>&1 >/dev/null && setcap cap_net_raw=eip $(which iftop) || true"; }; -EOF - -apt install -y build-essential pkg-config clang cmake sysstat linux-tools-common \ - linux-generic-hwe-18.04-edge linux-tools-generic-hwe-18.04-edge \ - iftop heaptrack - -"$HERE"/../scripts/install-docker.sh -usermod -aG docker "$SETUP_USER" -"$HERE"/../scripts/install-certbot.sh -"$HERE"/setup-sudoers.sh -"$HERE"/setup-ssh.sh - -# Allow admin user to log in -BASE_SSH_DIR="${SETUP_HOME}/.ssh" -mkdir "$BASE_SSH_DIR" -chown "$SETUP_USER:$SETUP_USER" "$BASE_SSH_DIR" -cat "$PUBKEY_FILE" > "${BASE_SSH_DIR}/authorized_keys" -chown "$SETUP_USER:$SETUP_USER" "${BASE_SSH_DIR}/.ssh/authorized_keys" - -"$HERE"/disable-nouveau.sh -"$HERE"/disable-networkd-wait.sh -"$HERE"/setup-grub.sh -"$HERE"/../scripts/install-earlyoom.sh -"$HERE"/../scripts/install-nodeljs.sh -"$HERE"/../scripts/localtime.sh -"$HERE"/../scripts/install-redis.sh -"$HERE"/../scripts/install-rsync.sh -"$HERE"/../scripts/install-libssl-compatability.sh -"$HERE"/setup-procfs-knobs.sh -"$HERE"/setup-limits.sh - -echo "Please reboot then run setup-dc-node-2.sh" diff --git a/net/datacenter-node-install/setup-dc-node-2.sh b/net/datacenter-node-install/setup-dc-node-2.sh deleted file mode 100644 index 1fc9b6667a..0000000000 --- a/net/datacenter-node-install/setup-dc-node-2.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash - -HERE="$(dirname "$0")" - -# shellcheck source=net/datacenter-node-install/utils.sh -source "$HERE"/utils.sh - -ensure_env || exit 1 - -set -xe - -"$HERE"/setup-cuda.sh - -# setup persistence mode across reboots -TMPDIR="$(mktemp)" -mkdir -p "$TMPDIR" -if pushd "$TMPDIR"; then - tar -xvf /usr/share/doc/NVIDIA_GLX-1.0/sample/nvidia-persistenced-init.tar.bz2 - ./nvidia-persistenced-init/install.sh systemd - popd - rm -rf "$TMPDIR" -fi diff --git a/net/datacenter-node-install/setup-grub.sh b/net/datacenter-node-install/setup-grub.sh deleted file mode 100644 index 6ae0f6caae..0000000000 --- a/net/datacenter-node-install/setup-grub.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env bash - -HERE="$(dirname "$0")" - -# shellcheck source=net/datacenter-node-install/utils.sh -source "$HERE"/utils.sh - -ensure_env || exit 1 - -set -xe - -printf "GRUB_GFXPAYLOAD_LINUX=1280x1024x32\n\n" >> /etc/default/grub -update-grub diff --git a/net/scripts/install-docker.sh b/net/scripts/install-docker.sh index fe783ceec9..9eedd4417c 100755 --- a/net/scripts/install-docker.sh +++ b/net/scripts/install-docker.sh @@ -18,9 +18,62 @@ add-apt-repository \ apt-get update apt-get install -y docker-ce -docker run hello-world + +cat > /lib/systemd/system/docker.service <