feat: Add backup option for cosmovisor (#9652)

<!--
The default pull request template is for types feat, fix, or refactor.
For other templates, add one of the following parameters to the url:
- template=docs.md
- template=other.md
-->

## Description

Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972

depends: #8590

<!-- Add a description of the changes that this PR introduces and the files that
are the most critical to review. -->

This PR adds a full backup option for cosmovisor. 
`UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly.
- if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade.
- If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others.

---

### Author Checklist

*All items are required. Please add a note to the item if the item is not applicable and
please add links to any relevant follow up issues.*

I have...

- [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] added `!` to the type prefix if API or client breaking change
- [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting))
- [x] provided a link to the relevant issue or specification
- [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules)
- [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing)
- [ ] added a changelog entry to `CHANGELOG.md`
- [x] included comments for [documenting Go code](https://blog.golang.org/godoc)
- [x] updated the relevant documentation or specification
- [x] reviewed "Files changed" and left comments if necessary
- [ ] confirmed all CI checks have passed

### Reviewers Checklist

*All items are required. Please add a note if the item is not applicable and please add
your handle next to the items reviewed if you only reviewed selected items.*

I have...

- [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title
- [ ] confirmed `!` in the type prefix if API or client breaking change
- [ ] confirmed all author checklist items have been addressed 
- [ ] reviewed state machine logic
- [ ] reviewed API design and naming
- [ ] reviewed documentation is accurate
- [ ] reviewed tests and test coverage
- [ ] manually tested (if applicable)
This commit is contained in:
Anil Kumar Kammari 2021-08-06 01:30:08 +05:30 committed by GitHub
parent eb79dd022f
commit 5a47154f6c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 60 additions and 2 deletions

View File

@ -22,6 +22,7 @@ All arguments passed to `cosmovisor` will be passed to the application binary (a
* `DAEMON_NAME` is the name of the binary itself (e.g. `gaiad`, `regend`, `simd`, etc.).
* `DAEMON_ALLOW_DOWNLOAD_BINARIES` (*optional*), if set to `true`, will enable auto-downloading of new binaries (for security reasons, this is intended for full nodes rather than validators). By default, `cosmovisor` will not auto-download new binaries.
* `DAEMON_RESTART_AFTER_UPGRADE` (*optional*), if set to `true`, will restart the subprocess with the same command-line arguments and flags (but with the new binary) after a successful upgrade. By default, `cosmovisor` stops running after an upgrade and requires the system administrator to manually restart it. Note that `cosmovisor` will not auto-restart the subprocess if there was an error.
* `UNSAFE_SKIP_BACKUP` (defaults to `false`), if set to `false`, will backup the data before trying the upgrade. Otherwise it will upgrade directly without doing any backup. This is useful (and recommended) in case of failures and when needed to rollback. It is advised to use backup option, i.e., `UNSAFE_SKIP_BACKUP=false`
## Folder Layout

View File

@ -24,6 +24,7 @@ type Config struct {
AllowDownloadBinaries bool
RestartAfterUpgrade bool
LogBufferSize int
UnsafeSkipBackup bool
}
// Root returns the root directory where all info lives
@ -113,6 +114,8 @@ func GetConfigFromEnv() (*Config, error) {
cfg.LogBufferSize = bufio.MaxScanTokenSize
}
cfg.UnsafeSkipBackup = os.Getenv("UNSAFE_SKIP_BACKUP") == "true"
if err := cfg.validate(); err != nil {
return nil, err
}

View File

@ -22,6 +22,7 @@ func Run(args []string) error {
}
doUpgrade, err := cosmovisor.LaunchProcess(cfg, args, os.Stdout, os.Stderr)
// if RestartAfterUpgrade, we launch after a successful upgrade (only condition LaunchProcess returns nil)
for cfg.RestartAfterUpgrade && err == nil && doUpgrade {
doUpgrade, err = cosmovisor.LaunchProcess(cfg, args, os.Stdout, os.Stderr)

View File

@ -2,15 +2,21 @@ package cosmovisor
import (
"bufio"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"os/exec"
"os/signal"
"path/filepath"
"strings"
"sync"
"syscall"
"time"
"github.com/otiai10/copy"
)
// LaunchProcess runs a subprocess and returns when the subprocess exits,
@ -70,12 +76,59 @@ func LaunchProcess(cfg *Config, args []string, stdout, stderr io.Writer) (bool,
}
if upgradeInfo != nil {
if err := doBackup(cfg); err != nil {
return false, err
}
return true, DoUpgrade(cfg, upgradeInfo)
}
return false, nil
}
func doBackup(cfg *Config) error {
// take backup if `UNSAFE_SKIP_BACKUP` is not set.
if !cfg.UnsafeSkipBackup {
// check if upgrade-info.json is not empty.
var uInfo UpgradeInfo
upgradeInfoFile, err := ioutil.ReadFile(filepath.Join(cfg.Home, "data", "upgrade-info.json"))
if err != nil {
return fmt.Errorf("error while reading upgrade-info.json: %w", err)
}
err = json.Unmarshal(upgradeInfoFile, &uInfo)
if err != nil {
return err
}
if uInfo.Name == "" {
return fmt.Errorf("upgrade-info.json is empty")
}
// a destination directory, Format YYYY-MM-DD
st := time.Now()
stStr := fmt.Sprintf("%d-%d-%d", st.Year(), st.Month(), st.Day())
dst := filepath.Join(cfg.Home, fmt.Sprintf("data"+"-backup-%s", stStr))
fmt.Printf("starting to take backup of data directory at time %s", st)
// copy the $DAEMON_HOME/data to a backup dir
err = copy.Copy(filepath.Join(cfg.Home, "data"), dst)
if err != nil {
return fmt.Errorf("error while taking data backup: %w", err)
}
// backup is done, lets check endtime to calculate total time taken for backup process
et := time.Now()
timeTaken := et.Sub(st)
fmt.Printf("backup saved at location: %s, completed at time: %s\n"+
"time taken to complete the backup: %s", dst, et, timeTaken)
}
return nil
}
// WaitResult is used to wrap feedback on cmd state with some mutex logic.
// This is needed as multiple go-routines can affect this - two read pipes that can trigger upgrade
// As well as the command, which can fail

View File

@ -23,7 +23,7 @@ func TestProcessTestSuite(t *testing.T) {
// and args are passed through
func (s *processTestSuite) TestLaunchProcess() {
home := copyTestData(s.T(), "validate")
cfg := &cosmovisor.Config{Home: home, Name: "dummyd"}
cfg := &cosmovisor.Config{Home: home, Name: "dummyd", UnsafeSkipBackup: true}
// should run the genesis binary and produce expected output
var stdout, stderr bytes.Buffer
@ -65,7 +65,7 @@ func (s *processTestSuite) TestLaunchProcessWithDownloads() {
// zip_binary -> "chain3" = ref_zipped -> zip_directory
// zip_directory no upgrade
home := copyTestData(s.T(), "download")
cfg := &cosmovisor.Config{Home: home, Name: "autod", AllowDownloadBinaries: true}
cfg := &cosmovisor.Config{Home: home, Name: "autod", AllowDownloadBinaries: true, UnsafeSkipBackup: true}
// should run the genesis binary and produce expected output
var stdout, stderr bytes.Buffer