cosmos-sdk/cosmovisor/process.go

207 lines
5.9 KiB
Go
Raw Normal View History

package cosmovisor
import (
"bufio"
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
"encoding/json"
"fmt"
"io"
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
"io/ioutil"
"log"
"os"
"os/exec"
"os/signal"
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
"path/filepath"
"strings"
"sync"
"syscall"
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
"time"
"github.com/otiai10/copy"
)
// LaunchProcess runs a subprocess and returns when the subprocess exits,
// either when it dies, or *after* a successful upgrade.
func LaunchProcess(cfg *Config, args []string, stdout, stderr io.Writer) (bool, error) {
bin, err := cfg.CurrentBin()
if err != nil {
return false, fmt.Errorf("error creating symlink to genesis: %w", err)
}
if err := EnsureBinary(bin); err != nil {
return false, fmt.Errorf("current binary invalid: %w", err)
}
cmd := exec.Command(bin, args...)
outpipe, err := cmd.StdoutPipe()
if err != nil {
return false, err
}
errpipe, err := cmd.StderrPipe()
if err != nil {
return false, err
}
scanOut := bufio.NewScanner(io.TeeReader(outpipe, stdout))
scanErr := bufio.NewScanner(io.TeeReader(errpipe, stderr))
// set scanner's buffer size to cfg.LogBufferSize, and ensure larger than bufio.MaxScanTokenSize otherwise fallback to bufio.MaxScanTokenSize
var maxCapacity int
if cfg.LogBufferSize < bufio.MaxScanTokenSize {
maxCapacity = bufio.MaxScanTokenSize
} else {
maxCapacity = cfg.LogBufferSize
}
bufOut := make([]byte, maxCapacity)
bufErr := make([]byte, maxCapacity)
scanOut.Buffer(bufOut, maxCapacity)
scanErr.Buffer(bufErr, maxCapacity)
if err := cmd.Start(); err != nil {
return false, fmt.Errorf("launching process %s %s: %w", bin, strings.Join(args, " "), err)
}
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGQUIT, syscall.SIGTERM)
go func() {
sig := <-sigs
if err := cmd.Process.Signal(sig); err != nil {
log.Fatal(err)
}
}()
// three ways to exit - command ends, find regexp in scanOut, find regexp in scanErr
upgradeInfo, err := WaitForUpgradeOrExit(cmd, scanOut, scanErr)
if err != nil {
return false, err
}
if upgradeInfo != nil {
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
if err := doBackup(cfg); err != nil {
return false, err
}
return true, DoUpgrade(cfg, upgradeInfo)
}
return false, nil
}
feat: Add backup option for cosmovisor (#9652) <!-- The default pull request template is for types feat, fix, or refactor. For other templates, add one of the following parameters to the url: - template=docs.md - template=other.md --> ## Description Ref: https://github.com/cosmos/cosmos-sdk/issues/9616#issuecomment-873051972 depends: #8590 <!-- Add a description of the changes that this PR introduces and the files that are the most critical to review. --> This PR adds a full backup option for cosmovisor. `UNSAFE_SKIP_BACKUP` is an `env` setting introduced newly. - if `false` (default, **recommended**), cosmovisor will try to take backup and then upgrade. In case of failure while taking backup, it will just halt the process there and won't try the upgrade. - If `true`, the cosmovisor will try to upgrade without any backup. This setting makes it hard to recover from a failed upgrade. Node operators either need to sync from a healthy node or use a snapshot from others. --- ### Author Checklist *All items are required. Please add a note to the item if the item is not applicable and please add links to any relevant follow up issues.* I have... - [x] included the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] added `!` to the type prefix if API or client breaking change - [ ] targeted the correct branch (see [PR Targeting](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#pr-targeting)) - [x] provided a link to the relevant issue or specification - [ ] followed the guidelines for [building modules](https://github.com/cosmos/cosmos-sdk/blob/master/docs/building-modules) - [ ] included the necessary unit and integration [tests](https://github.com/cosmos/cosmos-sdk/blob/master/CONTRIBUTING.md#testing) - [ ] added a changelog entry to `CHANGELOG.md` - [x] included comments for [documenting Go code](https://blog.golang.org/godoc) - [x] updated the relevant documentation or specification - [x] reviewed "Files changed" and left comments if necessary - [ ] confirmed all CI checks have passed ### Reviewers Checklist *All items are required. Please add a note if the item is not applicable and please add your handle next to the items reviewed if you only reviewed selected items.* I have... - [ ] confirmed the correct [type prefix](https://github.com/commitizen/conventional-commit-types/blob/v3.0.0/index.json) in the PR title - [ ] confirmed `!` in the type prefix if API or client breaking change - [ ] confirmed all author checklist items have been addressed - [ ] reviewed state machine logic - [ ] reviewed API design and naming - [ ] reviewed documentation is accurate - [ ] reviewed tests and test coverage - [ ] manually tested (if applicable)
2021-08-05 13:00:08 -07:00
func doBackup(cfg *Config) error {
// take backup if `UNSAFE_SKIP_BACKUP` is not set.
if !cfg.UnsafeSkipBackup {
// check if upgrade-info.json is not empty.
var uInfo UpgradeInfo
upgradeInfoFile, err := ioutil.ReadFile(filepath.Join(cfg.Home, "data", "upgrade-info.json"))
if err != nil {
return fmt.Errorf("error while reading upgrade-info.json: %w", err)
}
err = json.Unmarshal(upgradeInfoFile, &uInfo)
if err != nil {
return err
}
if uInfo.Name == "" {
return fmt.Errorf("upgrade-info.json is empty")
}
// a destination directory, Format YYYY-MM-DD
st := time.Now()
stStr := fmt.Sprintf("%d-%d-%d", st.Year(), st.Month(), st.Day())
dst := filepath.Join(cfg.Home, fmt.Sprintf("data"+"-backup-%s", stStr))
fmt.Printf("starting to take backup of data directory at time %s", st)
// copy the $DAEMON_HOME/data to a backup dir
err = copy.Copy(filepath.Join(cfg.Home, "data"), dst)
if err != nil {
return fmt.Errorf("error while taking data backup: %w", err)
}
// backup is done, lets check endtime to calculate total time taken for backup process
et := time.Now()
timeTaken := et.Sub(st)
fmt.Printf("backup saved at location: %s, completed at time: %s\n"+
"time taken to complete the backup: %s", dst, et, timeTaken)
}
return nil
}
// WaitResult is used to wrap feedback on cmd state with some mutex logic.
// This is needed as multiple go-routines can affect this - two read pipes that can trigger upgrade
// As well as the command, which can fail
type WaitResult struct {
// both err and info may be updated from several go-routines
// access is wrapped by mutex and should only be done through methods
err error
info *UpgradeInfo
mutex sync.Mutex
}
// AsResult reads the data protected by mutex to avoid race conditions
func (u *WaitResult) AsResult() (*UpgradeInfo, error) {
u.mutex.Lock()
defer u.mutex.Unlock()
return u.info, u.err
}
// SetError will set with the first error using a mutex
// don't set it once info is set, that means we chose to kill the process
func (u *WaitResult) SetError(myErr error) {
u.mutex.Lock()
defer u.mutex.Unlock()
if u.info == nil && myErr != nil {
u.err = myErr
}
}
// SetUpgrade sets first non-nil upgrade info, ensure error is then nil
// pass in a command to shutdown on successful upgrade
func (u *WaitResult) SetUpgrade(up *UpgradeInfo) {
u.mutex.Lock()
defer u.mutex.Unlock()
if u.info == nil && up != nil {
u.info = up
u.err = nil
}
}
// WaitForUpgradeOrExit listens to both output streams of the process, as well as the process state itself
// When it returns, the process is finished and all streams have closed.
//
// It returns (info, nil) if an upgrade should be initiated (and we killed the process)
// It returns (nil, err) if the process died by itself, or there was an issue reading the pipes
// It returns (nil, nil) if the process exited normally without triggering an upgrade. This is very unlikely
// to happened with "start" but may happened with short-lived commands like `gaiad export ...`
func WaitForUpgradeOrExit(cmd *exec.Cmd, scanOut, scanErr *bufio.Scanner) (*UpgradeInfo, error) {
var res WaitResult
waitScan := func(scan *bufio.Scanner) {
upgrade, err := WaitForUpdate(scan)
if err != nil {
res.SetError(err)
} else if upgrade != nil {
res.SetUpgrade(upgrade)
// now we need to kill the process
_ = cmd.Process.Kill()
}
}
// wait for the scanners, which can trigger upgrade and kill cmd
go waitScan(scanOut)
go waitScan(scanErr)
// if the command exits normally (eg. short command like `gaiad version`), just return (nil, nil)
// we often get broken read pipes if it runs too fast.
// if we had upgrade info, we would have killed it, and thus got a non-nil error code
err := cmd.Wait()
if err == nil {
return nil, nil
}
// this will set the error code if it wasn't killed due to upgrade
res.SetError(err)
return res.AsResult()
}