gecko/snow/engine/snowman/transitive.go

// (c) 2019-2020, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package snowman

import (
	"time"

	"github.com/ava-labs/gecko/ids"
	"github.com/ava-labs/gecko/network"
	"github.com/ava-labs/gecko/snow"
	"github.com/ava-labs/gecko/snow/choices"
	"github.com/ava-labs/gecko/snow/consensus/snowman"
	"github.com/ava-labs/gecko/snow/engine/common"
	"github.com/ava-labs/gecko/snow/engine/snowman/poll"
	"github.com/ava-labs/gecko/snow/events"
	"github.com/ava-labs/gecko/utils/formatting"
	"github.com/ava-labs/gecko/utils/wrappers"
)

const (
	// TODO define this constant in one place rather than here and in snowman
	// Max containers size in a MultiPut message
	maxContainersLen = int(4 * network.DefaultMaxMessageSize / 5)
)

// Transitive implements the Engine interface by attempting to fetch all
// transitive dependencies.
type Transitive struct {
	Config
	bootstrapper

	// track outstanding preference requests
	polls poll.Set

	// blocks that have outstanding get requests
	blkReqs common.Requests

	// blocks that are fetched but haven't been issued due to missing
	// dependencies
	pending ids.Set

	// operations that are blocked on a block being issued. This could be
	// issuing another block, responding to a query, or applying votes to
	// consensus
	blocked events.Blocker

	// mark for if the engine has been bootstrapped or not
	bootstrapped bool

	// errs tracks if an error has occurred in a callback
	errs wrappers.Errs
}

// Initialize implements the Engine interface
func (t *Transitive) Initialize(config Config) error {
	config.Context.Log.Info("initializing consensus engine")

	t.Config = config
	t.metrics.Initialize(
		config.Context.Log,
		config.Params.Namespace,
		config.Params.Metrics,
	)

	t.onFinished = t.finishBootstrapping

	factory := poll.NewEarlyTermNoTraversalFactory(int(config.Params.Alpha))
	t.polls = poll.NewSet(factory,
		config.Context.Log,
		config.Params.Namespace,
		config.Params.Metrics,
	)

	return t.bootstrapper.Initialize(config.BootstrapConfig)
}

// when bootstrapping is finished, this will be called. This initializes the
// consensus engine with the last accepted block.
func (t *Transitive) finishBootstrapping() error {
	// set the bootstrapped mark to switch consensus modes
	t.bootstrapped = true

	// initialize consensus to the last accepted blockID
	tailID := t.Config.VM.LastAccepted()
	t.Consensus.Initialize(t.Config.Context, t.Params, tailID)

	// to maintain the invariant that oracle blocks are issued in the correct
	// preferences, we need to handle the case that we are bootstrapping into an
	// oracle block
	tail, err := t.Config.VM.GetBlock(tailID)
	if err != nil {
		t.Config.Context.Log.Error("failed to get last accepted block due to: %s", err)
		return err
	}

	switch blk := tail.(type) {
	case OracleBlock:
		for _, blk := range blk.Options() {
			// note that deliver will set the VM's preference
			if err := t.deliver(blk); err != nil {
				return err
			}
		}
	default:
		// if there aren't blocks we need to deliver on startup, we need to set
		// the preference to the last accepted block
		t.Config.VM.SetPreference(tailID)
	}

	t.Config.Context.Log.Info("bootstrapping finished with %s as the last accepted block", tailID)
	return nil
}

// Gossip implements the Engine interface
func (t *Transitive) Gossip() error {
	blkID := t.Config.VM.LastAccepted()
	blk, err := t.Config.VM.GetBlock(blkID)
	if err != nil {
		t.Config.Context.Log.Warn("dropping gossip request as %s couldn't be loaded due to %s", blkID, err)
		return nil
	}

	t.Config.Context.Log.Verbo("gossiping %s as accepted to the network", blkID)
	t.Config.Sender.Gossip(blkID, blk.Bytes())
	return nil
}

// Shutdown implements the Engine interface
func (t *Transitive) Shutdown() error {
	t.Config.Context.Log.Info("shutting down consensus engine")
	return t.Config.VM.Shutdown()
}

// Context implements the Engine interface
func (t *Transitive) Context() *snow.Context { return t.Config.Context }

// Get implements the Engine interface
func (t *Transitive) Get(vdr ids.ShortID, requestID uint32, blkID ids.ID) error {
	blk, err := t.Config.VM.GetBlock(blkID)
	if err != nil {
		// If we failed to get the block, that means either an unexpected error
		// has occurred, the validator is not following the protocol, or the
		// block has been pruned.
		t.Config.Context.Log.Debug("Get(%s, %d, %s) failed with: %s", vdr, requestID, blkID, err)
		return nil
	}

	// Respond to the validator with the fetched block and the same requestID.
	t.Config.Sender.Put(vdr, requestID, blkID, blk.Bytes())
	return nil
}

// GetAncestors implements the Engine interface
func (t *Transitive) GetAncestors(vdr ids.ShortID, requestID uint32, blkID ids.ID) error {
	startTime := time.Now()
	blk, err := t.Config.VM.GetBlock(blkID)
	if err != nil { // Don't have the block. Drop this request.
		t.Config.Context.Log.Verbo("couldn't get block %s. dropping GetAncestors(%s, %d, %s)", blkID, vdr, requestID, blkID)
		return nil
	}

	ancestorsBytes := make([][]byte, 1, common.MaxContainersPerMultiPut) // First elt is byte repr. of blk, then its parents, then grandparent, etc.
	ancestorsBytes[0] = blk.Bytes()
	ancestorsBytesLen := len(blk.Bytes()) + wrappers.IntLen // length, in bytes, of all elements of ancestors

	for numFetched := 1; numFetched < common.MaxContainersPerMultiPut && time.Since(startTime) < common.MaxTimeFetchingAncestors; numFetched++ {
		blk = blk.Parent()
		if blk.Status() == choices.Unknown {
			break
		}
		blkBytes := blk.Bytes()
		// Ensure response size isn't too large. Include wrappers.IntLen because the size of the message
		// is included with each container, and the size is repr. by an int.
		if newLen := wrappers.IntLen + ancestorsBytesLen + len(blkBytes); newLen < maxContainersLen {
			ancestorsBytes = append(ancestorsBytes, blkBytes)
			ancestorsBytesLen = newLen
		} else { // reached maximum response size
			break
		}
	}

	t.Config.Sender.MultiPut(vdr, requestID, ancestorsBytes)
	return nil
}

// Put implements the Engine interface
func (t *Transitive) Put(vdr ids.ShortID, requestID uint32, blkID ids.ID, blkBytes []byte) error {
	// bootstrapping isn't done --> we didn't send any gets --> this put is invalid
	if !t.bootstrapped {
		if requestID == network.GossipMsgRequestID {
			t.Config.Context.Log.Verbo("dropping gossip Put(%s, %d, %s) due to bootstrapping", vdr, requestID, blkID)
		} else {
			t.Config.Context.Log.Debug("dropping Put(%s, %d, %s) due to bootstrapping", vdr, requestID, blkID)
		}
		return nil
	}

	blk, err := t.Config.VM.ParseBlock(blkBytes)
	if err != nil {
		t.Config.Context.Log.Debug("failed to parse block %s: %s", blkID, err)
		t.Config.Context.Log.Verbo("block:\n%s", formatting.DumpBytes{Bytes: blkBytes})
		// because GetFailed doesn't utilize the assumption that we actually
		// sent a Get message, we can safely call GetFailed here to potentially
		// abandon the request.
		return t.GetFailed(vdr, requestID)
	}

	// insert the block into consensus. If the block has already been issued,
	// this will be a noop. If this block has missing dependencies, vdr will
	// receive requests to fill the ancestry. dependencies that have already
	// been fetched, but with missing dependencies themselves won't be requested
	// from the vdr.
	_, err = t.insertFrom(vdr, blk)
	return err
}

// GetFailed implements the Engine interface
func (t *Transitive) GetFailed(vdr ids.ShortID, requestID uint32) error {
	// not done bootstrapping --> didn't send a get --> this message is invalid
	if !t.bootstrapped {
		t.Config.Context.Log.Debug("dropping GetFailed(%s, %d) due to bootstrapping")
		return nil
	}

	// we don't use the assumption that this function is called after a failed
	// Get message. So we first check to see if we have an outsanding request
	// and also get what the request was for if it exists
	blkID, ok := t.blkReqs.Remove(vdr, requestID)
	if !ok {
		t.Config.Context.Log.Debug("getFailed(%s, %d) called without having sent corresponding Get", vdr, requestID)
		return nil
	}

	// because the get request was dropped, we no longer are expected blkID to
	// be issued.
	t.blocked.Abandon(blkID)
	return t.errs.Err
}

// PullQuery implements the Engine interface
func (t *Transitive) PullQuery(vdr ids.ShortID, requestID uint32, blkID ids.ID) error {
	// if the engine hasn't been bootstrapped, we aren't ready to respond to
	// queries
	if !t.bootstrapped {
		t.Config.Context.Log.Debug("dropping PullQuery(%s, %d, %s) due to bootstrapping", vdr, requestID, blkID)
		return nil
	}

	c := &convincer{
		consensus: t.Consensus,
		sender:    t.Config.Sender,
		vdr:       vdr,
		requestID: requestID,
		errs:      &t.errs,
	}

	added, err := t.reinsertFrom(vdr, blkID)
	if err != nil {
		return err
	}

	// if we aren't able to have issued this block, then it is a dependency for
	// this reply
	if !added {
		c.deps.Add(blkID)
	}

	t.blocked.Register(c)
	return t.errs.Err
}

// PushQuery implements the Engine interface
func (t *Transitive) PushQuery(vdr ids.ShortID, requestID uint32, blkID ids.ID, blkBytes []byte) error {
	// if the engine hasn't been bootstrapped, we aren't ready to respond to
	// queries
	if !t.bootstrapped {
		t.Config.Context.Log.Debug("dropping PushQuery(%s, %d, %s) due to bootstrapping", vdr, requestID, blkID)
		return nil
	}

	blk, err := t.Config.VM.ParseBlock(blkBytes)
	// If the parsing fails, we just drop the request, as we didn't ask for it
	if err != nil {
		t.Config.Context.Log.Debug("failed to parse block %s: %s", blkID, err)
		t.Config.Context.Log.Verbo("block:\n%s", formatting.DumpBytes{Bytes: blkBytes})
		return nil
	}

	// insert the block into consensus. If the block has already been issued,
	// this will be a noop. If this block has missing dependencies, vdr will
	// receive requests to fill the ancestry. dependencies that have already
	// been fetched, but with missing dependencies themselves won't be requested
	// from the vdr.
	if _, err := t.insertFrom(vdr, blk); err != nil {
		return err
	}

	// register the chit request
	return t.PullQuery(vdr, requestID, blk.ID())
}

// Chits implements the Engine interface
func (t *Transitive) Chits(vdr ids.ShortID, requestID uint32, votes ids.Set) error {
	// if the engine hasn't been bootstrapped, we shouldn't be receiving chits
	if !t.bootstrapped {
		t.Config.Context.Log.Debug("dropping Chits(%s, %d) due to bootstrapping", vdr, requestID)
		return nil
	}

	// Since this is snowman, there should only be one ID in the vote set
	if votes.Len() != 1 {
		t.Config.Context.Log.Debug("Chits(%s, %d) was called with %d votes (expected 1)", vdr, requestID, votes.Len())
		// because QueryFailed doesn't utilize the assumption that we actually
		// sent a Query message, we can safely call QueryFailed here to
		// potentially abandon the request.
		return t.QueryFailed(vdr, requestID)
	}
	vote := votes.List()[0]

	t.Config.Context.Log.Verbo("Chits(%s, %d) contains vote for %s", vdr, requestID, vote)

	v := &voter{
		t:         t,
		vdr:       vdr,
		requestID: requestID,
		response:  vote,
	}

	added, err := t.reinsertFrom(vdr, vote)
	if err != nil {
		return err
	}

	// if we aren't able to have issued the vote's block, then it is a
	// dependency for applying the vote
	if !added {
		v.deps.Add(vote)
	}

	t.blocked.Register(v)
	return t.errs.Err
}

// QueryFailed implements the Engine interface
func (t *Transitive) QueryFailed(vdr ids.ShortID, requestID uint32) error {
	// if the engine hasn't been bootstrapped, we won't have sent a query
	if !t.bootstrapped {
		t.Config.Context.Log.Warn("dropping QueryFailed(%s, %d) due to bootstrapping", vdr, requestID)
		return nil
	}

	t.blocked.Register(&voter{
		t:         t,
		vdr:       vdr,
		requestID: requestID,
	})
	return t.errs.Err
}

// Notify implements the Engine interface
func (t *Transitive) Notify(msg common.Message) error {
	// if the engine hasn't been bootstrapped, we shouldn't issuing blocks
	if !t.bootstrapped {
		t.Config.Context.Log.Debug("dropping Notify due to bootstrapping")
		return nil
	}

	t.Config.Context.Log.Verbo("snowman engine notified of %s from the vm", msg)
	switch msg {
	case common.PendingTxs:
		// the pending txs message means we should attempt to build a block.
		blk, err := t.Config.VM.BuildBlock()
		if err != nil {
			t.Config.Context.Log.Debug("VM.BuildBlock errored with: %s", err)
			return nil
		}

		// a newly created block is expected to be processing. If this check
		// fails, there is potentially an error in the VM this engine is running
		if status := blk.Status(); status != choices.Processing {
			t.Config.Context.Log.Warn("attempting to issue a block with status: %s, expected Processing", status)
		}

		// the newly created block should be built on top of the preferred
		// block. Otherwise, the new block doesn't have the best chance of being
		// confirmed.
		parentID := blk.Parent().ID()
		if pref := t.Consensus.Preference(); !parentID.Equals(pref) {
			t.Config.Context.Log.Warn("built block with parent: %s, expected %s", parentID, pref)
		}

		added, err := t.insertAll(blk)
		if err != nil {
			return err
		}

		// inserting the block shouldn't have any missing dependencies
		if added {
			t.Config.Context.Log.Verbo("successfully issued new block from the VM")
		} else {
			t.Config.Context.Log.Warn("VM.BuildBlock returned a block that is pending for ancestors")
		}
	default:
		t.Config.Context.Log.Warn("unexpected message from the VM: %s", msg)
	}
	return nil
}

func (t *Transitive) repoll() {
	// if we are issuing a repoll, we should gossip our current preferences to
	// propagate the most likely branch as quickly as possible
	prefID := t.Consensus.Preference()

	for i := t.polls.Len(); i < t.Params.ConcurrentRepolls; i++ {
		t.pullSample(prefID)
	}
}

// reinsertFrom attempts to issue the branch ending with a block, from only its
// ID, to consensus. Returns true if the block was added, or was previously
// added, to consensus. This is useful to check the local DB before requesting a
// block in case we have the block for some reason. If the block or a dependency
// is missing, the validator will be sent a Get message.
func (t *Transitive) reinsertFrom(vdr ids.ShortID, blkID ids.ID) (bool, error) {
	blk, err := t.Config.VM.GetBlock(blkID)
	if err != nil {
		t.sendRequest(vdr, blkID)
		return false, nil
	}
	return t.insertFrom(vdr, blk)
}

// insertFrom attempts to issue the branch ending with a block to consensus.
// Returns true if the block was added, or was previously added, to consensus.
// This is useful to check the local DB before requesting a block in case we
// have the block for some reason. If a dependency is missing, the validator
// will be sent a Get message.
func (t *Transitive) insertFrom(vdr ids.ShortID, blk snowman.Block) (bool, error) {
	blkID := blk.ID()
	// if the block has been issued, we don't need to insert it. if the block is
	// already pending, we shouldn't attempt to insert it again yet
	for !t.Consensus.Issued(blk) && !t.pending.Contains(blkID) {
		if err := t.insert(blk); err != nil {
			return false, err
		}

		blk = blk.Parent()
		blkID = blk.ID()

		// if the parent hasn't been fetched, we need to request it to issue the
		// newly inserted block
		if !blk.Status().Fetched() {
			t.sendRequest(vdr, blkID)
			return false, nil
		}
	}
	return t.Consensus.Issued(blk), nil
}

// insertAll attempts to issue the branch ending with a block to consensus.
// Returns true if the block was added, or was previously added, to consensus.
// This is useful to check the local DB before requesting a block in case we
// have the block for some reason. If a dependency is missing and the dependency
// hasn't been requested, the issuance will be abandoned.
func (t *Transitive) insertAll(blk snowman.Block) (bool, error) {
	blkID := blk.ID()
	for blk.Status().Fetched() && !t.Consensus.Issued(blk) && !t.pending.Contains(blkID) {
		if err := t.insert(blk); err != nil {
			return false, err
		}

		blk = blk.Parent()
		blkID = blk.ID()
	}

	// if issuance the block was successful, this is the happy path
	if t.Consensus.Issued(blk) {
		return true, nil
	}

	// if this branch is waiting on a block that we supposedly have a source of,
	// we can just wait for that request to succeed or fail
	if t.blkReqs.Contains(blkID) {
		return false, nil
	}

	// if we have no reason to expect that this block will be inserted, we
	// should abandon the block to avoid a memory leak
	t.blocked.Abandon(blkID)
	return false, t.errs.Err
}

// attempt to insert the block to consensus. If the block's parent hasn't been
// issued, the insertion will block until the parent's issuance is abandoned or
// fulfilled
func (t *Transitive) insert(blk snowman.Block) error {
	blkID := blk.ID()

	// mark that the block has been fetched but is pending
	t.pending.Add(blkID)

	// if we have any outstanding requests for this block, remove the pending
	// requests
	t.blkReqs.RemoveAny(blkID)

	i := &issuer{
		t:   t,
		blk: blk,
	}

	// block on the parent if needed
	if parent := blk.Parent(); !t.Consensus.Issued(parent) {
		parentID := parent.ID()
		t.Config.Context.Log.Verbo("block %s waiting for parent %s", blkID, parentID)
		i.deps.Add(parentID)
	}

	t.blocked.Register(i)

	// Tracks performance statistics
	t.numBlkRequests.Set(float64(t.blkReqs.Len()))
	t.numBlockedBlk.Set(float64(t.pending.Len()))
	return t.errs.Err
}

func (t *Transitive) sendRequest(vdr ids.ShortID, blkID ids.ID) {
	// only send one request at a time for a block
	if t.blkReqs.Contains(blkID) {
		return
	}

	t.RequestID++
	t.blkReqs.Add(vdr, t.RequestID, blkID)
	t.Config.Context.Log.Verbo("sending Get(%s, %d, %s)", vdr, t.RequestID, blkID)
	t.Config.Sender.Get(vdr, t.RequestID, blkID)

	// Tracks performance statistics
	t.numBlkRequests.Set(float64(t.blkReqs.Len()))
}

// send a pull request for this block ID
func (t *Transitive) pullSample(blkID ids.ID) {
	t.Config.Context.Log.Verbo("about to sample from: %s", t.Config.Validators)
	p := t.Consensus.Parameters()
	vdrs := t.Config.Validators.Sample(p.K)
	vdrSet := ids.ShortSet{}
	for _, vdr := range vdrs {
		vdrSet.Add(vdr.ID())
	}

	toSample := ids.ShortSet{}
	toSample.Union(vdrSet)

	t.RequestID++
	if numVdrs := len(vdrs); numVdrs == p.K && t.polls.Add(t.RequestID, vdrSet) {
		t.Config.Sender.PullQuery(toSample, t.RequestID, blkID)
	} else if numVdrs < p.K {
		t.Config.Context.Log.Error("query for %s was dropped due to an insufficient number of validators", blkID)
	}
}

// send a push request for this block
func (t *Transitive) pushSample(blk snowman.Block) {
	t.Config.Context.Log.Verbo("about to sample from: %s", t.Config.Validators)
	p := t.Consensus.Parameters()
	vdrs := t.Config.Validators.Sample(p.K)
	vdrSet := ids.ShortSet{}
	for _, vdr := range vdrs {
		vdrSet.Add(vdr.ID())
	}

	toSample := ids.ShortSet{}
	toSample.Union(vdrSet)

	t.RequestID++
	if numVdrs := len(vdrs); numVdrs == p.K && t.polls.Add(t.RequestID, vdrSet) {
		t.Config.Sender.PushQuery(toSample, t.RequestID, blk.ID(), blk.Bytes())
	} else if numVdrs < p.K {
		t.Config.Context.Log.Error("query for %s was dropped due to an insufficient number of validators", blk.ID())
	}
}

func (t *Transitive) deliver(blk snowman.Block) error {
	if t.Consensus.Issued(blk) {
		return nil
	}

	// we are adding the block to consensus, so it is no longer pending
	blkID := blk.ID()
	t.pending.Remove(blkID)

	if err := blk.Verify(); err != nil {
		t.Config.Context.Log.Debug("block failed verification due to %s, dropping block", err)

		// if verify fails, then all decedents are also invalid
		t.blocked.Abandon(blkID)
		t.numBlockedBlk.Set(float64(t.pending.Len())) // Tracks performance statistics
		return t.errs.Err
	}

	t.Config.Context.Log.Verbo("adding block to consensus: %s", blkID)
	t.Consensus.Add(blk)

	// Add all the oracle blocks if they exist. We call verify on all the blocks
	// and add them to consensus before marking anything as fulfilled to avoid
	// any potential reentrant bugs.
	added := []snowman.Block{}
	dropped := []snowman.Block{}
	switch blk := blk.(type) {
	case OracleBlock:
		for _, blk := range blk.Options() {
			if err := blk.Verify(); err != nil {
				t.Config.Context.Log.Debug("block failed verification due to %s, dropping block", err)
				dropped = append(dropped, blk)
			} else {
				t.Consensus.Add(blk)
				added = append(added, blk)
			}
		}
	}

	t.Config.VM.SetPreference(t.Consensus.Preference())

	// launch a query for the newly added block
	t.pushSample(blk)

	t.blocked.Fulfill(blkID)
	for _, blk := range added {
		t.pushSample(blk)

		blkID := blk.ID()
		t.pending.Remove(blkID)
		t.blocked.Fulfill(blkID)
	}
	for _, blk := range dropped {
		blkID := blk.ID()
		t.pending.Remove(blkID)
		t.blocked.Abandon(blkID)
	}

	// If we should issue multiple queries at the same time, we need to repoll
	t.repoll()

	// Tracks performance statistics
	t.numBlkRequests.Set(float64(t.blkReqs.Len()))
	t.numBlockedBlk.Set(float64(t.pending.Len()))
	return t.errs.Err
}