tendermint/blockchain/reactor.go

391 lines
12 KiB
Go
Raw Normal View History

package blockchain
import (
"bytes"
"errors"
"reflect"
"time"
2017-05-02 00:53:32 -07:00
wire "github.com/tendermint/go-wire"
2017-04-08 19:04:06 -07:00
"github.com/tendermint/tendermint/p2p"
2015-12-01 20:12:01 -08:00
"github.com/tendermint/tendermint/proxy"
2015-04-01 17:30:16 -07:00
sm "github.com/tendermint/tendermint/state"
"github.com/tendermint/tendermint/types"
2017-04-08 19:04:06 -07:00
cmn "github.com/tendermint/tmlibs/common"
2017-10-20 12:56:21 -07:00
"github.com/tendermint/tmlibs/log"
)
const (
// BlockchainChannel is a channel for blocks and status updates (`BlockStore` height)
BlockchainChannel = byte(0x40)
defaultChannelCapacity = 1000
trySyncIntervalMS = 50
// stop syncing when last block's time is
// within this much of the system time.
2015-04-23 14:59:12 -07:00
// stopSyncingDurationMinutes = 10
// ask for best height every 10s
statusUpdateIntervalSeconds = 10
// check if we should switch to consensus reactor
switchToConsensusIntervalSeconds = 1
)
type consensusReactor interface {
// for when we switch from blockchain reactor and fast sync to
// the consensus machine
SwitchToConsensus(*sm.State, int)
}
// BlockchainReactor handles long-term catchup syncing.
type BlockchainReactor struct {
p2p.BaseReactor
state *sm.State
2016-08-17 19:28:08 -07:00
proxyAppConn proxy.AppConnConsensus // same as consensus.proxyAppConn
store *BlockStore
pool *BlockPool
fastSync bool
requestsCh chan BlockRequest
timeoutsCh chan string
new pubsub package comment out failing consensus tests for now rewrite rpc httpclient to use new pubsub package import pubsub as tmpubsub, query as tmquery make event IDs constants EventKey -> EventTypeKey rename EventsPubsub to PubSub mempool does not use pubsub rename eventsSub to pubsub new subscribe API fix channel size issues and consensus tests bugs refactor rpc client add missing discardFromChan method add mutex rename pubsub to eventBus remove IsRunning from WSRPCConnection interface (not needed) add a comment in broadcastNewRoundStepsAndVotes rename registerEventCallbacks to broadcastNewRoundStepsAndVotes See https://dave.cheney.net/2014/03/19/channel-axioms stop eventBuses after reactor tests remove unnecessary Unsubscribe return subscribe helper function move discardFromChan to where it is used subscribe now returns an err this gives us ability to refuse to subscribe if pubsub is at its max capacity. use context for control overflow cache queries handle err when subscribing in replay_test rename testClientID to testSubscriber extract var set channel buffer capacity to 1 in replay_file fix byzantine_test unsubscribe from single event, not all events refactor httpclient to return events to appropriate channels return failing testReplayCrashBeforeWriteVote test fix TestValidatorSetChanges refactor code a bit fix testReplayCrashBeforeWriteVote add comment fix TestValidatorSetChanges fixes from Bucky's review update comment [ci skip] test TxEventBuffer update changelog fix TestValidatorSetChanges (2nd attempt) only do wg.Done when no errors benchmark event bus create pubsub server inside NewEventBus only expose config params (later if needed) set buffer capacity to 0 so we are not testing cache new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ} This should allow to subscribe to all transactions! or a specific one using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'" use TimeoutCommit instead of afterPublishEventNewBlockTimeout TimeoutCommit is the time a node waits after committing a block, before it goes into the next height. So it will finish everything from the last block, but then wait a bit. The idea is this gives it time to hear more votes from other validators, to strengthen the commit it includes in the next block. But it also gives it time to hear about new transactions. waitForBlockWithUpdatedVals rewrite WAL crash tests Task: test that we can recover from any WAL crash. Solution: the old tests were relying on event hub being run in the same thread (we were injecting the private validator's last signature). when considering a rewrite, we considered two possible solutions: write a "fuzzy" testing system where WAL is crashing upon receiving a new message, or inject failures and trigger them in tests using something like https://github.com/coreos/gofail. remove sleep no cs.Lock around wal.Save test different cases (empty block, non-empty block, ...) comments add comments test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks fixes as per Bucky's last review reset subscriptions on UnsubscribeAll use a simple counter to track message for which we panicked also, set a smaller part size for all test cases
2017-06-26 08:00:30 -07:00
eventBus *types.EventBus
}
// NewBlockchainReactor returns new reactor instance.
2017-04-28 20:50:24 -07:00
func NewBlockchainReactor(state *sm.State, proxyAppConn proxy.AppConnConsensus, store *BlockStore, fastSync bool) *BlockchainReactor {
if state.LastBlockHeight == store.Height()-1 {
store.height-- // XXX HACK, make this better
}
if state.LastBlockHeight != store.Height() {
cmn.PanicSanity(cmn.Fmt("state (%v) and store (%v) height mismatch", state.LastBlockHeight, store.Height()))
}
requestsCh := make(chan BlockRequest, defaultChannelCapacity)
timeoutsCh := make(chan string, defaultChannelCapacity)
pool := NewBlockPool(
store.Height()+1,
requestsCh,
timeoutsCh,
)
bcR := &BlockchainReactor{
state: state,
proxyAppConn: proxyAppConn,
store: store,
pool: pool,
fastSync: fastSync,
requestsCh: requestsCh,
timeoutsCh: timeoutsCh,
}
2017-05-02 00:53:32 -07:00
bcR.BaseReactor = *p2p.NewBaseReactor("BlockchainReactor", bcR)
return bcR
}
2017-10-20 12:56:21 -07:00
// SetLogger implements cmn.Service by setting the logger on reactor and pool.
func (bcR *BlockchainReactor) SetLogger(l log.Logger) {
bcR.BaseService.Logger = l
bcR.pool.Logger = l
}
// OnStart implements cmn.Service.
func (bcR *BlockchainReactor) OnStart() error {
2015-07-21 18:31:01 -07:00
bcR.BaseReactor.OnStart()
if bcR.fastSync {
_, err := bcR.pool.Start()
if err != nil {
return err
}
go bcR.poolRoutine()
}
return nil
}
2017-10-20 12:56:21 -07:00
// OnStop implements cmn.Service.
2015-07-21 18:31:01 -07:00
func (bcR *BlockchainReactor) OnStop() {
bcR.BaseReactor.OnStop()
bcR.pool.Stop()
}
// GetChannels implements Reactor
func (bcR *BlockchainReactor) GetChannels() []*p2p.ChannelDescriptor {
return []*p2p.ChannelDescriptor{
&p2p.ChannelDescriptor{
ID: BlockchainChannel,
Priority: 10,
SendQueueCapacity: 1000,
},
}
}
// AddPeer implements Reactor by sending our state to peer.
2017-09-12 17:49:22 -07:00
func (bcR *BlockchainReactor) AddPeer(peer p2p.Peer) {
if !peer.Send(BlockchainChannel, struct{ BlockchainMessage }{&bcStatusResponseMessage{bcR.store.Height()}}) {
// doing nothing, will try later in `poolRoutine`
}
}
// RemovePeer implements Reactor by removing peer from the pool.
2017-09-12 17:49:22 -07:00
func (bcR *BlockchainReactor) RemovePeer(peer p2p.Peer, reason interface{}) {
bcR.pool.RemovePeer(peer.Key())
}
// respondToPeer loads a block and sends it to the requesting peer,
// if we have it. Otherwise, we'll respond saying we don't have it.
// According to the Tendermint spec, if all nodes are honest,
// no node should be requesting for a block that's non-existent.
func (bcR *BlockchainReactor) respondToPeer(msg *bcBlockRequestMessage, src p2p.Peer) (queued bool) {
block := bcR.store.LoadBlock(msg.Height)
if block != nil {
msg := &bcBlockResponseMessage{Block: block}
return src.TrySend(BlockchainChannel, struct{ BlockchainMessage }{msg})
}
bcR.Logger.Info("Peer asking for a block we don't have", "src", src, "height", msg.Height)
return src.TrySend(BlockchainChannel, struct{ BlockchainMessage }{
&bcNoBlockResponseMessage{Height: msg.Height},
})
}
// Receive implements Reactor by handling 4 types of messages (look below).
2017-09-12 17:49:22 -07:00
func (bcR *BlockchainReactor) Receive(chID byte, src p2p.Peer, msgBytes []byte) {
_, msg, err := DecodeMessage(msgBytes, bcR.maxMsgSize())
if err != nil {
bcR.Logger.Error("Error decoding message", "err", err)
return
}
2017-05-02 00:53:32 -07:00
bcR.Logger.Debug("Receive", "src", src, "chID", chID, "msg", msg)
2017-06-23 18:36:47 -07:00
// TODO: improve logic to satisfy megacheck
2015-04-14 15:57:16 -07:00
switch msg := msg.(type) {
case *bcBlockRequestMessage:
if queued := bcR.respondToPeer(msg, src); !queued {
// Unfortunately not queued since the queue is full.
}
case *bcBlockResponseMessage:
// Got a block.
2017-09-12 17:49:22 -07:00
bcR.pool.AddBlock(src.Key(), msg.Block, len(msgBytes))
case *bcStatusRequestMessage:
// Send peer our state.
queued := src.TrySend(BlockchainChannel, struct{ BlockchainMessage }{&bcStatusResponseMessage{bcR.store.Height()}})
if !queued {
// sorry
}
case *bcStatusResponseMessage:
// Got a peer status. Unverified.
2017-09-12 17:49:22 -07:00
bcR.pool.SetPeerHeight(src.Key(), msg.Height)
default:
2017-05-02 00:53:32 -07:00
bcR.Logger.Error(cmn.Fmt("Unknown message type %v", reflect.TypeOf(msg)))
}
}
// maxMsgSize returns the maximum allowable size of a
// message on the blockchain reactor.
func (bcR *BlockchainReactor) maxMsgSize() int {
return bcR.state.Params.BlockSizeParams.MaxBytes + 2
}
// Handle messages from the poolReactor telling the reactor what to do.
// NOTE: Don't sleep in the FOR_LOOP or otherwise slow it down!
// (Except for the SYNC_LOOP, which is the primary purpose and must be synchronous.)
func (bcR *BlockchainReactor) poolRoutine() {
trySyncTicker := time.NewTicker(trySyncIntervalMS * time.Millisecond)
statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second)
switchToConsensusTicker := time.NewTicker(switchToConsensusIntervalSeconds * time.Second)
blocksSynced := 0
chainID := bcR.state.ChainID
lastHundred := time.Now()
lastRate := 0.0
FOR_LOOP:
for {
select {
case request := <-bcR.requestsCh: // chan BlockRequest
peer := bcR.Switch.Peers().Get(request.PeerID)
if peer == nil {
continue FOR_LOOP // Peer has since been disconnected.
}
msg := &bcBlockRequestMessage{request.Height}
queued := peer.TrySend(BlockchainChannel, struct{ BlockchainMessage }{msg})
if !queued {
// We couldn't make the request, send-queue full.
// The pool handles timeouts, just let it go.
continue FOR_LOOP
}
case peerID := <-bcR.timeoutsCh: // chan string
// Peer timed out.
peer := bcR.Switch.Peers().Get(peerID)
2015-03-25 12:21:52 -07:00
if peer != nil {
bcR.Switch.StopPeerForError(peer, errors.New("BlockchainReactor Timeout"))
2015-03-25 12:21:52 -07:00
}
case <-statusUpdateTicker.C:
// ask for status updates
go bcR.BroadcastStatusRequest()
case <-switchToConsensusTicker.C:
height, numPending, lenRequesters := bcR.pool.GetStatus()
outbound, inbound, _ := bcR.Switch.NumPeers()
bcR.Logger.Debug("Consensus ticker", "numPending", numPending, "total", lenRequesters,
"outbound", outbound, "inbound", inbound)
if bcR.pool.IsCaughtUp() {
2017-05-02 00:53:32 -07:00
bcR.Logger.Info("Time to switch to consensus reactor!", "height", height)
bcR.pool.Stop()
conR := bcR.Switch.Reactor("CONSENSUS").(consensusReactor)
conR.SwitchToConsensus(bcR.state, blocksSynced)
break FOR_LOOP
}
case <-trySyncTicker.C: // chan time
// This loop can be slow as long as it's doing syncing work.
SYNC_LOOP:
for i := 0; i < 10; i++ {
// See if there are any blocks to sync.
first, second := bcR.pool.PeekTwoBlocks()
2017-05-02 00:53:32 -07:00
//bcR.Logger.Info("TrySync peeked", "first", first, "second", second)
if first == nil || second == nil {
// We need both to sync the first block.
break SYNC_LOOP
}
firstParts := first.MakePartSet(bcR.state.Params.BlockPartSizeBytes)
firstPartsHeader := firstParts.Header()
2016-04-02 09:10:16 -07:00
// Finally, verify the first block using the second's commit
2016-08-16 14:59:19 -07:00
// NOTE: we can probably make this more efficient, but note that calling
// first.Hash() doesn't verify the tx contents, so MakePartSet() is
// currently necessary.
err := bcR.state.Validators.VerifyCommit(
chainID, types.BlockID{first.Hash(), firstPartsHeader}, first.Height, second.LastCommit)
if err != nil {
2017-10-02 12:24:30 -07:00
bcR.Logger.Error("Error in validation", "err", err)
bcR.pool.RedoRequest(first.Height)
break SYNC_LOOP
} else {
bcR.pool.PopRequest()
2016-12-06 20:01:55 -08:00
bcR.store.SaveBlock(first, firstParts, second.LastCommit)
2016-11-19 16:32:35 -08:00
// TODO: should we be firing events? need to fire NewBlock events manually ...
2016-08-27 11:15:18 -07:00
// NOTE: we could improve performance if we
// didn't make the app commit to disk every block
// ... but we would need a way to get the hash without it persisting
new pubsub package comment out failing consensus tests for now rewrite rpc httpclient to use new pubsub package import pubsub as tmpubsub, query as tmquery make event IDs constants EventKey -> EventTypeKey rename EventsPubsub to PubSub mempool does not use pubsub rename eventsSub to pubsub new subscribe API fix channel size issues and consensus tests bugs refactor rpc client add missing discardFromChan method add mutex rename pubsub to eventBus remove IsRunning from WSRPCConnection interface (not needed) add a comment in broadcastNewRoundStepsAndVotes rename registerEventCallbacks to broadcastNewRoundStepsAndVotes See https://dave.cheney.net/2014/03/19/channel-axioms stop eventBuses after reactor tests remove unnecessary Unsubscribe return subscribe helper function move discardFromChan to where it is used subscribe now returns an err this gives us ability to refuse to subscribe if pubsub is at its max capacity. use context for control overflow cache queries handle err when subscribing in replay_test rename testClientID to testSubscriber extract var set channel buffer capacity to 1 in replay_file fix byzantine_test unsubscribe from single event, not all events refactor httpclient to return events to appropriate channels return failing testReplayCrashBeforeWriteVote test fix TestValidatorSetChanges refactor code a bit fix testReplayCrashBeforeWriteVote add comment fix TestValidatorSetChanges fixes from Bucky's review update comment [ci skip] test TxEventBuffer update changelog fix TestValidatorSetChanges (2nd attempt) only do wg.Done when no errors benchmark event bus create pubsub server inside NewEventBus only expose config params (later if needed) set buffer capacity to 0 so we are not testing cache new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ} This should allow to subscribe to all transactions! or a specific one using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'" use TimeoutCommit instead of afterPublishEventNewBlockTimeout TimeoutCommit is the time a node waits after committing a block, before it goes into the next height. So it will finish everything from the last block, but then wait a bit. The idea is this gives it time to hear more votes from other validators, to strengthen the commit it includes in the next block. But it also gives it time to hear about new transactions. waitForBlockWithUpdatedVals rewrite WAL crash tests Task: test that we can recover from any WAL crash. Solution: the old tests were relying on event hub being run in the same thread (we were injecting the private validator's last signature). when considering a rewrite, we considered two possible solutions: write a "fuzzy" testing system where WAL is crashing upon receiving a new message, or inject failures and trigger them in tests using something like https://github.com/coreos/gofail. remove sleep no cs.Lock around wal.Save test different cases (empty block, non-empty block, ...) comments add comments test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks fixes as per Bucky's last review reset subscriptions on UnsubscribeAll use a simple counter to track message for which we panicked also, set a smaller part size for all test cases
2017-06-26 08:00:30 -07:00
err := bcR.state.ApplyBlock(bcR.eventBus, bcR.proxyAppConn, first, firstPartsHeader, types.MockMempool{})
2016-12-06 20:01:55 -08:00
if err != nil {
// TODO This is bad, are we zombie?
cmn.PanicQ(cmn.Fmt("Failed to process committed block (%d:%X): %v", first.Height, first.Hash(), err))
}
blocksSynced += 1
if blocksSynced%100 == 0 {
lastRate = 0.9*lastRate + 0.1*(100/time.Since(lastHundred).Seconds())
bcR.Logger.Info("Fast Sync Rate", "height", bcR.pool.height,
"max_peer_height", bcR.pool.MaxPeerHeight(), "blocks/s", lastRate)
lastHundred = time.Now()
}
}
}
continue FOR_LOOP
case <-bcR.Quit:
break FOR_LOOP
}
}
}
// BroadcastStatusRequest broadcasts `BlockStore` height.
func (bcR *BlockchainReactor) BroadcastStatusRequest() error {
bcR.Switch.Broadcast(BlockchainChannel, struct{ BlockchainMessage }{&bcStatusRequestMessage{bcR.store.Height()}})
return nil
}
new pubsub package comment out failing consensus tests for now rewrite rpc httpclient to use new pubsub package import pubsub as tmpubsub, query as tmquery make event IDs constants EventKey -> EventTypeKey rename EventsPubsub to PubSub mempool does not use pubsub rename eventsSub to pubsub new subscribe API fix channel size issues and consensus tests bugs refactor rpc client add missing discardFromChan method add mutex rename pubsub to eventBus remove IsRunning from WSRPCConnection interface (not needed) add a comment in broadcastNewRoundStepsAndVotes rename registerEventCallbacks to broadcastNewRoundStepsAndVotes See https://dave.cheney.net/2014/03/19/channel-axioms stop eventBuses after reactor tests remove unnecessary Unsubscribe return subscribe helper function move discardFromChan to where it is used subscribe now returns an err this gives us ability to refuse to subscribe if pubsub is at its max capacity. use context for control overflow cache queries handle err when subscribing in replay_test rename testClientID to testSubscriber extract var set channel buffer capacity to 1 in replay_file fix byzantine_test unsubscribe from single event, not all events refactor httpclient to return events to appropriate channels return failing testReplayCrashBeforeWriteVote test fix TestValidatorSetChanges refactor code a bit fix testReplayCrashBeforeWriteVote add comment fix TestValidatorSetChanges fixes from Bucky's review update comment [ci skip] test TxEventBuffer update changelog fix TestValidatorSetChanges (2nd attempt) only do wg.Done when no errors benchmark event bus create pubsub server inside NewEventBus only expose config params (later if needed) set buffer capacity to 0 so we are not testing cache new tx event format: key = "Tx" plus a tag {"tx.hash": XYZ} This should allow to subscribe to all transactions! or a specific one using a query: "tm.events.type = Tx and tx.hash = '013ABF99434...'" use TimeoutCommit instead of afterPublishEventNewBlockTimeout TimeoutCommit is the time a node waits after committing a block, before it goes into the next height. So it will finish everything from the last block, but then wait a bit. The idea is this gives it time to hear more votes from other validators, to strengthen the commit it includes in the next block. But it also gives it time to hear about new transactions. waitForBlockWithUpdatedVals rewrite WAL crash tests Task: test that we can recover from any WAL crash. Solution: the old tests were relying on event hub being run in the same thread (we were injecting the private validator's last signature). when considering a rewrite, we considered two possible solutions: write a "fuzzy" testing system where WAL is crashing upon receiving a new message, or inject failures and trigger them in tests using something like https://github.com/coreos/gofail. remove sleep no cs.Lock around wal.Save test different cases (empty block, non-empty block, ...) comments add comments test 4 cases: empty block, non-empty block, non-empty block with smaller part size, many blocks fixes as per Bucky's last review reset subscriptions on UnsubscribeAll use a simple counter to track message for which we panicked also, set a smaller part size for all test cases
2017-06-26 08:00:30 -07:00
// SetEventBus sets event bus.
func (bcR *BlockchainReactor) SetEventBus(b *types.EventBus) {
bcR.eventBus = b
}
//-----------------------------------------------------------------------------
// Messages
const (
msgTypeBlockRequest = byte(0x10)
msgTypeBlockResponse = byte(0x11)
msgTypeNoBlockResponse = byte(0x12)
msgTypeStatusResponse = byte(0x20)
msgTypeStatusRequest = byte(0x21)
)
// BlockchainMessage is a generic message for this reactor.
2015-04-14 15:57:16 -07:00
type BlockchainMessage interface{}
2015-07-25 15:45:45 -07:00
var _ = wire.RegisterInterface(
2015-04-14 15:57:16 -07:00
struct{ BlockchainMessage }{},
2015-07-25 15:45:45 -07:00
wire.ConcreteType{&bcBlockRequestMessage{}, msgTypeBlockRequest},
wire.ConcreteType{&bcBlockResponseMessage{}, msgTypeBlockResponse},
wire.ConcreteType{&bcNoBlockResponseMessage{}, msgTypeNoBlockResponse},
2015-07-25 15:45:45 -07:00
wire.ConcreteType{&bcStatusResponseMessage{}, msgTypeStatusResponse},
wire.ConcreteType{&bcStatusRequestMessage{}, msgTypeStatusRequest},
2015-04-14 15:57:16 -07:00
)
// DecodeMessage decodes BlockchainMessage.
// TODO: ensure that bz is completely read.
func DecodeMessage(bz []byte, maxSize int) (msgType byte, msg BlockchainMessage, err error) {
msgType = bz[0]
2015-11-10 13:10:43 -08:00
n := int(0)
r := bytes.NewReader(bz)
msg = wire.ReadBinary(struct{ BlockchainMessage }{}, r, maxSize, &n, &err).(struct{ BlockchainMessage }).BlockchainMessage
2015-11-10 13:10:43 -08:00
if err != nil && n != len(bz) {
err = errors.New("DecodeMessage() had bytes left over")
}
return
}
//-------------------------------------
type bcBlockRequestMessage struct {
Height int
}
func (m *bcBlockRequestMessage) String() string {
return cmn.Fmt("[bcBlockRequestMessage %v]", m.Height)
}
type bcNoBlockResponseMessage struct {
Height int
}
func (brm *bcNoBlockResponseMessage) String() string {
return cmn.Fmt("[bcNoBlockResponseMessage %d]", brm.Height)
}
//-------------------------------------
2015-11-10 13:10:43 -08:00
// NOTE: keep up-to-date with maxBlockchainResponseSize
type bcBlockResponseMessage struct {
Block *types.Block
}
func (m *bcBlockResponseMessage) String() string {
return cmn.Fmt("[bcBlockResponseMessage %v]", m.Block.Height)
}
//-------------------------------------
type bcStatusRequestMessage struct {
Height int
}
func (m *bcStatusRequestMessage) String() string {
return cmn.Fmt("[bcStatusRequestMessage %v]", m.Height)
}
//-------------------------------------
type bcStatusResponseMessage struct {
Height int
}
func (m *bcStatusResponseMessage) String() string {
return cmn.Fmt("[bcStatusResponseMessage %v]", m.Height)
}