node: switch to use recovery and cleanly restart watcher
This commit is contained in:
parent
5d8072e3da
commit
4ddeca4dbd
|
@ -1136,7 +1136,7 @@ func runNode(cmd *cobra.Command, args []string) {
|
||||||
readiness.RegisterComponent(common.ReadinessNearSyncing)
|
readiness.RegisterComponent(common.ReadinessNearSyncing)
|
||||||
chainObsvReqC[vaa.ChainIDNear] = make(chan *gossipv1.ObservationRequest, observationRequestBufferSize)
|
chainObsvReqC[vaa.ChainIDNear] = make(chan *gossipv1.ObservationRequest, observationRequestBufferSize)
|
||||||
if err := supervisor.Run(ctx, "nearwatch",
|
if err := supervisor.Run(ctx, "nearwatch",
|
||||||
near.NewWatcher(*nearRPC, *nearContract, lockC, chainObsvReqC[vaa.ChainIDNear], !(*unsafeDevMode || *testnetMode)).Run); err != nil {
|
common.WrapWithScissors(near.NewWatcher(*nearRPC, *nearContract, lockC, chainObsvReqC[vaa.ChainIDNear], !(*unsafeDevMode || *testnetMode)).Run)); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/certusone/wormhole/node/pkg/supervisor"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||||
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ScissorsErrors = promauto.NewCounter(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Name: "scissor_errors_caught",
|
||||||
|
Help: "Total number of unhandled errors caught",
|
||||||
|
})
|
||||||
|
)
|
||||||
|
|
||||||
|
// Start a go routine with recovering from any panic by sending an error to a error channel
|
||||||
|
func RunWithScissors(ctx context.Context, errC chan error, name string, runnable supervisor.Runnable) {
|
||||||
|
go func() {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
switch x := r.(type) {
|
||||||
|
case error:
|
||||||
|
errC <- fmt.Errorf("%s: %w", name, x)
|
||||||
|
default:
|
||||||
|
errC <- fmt.Errorf("%s: %v", name, x)
|
||||||
|
}
|
||||||
|
ScissorsErrors.Inc()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
err := runnable(ctx)
|
||||||
|
if err != nil {
|
||||||
|
errC <- err
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
|
type (
|
||||||
|
Scissors struct {
|
||||||
|
runnable supervisor.Runnable
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
func WrapWithScissors(runnable supervisor.Runnable) supervisor.Runnable {
|
||||||
|
s := Scissors{runnable: runnable}
|
||||||
|
return s.Run
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *Scissors) Run(ctx context.Context) (result error) {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
switch x := r.(type) {
|
||||||
|
case error:
|
||||||
|
result = x
|
||||||
|
default:
|
||||||
|
result = fmt.Errorf("%v", x)
|
||||||
|
}
|
||||||
|
ScissorsErrors.Inc()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
return e.runnable(ctx)
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
package common
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/assert"
|
||||||
|
)
|
||||||
|
|
||||||
|
func throwNil(ctx context.Context) error {
|
||||||
|
var x *int = nil
|
||||||
|
*x = 5
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func runTest(t *testing.T, ctx context.Context, testCase int) (result error) {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
switch x := r.(type) {
|
||||||
|
case string:
|
||||||
|
result = errors.New(x)
|
||||||
|
case error:
|
||||||
|
result = x
|
||||||
|
default:
|
||||||
|
result = fmt.Errorf("unknown panic in runTest/%d", testCase)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
errC := make(chan error)
|
||||||
|
|
||||||
|
switch testCase {
|
||||||
|
case 0:
|
||||||
|
_ = throwNil(ctx) // fall into defer above
|
||||||
|
case 1:
|
||||||
|
RunWithScissors(ctx, errC, "test1Thread", throwNil)
|
||||||
|
case 2:
|
||||||
|
RunWithScissors(ctx, errC, "test2Thread", func(ctx context.Context) error {
|
||||||
|
<-ctx.Done()
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
_ = throwNil(ctx)
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
go func() { _ = throwNil(ctx) }() // uncatchable
|
||||||
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case err := <-errC:
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSupervisor(t *testing.T) {
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
|
||||||
|
rootCtx := context.Background()
|
||||||
|
ctx, fn := context.WithCancel(rootCtx)
|
||||||
|
|
||||||
|
err := runTest(t, ctx, i)
|
||||||
|
|
||||||
|
switch i {
|
||||||
|
case 0:
|
||||||
|
assert.EqualError(t, err, "runtime error: invalid memory address or nil pointer dereference")
|
||||||
|
case 1:
|
||||||
|
assert.EqualError(t, err, "test1Thread: runtime error: invalid memory address or nil pointer dereference")
|
||||||
|
case 2:
|
||||||
|
assert.EqualError(t, err, "runtime error: invalid memory address or nil pointer dereference")
|
||||||
|
}
|
||||||
|
fn()
|
||||||
|
},
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
|
@ -79,6 +79,9 @@ type (
|
||||||
eventChanTxProcessedDuration chan time.Duration
|
eventChanTxProcessedDuration chan time.Duration
|
||||||
eventChan chan eventType // whenever a messages is confirmed, post true in here
|
eventChan chan eventType // whenever a messages is confirmed, post true in here
|
||||||
|
|
||||||
|
// Error channel
|
||||||
|
errC chan error
|
||||||
|
|
||||||
// sub-components
|
// sub-components
|
||||||
finalizer Finalizer
|
finalizer Finalizer
|
||||||
nearAPI nearapi.NearApi
|
nearAPI nearapi.NearApi
|
||||||
|
@ -129,8 +132,6 @@ func (e *Watcher) runBlockPoll(ctx context.Context) error {
|
||||||
|
|
||||||
highestFinalBlockHeightObserved := finalBlock.Header.Height - 1 // minues one because we still want to process this block, just no blocks before it
|
highestFinalBlockHeightObserved := finalBlock.Header.Height - 1 // minues one because we still want to process this block, just no blocks before it
|
||||||
|
|
||||||
supervisor.Signal(ctx, supervisor.SignalHealthy)
|
|
||||||
|
|
||||||
timer := time.NewTimer(time.Nanosecond) // this is just for the first iteration.
|
timer := time.NewTimer(time.Nanosecond) // this is just for the first iteration.
|
||||||
|
|
||||||
for {
|
for {
|
||||||
|
@ -180,8 +181,6 @@ func (e *Watcher) runChunkFetcher(ctx context.Context) error {
|
||||||
func (e *Watcher) runObsvReqProcessor(ctx context.Context) error {
|
func (e *Watcher) runObsvReqProcessor(ctx context.Context) error {
|
||||||
logger := supervisor.Logger(ctx)
|
logger := supervisor.Logger(ctx)
|
||||||
|
|
||||||
supervisor.Signal(ctx, supervisor.SignalHealthy)
|
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -207,7 +206,7 @@ func (e *Watcher) runObsvReqProcessor(ctx context.Context) error {
|
||||||
|
|
||||||
func (e *Watcher) runTxProcessor(ctx context.Context) error {
|
func (e *Watcher) runTxProcessor(ctx context.Context) error {
|
||||||
logger := supervisor.Logger(ctx)
|
logger := supervisor.Logger(ctx)
|
||||||
supervisor.Signal(ctx, supervisor.SignalHealthy)
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
|
@ -252,6 +251,8 @@ func (e *Watcher) runTxProcessor(ctx context.Context) error {
|
||||||
func (e *Watcher) Run(ctx context.Context) error {
|
func (e *Watcher) Run(ctx context.Context) error {
|
||||||
logger := supervisor.Logger(ctx)
|
logger := supervisor.Logger(ctx)
|
||||||
|
|
||||||
|
e.errC = make(chan error)
|
||||||
|
|
||||||
e.nearAPI = nearapi.NewNearApiImpl(nearapi.NewHttpNearRpc(e.nearRPC))
|
e.nearAPI = nearapi.NewNearApiImpl(nearapi.NewHttpNearRpc(e.nearRPC))
|
||||||
e.finalizer = newFinalizer(e.eventChan, e.nearAPI, e.mainnet)
|
e.finalizer = newFinalizer(e.eventChan, e.nearAPI, e.mainnet)
|
||||||
|
|
||||||
|
@ -262,61 +263,52 @@ func (e *Watcher) Run(ctx context.Context) error {
|
||||||
logger.Info("Near watcher connecting to RPC node ", zap.String("url", e.nearRPC))
|
logger.Info("Near watcher connecting to RPC node ", zap.String("url", e.nearRPC))
|
||||||
|
|
||||||
// start metrics reporter
|
// start metrics reporter
|
||||||
err := supervisor.Run(ctx, "metrics", e.runMetrics)
|
common.RunWithScissors(ctx, e.errC, "metrics", e.runMetrics)
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// start one poller
|
// start one poller
|
||||||
err = supervisor.Run(ctx, "blockPoll", e.runBlockPoll)
|
common.RunWithScissors(ctx, e.errC, "blockPoll", e.runBlockPoll)
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// start one obsvReqC runner
|
// start one obsvReqC runner
|
||||||
err = supervisor.Run(ctx, "obsvReqProcessor", e.runObsvReqProcessor)
|
common.RunWithScissors(ctx, e.errC, "obsvReqProcessor", e.runObsvReqProcessor)
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// start `workerCount` many chunkFetcher runners
|
// start `workerCount` many chunkFetcher runners
|
||||||
for i := 0; i < workerChunkFetching; i++ {
|
for i := 0; i < workerChunkFetching; i++ {
|
||||||
err := supervisor.Run(ctx, fmt.Sprintf("chunk_fetcher_%d", i), e.runChunkFetcher)
|
common.RunWithScissors(ctx, e.errC, fmt.Sprintf("chunk_fetcher_%d", i), e.runChunkFetcher)
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// start `workerCount` many transactionProcessing runners
|
// start `workerCount` many transactionProcessing runners
|
||||||
for i := 0; i < workerCountTxProcessing; i++ {
|
for i := 0; i < workerCountTxProcessing; i++ {
|
||||||
err := supervisor.Run(ctx, fmt.Sprintf("txProcessor_%d", i), e.runTxProcessor)
|
common.RunWithScissors(ctx, e.errC, fmt.Sprintf("txProcessor_%d", i), e.runTxProcessor)
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
supervisor.Signal(ctx, supervisor.SignalHealthy)
|
supervisor.Signal(ctx, supervisor.SignalHealthy)
|
||||||
|
|
||||||
<-ctx.Done()
|
select {
|
||||||
return ctx.Err()
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case err := <-e.errC:
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// schedule pushes a job to workers after delay. It is context aware and will not execute the job if the context
|
// schedule pushes a job to workers after delay. It is context aware and will not execute the job if the context
|
||||||
// is cancelled before delay has passed and the job is picked up by a worker.
|
// is cancelled before delay has passed and the job is picked up by a worker.
|
||||||
func (e *Watcher) schedule(ctx context.Context, job *transactionProcessingJob, delay time.Duration) {
|
func (e *Watcher) schedule(ctx context.Context, job *transactionProcessingJob, delay time.Duration) {
|
||||||
go func() {
|
common.RunWithScissors(ctx, e.errC, "scheduledThread",
|
||||||
timer := time.NewTimer(delay)
|
func(ctx context.Context) error {
|
||||||
defer timer.Stop()
|
timer := time.NewTimer(delay)
|
||||||
|
defer timer.Stop()
|
||||||
|
|
||||||
e.transactionProcessingQueueCounter.Add(1)
|
e.transactionProcessingQueueCounter.Add(1)
|
||||||
defer e.transactionProcessingQueueCounter.Add(-1)
|
defer e.transactionProcessingQueueCounter.Add(-1)
|
||||||
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
return
|
|
||||||
case <-timer.C:
|
|
||||||
// Don't block on processing if the context is cancelled
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return nil
|
||||||
case e.transactionProcessingQueue <- job:
|
case <-timer.C:
|
||||||
|
// Don't block on processing if the context is cancelled
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return nil
|
||||||
|
case e.transactionProcessingQueue <- job:
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
return nil
|
||||||
}()
|
})
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue