Node: RunWithScissors should not hang on error if no listener (#2649)

* Node: RunwithScissors should not hang * Tests should check error text * Fix data race in tests * Add comments to explain the tests.
2023-04-18 09:05:20 -05:00 · 2023-04-18 09:05:20 -05:00 · c8e18ba72c
parent 8a866c3c1d
commit c8e18ba72c
2 changed files with 152 additions and 9 deletions
--- a/node/pkg/common/scissors.go
+++ b/node/pkg/common/scissors.go
@ -10,38 +10,56 @@ import (
 )

 var (
-	ScissorsErrors = promauto.NewCounterVec(
+	ScissorsErrorsCaught = promauto.NewCounterVec(
 		prometheus.CounterOpts{
 			Name: "scissor_errors_caught",
 			Help: "Total number of unhandled errors caught",
-		}, []string{"scissors", "name"})
+		}, []string{"name"})
+	ScissorsPanicsCaught = promauto.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "scissor_panics_caught",
+			Help: "Total number of panics caught",
+		}, []string{"name"})
 )

 // Start a go routine with recovering from any panic by sending an error to a error channel
 func RunWithScissors(ctx context.Context, errC chan error, name string, runnable supervisor.Runnable) {
-	ScissorsErrors.WithLabelValues("scissors", name).Add(0)
+	ScissorsErrorsCaught.WithLabelValues(name).Add(0)
+	ScissorsPanicsCaught.WithLabelValues(name).Add(0)
 	go func() {
 		defer func() {
 			if r := recover(); r != nil {
+				var err error
 				switch x := r.(type) {
 				case error:
-					errC <- fmt.Errorf("%s: %w", name, x)
+					err = fmt.Errorf("%s: %w", name, x)
 				default:
-					errC <- fmt.Errorf("%s: %v", name, x)
+					err = fmt.Errorf("%s: %v", name, x)
 				}
-				ScissorsErrors.WithLabelValues("scissors", name).Inc()
+				// We don't want this to hang if the listener has already gone away.
+				select {
+				case errC <- err:
+				default:
+				}
+				ScissorsPanicsCaught.WithLabelValues(name).Inc()

 			}
 		}()
 		err := runnable(ctx)
 		if err != nil {
-			errC <- err
+			// We don't want this to hang if the listener has already gone away.
+			select {
+			case errC <- err:
+			default:
+			}
+			ScissorsErrorsCaught.WithLabelValues(name).Inc()
 		}
 	}()
 }

 func WrapWithScissors(runnable supervisor.Runnable, name string) supervisor.Runnable {
-	ScissorsErrors.WithLabelValues("scissors", name).Add(0)
+	ScissorsErrorsCaught.WithLabelValues(name).Add(0)
+	ScissorsPanicsCaught.WithLabelValues(name).Add(0)
 	return func(ctx context.Context) (result error) {
 		defer func() {
 			if r := recover(); r != nil {
@ -51,7 +69,7 @@ func WrapWithScissors(runnable supervisor.Runnable, name string) supervisor.Runn
 				default:
 					result = fmt.Errorf("%s: %v", name, x)
 				}
-				ScissorsErrors.WithLabelValues("scissors", name).Inc()
+				ScissorsPanicsCaught.WithLabelValues(name).Inc()
 			}
 		}()

--- a/node/pkg/common/scissors_test.go
+++ b/node/pkg/common/scissors_test.go
@ -5,10 +5,23 @@ import (
 	"errors"
 	"fmt"
 	"testing"
+	"time"

 	"github.com/stretchr/testify/assert"
+	"github.com/test-go/testify/require"
+
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
 )

+func getCounterValue(metric *prometheus.CounterVec, runnableName string) float64 {
+	var m = &dto.Metric{}
+	if err := metric.WithLabelValues(runnableName).Write(m); err != nil {
+		return 0
+	}
+	return m.Counter.GetValue()
+}
+
 func throwNil(ctx context.Context) error {
 	var x *int = nil
 	*x = 5
@ -78,3 +91,115 @@ func TestSupervisor(t *testing.T) {
 		)
 	}
 }
+
+func TestRunWithScissorsCleanExit(t *testing.T) {
+	ctx := context.Background()
+	errC := make(chan error)
+
+	itRan := make(chan bool, 1)
+	RunWithScissors(ctx, errC, "TestRunWithScissorsCleanExit", func(ctx context.Context) error {
+		itRan <- true
+		return nil
+	})
+
+	shouldHaveRun := <-itRan
+	require.Equal(t, true, shouldHaveRun)
+
+	// Need to wait a bit to make sure the scissors code completes without hanging.
+	time.Sleep(100 * time.Millisecond)
+
+	assert.Equal(t, 0.0, getCounterValue(ScissorsErrorsCaught, "TestRunWithScissorsCleanExit"))
+	assert.Equal(t, 0.0, getCounterValue(ScissorsPanicsCaught, "TestRunWithScissorsCleanExit"))
+}
+
+func TestRunWithScissorsPanicReturned(t *testing.T) {
+	ctx := context.Background()
+	errC := make(chan error)
+
+	itRan := make(chan bool, 1)
+	RunWithScissors(ctx, errC, "TestRunWithScissorsPanicReturned", func(ctx context.Context) error {
+		itRan <- true
+		panic("Some random panic")
+	})
+
+	var err error
+	select {
+	case <-ctx.Done():
+		break
+	case err = <-errC:
+		break
+	}
+
+	shouldHaveRun := <-itRan
+	require.Equal(t, true, shouldHaveRun)
+	assert.Error(t, err)
+	assert.Equal(t, "TestRunWithScissorsPanicReturned: Some random panic", err.Error())
+	assert.Equal(t, 0.0, getCounterValue(ScissorsErrorsCaught, "TestRunWithScissorsPanicReturned"))
+	assert.Equal(t, 1.0, getCounterValue(ScissorsPanicsCaught, "TestRunWithScissorsPanicReturned"))
+}
+
+func TestRunWithScissorsPanicDoesNotBlockWhenNoListener(t *testing.T) {
+	ctx := context.Background()
+	errC := make(chan error)
+
+	itRan := make(chan bool, 1)
+	RunWithScissors(ctx, errC, "TestRunWithScissorsPanicDoesNotBlockWhenNoListener", func(ctx context.Context) error {
+		itRan <- true
+		panic("Some random panic")
+	})
+
+	shouldHaveRun := <-itRan
+	require.Equal(t, true, shouldHaveRun)
+
+	// Need to wait a bit to make sure the scissors code completes without hanging.
+	time.Sleep(100 * time.Millisecond)
+
+	assert.Equal(t, 0.0, getCounterValue(ScissorsErrorsCaught, "TestRunWithScissorsPanicDoesNotBlockWhenNoListener"))
+	assert.Equal(t, 1.0, getCounterValue(ScissorsPanicsCaught, "TestRunWithScissorsPanicDoesNotBlockWhenNoListener"))
+}
+
+func TestRunWithScissorsErrorReturned(t *testing.T) {
+	ctx := context.Background()
+	errC := make(chan error)
+
+	itRan := make(chan bool, 1)
+	RunWithScissors(ctx, errC, "TestRunWithScissorsErrorReturned", func(ctx context.Context) error {
+		itRan <- true
+		return fmt.Errorf("Some random error")
+	})
+
+	var err error
+	select {
+	case <-ctx.Done():
+		break
+	case err = <-errC:
+		break
+	}
+
+	shouldHaveRun := <-itRan
+	require.Equal(t, true, shouldHaveRun)
+	assert.Error(t, err)
+	assert.Equal(t, "Some random error", err.Error())
+	assert.Equal(t, 1.0, getCounterValue(ScissorsErrorsCaught, "TestRunWithScissorsErrorReturned"))
+	assert.Equal(t, 0.0, getCounterValue(ScissorsPanicsCaught, "TestRunWithScissorsErrorReturned"))
+}
+
+func TestRunWithScissorsErrorDoesNotBlockWhenNoListener(t *testing.T) {
+	ctx := context.Background()
+	errC := make(chan error)
+
+	itRan := make(chan bool, 1)
+	RunWithScissors(ctx, errC, "TestRunWithScissorsErrorDoesNotBlockWhenNoListener", func(ctx context.Context) error {
+		itRan <- true
+		return fmt.Errorf("Some random error")
+	})
+
+	shouldHaveRun := <-itRan
+	require.Equal(t, true, shouldHaveRun)
+
+	// Need to wait a bit to make sure the scissors code completes without hanging.
+	time.Sleep(100 * time.Millisecond)
+
+	assert.Equal(t, 1.0, getCounterValue(ScissorsErrorsCaught, "TestRunWithScissorsErrorDoesNotBlockWhenNoListener"))
+	assert.Equal(t, 0.0, getCounterValue(ScissorsPanicsCaught, "TestRunWithScissorsErrorDoesNotBlockWhenNoListener"))
+}