From fcd13e7d902d9910c57f03bc7145ad659d81cde7 Mon Sep 17 00:00:00 2001 From: Tyler Smith Date: Tue, 5 May 2020 22:44:45 -0700 Subject: [PATCH] FEATURE: Add Health API. --- api/health/checks.go | 74 ++++++++++++++++++++++++++++++ api/health/service.go | 79 ++++++++++++++++++++++++++++++++ main/params.go | 1 + networking/handshake_handlers.go | 18 ++++++++ node/config.go | 1 + node/node.go | 20 +++++++- 6 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 api/health/checks.go create mode 100644 api/health/service.go diff --git a/api/health/checks.go b/api/health/checks.go new file mode 100644 index 0000000..5a6a569 --- /dev/null +++ b/api/health/checks.go @@ -0,0 +1,74 @@ +// (c) 2020, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package health + +import ( + "errors" + "time" +) + +var ( + // ErrHeartbeatNotDetected is returned from a HeartbeatCheckFn when the + // heartbeat has not been detected recently enough + ErrHeartbeatNotDetected = errors.New("heartbeat not detected") +) + +// CheckFn returns optional status information and an error indicating health or +// non-health +type CheckFn func() (interface{}, error) + +// Check defines a single health check that we want to monitor and consider as +// part of our wider healthiness +type Check struct { + // Name is the identifier for this check and must be unique among all Checks + Name string + + // CheckFn is the function to call to perform the the health check + CheckFn CheckFn + + // ExecutionPeriod is the duration to wait between executions of this Check + ExecutionPeriod time.Duration + + // InitialDelay is the duration to wait before executing the first time + InitialDelay time.Duration + + // InitiallyPassing is whether or not to consider the Check healthy before the + // initial execution + InitiallyPassing bool +} + +// gosundheitCheck implements the health.Check interface backed by a CheckFn +type gosundheitCheck struct { + name string + checkFn CheckFn +} + +// Name implements the health.Check interface by returning a unique name +func (c gosundheitCheck) Name() string { return c.name } + +// Execute implements the health.Check interface by executing the checkFn and +// returning the results +func (c gosundheitCheck) Execute() (interface{}, error) { return c.checkFn() } + +// heartbeater provides a getter to the most recently observed heartbeat +type heartbeater interface { + GetHeartbeat() time.Time +} + +// HeartbeatCheckFn returns a CheckFn that checks the given heartbeater has +// pulsed within the given duration +func HeartbeatCheckFn(hb heartbeater, max time.Duration) CheckFn { + return func() (data interface{}, err error) { + // Get the heartbeat and create a data set to return to the caller + hb := hb.GetHeartbeat() + data = map[string]int64{"heartbeat": hb.UTC().Unix()} + + // If the current time is after the last known heartbeat + the limit then + // mark our check as failed + if hb.Add(max).Before(time.Now()) { + err = ErrHeartbeatNotDetected + } + return data, err + } +} diff --git a/api/health/service.go b/api/health/service.go new file mode 100644 index 0000000..c1f0e40 --- /dev/null +++ b/api/health/service.go @@ -0,0 +1,79 @@ +// (c) 2020, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package health + +import ( + "net/http" + "time" + + "github.com/AppsFlyer/go-sundheit" + "github.com/ava-labs/gecko/snow/engine/common" + "github.com/ava-labs/gecko/utils/json" + "github.com/ava-labs/gecko/utils/logging" + "github.com/gorilla/rpc/v2" +) + +// defaultCheckOpts is a Check whose properties represent a default Check +var defaultCheckOpts = Check{ExecutionPeriod: time.Minute} + +// Health observes a set of vital signs and makes them +type Health struct { + log logging.Logger + health health.Health +} + +// NewService creates a new Health service +func NewService(log logging.Logger) *Health { + return &Health{log, health.New()} +} + +// Handler returns an HTTPHandler providing RPC access to the Health service +func (h *Health) Handler() *common.HTTPHandler { + newServer := rpc.NewServer() + codec := json.NewCodec() + newServer.RegisterCodec(codec, "application/json") + newServer.RegisterCodec(codec, "application/json;charset=UTF-8") + newServer.RegisterService(h, "health") + return &common.HTTPHandler{LockOptions: common.NoLock, Handler: newServer} +} + +// RegisterHeartbeat adds a check with default options and a CheckFn that checks +// the given heartbeater for a recent heartbeat +func (h *Health) RegisterHeartbeat(name string, hb heartbeater, max time.Duration) error { + return h.RegisterCheckFunc(name, HeartbeatCheckFn(hb, max)) +} + +// RegisterCheckFunc adds a Check with default options and the given CheckFn +func (h *Health) RegisterCheckFunc(name string, checkFn CheckFn) error { + check := defaultCheckOpts + check.Name = name + check.CheckFn = checkFn + return h.RegisterCheck(check) +} + +// RegisterCheck adds the given Check +func (h *Health) RegisterCheck(c Check) error { + return h.health.RegisterCheck(&health.Config{ + InitialDelay: c.InitialDelay, + ExecutionPeriod: c.ExecutionPeriod, + InitiallyPassing: c.InitiallyPassing, + Check: gosundheitCheck{c.Name, c.CheckFn}, + }) +} + +// GetHealthArgs are the arguments for GetHealth +type GetHealthArgs struct{} + +// GetHealthReply is the response for GetHealth +type GetHealthReply struct { + Checks map[string]health.Result `json:"checks"` + Healthy bool `json:"healthy"` +} + +// GetHealth returns a summation of the health of the node +func (service *Health) GetHealth(_ *http.Request, _ *GetHealthArgs, reply *GetHealthReply) error { + service.log.Debug("Health: GetHealth called") + reply.Checks, reply.Healthy = service.health.Results() + return nil +} diff --git a/main/params.go b/main/params.go index c18a937..c4c2d8a 100644 --- a/main/params.go +++ b/main/params.go @@ -128,6 +128,7 @@ func init() { fs.BoolVar(&Config.AdminAPIEnabled, "api-admin-enabled", true, "If true, this node exposes the Admin API") fs.BoolVar(&Config.KeystoreAPIEnabled, "api-keystore-enabled", true, "If true, this node exposes the Keystore API") fs.BoolVar(&Config.MetricsAPIEnabled, "api-metrics-enabled", true, "If true, this node exposes the Metrics API") + fs.BoolVar(&Config.HealthAPIEnabled, "api-health-enabled", true, "If true, this node exposes the Health API") fs.BoolVar(&Config.IPCEnabled, "api-ipcs-enabled", false, "If true, IPCs can be opened") // Throughput Server diff --git a/networking/handshake_handlers.go b/networking/handshake_handlers.go index 554f4fb..2fa7645 100644 --- a/networking/handshake_handlers.go +++ b/networking/handshake_handlers.go @@ -147,6 +147,8 @@ type Handshake struct { // If any chain is blocked on connecting to peers, track these blockers here awaitingLock sync.Mutex awaiting []*networking.AwaitingConnections + + lastHeartbeat time.Time } // Initialize to the c networking library. This should only be done once during @@ -201,6 +203,8 @@ func (nm *Handshake) Initialize( net.RegHandler(Version, salticidae.MsgNetworkMsgCallback(C.version), nil) net.RegHandler(GetPeerList, salticidae.MsgNetworkMsgCallback(C.getPeerList), nil) net.RegHandler(PeerList, salticidae.MsgNetworkMsgCallback(C.peerList), nil) + + nm.heartbeat() } // ConnectTo add the peer as a connection and connects to them. @@ -593,6 +597,16 @@ func (nm *Handshake) checkCompatibility(peerVersion string) bool { return true } +// heartbeat registers a new heartbeat to signal liveness +func (nm *Handshake) heartbeat() { + nm.lastHeartbeat = nm.clock.Time() +} + +// GetHeartbeat returns the most recent heartbeat time +func (nm *Handshake) GetHeartbeat() time.Time { + return nm.lastHeartbeat +} + // peerHandler notifies a change to the set of connected peers // connected is true if a new peer is connected // connected is false if a formerly connected peer has disconnected @@ -667,6 +681,7 @@ func pong(*C.struct_msg_t, *C.struct_msgnetwork_conn_t, unsafe.Pointer) {} //export getVersion func getVersion(_msg *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe.Pointer) { HandshakeNet.numGetVersionReceived.Inc() + HandshakeNet.heartbeat() conn := salticidae.PeerNetworkConnFromC(salticidae.CPeerNetworkConn(_conn)) peer := conn.GetPeerID(false) @@ -679,6 +694,7 @@ func getVersion(_msg *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsaf //export version func version(_msg *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe.Pointer) { HandshakeNet.numVersionReceived.Inc() + HandshakeNet.heartbeat() msg := salticidae.MsgFromC(salticidae.CMsg(_msg)) conn := salticidae.PeerNetworkConnFromC(salticidae.CPeerNetworkConn(_conn)) @@ -763,6 +779,7 @@ func version(_msg *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe.P //export getPeerList func getPeerList(_ *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe.Pointer) { HandshakeNet.numGetPeerlistReceived.Inc() + HandshakeNet.heartbeat() conn := salticidae.PeerNetworkConnFromC(salticidae.CPeerNetworkConn(_conn)) peer := conn.GetPeerID(false) @@ -775,6 +792,7 @@ func getPeerList(_ *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe. //export peerList func peerList(_msg *C.struct_msg_t, _conn *C.struct_msgnetwork_conn_t, _ unsafe.Pointer) { HandshakeNet.numPeerlistReceived.Inc() + HandshakeNet.heartbeat() msg := salticidae.MsgFromC(salticidae.CMsg(_msg)) build := Builder{} diff --git a/node/config.go b/node/config.go index b35d997..9612746 100644 --- a/node/config.go +++ b/node/config.go @@ -51,6 +51,7 @@ type Config struct { AdminAPIEnabled bool KeystoreAPIEnabled bool MetricsAPIEnabled bool + HealthAPIEnabled bool // Logging configuration LoggingConfig logging.Config diff --git a/node/node.go b/node/node.go index 289de18..424f044 100644 --- a/node/node.go +++ b/node/node.go @@ -17,12 +17,14 @@ import ( "os" "path" "sync" + "time" "unsafe" "github.com/ava-labs/salticidae-go" "github.com/ava-labs/gecko/api" "github.com/ava-labs/gecko/api/admin" + "github.com/ava-labs/gecko/api/health" "github.com/ava-labs/gecko/api/ipcs" "github.com/ava-labs/gecko/api/keystore" "github.com/ava-labs/gecko/api/metrics" @@ -559,6 +561,19 @@ func (n *Node) initAdminAPI() { } } +// initHealthAPI initializes the Health API service +// Assumes n.Log, n.ConsensusAPI, and n.ValidatorAPI already initialized +func (n *Node) initHealthAPI() { + if !n.Config.HealthAPIEnabled { + return + } + + n.Log.Info("initializing Health API") + service := health.NewService(n.Log) + service.RegisterHeartbeat("network.validators.heartbeat", n.ValidatorAPI, 5*time.Minute) + n.APIServer.AddRoute(service.Handler(), &sync.RWMutex{}, "health", "", n.HTTPLog) +} + // initIPCAPI initializes the IPC API service // Assumes n.log and n.chainManager already initialized func (n *Node) initIPCAPI() { @@ -650,8 +665,9 @@ func (n *Node) Initialize(Config *Config, logger logging.Logger, logFactory logg n.initClients() // Set up the client servers } - n.initAdminAPI() // Start the Admin API - n.initIPCAPI() // Start the IPC API + n.initAdminAPI() // Start the Admin API + n.initHealthAPI() // Start the Health API + n.initIPCAPI() // Start the IPC API if err := n.initAliases(); err != nil { // Set up aliases return err