mirror of https://github.com/certusone/kb.git
Import knowledge base
This commit is contained in:
commit
cdd8111364
|
@ -0,0 +1 @@
|
|||
build
|
|
@ -0,0 +1,20 @@
|
|||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line.
|
||||
SPHINXOPTS =
|
||||
SPHINXBUILD = sphinx-build
|
||||
SPHINXPROJ = CertusOneKnowledgeBase
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
@ -0,0 +1,21 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
function ensure_package() {
|
||||
if ! rpm -q --quiet $1; then
|
||||
pkcon -y install $1
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_package python3-sphinx
|
||||
ensure_package python3-sphinx-autobuild
|
||||
ensure_package python3-sphinx-theme-alabaster
|
||||
ensure_package python3-port-for
|
||||
|
||||
# Fix for broken Fedora package
|
||||
PKDEF=/usr/lib/python3.?/site-packages/sphinx_autobuild-*.egg-info/requires.txt
|
||||
if grep -q port_for $PKDEF; then
|
||||
sudo sed -i '/port_for/d' $PKDEF
|
||||
fi
|
||||
|
||||
sphinx-autobuild-3 -p 8080 -H 127.0.0.1 source build/html -n "$@"
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="WEB_MODULE" version="4">
|
||||
<component name="NewModuleRootManager" inherit-compiler-output="true">
|
||||
<exclude-output />
|
||||
<content url="file://$MODULE_DIR$" />
|
||||
<orderEntry type="inheritedJdk" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
|
@ -0,0 +1,36 @@
|
|||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
set SPHINXPROJ=CertusOneKnowledgeBase
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
|
||||
|
||||
:end
|
||||
popd
|
|
@ -0,0 +1,171 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Certus One Knowledge Base documentation build configuration file, created by
|
||||
# sphinx-quickstart on Tue Sep 25 07:43:02 2018.
|
||||
#
|
||||
# This file is execfile()d with the current directory set to its
|
||||
# containing dir.
|
||||
#
|
||||
# Note that not all possible configuration values are present in this
|
||||
# autogenerated file.
|
||||
#
|
||||
# All configuration values have a default; values that are commented out
|
||||
# serve to show the default.
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
|
||||
# -- General configuration ------------------------------------------------
|
||||
|
||||
# If your documentation needs a minimal Sphinx version, state it here.
|
||||
#
|
||||
# needs_sphinx = '1.0'
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = ['sphinx.ext.todo',
|
||||
'sphinx.ext.ifconfig',
|
||||
'sphinx.ext.githubpages']
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
|
||||
# The suffix(es) of source filenames.
|
||||
# You can specify multiple suffix as a list of string:
|
||||
#
|
||||
# source_suffix = ['.rst', '.md']
|
||||
source_suffix = '.rst'
|
||||
|
||||
# The master toctree document.
|
||||
master_doc = 'index'
|
||||
|
||||
# General information about the project.
|
||||
project = 'Certus One Knowledge Base'
|
||||
copyright = '2018, Certus One GmbH'
|
||||
author = 'Certus One GmbH'
|
||||
|
||||
# The version info for the project you're documenting, acts as replacement for
|
||||
# |version| and |release|, also used in various other places throughout the
|
||||
# built documents.
|
||||
#
|
||||
# The short X.Y version.
|
||||
version = ''
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = ''
|
||||
|
||||
# The language for content autogenerated by Sphinx. Refer to documentation
|
||||
# for a list of supported languages.
|
||||
#
|
||||
# This is also used if you do content translation via gettext catalogs.
|
||||
# Usually you set "language" from the command line for these cases.
|
||||
language = None
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This patterns also effect to html_static_path and html_extra_path
|
||||
exclude_patterns = []
|
||||
|
||||
# The name of the Pygments (syntax highlighting) style to use.
|
||||
pygments_style = 'sphinx'
|
||||
|
||||
# If true, `todo` and `todoList` produce output, else they produce nothing.
|
||||
todo_include_todos = True
|
||||
|
||||
|
||||
# -- Options for HTML output ----------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = 'alabaster'
|
||||
|
||||
# Theme options are theme-specific and customize the look and feel of a theme
|
||||
# further. For a list of options available for each theme, see the
|
||||
# documentation.
|
||||
#
|
||||
# html_theme_options = {}
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ['_static']
|
||||
|
||||
# Custom sidebar templates, must be a dictionary that maps document names
|
||||
# to template names.
|
||||
#
|
||||
# This is required for the alabaster theme
|
||||
# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
|
||||
html_sidebars = {
|
||||
'**': [
|
||||
'relations.html', # needs 'show_related': True theme option to display
|
||||
'searchbox.html',
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# -- Options for HTMLHelp output ------------------------------------------
|
||||
|
||||
# Output file base name for HTML help builder.
|
||||
htmlhelp_basename = 'CertusOneKnowledgeBasedoc'
|
||||
|
||||
|
||||
# -- Options for LaTeX output ---------------------------------------------
|
||||
|
||||
latex_elements = {
|
||||
# The paper size ('letterpaper' or 'a4paper').
|
||||
#
|
||||
# 'papersize': 'letterpaper',
|
||||
|
||||
# The font size ('10pt', '11pt' or '12pt').
|
||||
#
|
||||
# 'pointsize': '10pt',
|
||||
|
||||
# Additional stuff for the LaTeX preamble.
|
||||
#
|
||||
# 'preamble': '',
|
||||
|
||||
# Latex figure (float) alignment
|
||||
#
|
||||
# 'figure_align': 'htbp',
|
||||
}
|
||||
|
||||
# Grouping the document tree into LaTeX files. List of tuples
|
||||
# (source start file, target name, title,
|
||||
# author, documentclass [howto, manual, or own class]).
|
||||
latex_documents = [
|
||||
(master_doc, 'CertusOneKnowledgeBase.tex', 'Certus One Knowledge Base Documentation',
|
||||
'Certus One GmbH', 'manual'),
|
||||
]
|
||||
|
||||
|
||||
# -- Options for manual page output ---------------------------------------
|
||||
|
||||
# One entry per manual page. List of tuples
|
||||
# (source start file, name, description, authors, manual section).
|
||||
man_pages = [
|
||||
(master_doc, 'certusoneknowledgebase', 'Certus One Knowledge Base Documentation',
|
||||
[author], 1)
|
||||
]
|
||||
|
||||
|
||||
# -- Options for Texinfo output -------------------------------------------
|
||||
|
||||
# Grouping the document tree into Texinfo files. List of tuples
|
||||
# (source start file, target name, title, author,
|
||||
# dir menu entry, description, category)
|
||||
texinfo_documents = [
|
||||
(master_doc, 'CertusOneKnowledgeBase', 'Certus One Knowledge Base Documentation',
|
||||
author, 'CertusOneKnowledgeBase', 'One line description of project.',
|
||||
'Miscellaneous'),
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
.. Certus One Knowledge Base documentation master file, created by
|
||||
sphinx-quickstart on Tue Sep 25 07:43:02 2018.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Validator Operations Guide
|
||||
==========================
|
||||
|
||||
This guide is a living document which details a set of best practices for
|
||||
running a validator service, as implemented by Certus One. Running a validator
|
||||
puts a much greater emphasis on technical correctness, sound systems architecture,
|
||||
security, and overall operational excellence.
|
||||
|
||||
The aim of this document is to provide a baseline for validator operation, both to
|
||||
make it easier for new validators to get started, and to provide input to other
|
||||
teams. We believe that collaboration and openness strongly benefits the overall
|
||||
ecosystem - the more well-run validators there are, the more resilient will the
|
||||
network be.
|
||||
|
||||
While this document's focus is running a `Cosmos`_ validator, but most of its
|
||||
content is applicable to operating any highly available, distributed service.
|
||||
|
||||
The document's source code is available on `GitHub`_.
|
||||
Contributions are greatly appreciated.
|
||||
|
||||
While it's hard to provide an implementation that fits all use cases, we usually
|
||||
provide reference implementations which implement our guidelines.
|
||||
|
||||
.. _GitHub: https://github.com/certusone/kb
|
||||
.. _Cosmos: https://cosmos.network
|
||||
|
||||
Contents
|
||||
++++++++
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:caption: Contents:
|
||||
|
||||
monitoring
|
|
@ -0,0 +1,83 @@
|
|||
Monitoring
|
||||
==========
|
||||
|
||||
Monitoring is an integral part of providing highly available services. By monitoring, we mean:
|
||||
|
||||
1. Instrumenting your applications and servers and collecting as many useful **metrics** as
|
||||
possible. Having detailed metrics is invaluable for debugging, incident response and
|
||||
performance analysis (observability - you can't improve what you can't measure).
|
||||
\
|
||||
|
||||
2. Generating on-call **alerts** when a production system breaks and needs fixing.
|
||||
\
|
||||
|
||||
3. Handling one-time **events** which need attention.
|
||||
|
||||
Traditionally, most teams used different systems for metrics and alerting, with a time-series
|
||||
database like Graphite for metrics and a check-based monitoring system like Nagios for alerts.
|
||||
Nowadays, a different paradigm is getting popular - **metrics-based alerting**. Instead of having
|
||||
separate tooling for instrumentation and monitoring, alerts are realized by configuring rules
|
||||
that are evaluated against collected metrics.
|
||||
|
||||
This is a very powerful approach, since it radically simplifies the monitoring stack and allows
|
||||
for sophisticated queries over time ranges and groups of metrics.
|
||||
|
||||
Modern monitoring like Prometheus make modern metrics-based alerting very approachable.
|
||||
Gone are the days of fiddling with Nagios performance and check scripts!
|
||||
|
||||
However, no tool can solve the hardest part about monitoring - figuring out what to monitor and
|
||||
when to alert. As any on-call engineer can attest, the most common failure mode in alerting is
|
||||
having *too many* alerts, rather than too few, with important alerts getting lost in the noise.
|
||||
|
||||
Therefore, the main goal of a good alerting system is a **high signal-to-noise ratio** -
|
||||
every alert should be *relevant* (impact a production system in a way that requires
|
||||
immediate attention), and *actionable* (require human input to resolve). Any condition that isn't
|
||||
either relevant, or actionable, must not result in an alert.
|
||||
|
||||
You also need to figure out whom to alert - also called **on-call rotation**, how to compensate
|
||||
for it, how to fairly distribute the load in the team, how to effectively communicate during an
|
||||
incident, and handle the feedback loop (we'll dedicate a separate article to this topic).
|
||||
|
||||
A low number of meaningful, simple alerts paired with robust business procedures which ensure that
|
||||
alerts are acted upon, false positives are quickly eliminated and outages are followed up are much
|
||||
more powerful than "magic" approaches like anomaly detection, which sound good in theory, but don't
|
||||
work well in practice [#anomaly]_.
|
||||
|
||||
.. [#anomaly] Anomaly detection - while useful in other contexts- tends to quickly break down for
|
||||
monitoring use cases. As soon as you compare a sufficiently large number of metrics, spurious
|
||||
correlations will show up and result in false positives. Your online shop saw a 10x traffic
|
||||
spike, but didn't go down - why would you page someone? Everything is working fine.
|
||||
Simple time offsets (do we see less traffic than last week?), linear extrapolation and
|
||||
even fixed thresholds are much more specific.
|
||||
|
||||
Symptoms-based alerting
|
||||
+++++++++++++++++++++++
|
||||
|
||||
For on-call alerts, you want to strictly limit the amount of pages you send out in order to
|
||||
maintain a high signal-to-noise ratio. Every page interrupts the on-call engineer's workflow, and in
|
||||
the worst case, it wakes him up at night.
|
||||
|
||||
In order to do this, you want to alert on symptoms *as far up the stack as possible*. Instead of
|
||||
having an alert that says "one of our MySQL database servers is down", you want "the website error
|
||||
rate went up", which catches a huge number of potential issues, whereas a redundant database server
|
||||
going down may not have any impact whatsoever.
|
||||
|
||||
Once you've been woken up, you can then use your detailed metrics and dashboards to narrow down
|
||||
the cause of the outage, but there's no point on alerting on them.
|
||||
|
||||
Exceptions to this are cases where you can reliably extrapolate - like "this disk *will* up in
|
||||
30 minutes, and it *will* take down our service unless it's fixed.
|
||||
|
||||
Low-severity alerts
|
||||
-------------------
|
||||
|
||||
There are many conditions that need to be taken of, just not *immediately*. Your redundant database
|
||||
server went down and nobody got paged - great! But, you still need someone to fix that database
|
||||
server, or clean up the log partition that will soon fill up, or refill your coffee maker (you're
|
||||
monitoring your coffee maker, right? It's critical infrastructure!).
|
||||
|
||||
A common approach is to have a separate, **low-severity notification channel** that won't wake anyone
|
||||
up, but still ensure that the issue is resolved. We recommend a channel in your favorite business
|
||||
messaging application, *plus* a dashboard which shows outstanding alerts (the dashboard is really
|
||||
important, since it ensures that alerts are acted upon - it's basically a to-do list).
|
||||
|
Loading…
Reference in New Issue