Import knowledge base

2018-09-25 07:43:20 +02:00 · 2018-09-25 07:43:20 +02:00 · cdd8111364
commit cdd8111364
8 changed files with 380 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+build
--- a/20
+++ b/20
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = CertusOneKnowledgeBase
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docserver.sh
+++ b/docserver.sh
@ -0,0 +1,21 @@
+#!/bin/bash
+set -euo pipefail
+
+function ensure_package() {
+  if ! rpm -q --quiet $1; then
+    pkcon -y install $1
+  fi
+}
+
+ensure_package python3-sphinx
+ensure_package python3-sphinx-autobuild
+ensure_package python3-sphinx-theme-alabaster
+ensure_package python3-port-for
+
+# Fix for broken Fedora package
+PKDEF=/usr/lib/python3.?/site-packages/sphinx_autobuild-*.egg-info/requires.txt
+if grep -q port_for $PKDEF; then
+  sudo sed -i '/port_for/d' $PKDEF
+fi 
+
+sphinx-autobuild-3 -p 8080 -H 127.0.0.1 source build/html -n "$@"
--- a/kb.iml
+++ b/kb.iml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="WEB_MODULE" version="4">
+  <component name="NewModuleRootManager" inherit-compiler-output="true">
+    <exclude-output />
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/make.bat
+++ b/make.bat
@ -0,0 +1,36 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+set SPHINXPROJ=CertusOneKnowledgeBase
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
--- a/source/conf.py
+++ b/source/conf.py
@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Certus One Knowledge Base documentation build configuration file, created by
+# sphinx-quickstart on Tue Sep 25 07:43:02 2018.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['sphinx.ext.todo',
+    'sphinx.ext.ifconfig',
+    'sphinx.ext.githubpages']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Certus One Knowledge Base'
+copyright = '2018, Certus One GmbH'
+author = 'Certus One GmbH'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = ''
+# The full version, including alpha/beta/rc tags.
+release = ''
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = True
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'CertusOneKnowledgeBasedoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'CertusOneKnowledgeBase.tex', 'Certus One Knowledge Base Documentation',
+     'Certus One GmbH', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'certusoneknowledgebase', 'Certus One Knowledge Base Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'CertusOneKnowledgeBase', 'Certus One Knowledge Base Documentation',
+     author, 'CertusOneKnowledgeBase', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
--- a/source/index.rst
+++ b/source/index.rst
@ -0,0 +1,39 @@
+.. Certus One Knowledge Base documentation master file, created by
+   sphinx-quickstart on Tue Sep 25 07:43:02 2018.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Validator Operations Guide
+==========================
+
+This guide is a living document which details a set of best practices for
+running a validator service, as implemented by Certus One. Running a validator
+puts a much greater emphasis on technical correctness, sound systems architecture,
+security, and overall operational excellence.
+
+The aim of this document is to provide a baseline for validator operation, both to
+make it easier for new validators to get started, and to provide input to other
+teams. We believe that collaboration and openness strongly benefits the overall
+ecosystem - the more well-run validators there are, the more resilient will the
+network be.
+
+While this document's focus is running a `Cosmos`_ validator, but most of its
+content is applicable to operating any highly available, distributed service.
+
+The document's source code is available on `GitHub`_.
+Contributions are greatly appreciated.
+
+While it's hard to provide an implementation that fits all use cases, we usually
+provide reference implementations which implement our guidelines.
+
+.. _GitHub: https://github.com/certusone/kb
+.. _Cosmos: https://cosmos.network
+
+Contents
++++++++
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   monitoring
--- a/source/monitoring.rst
+++ b/source/monitoring.rst
@ -0,0 +1,83 @@
+Monitoring
+==========
+
+Monitoring is an integral part of providing highly available services. By monitoring, we mean:
+
+1.  Instrumenting your applications and servers and collecting as many useful **metrics** as
+    possible. Having detailed metrics is invaluable for debugging, incident response and
+    performance analysis (observability - you can't improve what you can't measure).
+     \
+
+2.  Generating on-call **alerts** when a production system breaks and needs fixing.
+     \
+
+3.  Handling one-time **events** which need attention.
+
+Traditionally, most teams used different systems for metrics and alerting, with a time-series
+database like Graphite for metrics and a check-based monitoring system like Nagios for alerts.
+Nowadays, a different paradigm is getting popular - **metrics-based alerting**. Instead of having
+separate tooling for instrumentation and monitoring, alerts are realized by configuring rules
+that are evaluated against collected metrics.
+
+This is a very powerful approach, since it radically simplifies the monitoring stack and allows
+for sophisticated queries over time ranges and groups of metrics.
+
+Modern monitoring like Prometheus make modern metrics-based alerting very approachable.
+Gone are the days of fiddling with Nagios performance and check scripts!
+
+However, no tool can solve the hardest part about monitoring - figuring out what to monitor and
+when to alert. As any on-call engineer can attest, the most common failure mode in alerting is
+having *too many* alerts, rather than too few, with important alerts getting lost in the noise.
+
+Therefore, the main goal of a good alerting system is a **high signal-to-noise ratio** -
+every alert should be *relevant* (impact a production system in a way that requires
+immediate attention), and *actionable* (require human input to resolve). Any condition that isn't
+either relevant, or actionable, must not result in an alert.
+
+You also need to figure out whom to alert - also called **on-call rotation**, how to compensate
+for it, how to fairly distribute the load in the team, how to effectively communicate during an
+incident, and handle the feedback loop (we'll dedicate a separate article to this topic).
+
+A low number of meaningful, simple alerts paired with robust business procedures which ensure that
+alerts are acted upon, false positives are quickly eliminated and outages are followed up are much
+more powerful than "magic" approaches like anomaly detection, which sound good in theory, but don't
+work well in practice [#anomaly]_.
+
+..   [#anomaly] Anomaly detection - while useful in other contexts- tends to quickly break down for
+     monitoring use cases. As soon as you compare a sufficiently large number of metrics, spurious
+     correlations will show up and result in false positives. Your online shop saw a 10x traffic
+     spike, but didn't go down - why would you page someone? Everything is working fine.
+     Simple time offsets (do we see less traffic than last week?), linear extrapolation and
+     even fixed thresholds are much more specific.
+
+Symptoms-based alerting
+++++++++++++++++++++++
+
+For on-call alerts, you want to strictly limit the amount of pages you send out in order to
+maintain a high signal-to-noise ratio. Every page interrupts the on-call engineer's workflow, and in
+the worst case, it wakes him up at night.
+
+In order to do this, you want to alert on symptoms *as far up the stack as possible*. Instead of
+having an alert that says "one of our MySQL database servers is down", you want "the website error
+rate went up", which catches a huge number of potential issues, whereas a redundant database server
+going down may not have any impact whatsoever.
+
+Once you've been woken up, you can then use your detailed metrics and dashboards to narrow down
+the cause of the outage, but there's no point on alerting on them.
+
+Exceptions to this are cases where you can reliably extrapolate - like "this disk *will* up in
+30 minutes, and it *will* take down our service unless it's fixed.
+
+Low-severity alerts
+-------------------
+
+There are many conditions that need to be taken of, just not *immediately*. Your redundant database
+server went down and nobody got paged - great! But, you still need someone to fix that database
+server, or clean up the log partition that will soon fill up, or refill your coffee maker (you're
+monitoring your coffee maker, right? It's critical infrastructure!).
+
+A common approach is to have a separate, **low-severity notification channel** that won't wake anyone
+up, but still ensure that the issue is resolved. We recommend a channel in your favorite business
+messaging application, *plus* a dashboard which shows outstanding alerts (the dashboard is really
+important, since it ensures that alerts are acted upon - it's basically a to-do list).
+