diff --git a/metrics/README.md b/metrics/README.md index e648ce515..5c8b49e27 100644 --- a/metrics/README.md +++ b/metrics/README.md @@ -1,47 +1,47 @@ # Metrics -## Testnet Grafana Dashboard +## InfluxDB -There are three versions of the testnet dashboard, corresponding to the three -release channels: -* https://metrics.solana.com:3000/d/monitor-edge/cluster-telemetry-edge -* https://metrics.solana.com:3000/d/monitor-beta/cluster-telemetry-beta -* https://metrics.solana.com:3000/d/monitor/cluster-telemetry +In oder to explore validator specific metrics from mainnet-beta, testnet or devnet you can use Chronograf: -The dashboard for each channel is defined from the -`metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json` source -file in the git branch associated with that channel, and deployed by automation -running `ci/publish-metrics-dashboard.sh`. +* https://metrics.solana.com:8888/ (production enviroment) +* https://metrics.solana.com:8889/ (testing enviroment) -A deploy can be triggered at any time via the `New Build` button of -https://buildkite.com/solana-labs/publish-metrics-dashboard. +For local cluster deployments you should use: -### Modifying a Dashboard +* https://internal-metrics.solana.com:8888/ -Dashboard updates are accomplished by modifying -`metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json`, -**manual edits made directly in Grafana will be overwritten**. +## Public Grafana Dashboards -* Check out metrics to add at https://metrics.solana.com:8888/ in the data explorer. -* When editing a query for a dashboard graph, use the "Toggle Edit Mode" selection - behind the hamburger button to use raw SQL and copy the query into the text field. - You may have to fixup the query with the dashboard variables like $testnet or $timeFilter, - check other functioning fields in the dashboard for examples. +There are three main public dashboards for cluster related metrics: -1. Open the desired dashboard in Grafana -2. Create a development copy of the dashboard by selecting `Save As..` in the - `Settings` menu for the dashboard -3. Edit dashboard as desired -4. Extract the JSON Model by selecting `JSON Model` in the `Settings` menu. Copy the JSON to the clipboard - and paste into `metrics/scripts/grafana-provisioning/dashboards/cluster-monitor.json`, -5. Delete your development dashboard: `Settings` => `Delete` +* https://metrics.solana.com/d/monitor-edge/cluster-telemetry +* https://metrics.solana.com/d/0n54roOVz/fee-market +* https://metrics.solana.com/d/UpIWbId4k/ping-result -### Deploying a Dashboard Manually +For local cluster deployments you should use: -If you need to immediately deploy a dashboard using the contents of -`cluster-monitor.json` in your local workspace, -``` -$ export GRAFANA_API_TOKEN="an API key from https://metrics.solana.com:3000/org/apikeys" -$ metrics/publish-metrics-dashboard.sh (edge|beta|stable) -``` -Note that automation will eventually overwrite your manual deploy. +* https://internal-metrics.solana.com:3000/ + +### Cluster Telemetry + +The cluster telemetry dashboard shows the current state of the cluster: + +1. Cluster Stability +2. Validator Streamer +3. Tomer Consensus +4. IP Network +5. Snapshots +6. RPC Send Transaction Service + +### Fee Market + +The fee market dashboard shows: + +1. Total Priorization Fees +2. Block Min Priorization Fees +3. Cost Tracker Stats + +### Ping Results + +The ping reults dashboard displays relevant information about the Ping API diff --git a/metrics/grafcli.conf b/metrics/grafcli.conf deleted file mode 100644 index 96c170871..000000000 --- a/metrics/grafcli.conf +++ /dev/null @@ -1,15 +0,0 @@ -[grafcli] -editor = vim -mergetool = vimdiff -verbose = on -force = on - -[resources] - -[hosts] -metrics = on - -[metrics] -type = api -url = https://metrics.solana.com:3000/api -ssl = off diff --git a/metrics/influx-enterprise/README.md b/metrics/influx-enterprise/README.md new file mode 100644 index 000000000..46920c438 --- /dev/null +++ b/metrics/influx-enterprise/README.md @@ -0,0 +1,10 @@ +![image](https://user-images.githubusercontent.com/110216567/182764431-504557e4-92ac-41ff-82a5-b87c88c19c1d.png) +# Influxdb_Enterprise +[Influx_Enterprise](https://solana-labs.atlassian.net/wiki/spaces/DEVOPS/pages/25788425/Influx+Enterprise+Integration) + + +influxdb-meta.conf -- is the meta node configuration file in which we have to defined the servers and configuration. + +influxdb.conf -- is the data node configuration file in which we have to defined the servers and configuration. + +default -- is the nginx load balancer configuration file of the VM named influxdb-enterprise. diff --git a/metrics/influx-enterprise/default b/metrics/influx-enterprise/default new file mode 100644 index 000000000..e575ca350 --- /dev/null +++ b/metrics/influx-enterprise/default @@ -0,0 +1,102 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# +upstream backend { + server 145.40.69.29:8086 max_fails=0; + server 147.28.151.45:8086 max_fails=0; + server 147.28.151.201:8086 max_fails=0; + server 86.109.7.147:8086 max_fails=0; + server 147.28.151.73:8086 max_fails=0; + server 147.28.129.143:8086 max_fails=0; + } +server { + listen 8086 default_server; + listen [::]:8086 default_server; + + # SSL configuration + # + # listen 443 ssl default_server; + # listen [::]:443 ssl default_server; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.html index.htm index.nginx-debian.html; + + server_name _; + + location / { + proxy_connect_timeout 1200s; + proxy_send_timeout 1200s; + proxy_read_timeout 1200s; + proxy_pass http://backend; + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. +# try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + #location ~ \.php$ { + # include snippets/fastcgi-php.conf; + # + # # With php-fpm (or other unix sockets): + # fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} + +# Virtual Host configuration for example.com +# +# You can move that to a different file under sites-available/ and symlink that +# to sites-enabled/ to enable it. +# +#server { +# listen 80; +# listen [::]:80; +# +# server_name example.com; +# +# root /var/www/example.com; +# index index.html; +# +# location / { +# try_files $uri $uri/ =404; +# } +#} diff --git a/metrics/influx-enterprise/influxdb-meta.conf b/metrics/influx-enterprise/influxdb-meta.conf new file mode 100644 index 000000000..807c720a8 --- /dev/null +++ b/metrics/influx-enterprise/influxdb-meta.conf @@ -0,0 +1,140 @@ +### Welcome to the InfluxDB Enterprise configuration file. + +# The values in this file override the default values used by the system if +# a config option is not specified. The commented out lines are the configuration +# field and the default value used. Uncommenting a line and changing the value +# will change the value used at runtime when the process is restarted. + +# Once every 24 hours InfluxDB Enterprise will report usage data to usage.influxdata.com +# The data includes a random ID, os, arch, version, the number of series and other +# usage data. No data from user databases is ever transmitted. +# Change this option to true to disable reporting. +# reporting-disabled = false + +# The TCP bind address to use for the cluster-internal meta services. +# bind-address = ":8091" + +# Hostname advertised by this host for remote addresses. This must be resolvable by all +# other nodes in the cluster. + hostname = "dev-equinix-washington-24" + +### +### [enterprise] +### +### Settings related to enterprise licensing. +### + +[enterprise] + # Must be set to true to use the Enterprise Web UI + # registration-enabled = false + + # Must include the protocol (http://) + # registration-server-url = "" + + # license-key and license-path are mutually exclusive, use only one and leave the other blank + license-key = "" + + # license-key and license-path are mutually exclusive, use only one and leave the other blank + license-path = "" + +### +### [meta] +### +### Settings specific to meta node operation. +### +# +[meta] + # Directory where cluster meta data is stored. + dir = "/var/lib/influxdb/meta" + + # The default address for raft, cluster, snapshot, etc. + # bind-address = ":8089" + + # The default address to bind the API to. + # http-bind-address = ":8091" + + # Determines whether meta nodes use HTTPS to communicate with each other. + # https-enabled = false + + # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded + # bundle of the certificate and key. If it is just the certificate, a key must be + # specified in https-private-key. + # https-certificate = "" + + # Use a separate private key location. + # https-private-key = "" + + # Whether meta nodes will skip certificate validation communicating with each other over HTTPS. + # This is useful when testing with self-signed certificates. + # https-insecure-tls = false + + # Whether to use TLS to communicate with data nodes. + # data-use-tls = false + + # Whether meta nodes will skip certificate validation communicating with data nodes over TLS. + # This is useful when testing with self-signed certificates. + # data-insecure-tls = false + + # The default frequency with which the node will gossip its known announcements. + # gossip-frequency = "5s" + + # The default length of time an announcement is kept before it is considered too old. + # announcement-expiration = "30s" + + # Automatically create a default retention policy when creating a database. + # retention-autocreate = true + + # The amount of time in candidate state without a leader before we attempt an election. + # election-timeout = "1s" + + # The amount of time in follower state without a leader before we attempt an election. + # heartbeat-timeout = "1s" + + # Control how long the "lease" lasts for being the leader without being able to contact a quorum + # of nodes. If we reach this interval without contact, we will step down as leader. + # leader-lease-timeout = "500ms" + + # The amount of time without an Apply() operation before we heartbeat to ensure a timely + # commit. Due to random staggering, may be delayed as much as 2x this value. + # commit-timeout = "50ms" + + # Timeout waiting for consensus before getting the latest Raft snapshot. + # consensus-timeout = "30s" + + # Enables cluster level trace logging. + # cluster-tracing = false + + # Enables cluster API level trace logging. + # logging-enabled = true + + # Determines whether the pprof endpoint is enabled. This endpoint is used for + # troubleshooting and monitoring. + # pprof-enabled = true + + # The default duration of leases. + # lease-duration = "1m0s" + + # If true, HTTP endpoints require authentication. + # This setting must have the same value as the data nodes' meta.meta-auth-enabled + # configuration. + # auth-enabled = false + + # Whether LDAP is allowed to be set. + # If true, you will need to use `influxd ldap set-config` and set enabled=true to use LDAP authentication. + # ldap-allowed = false + + # The shared secret used by the API for JWT authentication. + # shared-secret = "" + + # The shared secret used by the internal API for JWT authentication. + # This setting must have the same value as the data nodes' + # meta.meta-internal-shared-secret configuration. + internal-shared-secret = "this is meta node" + + # Configures password hashing scheme. Use "pbkdf2-sha256" or "pbkdf2-sha512" + # for a FIPS-ready password hash. This setting must have the same value as + # the data nodes' meta.password-hash configuration. + # password-hash = "bcrypt" + + # Configures strict FIPS-readiness check on startup. + # ensure-fips = false diff --git a/metrics/influx-enterprise/influxdb.conf b/metrics/influx-enterprise/influxdb.conf new file mode 100644 index 000000000..d70944aec --- /dev/null +++ b/metrics/influx-enterprise/influxdb.conf @@ -0,0 +1,701 @@ +### Welcome to the InfluxDB Enterprise configuration file. + +# The values in this file override the default values used by the system if +# a config option is not specified. The commented out lines are the configuration +# field and the default value used. Uncommenting a line and changing the value +# will change the value used at runtime when the process is restarted. + +# Once every 24 hours InfluxDB Enterprise will report usage data to usage.influxdata.com +# The data includes a random ID, os, arch, version, the number of series and other +# usage data. No data from user databases is ever transmitted. +# Change this option to true to disable reporting. +# reporting-disabled = false + +# The TCP bind address to use for cluster-internal services. +# bind-address = ":8088" + +# Hostname advertised by this host for remote addresses. This must be resolvable by all +# other nodes in the cluster. + hostname = "dev-equinix-washington-27" + +# How often to update the cluster with this node's internal status. +# gossip-frequency = "3s" + +### +### [enterprise] +### +### Settings related to enterprise licensing. +### + +[enterprise] + # Must be set to true to use the Enterprise Web UI. + # registration-enabled = false + + # Must include the protocol (http://). + # registration-server-url = "" + + # license-key and license-path are mutually exclusive, use only one and leave the other blank. + license-key = "" + + # The path to a valid license file. license-key and license-path are mutually exclusive, + # use only one and leave the other blank. + license-path = "" + +### +### [meta] +### +### Settings related to how the data nodes interact with the meta nodes. +### + +[meta] + # Directory where the cluster metadata is stored. + dir = "/var/lib/influxdb/meta" + + # Whether to use TLS when connecting to meta nodes. + # meta-tls-enabled = false + + # The shared secret used by the internal API for JWT authentication. This setting + # must have the same value as the meta nodes' meta.auth-enabled configuration. + meta-auth-enabled = true + + # This setting must have the same value as the meta nodes' meta.internal-shared-secret configuration + # and must be non-empty if set. + meta-internal-shared-secret = "this is meta node" + + # Allows insecure TLS connections to meta nodes. This is useful when testing with self- + # signed certificates. + # meta-insecure-tls = false + + # Whether log messages are printed for the meta service. + # logging-enabled = true + + # Configures password hashing scheme. Use "pbkdf2-sha256" or "pbkdf2-sha512" + # for a FIPS-ready password hash. This setting must have the same value as + # the meta nodes' meta.password-hash configuration. + # password-hash = "bcrypt" + + # Configures strict FIPS-readiness check on startup. + # ensure-fips = false + + +### +### [data] +### +### Controls where the actual shard data for InfluxDB lives and how it is +### compacted from the WAL. "dir" may need to be changed to a suitable place +### for your system. The defaults should work for most systems. +### + +[data] + # The directory where the TSM storage engine stores TSM (read-optimized) files. + dir = "/var/lib/influxdb/data" + + # The directory where the TSM storage engine stores WAL (write-optimized) files. + wal-dir = "/var/lib/influxdb/wal" + + # Trace logging provides more verbose output around the tsm engine. Turning + # this on can provide more useful output for debugging tsm engine issues. + # trace-logging-enabled = false + + # Whether queries should be logged before execution. Very useful for troubleshooting, but will + # log any sensitive data contained within a query. + # query-log-enabled = true + + # Validates incoming writes to ensure keys only have valid unicode characters. + # This setting will incur a small overhead because every key must be checked. + # validate-keys = false + + # Settings for the TSM engine + + # The amount of time that a write will wait before fsyncing. A duration + # greater than 0 can be used to batch up multiple fsync calls. This is useful for slower + # disks or when WAL write contention is seen. A value of 0s fsyncs every write to the WAL. + # Values in the range of 0-100ms are recommended for non-SSD disks. + # wal-fsync-delay = "0s" + + # CacheMaxMemorySize is the maximum size a shard's cache can + # reach before it starts rejecting writes. + # cache-max-memory-size = "1g" + + # CacheSnapshotMemorySize is the size at which the engine will + # snapshot the cache and write it to a TSM file, freeing up memory. + # cache-snapshot-memory-size = "25m" + + # CacheSnapshotWriteColdDuration is the length of time at + # which the engine will snapshot the cache and write it to + # a new TSM file if the shard hasn't received writes or deletes. + # cache-snapshot-write-cold-duration = "10m" + + # The maximum number of concurrent full and level compactions that can run at one time. + # value of 0 results in 50% of runtime.GOMAXPROCS(0) used at runtime. Any number greater + # than 0 limits compactions to that value. This setting does not apply to cache snapshotting. + # max-concurrent-compactions = 0 + + # MaxConcurrentDeletes is the maximum number of simultaneous DELETE calls on a shard + # The default is 1, and should be left unchanged for most users + # max-concurrent-deletes = 1 + + # CompactFullWriteColdDuration is the duration at which the engine + # will compact all TSM files in a shard if it hasn't received a + # write or delete. + # compact-full-write-cold-duration = "4h" + + # CompactThroughput is the rate limit in bytes per second that we will allow + # TSM compactions to write to disk. Note that short bursts are allowed + # to happen at a possibly larger value, set by CompactThroughputBurst + # compact-throughput = "48m" + + # CompactThroughputBurst is the rate limit in bytes per second that we + # will allow TSM compactions to write to disk. + # compact-throughput-burst = "48m" + + # The maximum series allowed per database before writes are dropped. This limit can prevent + # high cardinality issues at the database level. This limit can be disabled by setting it to + # 0. + # max-series-per-database = 1000000 + + # The maximum number of tag values per tag that are allowed before writes are dropped. This limit + # can prevent high cardinality tag values from being written to a measurement. This limit can be + # disabled by setting it to 0. + # max-values-per-tag = 100000 + + # (TSI indexes only) The threshold, in bytes, when an index write-ahead log + # file will compact into an index file. Lower sizes will cause log files to be + # compacted more quickly and result in lower heap usage at the expense of write + # throughput. Higher sizes will be compacted less frequently, store more series + # in-memory, and provide higher write throughput. + # Valid size suffixes are k, m, or g (case insensitive, 1024 = 1k). + # Values without a size suffix are in bytes. + # max-index-log-file-size = "1m" + + # If true, then the mmap advise value MADV_WILLNEED will be provided to the kernel with respect to + # TSM files. This setting has been found to be problematic on some kernels, and defaults to off. + # It might help users who have slow disks in some cases. + # tsm-use-madv-willneed = false + +### +### [cluster] +### +### Settings related to how the data nodes interact with other data nodes. +### + +[cluster] + # The default timeout when establishing a new connection to a node. + # dial-timeout = "1s" + + # The default time a stream will remain idle in the connection pool before being reaped. + # pool-max-idle-time = "60s" + + # The default maximum number of streams that can be idle in a pool, per node. + # The number of active streams can exceed the maximum, but they will not return to the pool when released. + # pool-max-idle-streams = 100 + + # The default timeout set on shard readers. + # shard-reader-timeout = "0" + + # Determines whether data nodes use HTTPS to communicate with each other. + # https-enabled = false + + # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded + # bundle of the certificate and key. If it is just the certificate, a key must be + # specified in https-private-key. + # https-certificate = "" + + # Use a separate private key location. + # https-private-key = "" + + # Whether data nodes will skip certificate validation communicating with each other over HTTPS. + # This is useful when testing with self-signed certificates. + # https-insecure-tls = false + + # Enables cluster trace logging. + # cluster-tracing = false + + # The default time a write request will wait until a "timeout" error is returned to the caller. + # write-timeout = "10s" + + # The maximum number of concurrent queries allowed to be executing at one time. If a query is + # executed and exceeds this limit, an error is returned to the caller. This limit can be disabled + # by setting it to 0. + # max-concurrent-queries = 0 + + # The maximum time a query will is allowed to execute before being killed by the system. This limit + # can help prevent run away queries. Setting the value to 0 disables the limit. + # query-timeout = "0s" + + # The time threshold when a query will be logged as a slow query. This limit can be set to help + # discover slow or resource intensive queries. Setting the value to 0 disables the slow query logging. + # log-queries-after = "0s" + + # The maximum number of points a SELECT can process. A value of 0 will make the maximum + # point count unlimited. + # max-select-point = 0 + + # The maximum number of series a SELECT can run. A value of zero will make the maximum series + # count unlimited. + # max-select-series = 0 + + # The maximum number of group by time buckets a SELECT can create. A value of zero will make the maximum + # number of buckets unlimited. + # max-select-buckets = 0 + + # Whether to print a list of running queries when a data node receives a SIGTERM (sent when a process + # exceeds a container memory limit, or by the kill command. + # termination-query-log = false + +### +### [hinted-handoff] +### +### Settings for how write data is queued locally when the remote node is unable to accept a write. +### + +[hinted-handoff] + # Determines whether hinted handoff is enabled. + # enabled = true + + # The directory where the hinted handoff queues are stored. + dir = "/var/lib/influxdb/hh" + + # The default maximum size of all hinted handoff queues in bytes. + # max-size = "10g" + + # The default maximum amount of time that a hinted handoff write can stay in the queue. + # After this time, the write will be purged. + # max-age = "168h0m0s" + + # The maximum number of concurrent queued writes to process at a time. + # retry-concurrency = 20 + + # The default rate that hinted handoffs will be retried. The rate is in bytes per second + # and applies across all nodes when retried. A value of 0 disables the rate limit. + # retry-rate-limit = 0 + + # The default amount of time the system waits before attempting to flush hinted handoff + # queues. With each failure of a hinted handoff write, this retry interval increases + # exponentially until it reaches the maximum. + # retry-interval = "1s" + + # The maximum the hinted handoff retry interval will ever be. + # retry-max-interval = "10s" + + # The amount of time the system waits before attempting to purge hinted handoff data due + # to age or inactive nodes. + # purge-interval = "1m0s" + + # Maximum number of bytes to write to a shard in a single request + # batch-size = 512000 + + # Maximum number of writes into the hinted-handoff queue that can be pending. + # This is writes incoming to the hh queue, not outbound from the queue. + # max-pending-writes = 1024 + +### +### [anti-entropy] +### +### Controls the copying and repairing of shards to ensure that data nodes contain +### the shard data they are supposed to. The Anti-Entropy feature is disabled by +### default. + +[anti-entropy] + # Determines whether the service is enabled. + # enabled = false + + # The interval of time when anti-entropy checks run on each data node. + # check-interval = "5m" + + # The maximum number of shards that a single data node will copy or repair + # concurrently. + # max-fetch = 10 + + # How many concurrent sync operations should be performed. + # max-sync = 1 + + # When set to true, missing shards will be automatically repaired. + # auto-repair-missing = true + +### +### [retention] +### +### Controls the enforcement of retention policies for evicting old data. +### + +[retention] + # Determines whether retention policy enforcement enabled. + # enabled = true + + # The interval of time when retention policy enforcement checks run. + # check-interval = "30m" + +### +### [shard-precreation] +### +### Controls the precreation of shards, so they are available before data arrives. +### Only shards that, after creation, will have both a start- and end-time in the +### future, will ever be created. Shards are never precreated that would be wholly +### or partially in the past. + +[shard-precreation] + # Determines whether shard pre-creation service is enabled. + # enabled = true + + # The interval of time when the check to pre-create new shards runs. + # check-interval = "10m" + + # The default period ahead of the endtime of a shard group that its successor + # group is created. + # advance-period = "30m" + +### +### Controls the system's self-monitoring, statistics and diagnostics. +### +### The internal database for monitoring data is created automatically if +### it does not already exist. The target retention within this database +### is called 'monitor' and is also created with a retention period of 7 days +### and a replication factor of 1, if it does not exist. In all cases the +### this retention policy is configured as the default for the database. + +[monitor] + # Whether to record statistics internally. + # store-enabled = true + + # The destination database for recorded statistics. + # store-database = "_internal" + + # The interval at which to record statistics. + # store-interval = "10s" + + # How often to poll other data nodes' stats when aggregating cluster stats. + # remote-collect-interval = "10s" + +### +### [http] +### +### Controls how the HTTP endpoints are configured. These are the primary +### mechanism for getting data into and out of InfluxDB. +### + +[http] + # Determines whether HTTP endpoint is enabled. + # enabled = true + + # The bind address used by the HTTP service. + # bind-address = ":8086" + + # Determines whether HTTP authentication is enabled. + auth-enabled = false + + # The default realm sent back when issuing a basic auth challenge. + # realm = "InfluxDB" + + # Determines whether HTTP request logging is enabled. + # log-enabled = true + + # When HTTP request logging is enabled, this option specifies the path where + # log entries should be written. If unspecified, the default is to write to stderr, which + # intermingles HTTP logs with internal InfluxDB logging. + # + # If influxd is unable to access the specified path, it will log an error and fall back to writing + # the request log to stderr. + # access-log-path = "" + + # Filters which requests should be logged. Each filter is of the pattern NNN, NNX, or NXX where N is + # a number and X is a wildcard for any number. To filter all 5xx responses, use the string 5xx. + # If multiple filters are used, then only one has to match. The default is to have no filters which + # will cause every request to be printed. + # access-log-status-filters = [] + + # Determines whether detailed write logging is enabled. + # write-tracing = false + + # Determines whether the pprof endpoint is enabled. This endpoint is used for + # troubleshooting and monitoring. + # pprof-enabled = true + + # Enables authentication on pprof endpoints. Users will need admin permissions + # to access the pprof endpoints when this setting is enabled. This setting has + # no effect if either auth-enabled or pprof-enabled are set to false. + # pprof-auth-enabled = false + + # Enables a pprof endpoint that binds to localhost:6060 immediately on startup. + # This is only needed to debug startup issues. + # debug-pprof-enabled = false + + # Enables authentication on the /ping, /metrics, and deprecated /status + # endpoints. This setting has no effect if auth-enabled is set to false. + # ping-auth-enabled = false + + # Determines whether HTTPS is enabled. + # https-enabled = false + + # The SSL certificate to use when HTTPS is enabled. The certificate should be a PEM encoded + # bundle of the certificate and key. If it is just the certificate, a key must be + # specified in https-private-key. + # https-certificate = "/etc/ssl/influxdb.pem" + + # Use a separate private key location. + # https-private-key = "" + + # The JWT auth shared secret to validate requests using JSON web tokens. + # shared-secret = "" + + # The default chunk size for result sets that should be chunked. + # max-row-limit = 10000 + + # The maximum number of HTTP connections that may be open at once. New connections that + # would exceed this limit are dropped. Setting this value to 0 disables the limit. + # max-connection-limit = 0 + + # Whether to enable http service over unix domain socket. + # unix-socket-enabled = false + + # The permissions to use on the socket, if enabled. + # unix-socket-permissions = "0777" + + # The path of the unix domain socket. + # bind-socket = "/var/run/influxdb.sock" + + # The maximum size of a client request body, in bytes. Setting this value to 0 + # disables the limit. + # max-body-size = 25000000 + + # The maximum number of writes processed concurrently. + # Setting this to 0 disables the limit. + # max-concurrent-write-limit = 0 + + # The maximum number of writes queued for processing. + # Setting this to 0 disables the limit. + # max-enqueued-write-limit = 0 + + # The maximum duration for a write to wait in the queue to be processed. + # Setting this to 0 or setting max-concurrent-write-limit to 0 disables the limit. + # enqueued-write-timeout = 30000000000 + +### +### [logging] +### +### Controls how the logger emits logs to the output. +### + +[logging] + # Determines which log encoder to use for logs. Available options + # are auto, logfmt, and json. auto will use a more a more user-friendly + # output format if the output terminal is a TTY, but the format is not as + # easily machine-readable. When the output is a non-TTY, auto will use + # logfmt. + # format = "logfmt" + + # Determines which level of logs will be emitted. + # level = "info" + + # Suppresses the logo output that is printed when the program is started. + # suppress-logo = false + +### +### [subscriber] +### +### Controls the subscriptions, which can be used to fork a copy of all data +### received by the InfluxDB host. +### + +[subscriber] + # Determines whether the subscriber service is enabled. + # enabled = true + + # The default timeout for HTTP writes to subscribers. + # http-timeout = "30s" + + # Allows insecure HTTPS connections to subscribers. This is useful when testing with self- + # signed certificates. + # insecure-skip-verify = false + + # The path to the PEM encoded CA certs file. If the empty string, the default system certs will be used. + # ca-certs = "" + + # The number of writer goroutines processing the write channel. + # write-concurrency = 40 + + # The number of in-flight writes buffered in the write channel. + # write-buffer-size = 1000 + + +### +### [[graphite]] +### +### Controls one or many listeners for Graphite data. +### + +[[graphite]] + # Determines whether the graphite endpoint is enabled. + # enabled = false + # database = "graphite" + # retention-policy = "" + # bind-address = ":2003" + # protocol = "tcp" + # consistency-level = "one" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # Flush if this many points get buffered. + # batch-size = 5000 + + # Number of batches that may be pending in memory. + # batch-pending = 10 + + # Flush at least this often even if we haven't hit buffer limit. + # batch-timeout = "1s" + + # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + # udp-read-buffer = 0 + + # This string joins multiple matching 'measurement' values providing more control over the final measurement name. + # separator = "." + + # Default tags that will be added to all metrics. These can be overridden at the template level + # or by tags extracted from metric. + # tags = ["region=us-east", "zone=1c"] + + # Each template line requires a template pattern. It can have an optional + # filter before the template and separated by spaces. It can also have optional extra + # tags following the template. Multiple tags should be separated by commas and no spaces + # similar to the line protocol format. There can be only one default template. + # templates = [ + # "*.app env.service.resource.measurement", + # # Default template + # "server.*", + # ] + +### +### [collectd] +### +### Controls one or many listeners for collectd data. +### + +[[collectd]] + # enabled = false + # bind-address = ":25826" + # database = "collectd" + # retention-policy = "" + # typesdb = "/usr/share/collectd/types.db" + + # The collectd security level can be "" or "none", "sign", or "encrypt". + # security-level = "" + + # Path to the collectd auth file. Must be set if security level is sign or encrypt. + # auth-file = "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # Flush if this many points get buffered. + # batch-size = 5000 + + # Number of batches that may be pending in memory. + # batch-pending = 10 + + # Flush at least this often even if we haven't hit buffer limit. + # batch-timeout = "10s" + + # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + # read-buffer = 0 + +### +### [opentsdb] +### +### Controls one or many listeners for OpenTSDB data. +### + +[[opentsdb]] + # enabled = false + # bind-address = ":4242" + # database = "opentsdb" + # retention-policy = "" + # consistency-level = "one" + # tls-enabled = false + # certificate= "/etc/ssl/influxdb.pem" + + # Log an error for every malformed point. + # log-point-errors = true + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Only points + # metrics received over the telnet protocol undergo batching. + + # Flush if this many points get buffered. + # batch-size = 1000 + + # Number of batches that may be pending in memory. + # batch-pending = 5 + + # Flush at least this often even if we haven't hit buffer limit. + # batch-timeout = "1s" + +### +### [[udp]] +### +### Controls one or many listeners for InfluxDB line protocol data via UDP. +### + +[[udp]] + # enabled = false + # bind-address = ":8089" + # database = "udp" + # retention-policy = "" + + # InfluxDB precision for timestamps on received points ("" or "n", "u", "ms", "s", "m", "h") + # precision = "" + + # These next lines control how batching works. You should have this enabled + # otherwise you could get dropped metrics or poor performance. Batching + # will buffer points in memory if you have many coming in. + + # Flush if this many points get buffered. + # batch-size = 5000 + + # Number of batches that may be pending in memory. + # batch-pending = 10 + + # Will flush at least this often even if we haven't hit buffer limit. + # batch-timeout = "1s" + + # UDP Read buffer size, 0 means OS default. UDP listener will fail if set above OS max. + # read-buffer = 0 + +### +### [continuous_queries] +### +### Controls how continuous queries are run within InfluxDB. +### + +[continuous_queries] + # Determines whether the continuous query service is enabled. + # enabled = true + + # Controls whether queries are logged when executed by the CQ service. + # log-enabled = true + + # Controls whether queries are logged to the self-monitoring data store. + # query-stats-enabled = false + + # Interval for how often continuous queries will be checked whether they need to run. + # run-interval = "1s" + +[tls] + # Determines the available set of cipher suites. See https://golang.org/pkg/crypto/tls/#pkg-constants + # for a list of available ciphers, which depends on the version of Go (use the query + # SHOW DIAGNOSTICS to see the version of Go used to build InfluxDB). If not specified, uses + # the default settings from Go's crypto/tls package. + # ciphers = [ + # "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305", + # "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256", + # ] + + # Minimum version of the tls protocol that will be negotiated. If not specified, uses the + # default settings from Go's crypto/tls package. + # min-version = "tls1.2" + + # Maximum version of the tls protocol that will be negotiated. If not specified, uses the + # default settings from Go's crypto/tls package. + # max-version = "tls1.2" diff --git a/metrics/metrics-internal/README.md b/metrics/metrics-internal/README.md new file mode 100644 index 000000000..a5b29f5df --- /dev/null +++ b/metrics/metrics-internal/README.md @@ -0,0 +1,16 @@ +![image](https://user-images.githubusercontent.com/110216567/182764431-504557e4-92ac-41ff-82a5-b87c88c19c1d.png) + + +Services : +1. Influxdb +2. Chronograf (on port 8888) +3. Chronograf_8889 (on port 8889) +4. Grafana + +To install all the services on metrics-internal server, you need to run the ./start.sh script + +Install the Buildkite-agent to run the pipeline to get the status of the container. + +If any of the containers is not in running state or in exited state then it will redeploy the container as per the specific container status. + +**Note:** If you delete or remove the container manually then you can also run the specific script to redeploy it again. diff --git a/metrics/metrics-internal/chronograf_8888_internal.sh b/metrics/metrics-internal/chronograf_8888_internal.sh new file mode 100644 index 000000000..9d85fc456 --- /dev/null +++ b/metrics/metrics-internal/chronograf_8888_internal.sh @@ -0,0 +1,56 @@ +#!/bin/bash -ex +# +# (Re)starts the Chronograf containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${CHRONOGRAF_IMAGE:=chronograf:1.8.8}" + +# Remove the container +container=chronograf_8888_internal +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + + +# (Re)start the container +sudo sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8888" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8888" \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \ + --env PUBLIC_URL=https://internal-metrics.solana.com:8888 \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --env inactivity-duration=48h \ + --name=chronograf_8888_internal \ + --net=influxdb \ + --publish 8888:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 diff --git a/metrics/metrics-internal/chronograf_8889_internal.sh b/metrics/metrics-internal/chronograf_8889_internal.sh new file mode 100644 index 000000000..9c48fff6d --- /dev/null +++ b/metrics/metrics-internal/chronograf_8889_internal.sh @@ -0,0 +1,56 @@ +#!/bin/bash -ex +# +# (Re)starts the Chronograf_8889 containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${CHRONOGRAF_IMAGE:=chronograf:1.8.8}" + +# remove the container +container=chronograf_8889_internal +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + + +# (Re)start the container +sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8889" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \ + --env PUBLIC_URL=https://internal-metrics.solana.com:8889 \ + --env TOKEN_SECRET= \ + --env inactivity-duration=48h \ + --name=chronograf_8889_internal \ + --net=influxdb \ + --publish 8889:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf_8889:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 diff --git a/metrics/metrics-internal/grafana-internal-metrics.solana.com.ini b/metrics/metrics-internal/grafana-internal-metrics.solana.com.ini new file mode 100644 index 000000000..25cc69673 --- /dev/null +++ b/metrics/metrics-internal/grafana-internal-metrics.solana.com.ini @@ -0,0 +1,453 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +;data = /var/lib/grafana + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +;plugins = /var/lib/grafana/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, socket) +protocol = https + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +;http_port = 3000 + +# The public facing domain name used to access grafana from a browser +domain = internal-metrics.solana.com + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +;root_url = http://internal-metrics.solana.com:3000 + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +cert_file = /certs/fullchain.pem +cert_key = /certs/privkey.pem + +# Unix socket path +;socket = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres" only, either "disable", "require" or "verify-full" +;ssl_mode = disable + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +log_queries = + +#################################### Session #################################### +[session] +# Either "memory", "file", "redis", "mysql", "postgres", default is "file" +;provider = file + +# Provider config options +# memory: not have any config yet +# file: session dir path, is relative to grafana data_path +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=grafana` +# mysql: go-sql-driver/mysql dsn config string, e.g. `user:password@tcp(127.0.0.1:3306)/database_name` +# postgres: user=a password=b host=localhost port=5432 dbname=c sslmode=disable +;provider_config = sessions + +# Session cookie name +;cookie_name = grafana_sess + +# If you use session in https only, default is false +;cookie_secure = false + +# Session life time, default is 86400 +;session_life_time = 86400 + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# Set to false to disable all checks to https://grafana.net +# for new vesions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +;check_for_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +#################################### Security #################################### +[security] +# default admin user, created on startup +admin_user = $ADMIN_USER_GRAFANA + +# default admin password, can be changed before first start of grafana, or in profile settings +admin_password = $ADMIN_PASSWORD_GRAFANA +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# Auto-login remember days +;login_remember_days = 7 +;cookie_username = grafana_user +;cookie_remember_name = grafana_remember + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +#################################### Snapshots ########################### +[snapshots] +# snapshot sharing options +;external_enabled = true +;external_snapshot_url = https://snapshots-origin.raintank.io +;external_snapshot_name = Publish to snapshot.raintank.io + +# remove expired snapshot +;snapshot_remove_expired = true + +#################################### Dashboards History ################## +[dashboards] +# Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1 +;versions_to_keep = 20 + +#################################### Users ############################### +[users] +# disable user signup / registration +;allow_sign_up = true + +# Allow non admin users to create organizations +;allow_org_create = true + +# Set to true to automatically assign new users to the default organization (id 1) +auto_assign_org = true + +# Default role new users will be automatically assigned (if disabled above is set to true) +auto_assign_org_role = Editor + +# Background text for the user field on the login page +;login_hint = email or username + +# Default UI theme ("dark" or "light") +;default_theme = dark + +# External user management, these options affect the organization users view +;external_manage_link_url = +;external_manage_link_name = +;external_manage_info = + +# Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard. +viewers_can_edit = false + +[auth] +# Set to true to disable (hide) the login form, useful if you use OAuth, defaults to false +;disable_login_form = false + +# Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false +;disable_signout_menu = false + +# URL to redirect the user to after sign out +;signout_redirect_url = + +#################################### Anonymous Auth ########################## +[auth.anonymous] +# enable anonymous access +enabled = true + +# specify organization name that should be used for unauthenticated users +org_name = Solana Public + +# specify role for unauthenticated users +org_role = Viewer + +#################################### Github Auth ########################## +[auth.github] +#enabled = true +#allow_sign_up = true +#client_id = +#client_secret = +#scopes = user:email,read:org +#auth_url = https://github.com/login/oauth/authorize +#token_url = https://github.com/login/oauth/access_token +#api_url = https://api.github.com/user +;team_ids = +#allowed_organizations = solana-labs + +#################################### Google Auth ########################## +[auth.google] +enabled = true +allow_sign_up = true +client_id =$GOOGLE_CLIENT_ID +client_secret =$GOOGLE_CLIENT_SECRET +scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email +auth_url = https://accounts.google.com/o/oauth2/auth +token_url = https://accounts.google.com/o/oauth2/token +api_url = https://www.googleapis.com/oauth2/v1/userinfo +allowed_domains = solana.com,jito.wtf,jumpcrypto.com,certus.one +#################################### Generic OAuth ########################## +[auth.generic_oauth] +;enabled = false +;name = OAuth +;allow_sign_up = true +;client_id = some_id +;client_secret = some_secret +;scopes = user:email,read:org +;auth_url = https://foo.bar/login/oauth/authorize +;token_url = https://foo.bar/login/oauth/access_token +;api_url = https://foo.bar/user +;team_ids = +;allowed_organizations = + +#################################### Grafana.com Auth #################### +[auth.grafana_com] +;enabled = false +;allow_sign_up = true +;client_id = some_id +;client_secret = some_secret +;scopes = user:email +;allowed_organizations = + +#################################### Auth Proxy ########################## +[auth.proxy] +;enabled = false +;header_name = X-WEBAUTH-USER +;header_property = username +;auto_sign_up = true +;ldap_sync_ttl = 60 +;whitelist = 192.168.1.1, 192.168.2.1 + +#################################### Basic Auth ########################## +[auth.basic] +;enabled = true + +#################################### Auth LDAP ########################## +[auth.ldap] +;enabled = false +;config_file = /etc/grafana/ldap.toml +;allow_sign_up = true + +#################################### SMTP / Emailing ########################## +[smtp] +;enabled = false +;host = localhost:25 +;user = +# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;""" +;password = +;cert_file = +;key_file = +;skip_verify = false +;from_address = admin@grafana.localhost +;from_name = Grafana +# EHLO identity in SMTP dialog (defaults to instance_name) +;ehlo_identity = dashboard.example.com + +[emails] +;welcome_email_on_sign_up = false + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use space to separate multiple modes, e.g. "console file" +;mode = console file + +# Either "debug", "info", "warn", "error", "critical", default is "info" +;level = info + +# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug +;filters = + +# For "console" mode only +[log.console] +;level = + +# log line format, valid options are text, console and json +;format = console + +# For "file" mode only +[log.file] +;level = + +# log line format, valid options are text, console and json +;format = text + +# This enables automated log rotate(switch of following options), default is true +;log_rotate = true + +# Max line number of single file, default is 1000000 +;max_lines = 1000000 + +# Max size shift of single file, default is 28 means 1 << 28, 256MB +;max_size_shift = 28 + +# Segment log daily, default is true +;daily_rotate = true + +# Expired days of log file(delete after max days), default is 7 +;max_days = 7 + +[log.syslog] +;level = + +# log line format, valid options are text, console and json +;format = text + +# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used. +;network = +;address = + +# Syslog facility. user, daemon and local0 through local7 are valid. +;facility = + +# Syslog tag. By default, the process' argv[0] is used. +;tag = + +#################################### Alerting ############################ +[alerting] +# Disable alerting engine & UI features +;enabled = true +# Makes it possible to turn off alert rule execution but alerting UI is visible +;execute_alerts = true + +#################################### Explore ############################# +[explore] +# Enable the Explore section +enabled = false + +#################################### Internal Grafana Metrics ########################## +# Metrics available at HTTP API Url /metrics +[metrics] +# Disable / Enable internal metrics +;enabled = true + +# Publish interval +;interval_seconds = 10 + +# Send internal metrics to Graphite +[metrics.graphite] +# Enable by setting the address setting (ex localhost:2003) +;address = +;prefix = prod.grafana.%(instance_name)s. + +#################################### Distributed tracing ############ +[tracing.jaeger] +# Enable by setting the address sending traces to jaeger (ex localhost:6831) +;address = localhost:6831 +# Tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2) +;always_included_tag = tag1:value1 +# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote +;sampler_type = const +# jaeger samplerconfig param +# for "const" sampler, 0 or 1 for always false/true respectively +# for "probabilistic" sampler, a probability between 0 and 1 +# for "rateLimiting" sampler, the number of spans per second +# for "remote" sampler, param is the same as for "probabilistic" +# and indicates the initial sampling rate before the actual one +# is received from the mothership +;sampler_param = 1 + +#################################### Grafana.com integration ########################## +# Url used to to import dashboards directly from Grafana.com +[grafana_com] +;url = https://grafana.com + +#################################### External image storage ########################## +[external_image_storage] +# Used for uploading images to public servers so they can be included in slack/email messages. +# you can choose between (s3, webdav, gcs, azure_blob, local) +;provider = + +[external_image_storage.s3] +;bucket = +;region = +;path = +;access_key = +;secret_key = + +[external_image_storage.webdav] +;url = +;public_url = +;username = +;password = + +[external_image_storage.gcs] +;key_file = +;bucket = +;path = + +[external_image_storage.azure_blob] +;account_name = +;account_key = +;container_name = + +[external_image_storage.local] +# does not require any configuration diff --git a/metrics/metrics-internal/grafana_internal.sh b/metrics/metrics-internal/grafana_internal.sh new file mode 100644 index 000000000..d83b90231 --- /dev/null +++ b/metrics/metrics-internal/grafana_internal.sh @@ -0,0 +1,49 @@ +#!/bin/bash -ex +# +# (Re)starts the Grafana containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${GRAFANA_IMAGE:=grafana/grafana:9.4.7}" + +# remove the container +container=grafana_internal +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + + +#(Re)start the container +sudo docker run \ + --detach \ + --name=grafana_internal \ + --net=influxdb \ + --publish 3000:3000 \ + --user root:root \ + --env GF_PATHS_CONFIG=/grafana.ini \ + --volume "$PWD"/certs:/certs:ro \ + --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ + --volume /var/lib/grafana:/var/lib/grafana \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $GRAFANA_IMAGE diff --git a/metrics/metrics-internal/host.sh b/metrics/metrics-internal/host.sh new file mode 100644 index 000000000..9575960b1 --- /dev/null +++ b/metrics/metrics-internal/host.sh @@ -0,0 +1,6 @@ +# |source| me + +if [[ -z $HOST ]]; then + HOST=internal-metrics.solana.com +fi +echo "HOST: $HOST" diff --git a/metrics/metrics-internal/influxdb.conf b/metrics/metrics-internal/influxdb.conf new file mode 100644 index 000000000..c9ff9b026 --- /dev/null +++ b/metrics/metrics-internal/influxdb.conf @@ -0,0 +1,142 @@ +reporting-disabled = false +bind-address = "127.0.0.1:8088" + +[meta] + dir = "/var/lib/influxdb/meta" + retention-autocreate = true + logging-enabled = true + +[data] + dir = "/var/lib/influxdb/data" + index-version = "inmem" + wal-dir = "/var/lib/influxdb/wal" + wal-fsync-delay = "0s" + query-log-enabled = true + cache-max-memory-size = "50g" + cache-snapshot-memory-size = 26214400 + cache-snapshot-write-cold-duration = "10m0s" + compact-full-write-cold-duration = "4h0m0s" + max-series-per-database = 0 + max-values-per-tag = 0 + max-concurrent-compactions = 0 + max-index-log-file-size = 1048576 + trace-logging-enabled = false + +[coordinator] + write-timeout = "20s" + max-concurrent-queries = 0 + query-timeout = "120s" + log-queries-after = "10s" + max-select-point = 0 + max-select-series = 0 + max-select-buckets = 0 + +[retention] + enabled = true + check-interval = "30m0s" + +[shard-precreation] + enabled = true + check-interval = "10m0s" + advance-period = "30m0s" + +[monitor] + store-enabled = true + store-database = "_internal" + store-interval = "10s" + +[subscriber] + enabled = true + http-timeout = "30s" + insecure-skip-verify = false + ca-certs = "" + write-concurrency = 200 + write-buffer-size = 10000 + +[http] + enabled = true + bind-address = ":8086" + auth-enabled = true + log-enabled = true + write-tracing = false + pprof-enabled = true + debug-pprof-enabled = false + https-enabled = true + https-certificate = "/certs/fullchain.pem" + https-private-key = "/certs/privkey.pem" + max-row-limit = 0 + max-connection-limit = 0 + shared-secret = "" + realm = "Solana InfluxDB" + unix-socket-enabled = false + bind-socket = "/var/run/influxdb.sock" + max-body-size = 25000000 + access-log-path = "" + flux-enabled=true + +[logging] + format = "auto" + level = "info" + suppress-logo = false + +[ifql] + enabled = false + log-enabled = true + bind-address = ":8082" + +[[graphite]] + enabled = false + bind-address = ":2003" + database = "graphite" + retention-policy = "" + protocol = "tcp" + batch-size = 5000 + batch-pending = 10 + batch-timeout = "1s" + consistency-level = "one" + separator = "." + udp-read-buffer = 0 + +[[collectd]] + enabled = false + bind-address = ":25826" + database = "collectd" + retention-policy = "" + batch-size = 5000 + batch-pending = 10 + batch-timeout = "10s" + read-buffer = 0 + typesdb = "/usr/share/collectd/types.db" + security-level = "none" + auth-file = "/etc/collectd/auth_file" + parse-multivalue-plugin = "split" + +[[opentsdb]] + enabled = false + bind-address = ":4242" + database = "opentsdb" + retention-policy = "" + consistency-level = "one" + tls-enabled = false + certificate = "/etc/ssl/influxdb.pem" + batch-size = 1000 + batch-pending = 5 + batch-timeout = "1s" + log-point-errors = true + +[[udp]] + enabled = false + bind-address = ":8089" + database = "udp" + retention-policy = "" + batch-size = 5000 + batch-pending = 10 + read-buffer = 0 + batch-timeout = "1s" + precision = "" + +[continuous_queries] + log-enabled = true + enabled = true + query-stats-enabled = false + run-interval = "1s" diff --git a/metrics/metrics-internal/influxdb_internal.sh b/metrics/metrics-internal/influxdb_internal.sh new file mode 100644 index 000000000..6c4ec17e4 --- /dev/null +++ b/metrics/metrics-internal/influxdb_internal.sh @@ -0,0 +1,48 @@ +#!/bin/bash -ex +# +# (Re)starts the InfluxDB containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${INFLUXDB_IMAGE:=influxdb:1.7}" + +# Remove the container +container=influxdb_internal +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + +# (Re) start the container +sudo docker run \ + --detach \ + --name=influxdb_internal \ + --net=influxdb \ + --publish 8086:8086 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \ + --volume /var/lib/influxdb:/var/lib/influxdb \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + --cpus=10 \ + $INFLUXDB_IMAGE -config /etc/influxdb/influxdb.conf diff --git a/metrics/metrics-internal/nginx/fastcgi.conf b/metrics/metrics-internal/nginx/fastcgi.conf new file mode 100644 index 000000000..091738c60 --- /dev/null +++ b/metrics/metrics-internal/nginx/fastcgi.conf @@ -0,0 +1,26 @@ + +fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; +fastcgi_param QUERY_STRING $query_string; +fastcgi_param REQUEST_METHOD $request_method; +fastcgi_param CONTENT_TYPE $content_type; +fastcgi_param CONTENT_LENGTH $content_length; + +fastcgi_param SCRIPT_NAME $fastcgi_script_name; +fastcgi_param REQUEST_URI $request_uri; +fastcgi_param DOCUMENT_URI $document_uri; +fastcgi_param DOCUMENT_ROOT $document_root; +fastcgi_param SERVER_PROTOCOL $server_protocol; +fastcgi_param REQUEST_SCHEME $scheme; +fastcgi_param HTTPS $https if_not_empty; + +fastcgi_param GATEWAY_INTERFACE CGI/1.1; +fastcgi_param SERVER_SOFTWARE nginx/$nginx_version; + +fastcgi_param REMOTE_ADDR $remote_addr; +fastcgi_param REMOTE_PORT $remote_port; +fastcgi_param SERVER_ADDR $server_addr; +fastcgi_param SERVER_PORT $server_port; +fastcgi_param SERVER_NAME $server_name; + +# PHP only, required if PHP was built with --enable-force-cgi-redirect +fastcgi_param REDIRECT_STATUS 200; diff --git a/metrics/metrics-internal/nginx/fastcgi_params b/metrics/metrics-internal/nginx/fastcgi_params new file mode 100644 index 000000000..28decb955 --- /dev/null +++ b/metrics/metrics-internal/nginx/fastcgi_params @@ -0,0 +1,25 @@ + +fastcgi_param QUERY_STRING $query_string; +fastcgi_param REQUEST_METHOD $request_method; +fastcgi_param CONTENT_TYPE $content_type; +fastcgi_param CONTENT_LENGTH $content_length; + +fastcgi_param SCRIPT_NAME $fastcgi_script_name; +fastcgi_param REQUEST_URI $request_uri; +fastcgi_param DOCUMENT_URI $document_uri; +fastcgi_param DOCUMENT_ROOT $document_root; +fastcgi_param SERVER_PROTOCOL $server_protocol; +fastcgi_param REQUEST_SCHEME $scheme; +fastcgi_param HTTPS $https if_not_empty; + +fastcgi_param GATEWAY_INTERFACE CGI/1.1; +fastcgi_param SERVER_SOFTWARE nginx/$nginx_version; + +fastcgi_param REMOTE_ADDR $remote_addr; +fastcgi_param REMOTE_PORT $remote_port; +fastcgi_param SERVER_ADDR $server_addr; +fastcgi_param SERVER_PORT $server_port; +fastcgi_param SERVER_NAME $server_name; + +# PHP only, required if PHP was built with --enable-force-cgi-redirect +fastcgi_param REDIRECT_STATUS 200; diff --git a/metrics/metrics-internal/nginx/koi-utf b/metrics/metrics-internal/nginx/koi-utf new file mode 100644 index 000000000..e7974ff6a --- /dev/null +++ b/metrics/metrics-internal/nginx/koi-utf @@ -0,0 +1,109 @@ + +# This map is not a full koi8-r <> utf8 map: it does not contain +# box-drawing and some other characters. Besides this map contains +# several koi8-u and Byelorussian letters which are not in koi8-r. +# If you need a full and standard map, use contrib/unicode2nginx/koi-utf +# map instead. + +charset_map koi8-r utf-8 { + + 80 E282AC ; # euro + + 95 E280A2 ; # bullet + + 9A C2A0 ; #   + + 9E C2B7 ; # · + + A3 D191 ; # small yo + A4 D194 ; # small Ukrainian ye + + A6 D196 ; # small Ukrainian i + A7 D197 ; # small Ukrainian yi + + AD D291 ; # small Ukrainian soft g + AE D19E ; # small Byelorussian short u + + B0 C2B0 ; # ° + + B3 D081 ; # capital YO + B4 D084 ; # capital Ukrainian YE + + B6 D086 ; # capital Ukrainian I + B7 D087 ; # capital Ukrainian YI + + B9 E28496 ; # numero sign + + BD D290 ; # capital Ukrainian soft G + BE D18E ; # capital Byelorussian short U + + BF C2A9 ; # (C) + + C0 D18E ; # small yu + C1 D0B0 ; # small a + C2 D0B1 ; # small b + C3 D186 ; # small ts + C4 D0B4 ; # small d + C5 D0B5 ; # small ye + C6 D184 ; # small f + C7 D0B3 ; # small g + C8 D185 ; # small kh + C9 D0B8 ; # small i + CA D0B9 ; # small j + CB D0BA ; # small k + CC D0BB ; # small l + CD D0BC ; # small m + CE D0BD ; # small n + CF D0BE ; # small o + + D0 D0BF ; # small p + D1 D18F ; # small ya + D2 D180 ; # small r + D3 D181 ; # small s + D4 D182 ; # small t + D5 D183 ; # small u + D6 D0B6 ; # small zh + D7 D0B2 ; # small v + D8 D18C ; # small soft sign + D9 D18B ; # small y + DA D0B7 ; # small z + DB D188 ; # small sh + DC D18D ; # small e + DD D189 ; # small shch + DE D187 ; # small ch + DF D18A ; # small hard sign + + E0 D0AE ; # capital YU + E1 D090 ; # capital A + E2 D091 ; # capital B + E3 D0A6 ; # capital TS + E4 D094 ; # capital D + E5 D095 ; # capital YE + E6 D0A4 ; # capital F + E7 D093 ; # capital G + E8 D0A5 ; # capital KH + E9 D098 ; # capital I + EA D099 ; # capital J + EB D09A ; # capital K + EC D09B ; # capital L + ED D09C ; # capital M + EE D09D ; # capital N + EF D09E ; # capital O + + F0 D09F ; # capital P + F1 D0AF ; # capital YA + F2 D0A0 ; # capital R + F3 D0A1 ; # capital S + F4 D0A2 ; # capital T + F5 D0A3 ; # capital U + F6 D096 ; # capital ZH + F7 D092 ; # capital V + F8 D0AC ; # capital soft sign + F9 D0AB ; # capital Y + FA D097 ; # capital Z + FB D0A8 ; # capital SH + FC D0AD ; # capital E + FD D0A9 ; # capital SHCH + FE D0A7 ; # capital CH + FF D0AA ; # capital hard sign +} diff --git a/metrics/metrics-internal/nginx/koi-win b/metrics/metrics-internal/nginx/koi-win new file mode 100644 index 000000000..72afabe89 --- /dev/null +++ b/metrics/metrics-internal/nginx/koi-win @@ -0,0 +1,103 @@ + +charset_map koi8-r windows-1251 { + + 80 88 ; # euro + + 95 95 ; # bullet + + 9A A0 ; #   + + 9E B7 ; # · + + A3 B8 ; # small yo + A4 BA ; # small Ukrainian ye + + A6 B3 ; # small Ukrainian i + A7 BF ; # small Ukrainian yi + + AD B4 ; # small Ukrainian soft g + AE A2 ; # small Byelorussian short u + + B0 B0 ; # ° + + B3 A8 ; # capital YO + B4 AA ; # capital Ukrainian YE + + B6 B2 ; # capital Ukrainian I + B7 AF ; # capital Ukrainian YI + + B9 B9 ; # numero sign + + BD A5 ; # capital Ukrainian soft G + BE A1 ; # capital Byelorussian short U + + BF A9 ; # (C) + + C0 FE ; # small yu + C1 E0 ; # small a + C2 E1 ; # small b + C3 F6 ; # small ts + C4 E4 ; # small d + C5 E5 ; # small ye + C6 F4 ; # small f + C7 E3 ; # small g + C8 F5 ; # small kh + C9 E8 ; # small i + CA E9 ; # small j + CB EA ; # small k + CC EB ; # small l + CD EC ; # small m + CE ED ; # small n + CF EE ; # small o + + D0 EF ; # small p + D1 FF ; # small ya + D2 F0 ; # small r + D3 F1 ; # small s + D4 F2 ; # small t + D5 F3 ; # small u + D6 E6 ; # small zh + D7 E2 ; # small v + D8 FC ; # small soft sign + D9 FB ; # small y + DA E7 ; # small z + DB F8 ; # small sh + DC FD ; # small e + DD F9 ; # small shch + DE F7 ; # small ch + DF FA ; # small hard sign + + E0 DE ; # capital YU + E1 C0 ; # capital A + E2 C1 ; # capital B + E3 D6 ; # capital TS + E4 C4 ; # capital D + E5 C5 ; # capital YE + E6 D4 ; # capital F + E7 C3 ; # capital G + E8 D5 ; # capital KH + E9 C8 ; # capital I + EA C9 ; # capital J + EB CA ; # capital K + EC CB ; # capital L + ED CC ; # capital M + EE CD ; # capital N + EF CE ; # capital O + + F0 CF ; # capital P + F1 DF ; # capital YA + F2 D0 ; # capital R + F3 D1 ; # capital S + F4 D2 ; # capital T + F5 D3 ; # capital U + F6 C6 ; # capital ZH + F7 C2 ; # capital V + F8 DC ; # capital soft sign + F9 DB ; # capital Y + FA C7 ; # capital Z + FB D8 ; # capital SH + FC DD ; # capital E + FD D9 ; # capital SHCH + FE D7 ; # capital CH + FF DA ; # capital hard sign +} diff --git a/metrics/metrics-internal/nginx/mime.types b/metrics/metrics-internal/nginx/mime.types new file mode 100644 index 000000000..89be9a4cd --- /dev/null +++ b/metrics/metrics-internal/nginx/mime.types @@ -0,0 +1,89 @@ + +types { + text/html html htm shtml; + text/css css; + text/xml xml; + image/gif gif; + image/jpeg jpeg jpg; + application/javascript js; + application/atom+xml atom; + application/rss+xml rss; + + text/mathml mml; + text/plain txt; + text/vnd.sun.j2me.app-descriptor jad; + text/vnd.wap.wml wml; + text/x-component htc; + + image/png png; + image/tiff tif tiff; + image/vnd.wap.wbmp wbmp; + image/x-icon ico; + image/x-jng jng; + image/x-ms-bmp bmp; + image/svg+xml svg svgz; + image/webp webp; + + application/font-woff woff; + application/java-archive jar war ear; + application/json json; + application/mac-binhex40 hqx; + application/msword doc; + application/pdf pdf; + application/postscript ps eps ai; + application/rtf rtf; + application/vnd.apple.mpegurl m3u8; + application/vnd.ms-excel xls; + application/vnd.ms-fontobject eot; + application/vnd.ms-powerpoint ppt; + application/vnd.wap.wmlc wmlc; + application/vnd.google-earth.kml+xml kml; + application/vnd.google-earth.kmz kmz; + application/x-7z-compressed 7z; + application/x-cocoa cco; + application/x-java-archive-diff jardiff; + application/x-java-jnlp-file jnlp; + application/x-makeself run; + application/x-perl pl pm; + application/x-pilot prc pdb; + application/x-rar-compressed rar; + application/x-redhat-package-manager rpm; + application/x-sea sea; + application/x-shockwave-flash swf; + application/x-stuffit sit; + application/x-tcl tcl tk; + application/x-x509-ca-cert der pem crt; + application/x-xpinstall xpi; + application/xhtml+xml xhtml; + application/xspf+xml xspf; + application/zip zip; + + application/octet-stream bin exe dll; + application/octet-stream deb; + application/octet-stream dmg; + application/octet-stream iso img; + application/octet-stream msi msp msm; + + application/vnd.openxmlformats-officedocument.wordprocessingml.document docx; + application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx; + application/vnd.openxmlformats-officedocument.presentationml.presentation pptx; + + audio/midi mid midi kar; + audio/mpeg mp3; + audio/ogg ogg; + audio/x-m4a m4a; + audio/x-realaudio ra; + + video/3gpp 3gpp 3gp; + video/mp2t ts; + video/mp4 mp4; + video/mpeg mpeg mpg; + video/quicktime mov; + video/webm webm; + video/x-flv flv; + video/x-m4v m4v; + video/x-mng mng; + video/x-ms-asf asx asf; + video/x-ms-wmv wmv; + video/x-msvideo avi; +} diff --git a/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-image-filter.conf b/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-image-filter.conf new file mode 100644 index 000000000..dfa29399d --- /dev/null +++ b/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-image-filter.conf @@ -0,0 +1 @@ +load_module modules/ngx_http_image_filter_module.so; diff --git a/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-xslt-filter.conf b/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-xslt-filter.conf new file mode 100644 index 000000000..a4f87ac30 --- /dev/null +++ b/metrics/metrics-internal/nginx/modules-enabled/50-mod-http-xslt-filter.conf @@ -0,0 +1 @@ +load_module modules/ngx_http_xslt_filter_module.so; diff --git a/metrics/metrics-internal/nginx/modules-enabled/50-mod-mail.conf b/metrics/metrics-internal/nginx/modules-enabled/50-mod-mail.conf new file mode 100644 index 000000000..cfd4a4d23 --- /dev/null +++ b/metrics/metrics-internal/nginx/modules-enabled/50-mod-mail.conf @@ -0,0 +1 @@ +load_module modules/ngx_mail_module.so; diff --git a/metrics/metrics-internal/nginx/modules-enabled/50-mod-stream.conf b/metrics/metrics-internal/nginx/modules-enabled/50-mod-stream.conf new file mode 100644 index 000000000..f2b265785 --- /dev/null +++ b/metrics/metrics-internal/nginx/modules-enabled/50-mod-stream.conf @@ -0,0 +1 @@ +load_module modules/ngx_stream_module.so; diff --git a/metrics/metrics-internal/nginx/nginx.conf b/metrics/metrics-internal/nginx/nginx.conf new file mode 100644 index 000000000..a5834c4f9 --- /dev/null +++ b/metrics/metrics-internal/nginx/nginx.conf @@ -0,0 +1,85 @@ +user www-data; +worker_processes auto; +pid /run/nginx.pid; +include /etc/nginx/modules-enabled/*.conf; + +events { + worker_connections 768; + # multi_accept on; +} + +http { + + ## + # Basic Settings + ## + + sendfile on; + tcp_nopush on; + tcp_nodelay on; + keepalive_timeout 65; + types_hash_max_size 2048; + # server_tokens off; + + # server_names_hash_bucket_size 64; + # server_name_in_redirect off; + + include /etc/nginx/mime.types; + default_type application/octet-stream; + + ## + # SSL Settings + ## + + ssl_protocols TLSv1 TLSv1.1 TLSv1.2 TLSv1.3; # Dropping SSLv3, ref: POODLE + ssl_prefer_server_ciphers on; + + ## + # Logging Settings + ## + + access_log /var/log/nginx/access.log; + error_log /var/log/nginx/error.log; + + ## + # Gzip Settings + ## + + gzip on; + + # gzip_vary on; + # gzip_proxied any; + # gzip_comp_level 6; + # gzip_buffers 16 8k; + # gzip_http_version 1.1; + # gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; + + ## + # Virtual Host Configs + ## + + include /etc/nginx/conf.d/*.conf; + include /etc/nginx/sites-enabled/*; +} + + +#mail { +# # See sample authentication script at: +# # http://wiki.nginx.org/ImapAuthenticateWithApachePhpScript +# +# # auth_http localhost/auth.php; +# # pop3_capabilities "TOP" "USER"; +# # imap_capabilities "IMAP4rev1" "UIDPLUS"; +# +# server { +# listen localhost:110; +# protocol pop3; +# proxy on; +# } +# +# server { +# listen localhost:143; +# protocol imap; +# proxy on; +# } +#} diff --git a/metrics/metrics-internal/nginx/proxy_params b/metrics/metrics-internal/nginx/proxy_params new file mode 100644 index 000000000..df75bc5d7 --- /dev/null +++ b/metrics/metrics-internal/nginx/proxy_params @@ -0,0 +1,4 @@ +proxy_set_header Host $http_host; +proxy_set_header X-Real-IP $remote_addr; +proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; +proxy_set_header X-Forwarded-Proto $scheme; diff --git a/metrics/metrics-internal/nginx/scgi_params b/metrics/metrics-internal/nginx/scgi_params new file mode 100644 index 000000000..6d4ce4f3e --- /dev/null +++ b/metrics/metrics-internal/nginx/scgi_params @@ -0,0 +1,17 @@ + +scgi_param REQUEST_METHOD $request_method; +scgi_param REQUEST_URI $request_uri; +scgi_param QUERY_STRING $query_string; +scgi_param CONTENT_TYPE $content_type; + +scgi_param DOCUMENT_URI $document_uri; +scgi_param DOCUMENT_ROOT $document_root; +scgi_param SCGI 1; +scgi_param SERVER_PROTOCOL $server_protocol; +scgi_param REQUEST_SCHEME $scheme; +scgi_param HTTPS $https if_not_empty; + +scgi_param REMOTE_ADDR $remote_addr; +scgi_param REMOTE_PORT $remote_port; +scgi_param SERVER_PORT $server_port; +scgi_param SERVER_NAME $server_name; diff --git a/metrics/metrics-internal/nginx/sites-available/default b/metrics/metrics-internal/nginx/sites-available/default new file mode 100644 index 000000000..7a2aa47a8 --- /dev/null +++ b/metrics/metrics-internal/nginx/sites-available/default @@ -0,0 +1,107 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# +server { + listen 80 default_server; + listen [::]:80 default_server; + + # SSL configuration + # + # listen 443 ssl default_server; + # listen [::]:443 ssl default_server; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.html index.htm index.nginx-debian.html; + + server_name _; + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + #location ~ \.php$ { + # include snippets/fastcgi-php.conf; + # + # # With php-fpm (or other unix sockets): + # fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} + + +# Virtual Host configuration for example.com +# +# You can move that to a different file under sites-available/ and symlink that +# to sites-enabled/ to enable it. +# +#server { +# listen 80; +# listen [::]:80; +# +# server_name example.com; +# +# root /var/www/example.com; +# index index.html; +# +# location / { +# try_files $uri $uri/ =404; +# } +#} +server { + listen 80; + # root /usr/share/nginx/html; + # index index.html index.htm; + + server_name http://10.138.0.13; + location / { + proxy_pass http://10.138.0.13:3000; +# rewrite ^/(.*) /$1 break; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; +# proxy_cache_pass $http_upgrade; + } +} diff --git a/metrics/metrics-internal/nginx/sites-available/default-back b/metrics/metrics-internal/nginx/sites-available/default-back new file mode 100644 index 000000000..9117e378e --- /dev/null +++ b/metrics/metrics-internal/nginx/sites-available/default-back @@ -0,0 +1,91 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# +server { + listen 80 default_server; + listen [::]:80 default_server; + + # SSL configuration + # + # listen 443 ssl default_server; + # listen [::]:443 ssl default_server; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.html index.htm index.nginx-debian.html; + + server_name _; + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + #location ~ \.php$ { + # include snippets/fastcgi-php.conf; + # + # # With php-fpm (or other unix sockets): + # fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} + + +# Virtual Host configuration for example.com +# +# You can move that to a different file under sites-available/ and symlink that +# to sites-enabled/ to enable it. +# +#server { +# listen 80; +# listen [::]:80; +# +# server_name example.com; +# +# root /var/www/example.com; +# index index.html; +# +# location / { +# try_files $uri $uri/ =404; +# } +#} diff --git a/metrics/metrics-internal/nginx/sites-enabled/default b/metrics/metrics-internal/nginx/sites-enabled/default new file mode 100755 index 000000000..7cd6f4491 --- /dev/null +++ b/metrics/metrics-internal/nginx/sites-enabled/default @@ -0,0 +1,37 @@ +server { + listen 80; + return 301 https://$host$request_uri; +} + +server { + + listen 443; + server_name internal-metrics.solana.com; + + # ssl_certificate /etc/nginx/cert.crt; + # ssl_certificate_key /etc/nginx/cert.key; + ssl_certificate /home/okcan/metrics-solana-com/certs/fullchain.pem; + ssl_certificate_key /home/okcan/metrics-solana-com/certs/privkey.pem; + + ssl on; + ssl_session_cache builtin:1000 shared:SSL:10m; + ssl_protocols TLSv1 TLSv1.1 TLSv1.2; + ssl_ciphers HIGH:!aNULL:!eNULL:!EXPORT:!CAMELLIA:!DES:!MD5:!PSK:!RC4; + ssl_prefer_server_ciphers on; + + access_log /var/log/nginx/jenkins.access.log; + + location / { + + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Fix the “It appears that your reverse proxy set up is broken" error. + proxy_pass https://internal-metrics.solana.com:3000; + proxy_read_timeout 90; + + proxy_redirect https://internal-metrics.solana.com:3000 https://internal-metrics.solana.com; + } + } diff --git a/metrics/metrics-internal/nginx/sites-enabled/default.bck b/metrics/metrics-internal/nginx/sites-enabled/default.bck new file mode 100644 index 000000000..7a2aa47a8 --- /dev/null +++ b/metrics/metrics-internal/nginx/sites-enabled/default.bck @@ -0,0 +1,107 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# +server { + listen 80 default_server; + listen [::]:80 default_server; + + # SSL configuration + # + # listen 443 ssl default_server; + # listen [::]:443 ssl default_server; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.html index.htm index.nginx-debian.html; + + server_name _; + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + #location ~ \.php$ { + # include snippets/fastcgi-php.conf; + # + # # With php-fpm (or other unix sockets): + # fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + #} + + # deny access to .htaccess files, if Apache's document root + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} +} + + +# Virtual Host configuration for example.com +# +# You can move that to a different file under sites-available/ and symlink that +# to sites-enabled/ to enable it. +# +#server { +# listen 80; +# listen [::]:80; +# +# server_name example.com; +# +# root /var/www/example.com; +# index index.html; +# +# location / { +# try_files $uri $uri/ =404; +# } +#} +server { + listen 80; + # root /usr/share/nginx/html; + # index index.html index.htm; + + server_name http://10.138.0.13; + location / { + proxy_pass http://10.138.0.13:3000; +# rewrite ^/(.*) /$1 break; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection 'upgrade'; + proxy_set_header Host $host; +# proxy_cache_pass $http_upgrade; + } +} diff --git a/metrics/metrics-internal/nginx/snippets/fastcgi-php.conf b/metrics/metrics-internal/nginx/snippets/fastcgi-php.conf new file mode 100644 index 000000000..467a9e732 --- /dev/null +++ b/metrics/metrics-internal/nginx/snippets/fastcgi-php.conf @@ -0,0 +1,13 @@ +# regex to split $uri to $fastcgi_script_name and $fastcgi_path +fastcgi_split_path_info ^(.+?\.php)(/.*)$; + +# Check that the PHP script exists before passing it +try_files $fastcgi_script_name =404; + +# Bypass the fact that try_files resets $fastcgi_path_info +# see: http://trac.nginx.org/nginx/ticket/321 +set $path_info $fastcgi_path_info; +fastcgi_param PATH_INFO $path_info; + +fastcgi_index index.php; +include fastcgi.conf; diff --git a/metrics/metrics-internal/nginx/snippets/snakeoil.conf b/metrics/metrics-internal/nginx/snippets/snakeoil.conf new file mode 100644 index 000000000..ad26c3e21 --- /dev/null +++ b/metrics/metrics-internal/nginx/snippets/snakeoil.conf @@ -0,0 +1,5 @@ +# Self signed certificates generated by the ssl-cert package +# Don't use them in a production server! + +ssl_certificate /etc/ssl/certs/ssl-cert-snakeoil.pem; +ssl_certificate_key /etc/ssl/private/ssl-cert-snakeoil.key; diff --git a/metrics/metrics-internal/nginx/uwsgi_params b/metrics/metrics-internal/nginx/uwsgi_params new file mode 100644 index 000000000..09c732cd6 --- /dev/null +++ b/metrics/metrics-internal/nginx/uwsgi_params @@ -0,0 +1,17 @@ + +uwsgi_param QUERY_STRING $query_string; +uwsgi_param REQUEST_METHOD $request_method; +uwsgi_param CONTENT_TYPE $content_type; +uwsgi_param CONTENT_LENGTH $content_length; + +uwsgi_param REQUEST_URI $request_uri; +uwsgi_param PATH_INFO $document_uri; +uwsgi_param DOCUMENT_ROOT $document_root; +uwsgi_param SERVER_PROTOCOL $server_protocol; +uwsgi_param REQUEST_SCHEME $scheme; +uwsgi_param HTTPS $https if_not_empty; + +uwsgi_param REMOTE_ADDR $remote_addr; +uwsgi_param REMOTE_PORT $remote_port; +uwsgi_param SERVER_PORT $server_port; +uwsgi_param SERVER_NAME $server_name; diff --git a/metrics/metrics-internal/nginx/win-utf b/metrics/metrics-internal/nginx/win-utf new file mode 100644 index 000000000..774fd9fc9 --- /dev/null +++ b/metrics/metrics-internal/nginx/win-utf @@ -0,0 +1,125 @@ +# This map is not a full windows-1251 <> utf8 map: it does not +# contain Serbian and Macedonian letters. If you need a full map, +# use contrib/unicode2nginx/win-utf map instead. + +charset_map windows-1251 utf-8 { + + 82 E2809A; # single low-9 quotation mark + + 84 E2809E; # double low-9 quotation mark + 85 E280A6; # ellipsis + 86 E280A0; # dagger + 87 E280A1; # double dagger + 88 E282AC; # euro + 89 E280B0; # per mille + + 91 E28098; # left single quotation mark + 92 E28099; # right single quotation mark + 93 E2809C; # left double quotation mark + 94 E2809D; # right double quotation mark + 95 E280A2; # bullet + 96 E28093; # en dash + 97 E28094; # em dash + + 99 E284A2; # trade mark sign + + A0 C2A0; #   + A1 D18E; # capital Byelorussian short U + A2 D19E; # small Byelorussian short u + + A4 C2A4; # currency sign + A5 D290; # capital Ukrainian soft G + A6 C2A6; # borken bar + A7 C2A7; # section sign + A8 D081; # capital YO + A9 C2A9; # (C) + AA D084; # capital Ukrainian YE + AB C2AB; # left-pointing double angle quotation mark + AC C2AC; # not sign + AD C2AD; # soft hypen + AE C2AE; # (R) + AF D087; # capital Ukrainian YI + + B0 C2B0; # ° + B1 C2B1; # plus-minus sign + B2 D086; # capital Ukrainian I + B3 D196; # small Ukrainian i + B4 D291; # small Ukrainian soft g + B5 C2B5; # micro sign + B6 C2B6; # pilcrow sign + B7 C2B7; # · + B8 D191; # small yo + B9 E28496; # numero sign + BA D194; # small Ukrainian ye + BB C2BB; # right-pointing double angle quotation mark + + BF D197; # small Ukrainian yi + + C0 D090; # capital A + C1 D091; # capital B + C2 D092; # capital V + C3 D093; # capital G + C4 D094; # capital D + C5 D095; # capital YE + C6 D096; # capital ZH + C7 D097; # capital Z + C8 D098; # capital I + C9 D099; # capital J + CA D09A; # capital K + CB D09B; # capital L + CC D09C; # capital M + CD D09D; # capital N + CE D09E; # capital O + CF D09F; # capital P + + D0 D0A0; # capital R + D1 D0A1; # capital S + D2 D0A2; # capital T + D3 D0A3; # capital U + D4 D0A4; # capital F + D5 D0A5; # capital KH + D6 D0A6; # capital TS + D7 D0A7; # capital CH + D8 D0A8; # capital SH + D9 D0A9; # capital SHCH + DA D0AA; # capital hard sign + DB D0AB; # capital Y + DC D0AC; # capital soft sign + DD D0AD; # capital E + DE D0AE; # capital YU + DF D0AF; # capital YA + + E0 D0B0; # small a + E1 D0B1; # small b + E2 D0B2; # small v + E3 D0B3; # small g + E4 D0B4; # small d + E5 D0B5; # small ye + E6 D0B6; # small zh + E7 D0B7; # small z + E8 D0B8; # small i + E9 D0B9; # small j + EA D0BA; # small k + EB D0BB; # small l + EC D0BC; # small m + ED D0BD; # small n + EE D0BE; # small o + EF D0BF; # small p + + F0 D180; # small r + F1 D181; # small s + F2 D182; # small t + F3 D183; # small u + F4 D184; # small f + F5 D185; # small kh + F6 D186; # small ts + F7 D187; # small ch + F8 D188; # small sh + F9 D189; # small shch + FA D18A; # small hard sign + FB D18B; # small y + FC D18C; # small soft sign + FD D18D; # small e + FE D18E; # small yu + FF D18F; # small ya +} diff --git a/metrics/metrics-internal/start.sh b/metrics/metrics-internal/start.sh new file mode 100644 index 000000000..d331a0fdd --- /dev/null +++ b/metrics/metrics-internal/start.sh @@ -0,0 +1,119 @@ +#!/bin/bash -ex +# +# (Re)starts the InfluxDB/Chronograf containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${INFLUXDB_IMAGE:=influxdb:1.7}" +: "${CHRONOGRAF_IMAGE:=chronograf:1.8.8}" +: "${GRAFANA_IMAGE:=grafana/grafana:8.3.1}" + +docker pull $INFLUXDB_IMAGE +docker pull $CHRONOGRAF_IMAGE +docker pull $GRAFANA_IMAGE + +for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do + [[ -w /var/lib/$container ]] + [[ -x /var/lib/$container ]] + + ( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 + ) +done + +sudo docker network remove influxdb || true +sudo docker network create influxdb +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + +sudo docker run \ + --detach \ + --name=grafana_internal \ + --net=influxdb \ + --publish 3000:3000 \ + --user root:root \ + --env GF_PATHS_CONFIG=/grafana.ini \ + --volume "$PWD"/certs:/certs:ro \ + --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ + --volume /var/lib/grafana:/var/lib/grafana \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $GRAFANA_IMAGE + +sudo docker run \ + --detach \ + --name=influxdb_internal \ + --net=influxdb \ + --publish 8086:8086 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \ + --volume /var/lib/influxdb:/var/lib/influxdb \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + --cpus=10 \ + $INFLUXDB_IMAGE -config /etc/influxdb/influxdb.conf + +sleep 20s + +sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8889" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \ + --env PUBLIC_URL=https://internal-metrics.solana.com:8889 \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --env inactivity-duration=48h \ + --name=chronograf_8889_internal \ + --net=influxdb \ + --publish 8889:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf_8889:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + +sudo sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8888" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8888" \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \ + --env PUBLIC_URL=https://internal-metrics.solana.com:8888 \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --env inactivity-duration=48h \ + --name=chronograf_8888_internal \ + --net=influxdb \ + --publish 8888:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 + +curl -h | sed -ne '/--tlsv/p' +curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping + +exit 0 diff --git a/metrics/metrics-internal/status.sh b/metrics/metrics-internal/status.sh new file mode 100644 index 000000000..beb57a12a --- /dev/null +++ b/metrics/metrics-internal/status.sh @@ -0,0 +1,33 @@ +#!/bin/bash -ex +# +# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +echo +++ status +( + set -x + pwd + sudo docker ps --no-trunc --size + sudo du -hs /var/lib/{influxdb,chronograf,grafana} + df -h + free -h + uptime +) + +# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again + +for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do + if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then + curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in metrics-internal server"}' "$SLACK_WEBHOOK" + curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in metrics-internal server"}' "$DISCORD_WEBHOOK" + echo "Starting up script" + sudo bash $container.sh + sleep 30 + fi + done diff --git a/metrics/metrics-main/README.md b/metrics/metrics-main/README.md new file mode 100644 index 000000000..cbc0efb9f --- /dev/null +++ b/metrics/metrics-main/README.md @@ -0,0 +1,18 @@ +![image](https://user-images.githubusercontent.com/110216567/184346286-94e0b45f-19e9-4fc9-a1a3-2e50c6f12bf8.png) + +Services: +1. Prometheus +2. AlertManager +3. Chronograf2 (on port 8888) +4. Chronograf_8889 (on port 8889) +5. Grafana (on port 3000) +6. Grafana2 (on port 3001) +7. Kapacitor + +To install all the services on the metrics-internal server, you need to run the ./start.sh script. + +Install the Buildkite-agent to run the pipeline to get the status of the container. + +If any of the containers is not in running state or in exited state then it will redeploy the container as per the specific container status. + +**Note:** If you delete or remove the container manually then you can also run the script to redeploy it again. diff --git a/metrics/metrics-main/alertmanager-discord.sh b/metrics/metrics-main/alertmanager-discord.sh new file mode 100644 index 000000000..6941dbf22 --- /dev/null +++ b/metrics/metrics-main/alertmanager-discord.sh @@ -0,0 +1,40 @@ +#!/bin/bash -ex +# +# (Re)starts the Alertmanager containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${ALERTMANAGER_DISCORD_IMAGE:=benjojo/alertmanager-discord:latest}" + +# remove the container +container=alertmanager-discord +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* + +# (Re) start the Alertmanager container +sudo docker run -it -d \ + --publish 9094:9094 \ + --name=alertmanager-discord \ + --env DISCORD_WEBHOOK="$DISCORD_WEBHOOK_ALERTMANAGER" \ + $ALERTMANAGER_DISCORD_IMAGE diff --git a/metrics/metrics-main/alertmanager.sh b/metrics/metrics-main/alertmanager.sh new file mode 100644 index 000000000..0eba86297 --- /dev/null +++ b/metrics/metrics-main/alertmanager.sh @@ -0,0 +1,44 @@ +#!/bin/bash -ex +# +# (Re)starts the Alertmanager containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${ALERTMANAGER_IMAGE:=prom/alertmanager:v0.23.0}" + +# remove the container +container=alertmanager +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + + +# (Re) start the Alertmanager container +sudo docker run -it -d \ + --user root:root \ + --publish 9093:9093 \ + --name=alertmanager \ + --volume "PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ + --volume /etc/hosts:/etc/hosts \ + $ALERTMANAGER_IMAGE diff --git a/metrics/metrics-main/alertmanager.yml b/metrics/metrics-main/alertmanager.yml new file mode 100644 index 000000000..e15a8f2f1 --- /dev/null +++ b/metrics/metrics-main/alertmanager.yml @@ -0,0 +1,11 @@ +route: + group_by: [AlertMe] + # If an alert isn't caught by a route, send it to the pager. + receiver: discord_webhook + +receivers: +- name: 'discord_webhook' + #pagerduty_configs: + #- service_key: cde8232f1c6d4f09c0884c5b0e5d5f86 + webhook_configs: + - url: 'http://10.128.0.11:9094' diff --git a/metrics/metrics-main/chronograf.sh b/metrics/metrics-main/chronograf.sh new file mode 100644 index 000000000..423c008ed --- /dev/null +++ b/metrics/metrics-main/chronograf.sh @@ -0,0 +1,58 @@ +#!/bin/bash -ex +# +# (Re)starts the Chronograf containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${CHRONOGRAF_IMAGE:=chronograf:1.9.4}" + +# remove the container +container=chronograf +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + + + +#(Re) start the container +sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env inactivity-duration=48h \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8888" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8888" \ + --env PUBLIC_URL=https://metrics.solana.com:8888 \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets,influxdata.com,solana.org \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --name=chronograf \ + --net=influxdb \ + --publish 8888:8888 \ + --user 0:0 \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + chronograf:1.8.8 --influxdb-url=https://metrics.solana.com:8086 diff --git a/metrics/metrics-main/chronograf_8889.sh b/metrics/metrics-main/chronograf_8889.sh new file mode 100644 index 000000000..54b8ef40d --- /dev/null +++ b/metrics/metrics-main/chronograf_8889.sh @@ -0,0 +1,55 @@ +#!/bin/bash -ex +# +# (Re)starts the Chronograf_8889 containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${CHRONOGRAF_IMAGE:=chronograf:1.9.4}" + +# remove the container +container=chronograf_8889 +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + +# (Re) start the container +sudo docker run \ + --detach \ + --name=chronograf_8889 \ + --env AUTH_DURATION=24h \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8889" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \ + --env PUBLIC_URL=https://metrics.solana.com:8889 \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets,influxdata.com,solana.org \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env inactivity-duration=48h \ + --publish 8889:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf_8889:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" diff --git a/metrics/metrics-main/first_rules.yml b/metrics/metrics-main/first_rules.yml new file mode 100644 index 000000000..2ac99f36c --- /dev/null +++ b/metrics/metrics-main/first_rules.yml @@ -0,0 +1,101 @@ +groups: + - name: AllInstances + rules: + - alert: InstanceDown + expr: up == 0 + for: 1m + annotations: + title: 'Instance {{ $labels.instance }} down' + description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.' + labels: + severity: critical + + - alert: CpuLoad Warning + expr: node_load15 / (count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 1 + for: 10m + labels: + severity: warning + annotations: + title: 'Instance {{ $labels.instance }} Warning' + summary: "CPU load (instance {{ $labels.instance }})" + description: "CPU load (15m) is high\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: CpuLoad Critical + expr: node_load15 / (count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 2 + for: 10m + labels: + severity: critical + annotations: + title: 'Instance {{ $labels.instance }} CpuLoad is Critical' + summary: "CPU load (instance {{ $labels.instance }})" + description: "CPU load (15m) is high\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfMemory + expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 20 + for: 10m + labels: + severity: warning + annotations: + title: 'Instance {{ $labels.instance }} OutOfMemory warning' + summary: "Out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfMemory + expr: (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10 + for: 10m + labels: + severity: critical + annotations: + title: 'Instance {{ $labels.instance }} OutOfMemory critical' + summary: "Out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfDiskSpace>80 + expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 20 + for: 10m + labels: + severity: warning + annotations: + title: 'Instance {{ $labels.instance }} Disk space more than 80%' + summary: "Out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: OutOfDiskSpace>90 + expr: node_filesystem_free_bytes{mountpoint ="/"} / node_filesystem_size_bytes{mountpoint ="/"} * 100 < 10 + for: 10m + labels: + severity: critical + annotations: + title: 'Instance {{ $labels.instance }} Disk space more than 90%' + summary: "Out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: InfluxdbOutOfDiskSpace>80 + expr: (node_filesystem_avail_bytes{job=~"Influx-Data|Influx-Meta", mountpoint="/var/lib/influxdb"} * 100) / node_filesystem_size_bytes{job=~"Influx-Data|Influx-Meta", mountpoint="/var/lib/influxdb"} < 20 and ON (instance, device, mountpoint) node_filesystem_readonly{job=~"Influx-Data|Influx-Meta", mountpoint="/var/lib/influxdb"} == 0 + for: 10m + labels: + severity: critical + annotations: + title: 'Influxdb Instance {{ $labels.instance }} Disk space more than 80%' + summary: "Out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: InfluxdbOutOfMemory>80 + expr: node_memory_MemAvailable_bytes{job=~"Influx-Data|Influx-Meta"} / node_memory_MemTotal_bytes{job=~"Influx-Data|Influx-Meta"} * 100 < 20 + for: 10m + labels: + severity: critical + annotations: + title: 'Influxdb Instance {{ $labels.instance }} OutOfMemory critical' + summary: "Out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 20% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: InfluxdbServiceInactive + expr: node_systemd_unit_state{job=~"Influx-Data|Influx-Meta",name=~"influxdb-meta.service|influxdb.service",state="active"} == 0 + for: 10m + labels: + severity: critical + annotations: + title: 'Service {{ $labels.name }} is inactive in the Instance {{ $labels.instance }} ' + summary: "Inactive Service (instance {{ $labels.instance }})" + description: "Service is Inactive \n VALUE = {{ $value }}\n LABELS: {{ $labels }}" diff --git a/metrics/metrics-main/grafana-metrics.solana.com.ini b/metrics/metrics-main/grafana-metrics.solana.com.ini new file mode 100644 index 000000000..e9d294b80 --- /dev/null +++ b/metrics/metrics-main/grafana-metrics.solana.com.ini @@ -0,0 +1,454 @@ +##################### Grafana Configuration Example ##################### +# +# Everything has defaults so you only need to uncomment things you want to +# change + +# possible values : production, development +;app_mode = production + +# instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty +;instance_name = ${HOSTNAME} + +#################################### Paths #################################### +[paths] +# Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) +;data = /var/lib/grafana + +# Directory where grafana can store logs +;logs = /var/log/grafana + +# Directory where grafana will automatically scan and look for plugins +;plugins = /var/lib/grafana/plugins + +# folder that contains provisioning config files that grafana will apply on startup and while running. +;provisioning = conf/provisioning + +#################################### Server #################################### +[server] +# Protocol (http, https, socket) +protocol = https + +# The ip address to bind to, empty will bind to all interfaces +;http_addr = + +# The http port to use +;http_port = 3000 + +# The public facing domain name used to access grafana from a browser +domain = metrics.solana.com + +# Redirect to correct domain if host header does not match domain +# Prevents DNS rebinding attacks +;enforce_domain = false + +# The full public facing url you use in browser, used for redirects and emails +# If you use reverse proxy and sub path specify full url (with sub path) +;root_url = http://metrics.solana.com:3000 + +# Log web requests +;router_logging = false + +# the path relative working path +;static_root_path = public + +# enable gzip +;enable_gzip = false + +# https certs & key file +cert_file = /certs/fullchain.pem +cert_key = /certs/privkey.pem + +# Unix socket path +;socket = + +#################################### Database #################################### +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as separate properties or as on string using the url properties. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +;type = sqlite3 +;host = 127.0.0.1:3306 +;name = grafana +;user = root +# If the password contains # or ; you have to wrap it with triple quotes. Ex """#password;""" +;password = + +# Use either URL or the previous fields to configure the database +# Example: mysql://user:secret@host:port/database +;url = + +# For "postgres" only, either "disable", "require" or "verify-full" +;ssl_mode = disable + +# For "sqlite3" only, path relative to data_path setting +;path = grafana.db + +# Max idle conn setting default is 2 +;max_idle_conn = 2 + +# Max conn setting default is 0 (mean not set) +;max_open_conn = + +# Connection Max Lifetime default is 14400 (means 14400 seconds or 4 hours) +;conn_max_lifetime = 14400 + +# Set to true to log the sql calls and execution times. +log_queries = + +#################################### Session #################################### +[session] +# Either "memory", "file", "redis", "mysql", "postgres", default is "file" +;provider = file + +# Provider config options +# memory: not have any config yet +# file: session dir path, is relative to grafana data_path +# redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=grafana` +# mysql: go-sql-driver/mysql dsn config string, e.g. `user:password@tcp(127.0.0.1:3306)/database_name` +# postgres: user=a password=b host=localhost port=5432 dbname=c sslmode=disable +;provider_config = sessions + +# Session cookie name +;cookie_name = grafana_sess + +# If you use session in https only, default is false +;cookie_secure = false + +# Session life time, default is 86400 +;session_life_time = 86400 + +#################################### Data proxy ########################### +[dataproxy] + +# This enables data proxy logging, default is false +;logging = false + +#################################### Analytics #################################### +[analytics] +# Server reporting, sends usage counters to stats.grafana.org every 24 hours. +# No ip addresses are being tracked, only simple counters to track +# running instances, dashboard and error counts. It is very helpful to us. +# Change this option to false to disable reporting. +;reporting_enabled = true + +# Set to false to disable all checks to https://grafana.net +# for new vesions (grafana itself and plugins), check is used +# in some UI views to notify that grafana or plugin update exists +# This option does not cause any auto updates, nor send any information +# only a GET request to http://grafana.com to get latest versions +;check_for_updates = true + +# Google Analytics universal tracking code, only enabled if you specify an id here +;google_analytics_ua_id = + +#################################### Security #################################### +[security] +# default admin user, created on startup +admin_user = $ADMIN_USER_GRAFANA + +# default admin password, can be changed before first start of grafana, or in profile settings +admin_password = $ADMIN_PASSWORD_GRAFANA +# used for signing +;secret_key = SW2YcwTIb9zpOOhoPsMm + +# Auto-login remember days +;login_remember_days = 7 +;cookie_username = grafana_user +;cookie_remember_name = grafana_remember + +# disable gravatar profile images +;disable_gravatar = false + +# data source proxy whitelist (ip_or_domain:port separated by spaces) +;data_source_proxy_whitelist = + +# disable protection against brute force login attempts +;disable_brute_force_login_protection = false + +#################################### Snapshots ########################### +[snapshots] +# snapshot sharing options +;external_enabled = true +;external_snapshot_url = https://snapshots-origin.raintank.io +;external_snapshot_name = Publish to snapshot.raintank.io + +# remove expired snapshot +;snapshot_remove_expired = true + +#################################### Dashboards History ################## +[dashboards] +# Number dashboard versions to keep (per dashboard). Default: 20, Minimum: 1 +;versions_to_keep = 20 + +#################################### Users ############################### +[users] +# disable user signup / registration +;allow_sign_up = true + +# Allow non admin users to create organizations +;allow_org_create = true + +# Set to true to automatically assign new users to the default organization (id 1) +auto_assign_org = true + +# Default role new users will be automatically assigned (if disabled above is set to true) +auto_assign_org_role = Editor + +# Background text for the user field on the login page +;login_hint = email or username + +# Default UI theme ("dark" or "light") +;default_theme = dark + +# External user management, these options affect the organization users view +;external_manage_link_url = +;external_manage_link_name = +;external_manage_info = + +# Viewers can edit/inspect dashboard settings in the browser. But not save the dashboard. +viewers_can_edit = false + +[auth] +# Set to true to disable (hide) the login form, useful if you use OAuth, defaults to false +;disable_login_form = false + +# Set to true to disable the signout link in the side menu. useful if you use auth.proxy, defaults to false +;disable_signout_menu = false + +# URL to redirect the user to after sign out +;signout_redirect_url = + +#################################### Anonymous Auth ########################## +[auth.anonymous] +# enable anonymous access +enabled = true + +# specify organization name that should be used for unauthenticated users +org_name = Solana Public + +# specify role for unauthenticated users +org_role = Viewer + +#################################### Github Auth ########################## +[auth.github] +enabled = true +allow_sign_up = true +client_id = $GITHUB_CLIENT_ID +client_secret = $GITHUB_CLIENT_SECRET +scopes = user:email,read:org +auth_url = https://github.com/login/oauth/authorize +token_url = https://github.com/login/oauth/access_token +api_url = https://api.github.com/user +;team_ids = +allowed_organizations = solana-labs + +#################################### Google Auth ########################## +[auth.google] +;enabled = false +;allow_sign_up = true +;client_id = some_client_id +;client_secret = some_client_secret +;scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email +;auth_url = https://accounts.google.com/o/oauth2/auth +;token_url = https://accounts.google.com/o/oauth2/token +;api_url = https://www.googleapis.com/oauth2/v1/userinfo +;allowed_domains = + +#################################### Generic OAuth ########################## +[auth.generic_oauth] +;enabled = false +;name = OAuth +;allow_sign_up = true +;client_id = some_id +;client_secret = some_secret +;scopes = user:email,read:org +;auth_url = https://foo.bar/login/oauth/authorize +;token_url = https://foo.bar/login/oauth/access_token +;api_url = https://foo.bar/user +;team_ids = +;allowed_organizations = + +#################################### Grafana.com Auth #################### +[auth.grafana_com] +;enabled = false +;allow_sign_up = true +;client_id = some_id +;client_secret = some_secret +;scopes = user:email +;allowed_organizations = + +#################################### Auth Proxy ########################## +[auth.proxy] +;enabled = false +;header_name = X-WEBAUTH-USER +;header_property = username +;auto_sign_up = true +;ldap_sync_ttl = 60 +;whitelist = 192.168.1.1, 192.168.2.1 + +#################################### Basic Auth ########################## +[auth.basic] +;enabled = true + +#################################### Auth LDAP ########################## +[auth.ldap] +;enabled = false +;config_file = /etc/grafana/ldap.toml +;allow_sign_up = true + +#################################### SMTP / Emailing ########################## +[smtp] +;enabled = false +;host = localhost:25 +;user = +# If the password contains # or ; you have to wrap it with trippel quotes. Ex """#password;""" +;password = +;cert_file = +;key_file = +;skip_verify = false +;from_address = admin@grafana.localhost +;from_name = Grafana +# EHLO identity in SMTP dialog (defaults to instance_name) +;ehlo_identity = dashboard.example.com + +[emails] +;welcome_email_on_sign_up = false + +#################################### Logging ########################## +[log] +# Either "console", "file", "syslog". Default is console and file +# Use space to separate multiple modes, e.g. "console file" +;mode = console file + +# Either "debug", "info", "warn", "error", "critical", default is "info" +;level = info + +# optional settings to set different levels for specific loggers. Ex filters = sqlstore:debug +;filters = + +# For "console" mode only +[log.console] +;level = + +# log line format, valid options are text, console and json +;format = console + +# For "file" mode only +[log.file] +;level = + +# log line format, valid options are text, console and json +;format = text + +# This enables automated log rotate(switch of following options), default is true +;log_rotate = true + +# Max line number of single file, default is 1000000 +;max_lines = 1000000 + +# Max size shift of single file, default is 28 means 1 << 28, 256MB +;max_size_shift = 28 + +# Segment log daily, default is true +;daily_rotate = true + +# Expired days of log file(delete after max days), default is 7 +;max_days = 7 + +[log.syslog] +;level = + +# log line format, valid options are text, console and json +;format = text + +# Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used. +;network = +;address = + +# Syslog facility. user, daemon and local0 through local7 are valid. +;facility = + +# Syslog tag. By default, the process' argv[0] is used. +;tag = + +#################################### Alerting ############################ +[alerting] +# Disable alerting engine & UI features +;enabled = true +# Makes it possible to turn off alert rule execution but alerting UI is visible +;execute_alerts = true + +#################################### Explore ############################# +[explore] +# Enable the Explore section +enabled = false + +#################################### Internal Grafana Metrics ########################## +# Metrics available at HTTP API Url /metrics +[metrics] +# Disable / Enable internal metrics +;enabled = true + +# Publish interval +;interval_seconds = 10 + +# Send internal metrics to Graphite +[metrics.graphite] +# Enable by setting the address setting (ex localhost:2003) +;address = +;prefix = prod.grafana.%(instance_name)s. + +#################################### Distributed tracing ############ +[tracing.jaeger] +# Enable by setting the address sending traces to jaeger (ex localhost:6831) +;address = localhost:6831 +# Tag that will always be included in when creating new spans. ex (tag1:value1,tag2:value2) +;always_included_tag = tag1:value1 +# Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote +;sampler_type = const +# jaeger samplerconfig param +# for "const" sampler, 0 or 1 for always false/true respectively +# for "probabilistic" sampler, a probability between 0 and 1 +# for "rateLimiting" sampler, the number of spans per second +# for "remote" sampler, param is the same as for "probabilistic" +# and indicates the initial sampling rate before the actual one +# is received from the mothership +;sampler_param = 1 + +#################################### Grafana.com integration ########################## +# Url used to to import dashboards directly from Grafana.com +[grafana_com] +;url = https://grafana.com + +#################################### External image storage ########################## +[external_image_storage] +# Used for uploading images to public servers so they can be included in slack/email messages. +# you can choose between (s3, webdav, gcs, azure_blob, local) +;provider = + +[external_image_storage.s3] +;bucket = +;region = +;path = +;access_key = +;secret_key = + +[external_image_storage.webdav] +;url = +;public_url = +;username = +;password = + +[external_image_storage.gcs] +;key_file = +;bucket = +;path = + +[external_image_storage.azure_blob] +;account_name = +;account_key = +;container_name = + +[external_image_storage.local] +# does not require any configuration diff --git a/metrics/metrics-main/grafana.sh b/metrics/metrics-main/grafana.sh new file mode 100644 index 000000000..6e6044410 --- /dev/null +++ b/metrics/metrics-main/grafana.sh @@ -0,0 +1,49 @@ +#!/bin/bash -ex +# +# (Re)starts the Grafana containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${GRAFANA_IMAGE:=grafana/grafana:9.4.7}" + +# remove the container +container=grafana +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* + + +# (Re) start the container +sudo docker run \ + --detach \ + --name=grafana \ + --net=influxdb \ + --publish 3000:3000 \ + --user root:root \ + --env GF_PATHS_CONFIG=/grafana.ini \ + --volume "$PWD"/certs:/certs:ro \ + --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ + --volume /var/lib/grafana:/var/lib/grafana \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $GRAFANA_IMAGE diff --git a/metrics/metrics-main/host.sh b/metrics/metrics-main/host.sh new file mode 100644 index 000000000..291be4928 --- /dev/null +++ b/metrics/metrics-main/host.sh @@ -0,0 +1,6 @@ +# |source| me + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" diff --git a/metrics/metrics-main/kapacitor.conf b/metrics/metrics-main/kapacitor.conf new file mode 100644 index 000000000..e1a51674a --- /dev/null +++ b/metrics/metrics-main/kapacitor.conf @@ -0,0 +1,274 @@ +hostname = "cbb0e482c7a5" +data_dir = "/var/lib/kapacitor" +skip-config-overrides = false +default-retention-policy = "" + +[http] + bind-address = ":9092" + auth-enabled = false + log-enabled = true + write-tracing = false + pprof-enabled = false + https-enabled = false + https-certificate = "/etc/ssl/kapacitor.pem" + https-private-key = "" + shutdown-timeout = "10s" + shared-secret = "" + +[replay] + dir = "/var/lib/kapacitor/replay" + +[storage] + boltdb = "/var/lib/kapacitor/kapacitor.db" + +[task] + dir = "/root/.kapacitor/tasks" + snapshot-interval = "1m0s" + +[load] + enabled = false + dir = "/root/.kapacitor/load" + +[[influxdb]] + enabled = true + name = "default" + default = false + # urls = ["https://metrics.solana.com:8089"] + urls = ["http://35.224.128.87:8086"] + username = "$KAPACITOR_USERNAME" + password = "$KAPACITOR_PASSWORD" + ssl-ca = "" + ssl-cert = "" + ssl-key = "" + insecure-skip-verify = false + timeout = "0s" + disable-subscriptions = false + subscription-protocol = "http" + subscription-mode = "cluster" + kapacitor-hostname = "" + http-port = 0 + udp-bind = "" + udp-buffer = 1000 + udp-read-buffer = 0 + startup-timeout = "5m0s" + subscriptions-sync-interval = "1m0s" + [influxdb.excluded-subscriptions] + _kapacitor = ["autogen"] + + +[fluxtask] + # Configure flux tasks for kapacitor + enabled = true + # The InfluxDB instance name (from the [[influxdb]] config section) + # to store historical task run data in + # Not recommended: use "none" to turn off historical task run data storage. + task-run-influxdb = "default" + # Bucket to store historical task run data in. We recommend leaving this empty; by default, data is written to the `kapacitor_fluxtask_logs` bucket or database. + # If you have multiple Kapacitor instances and want to keep your data separate, specify the InfluxDB 2.x bucket or InfluxDB 1.x database to write to. For InfluxDB 1.x, use the `"mydb"` convention--the `"mydb/rp"` convention with the retention policy is not supported. + task-run-bucket="kapacitor_fluxtask_logs" + # The organization name or ID if storing historical task run data + # in InfluxDB 2.x or InfluxDB Cloud + task-run-org = "" + task-run-orgid = "" + # The measurement name for the historical task run data + task-run-measurement = "runs" + +[logging] + file = "STDERR" + level = "DEBUG" + +[config-override] + enabled = true + +[collectd] + enabled = false + bind-address = ":25826" + database = "collectd" + retention-policy = "" + batch-size = 5000 + batch-pending = 10 + batch-timeout = "10s" + read-buffer = 0 + typesdb = "/usr/share/collectd/types.db" + +[opentsdb] + enabled = false + bind-address = ":4242" + database = "opentsdb" + retention-policy = "" + consistency-level = "one" + tls-enabled = false + certificate = "/etc/ssl/influxdb.pem" + batch-size = 1000 + batch-pending = 5 + batch-timeout = "1s" + log-point-errors = true + +[alerta] + enabled = false + url = "" + insecure-skip-verify = false + token = "" + token-prefix = "" + environment = "" + origin = "" + timeout = "0s" + +[hipchat] + enabled = false + url = "" + token = "" + room = "" + global = false + state-changes-only = false + +[[kafka]] + enabled = false + id = "default" + timeout = "0s" + batch-size = 0 + batch-timeout = "0s" + use-ssl = false + ssl-ca = "" + ssl-cert = "" + ssl-key = "" + insecure-skip-verify = false + +[[mqtt]] + enabled = false + name = "default" + default = false + url = "" + ssl-ca = "" + ssl-cert = "" + ssl-key = "" + insecure-skip-verify = false + client-id = "" + username = "" + password = "" + +[opsgenie] + enabled = false + api-key = "" + url = "https://api.opsgenie.com/v1/json/alert" + recovery_url = "https://api.opsgenie.com/v1/json/alert/note" + global = false + +[opsgenie2] + enabled = false + api-key = "" + url = "https://api.opsgenie.com/v2/alerts" + recovery_action = "notes" + global = false + +[pagerduty] + enabled = false + url = "https://events.pagerduty.com/generic/2010-04-15/create_event.json" + service-key = "" + global = false + +[pagerduty2] + enabled = false + url = "https://events.pagerduty.com/v2/enqueue" + routing-key = "" + global = false + +[pushover] + enabled = false + token = "" + user-key = "" + url = "https://api.pushover.net/1/messages.json" + +[[httppost]] + endpoint = "example" + url = "http://example.com" + alert-template = "" + alert-template-file = "" + row-template = "" + row-template-file = "" + [httppost.basic-auth] + username = "" + password = "" + +[smtp] + enabled = false + host = "localhost" + port = 25 + username = "" + password = "" + no-verify = false + global = false + state-changes-only = false + from = "" + idle-timeout = "30s" + +[snmptrap] + enabled = false + addr = "localhost:162" + community = "kapacitor" + retries = 1 + +[sensu] + enabled = false + addr = "" + source = "Kapacitor" + +[[slack]] + enabled = false + default = true + workspace = "" + url = "" + channel = "" + username = "kapacitor" + icon-emoji = "" + global = false + state-changes-only = false + ssl-ca = "" + ssl-cert = "" + ssl-key = "" + insecure-skip-verify = false + +[talk] + enabled = false + url = "" + author_name = "" + +[telegram] + enabled = false + url = "https://api.telegram.org/bot" + token = "" + chat-id = "" + parse-mode = "" + disable-web-page-preview = false + disable-notification = false + global = false + state-changes-only = false + +[victorops] + enabled = false + api-key = "" + routing-key = "" + url = "https://alert.victorops.com/integrations/generic/20131114/alert" + global = false + json-data = false + +[reporting] + enabled = true + url = "https://usage.influxdata.com" + +[stats] + enabled = true + stats-interval = "10s" + database = "_kapacitor" + retention-policy = "autogen" + timing-sample-rate = 0.1 + timing-movavg-size = 1000 + +[udf] + +[deadman] + interval = "10s" + threshold = 0.0 + id = "{{ .Group }}:NODE_NAME for task '{{ .TaskName }}'" + message = "{{ .ID }} is {{ if eq .Level \"OK\" }}alive{{ else }}dead{{ end }}: {{ index .Fields \"emitted\" | printf \"%0.3f\" }} points/INTERVAL." + global = false diff --git a/metrics/metrics-main/kapacitor.sh b/metrics/metrics-main/kapacitor.sh new file mode 100644 index 000000000..f4a964458 --- /dev/null +++ b/metrics/metrics-main/kapacitor.sh @@ -0,0 +1,37 @@ +#!/bin/bash -ex +# +# (Re)starts the Kapacitor container +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${KAPACITOR_IMAGE:=kapacitor:1.6.5}" + +# remove the container +container=kapacitor +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +#running influx kapacitor service +sudo docker run \ + --detach \ + --name=kapacitor \ + --publish 9092:9092 \ + --volume "$PWD"/kapacitor.conf:/etc/kapacitor/kapacitor.conf \ + --volume /var/lib/kapacitor:/var/lib/kapacitor \ + --user "$(id -u):$(id -g)" \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $KAPACITOR_IMAGE diff --git a/metrics/metrics-main/prometheus.sh b/metrics/metrics-main/prometheus.sh new file mode 100644 index 000000000..fd6bd2eb9 --- /dev/null +++ b/metrics/metrics-main/prometheus.sh @@ -0,0 +1,45 @@ +#!/bin/bash -ex +# +# (Re)starts the Prometheus containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${PROMETHEUS_IMAGE:=prom/prometheus:v2.28.0}" + +# remove the container +container=prometheus +[[ -w /var/lib/$container ]] +[[ -x /var/lib/$container ]] + +( + set +e + sudo docker kill $container + sudo docker rm -f $container + exit 0 +) + +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* + + +# (Re) start prometheus container +sudo docker run -it -d \ + --user root:root \ + --publish 9090:9090 \ + --name=prometheus \ + --volume "$PWD"/prometheus.yml:/etc/prometheus/prometheus.yml \ + --volume "$PWD"/first_rules.yml:/etc/prometheus/first_rules.yml \ + --volume /prometheus/prometheus/data:/prometheus \ + --volume /etc/hosts:/etc/hosts \ + $PROMETHEUS_IMAGE diff --git a/metrics/metrics-main/prometheus.yml b/metrics/metrics-main/prometheus.yml new file mode 100644 index 000000000..19ff6ed23 --- /dev/null +++ b/metrics/metrics-main/prometheus.yml @@ -0,0 +1,74 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + - '35.206.116.166:9093' + - '10.128.0.11:9093' + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + - job_name: 'Devnet' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['devnet-watchtower:9100','devnet-entrypoint:9100','devnet-validator-asia-sg1:9100','devnet-validator-europe-fr2:9100','devnet-validator-us-da11:9100','devnet-validator-us-ny5:9100','devnet-warehouse-us-ny5:9100','devnet-warehouse-asia-ty11:9100'] + + - job_name: 'Testnet' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['testnet-watchtower:9100','testnet-entrypoint:9100','testnet-validator-us-sv15:9100','testnet-warehouse-us-sv15:9100','testnet-warehouse-asia-ty11:9100'] + + - job_name: 'Mainnet-Beta' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['mainnet-watchtower:9100','mainnet-entrypoint:9100','beta-validator-us-ny5:9100','edge-validator-us-sv15:9100','validator-asia-sg1:9100','validator-europe-fr2:9100','validator-us-ny5:9100','validator-us-sv15:9100','warehouse-asia-sg1:9100','warehouse-europe-fr2:9100','warehouse-us-ny5:9100','warehouse-eu-ld7-aws:9100','warehouse-us-da11-aws:9100','warehouse-eu-ld7-azure:9100','warehouse-us-da11-azure:9100'] + + - job_name: 'Canaries' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['edge-validator-us-sv15:9100','beta-validator-us-ny5:9100','canary-am6-1:9100', 'canary-am6-2:9100','canary-da11-1:9100', 'canary-ny5-1:9100', 'canary-ny5-2:9100', 'canary-sg1-1:9100', 'canary-sv15-1:9100'] + + - job_name: 'Shared-Development' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['dev-server-us-da11:9100','dev-server-us-ny5:9100','dev-server-us-sv15:9100','dev-server-eu-am6:9100','dev-server-asia-sg1:9100','dev-server-us-da11-2:9100','dev-server-us-da11-3:9100','dev-server-us-da11-4:9100','dev-server-us-da11-5:9100','dev-server-asia-hk2:9100','dev-server-asia-sg1-2:9100','dev-server-asia-ty11:9100','dev-server-eu-am6-2:9100','dev-server-asia-sg1-2:9100','dev-server-us-sv15-2:9100','dev-server-us-da11-6:9100','dev-server-us-da11-7:9100','dev-server-eu-ld7-1:9100','dev-server-us-da11-8:9100','dev-server-eu-ld7-2:9100'] + + - job_name: 'Development' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['dev-equinix-washington-36:9100','dev-equinix-washington-35:9100','dev-equinix-washington-34:9100','dev-equinix-washington-33:9100','dev-equinix-washington-32:9100','dev-equinix-washington-31:9100','dev-equinix-washington-30:9100','dev-equinix-washington-29:9100','dev-equinix-washington-28:9100','dev-equinix-washington-27:9100','dev-equinix-washington-26:9100','dev-equinix-washington-25:9100','dev-equinix-washington-24:9100','dev-equinix-washington-23:9100','dev-equinix-washington-22:9100','dev-equinix-washington-21:9100','dev-equinix-washington-20:9100','dev-equinix-washington-19:9100','dev-equinix-washington-18:9100','dev-equinix-washington-17:9100','dev-equinix-washington-16:9100','dev-equinix-washington-15:9100','dev-equinix-washington-14:9100','dev-equinix-washington-13:9100','dev-equinix-washington-12:9100','dev-equinix-washington-11:9100','dev-equinix-washington-10:9100','dev-equinix-washington-9:9100','dev-equinix-washington-8:9100','dev-equinix-washington-7:9100','dev-equinix-washington-6:9100','dev-equinix-washington-5:9100','dev-equinix-washington-4:9100','dev-equinix-washington-3:9100','dev-equinix-washington-2:9100','dev-equinix-washington-1:9100','dev-equinix-toronto-35:9100','dev-equinix-toronto-34:9100','dev-equinix-toronto-33:9100','dev-equinix-toronto-32:9100','dev-equinix-toronto-31:9100','dev-equinix-toronto-30:9100','dev-equinix-toronto-29:9100','dev-equinix-toronto-28:9100','dev-equinix-toronto-27:9100','dev-equinix-toronto-26:9100','dev-equinix-tokyo-14:9100','dev-equinix-tokyo-13:9100','dev-equinix-tokyo-12:9100','dev-equinix-tokyo-11:9100','dev-equinix-tokyo-10:9100','dev-equinix-tokyo-9:9100','dev-equinix-singapore-2:9100','dev-equinix-new-york-8:9100','dev-equinix-new-york-7:9100','dev-equinix-new-york-6:9100','dev-equinix-new-york-5:9100','dev-equinix-hong-kong-2:9100','dev-equinix-hong-kong-1:9100','dev-equinix-toronto-12:9100','dev-equinix-toronto-11:9100','dev-equinix-toronto-10:9100','dev-equinix-toronto-9:9100','dev-equinix-toronto-8:9100','dev-equinix-toronto-7:9100','dev-equinix-toronto-6:9100','dev-equinix-toronto-5:9100','dev-equinix-toronto-4:9100','dev-equinix-toronto-3:9100','dev-equinix-toronto-2:9100','dev-equinix-toronto-1:9100','dev-equinix-frankfurt-10:9100','dev-equinix-frankfurt-9:9100','dev-equinix-frankfurt-8:9100','dev-equinix-frankfurt-7:9100','dev-equinix-frankfurt-6:9100','dev-equinix-frankfurt-5:9100','dev-equinix-tokyo-8:9100','dev-equinix-tokyo-7:9100','dev-equinix-tokyo-6:9100','dev-equinix-tokyo-5:9100','dev-equinix-tokyo-4:9100','dev-equinix-tokyo-3:9100','dev-equinix-tokyo-2:9100','dev-equinix-tokyo-1:9100','dev-equinix-singapore-3:9100','dev-equinix-dallas-8:9100','dev-equinix-singapore-1:9100','dev-equinix-dallas-2:9100','dev-equinix-chicago-25:9100','dev-equinix-chicago-24:9100','dev-equinix-chicago-23:9100','dev-equinix-new-york-4:9100','dev-equinix-new-york-3:9100','dev-equinix-new-york-2:9100','dev-equinix-new-york-1:9100','dev-equinix-paris-1:9100','dev-equinix-chicago-22:9100','dev-equinix-chicago-21:9100','dev-equinix-chicago-20:9100','dev-equinix-chicago-19:9100','dev-equinix-chicago-18:9100','dev-equinix-chicago-17:9100','dev-equinix-chicago-16:9100','dev-equinix-chicago-15:9100','dev-equinix-frankfurt-4:9100','dev-equinix-frankfurt-3:9100','dev-equinix-frankfurt-2:9100','dev-equinix-frankfurt-1:9100','dev-equinix-chicago-14:9100','dev-equinix-dallas-7:9100','dev-equinix-dallas-6:9100','dev-equinix-dallas-5:9100','dev-equinix-dallas-4:9100','dev-equinix-dallas-3:9100','dev-equinix-chicago-13:9100','dev-equinix-dallas-1:9100','dev-equinix-chicago-12:9100','dev-equinix-chicago-11:9100','dev-equinix-amsterdam-21:9100','dev-equinix-amsterdam-20:9100','dev-equinix-amsterdam-19:9100','dev-equinix-amsterdam-18:9100','dev-equinix-amsterdam-17:9100','dev-equinix-toronto-25:9100','dev-equinix-toronto-24:9100','dev-equinix-toronto-23:9100','dev-equinix-toronto-22:9100','dev-equinix-toronto-21:9100','dev-equinix-toronto-20:9100','dev-equinix-toronto-19:9100','dev-equinix-chicago-10:9100','dev-equinix-chicago-9:9100','dev-equinix-chicago-8:9100','dev-equinix-chicago-7:9100','dev-equinix-chicago-6:9100','dev-equinix-chicago-5:9100','dev-equinix-chicago-4:9100','dev-equinix-chicago-3:9100','dev-equinix-chicago-2:9100','dev-equinix-chicago-1:9100','dev-equinix-toronto-18:9100','dev-equinix-toronto-17:9100','dev-equinix-toronto-16:9100','dev-equinix-toronto-15:9100','dev-equinix-toronto-14:9100','dev-equinix-toronto-13:9100','dev-equinix-amsterdam-16:9100','dev-equinix-amsterdam-15:9100','dev-equinix-amsterdam-14:9100','dev-equinix-amsterdam-13:9100','dev-equinix-amsterdam-12:9100','dev-equinix-amsterdam-11:9100','dev-equinix-amsterdam-10:9100','dev-equinix-amsterdam-9:9100','dev-equinix-amsterdam-8:9100','dev-equinix-amsterdam-7:9100','dev-equinix-amsterdam-6:9100','dev-equinix-amsterdam-5:9100','dev-equinix-amsterdam-4:9100','dev-equinix-amsterdam-3:9100','dev-equinix-amsterdam-1:9100'] + + - job_name: 'Influx-Meta' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['dev-equinix-washington-24:9100','dev-equinix-washington-25:9100','dev-equinix-washington-26:9100'] + + + - job_name: 'Influx-Data' + scrape_interval: 15s + scrape_timeout: 14s + static_configs: + - targets: ['dev-equinix-washington-27:9100','dev-equinix-washington-28:9100','dev-equinix-washington-29:9100','dev-equinix-washington-30:9100','dev-equinix-washington-31:9100','dev-equinix-washington-32:9100','dev-equinix-amsterdam-19:9100','dev-equinix-amsterdam-20:9100','dev-equinix-amsterdam-21:9100','dev-equinix-amsterdam-22:9100','dev-equinix-chicago-17:9100','dev-equinix-chicago-19:9100','dev-equinix-chicago-25:9100','dev-equinix-dallas-1:9100','dev-equinix-frankfurt-1:9100','dev-equinix-toronto-5:9100'] diff --git a/metrics/metrics-main/start.sh b/metrics/metrics-main/start.sh new file mode 100644 index 000000000..5f2bc22bc --- /dev/null +++ b/metrics/metrics-main/start.sh @@ -0,0 +1,147 @@ +#!/bin/bash -ex +# +# (Re)starts the InfluxDB/Chronograf containers +# + +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +: "${INFLUXDB_IMAGE:=influxdb:1.7}" +: "${CHRONOGRAF_IMAGE:=chronograf:1.9.4}" +: "${KAPACITOR_IMAGE:=kapacitor:1.6.5}" +: "${GRAFANA_IMAGE:=grafana/grafana:9.4.7}" +: "${PROMETHEUS_IMAGE:=prom/prometheus:v2.28.0}" +: "${ALERTMANAGER_IMAGE:=prom/alertmanager:v0.23.0}" +: "${ALERTMANAGER_DISCORD_IMAGE:=benjojo/alertmanager-discord:latest}" + +docker pull $INFLUXDB_IMAGE +docker pull $CHRONOGRAF_IMAGE +docker pull $KAPACITOR_IMAGE +docker pull $GRAFANA_IMAGE +docker pull $PROMETHEUS_IMAGE +docker pull $ALERTMANAGER_IMAGE +docker pull $ALERTMANAGER_DISCORD_IMAGE + +for container in chronograf chronograf_8889 prometheus alertmanager alertmanager-discord grafana kapacitor; do + [[ -w /var/lib/$container ]] + [[ -x /var/lib/$container ]] + + ( + set +e + docker kill $container + docker rm -f $container + exit 0 + ) +done + +docker network remove influxdb || true +docker network create influxdb +pwd +rm -rf certs +mkdir -p certs +chmod 700 certs +sudo cp /etc/letsencrypt/live/"$HOST"/fullchain.pem certs/ +sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/ +sudo chmod 0444 certs/* +sudo chown buildkite-agent:buildkite-agent certs + +sudo docker run -it -d \ + --user root:root \ + --publish 9090:9090 \ + --name=prometheus \ + --volume /prometheus/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml \ + --volume /prometheus/prometheus/first_rules.yml:/etc/prometheus/first_rules.yml \ + --volume /prometheus/prometheus/data:/prometheus \ + --volume /etc/hosts:/etc/hosts \ + $PROMETHEUS_IMAGE + +sudo docker run -it -d \ + --user root:root \ + --publish 9093:9093 \ + --name=alertmanager \ + --volume /prometheus/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ + --volume /etc/hosts:/etc/hosts \ + $ALERTMANAGER_IMAGE + +sudo docker run -it -d \ + --publish 9094:9094 \ + --name=alertmanager-discord \ + --env DISCORD_WEBHOOK="$DISCORD_WEBHOOK_ALERTMANAGER" \ + $ALERTMANAGER_DISCORD_IMAGE + +sudo docker run \ + --detach \ + --name=grafana \ + --net=influxdb \ + --publish 3000:3000 \ + --user root:root \ + --env GF_PATHS_CONFIG=/grafana.ini \ + --volume "$PWD"/certs:/certs:ro \ + --volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \ + --volume /var/lib/grafana:/var/lib/grafana \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $GRAFANA_IMAGE + +sleep 20s + +sudo docker run \ + --detach \ + --name=chronograf_8889 \ + --env AUTH_DURATION=24h \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8889" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \ + --env PUBLIC_URL=https://metrics.solana.com:8889 \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets,influxdata.com,solana.org \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env inactivity-duration=48h \ + --publish 8889:8888 \ + --user "$(id -u):$(id -g)" \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf_8889:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file="5" \ + $CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD" + +sudo docker run \ + --detach \ + --env AUTH_DURATION=24h \ + --env inactivity-duration=48h \ + --env GOOGLE_CLIENT_ID="$GOOGLE_CLIENT_ID_8888" \ + --env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8888" \ + --env PUBLIC_URL=https://metrics.solana.com:8888 \ + --env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets,influxdata.com,solana.org \ + --env TLS_CERTIFICATE=/certs/fullchain.pem \ + --env TLS_PRIVATE_KEY=/certs/privkey.pem \ + --env TOKEN_SECRET="$TOKEN_SECRET" \ + --name=chronograf \ + --net=influxdb \ + --publish 8888:8888 \ + --user 0:0 \ + --volume "$PWD"/certs:/certs \ + --volume /var/lib/chronograf:/var/lib/chronograf \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $CHRONOGRAF_IMAGE --influxdb-url=https://metrics.solana.com:8086 + +sudo docker run \ + --detach \ + --name=kapacitor \ + --publish 9092:9092 \ + --volume "$PWD"/kapacitor.conf:/etc/kapacitor/kapacitor.conf \ + --volume /var/lib/kapacitor:/var/lib/kapacitor \ + --user "$(id -u):$(id -g)" \ + --log-opt max-size=1g \ + --log-opt max-file=5 \ + $KAPACITOR_IMAGE + +curl -h | sed -ne '/--tlsv/p' +curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping + +exit 0 diff --git a/metrics/metrics-main/status.sh b/metrics/metrics-main/status.sh new file mode 100644 index 000000000..58ccea30f --- /dev/null +++ b/metrics/metrics-main/status.sh @@ -0,0 +1,32 @@ +#!/bin/bash -ex +# +# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers +# +cd "$(dirname "$0")" + +if [[ -z $HOST ]]; then + HOST=metrics.solana.com +fi +echo "HOST: $HOST" + +echo +++ status +( + set -x + pwd + sudo docker ps --no-trunc --size + df -h + free -h + uptime +) + +# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again + +for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do + if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then + curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK" + curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK" + echo "Starting up script" + sudo bash $container.sh + sleep 30 + fi + done diff --git a/metrics/publish-metrics-dashboard.sh b/metrics/publish-metrics-dashboard.sh deleted file mode 100755 index 1ccdf3dc3..000000000 --- a/metrics/publish-metrics-dashboard.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env bash -set -e - -cd "$(dirname "$0")" - -CHANNEL=$1 -if [[ -z $CHANNEL ]]; then - echo "usage: $0 [channel]" - exit 1 -fi - -case $CHANNEL in -edge) - DASHBOARD=cluster-telemetry-edge - ;; -beta) - DASHBOARD=cluster-telemetry-beta - ;; -stable) - DASHBOARD=cluster-telemetry - ;; -*) - echo "Error: Invalid CHANNEL=$CHANNEL" - exit 1 - ;; -esac - - -if [[ -z $GRAFANA_API_TOKEN ]]; then - echo Error: GRAFANA_API_TOKEN not defined - exit 1 -fi - -DASHBOARD_JSON=scripts/grafana-provisioning/dashboards/cluster-monitor.json -if [[ ! -r $DASHBOARD_JSON ]]; then - echo Error: $DASHBOARD_JSON not found -fi - -( - set -x - scripts/adjust-dashboard-for-channel.py "$DASHBOARD_JSON" "$CHANNEL" -) - -rm -rf venv -python3 -m venv venv -# shellcheck source=/dev/null -source venv/bin/activate - -echo --- Fetch/build grafcli -( - set -x - git clone git@github.com:mvines/grafcli.git -b experimental-v5 venv/grafcli - cd venv/grafcli - python3 setup.py install -) - -echo --- Take a backup of existing dashboard if possible -( - set -x +e - grafcli export remote/metrics/$DASHBOARD $DASHBOARD_JSON.org - grafcli rm remote/metrics/$DASHBOARD - : -) - -echo --- Publish $DASHBOARD_JSON to $DASHBOARD -( - set -x - grafcli import "$DASHBOARD_JSON" remote/metrics -) - -exit 0