Add prom&grafana

main
Andrey Ivanov 2021-02-13 04:45:42 -05:00 committed by Andrey Ivanov
parent 9075c6f87c
commit 6f0d85600d
19 changed files with 7073 additions and 42 deletions

View File

@ -1,41 +0,0 @@
# Заготовка для социальной сети
Цель: В результате выполнения ДЗ вы создадите базовый скелет социальной сети, который будет развиваться в дальнейших ДЗ.
###В данном задании тренируются навыки:
- декомпозиции предметной области;
- построения элементарной архитектуры проекта
Требуется разработать создание и просмотр анект в социальной сети.
###Функциональные требования:
- Авторизация по паролю.
- Страница регистрации, где указывается следующая информация:
- Имя
- Фамилия
- Возраст
- Пол
- Интересы
- Город
- Страницы с анкетой.
###Нефункциональные требования:
- Любой язык программирования
- В качестве базы данных использовать MySQL
- Не использовать ORM
- Программа должна представлять из себя монолитное приложение.
- Не рекомендуется использовать следующие технологии:
- Репликация
- Шардинг
- Индексы
- Кэширование
Верстка не важна. Подойдет самая примитивная.
Разместить приложение на любом хостинге. Например, heroku.
ДЗ принимается в виде исходного кода на github и демонстрации проекта на хостинге.
Критерии оценки: Оценка происходит по принципу зачет/незачет.
###Требования:
- Есть возможность регистрации, создавать персональные страницы, возможность подружиться, список друзей.
- Отсутствуют SQL-инъекции.
- Пароль хранится безопасно.

View File

@ -0,0 +1,11 @@
route:
receiver: 'slack'
receivers:
- name: 'slack'
slack_configs:
- send_resolved: true
text: "{{ .CommonAnnotations.description }}"
username: 'Prometheus'
channel: '#<channel-name>'
api_url: 'https://hooks.slack.com/services/<webhook-id>'

39
cicd/caddy/Caddyfile Normal file
View File

@ -0,0 +1,39 @@
:9090 {
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
proxy / prometheus:9090 {
transparent
}
errors stderr
tls off
}
:9093 {
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
proxy / alertmanager:9093 {
transparent
}
errors stderr
tls off
}
:9091 {
basicauth / {$ADMIN_USER} {$ADMIN_PASSWORD}
proxy / pushgateway:9091 {
transparent
}
errors stderr
tls off
}
:3000 {
proxy / grafana:3000 {
transparent
websocket
}
errors stderr
tls off
}

3
cicd/config Normal file
View File

@ -0,0 +1,3 @@
GF_SECURITY_ADMIN_USER=admin
GF_SECURITY_ADMIN_PASSWORD=changeme
GF_USERS_ALLOW_SIGN_UP=false

View File

@ -58,4 +58,138 @@ services:
APP_DSN_PASS: app
APP_DSN_BASE: app
ports:
- "8080:8080"
- "8080:8080"
prometheus:
image: prom/prometheus:v2.24.1
container_name: prometheus
volumes:
- ./prometheus:/etc/prometheus
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=200h'
- '--web.enable-lifecycle'
restart: unless-stopped
expose:
- 9090
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
alertmanager:
image: prom/alertmanager:v0.21.0
container_name: alertmanager
volumes:
- ./alertmanager:/etc/alertmanager
command:
- '--config.file=/etc/alertmanager/config.yml'
- '--storage.path=/alertmanager'
restart: unless-stopped
expose:
- 9093
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
nodeexporter:
image: prom/node-exporter:v1.1.0
container_name: nodeexporter
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
restart: unless-stopped
expose:
- 9100
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.38.7
container_name: cadvisor
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
restart: unless-stopped
expose:
- 8080
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
grafana:
image: grafana/grafana:7.4.0
container_name: grafana
volumes:
- grafana_data:/var/lib/grafana
- ./grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
- ./grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
environment:
- GF_SECURITY_ADMIN_USER=${ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
restart: unless-stopped
expose:
- 3000
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
ports:
- "3001:3000"
pushgateway:
image: prom/pushgateway:v1.4.0
container_name: pushgateway
restart: unless-stopped
expose:
- 9091
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
caddy:
image: stefanprodan/caddy
container_name: caddy
ports:
- "3000:3000"
- "9090:9090"
- "9093:9093"
- "9091:9091"
volumes:
- ./caddy:/etc/caddy
environment:
- ADMIN_USER=${ADMIN_USER:-admin}
- ADMIN_PASSWORD=${ADMIN_PASSWORD:-admin}
restart: unless-stopped
networks:
- monitor-net
labels:
org.label-schema.group: "monitoring"
networks:
monitor-net:
driver: bridge
volumes:
prometheus_data: { }
grafana_data: { }

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'Prometheus'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,398 @@
{
"id": null,
"title": "Nginx",
"description": "Nginx exporter metrics",
"tags": [
"nginx"
],
"style": "dark",
"timezone": "browser",
"editable": true,
"hideControls": false,
"sharedCrosshair": true,
"rows": [
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 3,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "requests",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Requests/sec",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 2,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(nginx_connections_current) by (state)",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{state}}",
"metric": "",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
},
{
"aliasColors": {},
"bars": false,
"datasource": "Prometheus",
"decimals": 2,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(nginx_connections_processed_total{stage!=\"any\"}[5m])) by (stage)",
"hide": false,
"interval": "",
"intervalFactor": 10,
"legendFormat": "{{stage}}",
"metric": "",
"refId": "B",
"step": 10
}
],
"timeFrom": null,
"timeShift": null,
"title": "Connections rate",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx exporter metrics"
},
{
"collapse": false,
"editable": true,
"height": "250px",
"panels": [
{
"aliasColors": {},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 1,
"grid": {
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 4,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": true,
"rightSide": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(container_cpu_usage_seconds_total{name=~\"nginx\"}[5m])) / count(node_cpu_seconds_total{mode=\"system\"}) * 100",
"intervalFactor": 2,
"legendFormat": "nginx",
"refId": "A",
"step": 2
}
],
"timeFrom": null,
"timeShift": null,
"title": "CPU usage",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"show": true
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
]
}
],
"title": "Nginx container metrics"
}
],
"time": {
"from": "now-15m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"templating": {
"list": []
},
"annotations": {
"list": []
},
"refresh": "10s",
"schemaVersion": 12,
"version": 9,
"links": [],
"gnetId": null
}

View File

@ -0,0 +1,11 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
basicAuth: false
isDefault: true
editable: true

View File

@ -0,0 +1,22 @@
# Prometheus on EC2 & ECS:
Some helpers for anyone configuring Prometheus on ECS and AWS EC2.
To get started on AWS ECS and EC2:
*For EC2/ECS nodes*:
- Import the ecs task definition and add cadvisor and node-exporter service/task definition and run them on each host you want to be monitored
- Any hosts which have "Monitoring: On" tag will be automatically added in the targets
- Expose ports 9100 and 9191 to your Prometheus host
*For Prometheus host*:
- Copy prometheus.yml configuration present here to base prometheus configuration to enable EC2 service discovery
- `docker compose up -d`
**Note**:
Set query.staleness-delta to 1m make metrics more realtime
### TODO
- Add alerting rules based on ECS

View File

@ -0,0 +1,78 @@
{
"family": "cadvisor",
"containerDefinitions": [
{
"name": "cadvisor",
"image": "google/cadvisor",
"cpu": 10,
"memory": 300,
"portMappings": [
{
"containerPort": 9191,
"hostPort": 9191
}
],
"essential": true,
"privileged": true,
"mountPoints": [
{
"sourceVolume": "root",
"containerPath": "/rootfs",
"readOnly": true
},
{
"sourceVolume": "var_run",
"containerPath": "/var/run",
"readOnly": false
},
{
"sourceVolume": "sys",
"containerPath": "/sys",
"readOnly": true
},
{
"sourceVolume": "var_lib_docker",
"containerPath": "/var/lib/docker",
"readOnly": true
},
{
"sourceVolume": "cgroup",
"containerPath": "/cgroup",
"readOnly": true
}
]
}
],
"volumes": [
{
"name": "root",
"host": {
"sourcePath": "/"
}
},
{
"name": "var_run",
"host": {
"sourcePath": "/var/run"
}
},
{
"name": "sys",
"host": {
"sourcePath": "/sys"
}
},
{
"name": "var_lib_docker",
"host": {
"sourcePath": "/var/lib/docker/"
}
},
{
"name": "cgroup",
"host": {
"sourcePath": "/cgroup"
}
}
]
}

View File

@ -0,0 +1,22 @@
{
"family": "prometheus",
"containerDefinitions": [
{
"portMappings": [
{
"hostPort": 9100,
"containerPort": 9100,
"protocol": "tcp"
}
],
"essential": true,
"name": "node_exporter",
"image": "prom/node-exporter",
"cpu": 0,
"privileged": null,
"memoryReservation": 150
}
],
"volumes": [],
"networkMode": "host"
}

View File

@ -0,0 +1,53 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'docker-host-alpha'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- "targets.rules"
- "hosts.rules"
- "containers.rules"
# A scrape configuration containing exactly one endpoint to scrape.
scrape_configs:
- job_name: 'nodeexporter'
scrape_interval: 5s
static_configs:
- targets: ['nodeexporter:9100']
- job_name: 'cadvisor'
scrape_interval: 5s
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'prometheus'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
# sample scrape configuration for AWS EC2
- job_name: 'nodeexporter'
ec2_sd_configs:
- region: us-east-1
port: 9100
relabel_configs:
# Only monitor instances which have a tag called Monitoring "Monitoring"
- source_labels: [__meta_ec2_tag_Monitoring]
regex: On
action: keep
- job_name: 'cadvisor'
ec2_sd_configs:
- region: us-east-1
port: 9010
relabel_configs:
# Only monitor instances which have a tag called Monitoring "Monitoring"
- source_labels: [__meta_ec2_tag_Monitoring]
regex: On
action: keep

View File

@ -0,0 +1,70 @@
groups:
- name: targets
rules:
- alert: monitor_service_down
expr: up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "Monitor service non-operational"
description: "Service {{ $labels.instance }} is down."
- name: host
rules:
- alert: high_cpu_load
expr: node_load1 > 1.5
for: 30s
labels:
severity: warning
annotations:
summary: "Server under high load"
description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_memory_load
expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server memory is almost full"
description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- alert: high_storage_load
expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
for: 30s
labels:
severity: warning
annotations:
summary: "Server storage is almost full"
description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
- name: containers
rules:
- alert: jenkins_down
expr: absent(container_memory_usage_bytes{name="jenkins"})
for: 30s
labels:
severity: critical
annotations:
summary: "Jenkins down"
description: "Jenkins container is down for more than 30 seconds."
- alert: jenkins_high_cpu
expr: sum(rate(container_cpu_usage_seconds_total{name="jenkins"}[1m])) / count(node_cpu_seconds_total{mode="system"}) * 100 > 10
for: 30s
labels:
severity: warning
annotations:
summary: "Jenkins high CPU usage"
description: "Jenkins CPU usage is {{ humanize $value}}%."
- alert: jenkins_high_memory
expr: sum(container_memory_usage_bytes{name="jenkins"}) > 1200000000
for: 30s
labels:
severity: warning
annotations:
summary: "Jenkins high memory usage"
description: "Jenkins memory consumption is at {{ humanize $value}}."

View File

@ -0,0 +1,53 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: 'docker-host-alpha'
# Load and evaluate rules in this file every 'evaluation_interval' seconds.
rule_files:
- "alert.rules"
# A scrape configuration containing exactly one endpoint to scrape.
scrape_configs:
- job_name: 'nodeexporter'
scrape_interval: 5s
static_configs:
- targets: ['nodeexporter:9100']
- job_name: 'cadvisor'
scrape_interval: 5s
static_configs:
- targets: ['cadvisor:8080']
- job_name: 'prometheus'
scrape_interval: 10s
static_configs:
- targets: ['localhost:9090']
- job_name: 'pushgateway'
scrape_interval: 10s
honor_labels: true
static_configs:
- targets: ['pushgateway:9091']
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- 'alertmanager:9093'
# - job_name: 'nginx'
# scrape_interval: 10s
# static_configs:
# - targets: ['nginxexporter:9113']
# - job_name: 'aspnetcore'
# scrape_interval: 10s
# static_configs:
# - targets: ['eventlog-proxy:5000', 'eventlog:5000']

41
test/dz001/README.md Normal file
View File

@ -0,0 +1,41 @@
# Заготовка для социальной сети
Цель: В результате выполнения ДЗ вы создадите базовый скелет социальной сети, который будет развиваться в дальнейших ДЗ.
###В данном задании тренируются навыки:
- декомпозиции предметной области;
- построения элементарной архитектуры проекта
Требуется разработать создание и просмотр анект в социальной сети.
###Функциональные требования:
- Авторизация по паролю.
- Страница регистрации, где указывается следующая информация:
- Имя
- Фамилия
- Возраст
- Пол
- Интересы
- Город
- Страницы с анкетой.
###Нефункциональные требования:
- Любой язык программирования
- В качестве базы данных использовать MySQL
- Не использовать ORM
- Программа должна представлять из себя монолитное приложение.
- Не рекомендуется использовать следующие технологии:
- Репликация
- Шардинг
- Индексы
- Кэширование
Верстка не важна. Подойдет самая примитивная.
Разместить приложение на любом хостинге. Например, heroku.
ДЗ принимается в виде исходного кода на github и демонстрации проекта на хостинге.
Критерии оценки: Оценка происходит по принципу зачет/незачет.
###Требования:
- Есть возможность регистрации, создавать персональные страницы, возможность подружиться, список друзей.
- Отсутствуют SQL-инъекции.
- Пароль хранится безопасно.

2
test/dz003/README.md Normal file
View File

@ -0,0 +1,2 @@
Встроен запуск prometeus, grafana и т.д.
Графана доступна на http://localhost:3001/