#新建存放tar目录
mkdir ~/prometheus
#下载tar包
wget https://github.com/prometheus/prometheus/releases/download/v2.29.2/prometheus-2.29.2.linux-amd64.tar.gz -O ~/prometheus
#解压到安装目录,假如为/apps
mkdir /apps
tar -zxf ~/prometheus/prometheus-2.29.2.linux-amd64.tar.gz -C /apps
ln -s /apps/prometheus-2.29.2.linux-amd64 /apps/prometheus
mkdir /apps/prometheus/bin && mv /apps/prometheus/prometheus /apps/prometheus/bin
#配置prometheus的systemd启动脚本
cat >> /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
ExecStart=/apps/prometheus/bin/prometheus --config.file=/apps/prometheus/prometheus.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
# 检查ss -tunlp | grep 9090 或lsof -i:9090
curl http://192.168.207.38:9090
#1、安装Alertmanager
#官网下载https://prometheus.io/download/
tar -xvf alertmanager-0.20.0.linux-amd64.tar.gz
mv alertmanager-0.20.0.linux-amd64 /usr/local/alertmanager
# cat alertmanager.yml
global:
resolve_timeout: 10m
smtp_from: monitor@test.com #接收告警的服务器邮箱
smtp_hello: '@test.com'
smtp_smarthost: mail.test.com:587
smtp_auth_username: monitor@test.com
smtp_auth_password: Monitor#2021
smtp_require_tls: false
route:
group_by: ['alertname'] # 分组
group_wait: 30s # 告警等待
group_interval: 5m #
repeat_interval: 48h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
email_configs:
- to: ' monitor@test.com ' # 告警接收人,可多个
html: '{{ template "test.html" . }}'
send_resolved: true
# webhook_configs:
# - url: 'http://127.0.0.1:5001/'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
templates:
- /usr/local/alertmanager/template/*.tmpl #邮件告警模板存放目录
#注意:接收邮件的服务器配置信息要准确,包括地址、端口、密码
#告警模板配置说明:
#1、修改alertmanager.yml,配置模板地址,然后在每个receiver引用模板
----
templates:
- '/usr/local/alertmanager/template/email.tmpl '
...
...
receivers:
- name: 'web.hook'
email_configs:
- to: ' monitor@test.com ' # 告警接收人,可多个
html: '{{ template "test.html" . }}'
send_resolved: true
# webhook_configs:
----
#邮件告警模板【存放目录/usr/local/alertmanager/template】
vim template/email.tmpl
alertmanager参考模板
{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
#注:告警模板如果配置有问题,会导致邮件发送失败,注意观察日志。
#修改好配置文件后,可以使用amtool工具检查配置
./amtool check-config alertmanager.yml
#Checking 'alertmanager.yml' SUCCESS
#服务开机配置
cat >/usr/lib/systemd/system/alertmanager.service <<EOF
[Unit]
Description=alertmanager
[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data --web.listen-address=:9093 --data.retention=120h
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl enable alertmanager
systemctl restart alertmanager
#alertmanager默认运行端口是:9093
ss -tunlp | grep 9093
#alertmanager也可以同prometheus一样热加载配置
curl -X POST http://prometheus_ip:9093/-/reload
#部署grafana
#1、新建tar包目录
mkdir ~/grafana
#下载tar包
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.2.3.linux-amd64.tar.gz -O ~/grafana
tar -zxvf ~/grafana/grafana-enterprise-9.2.3.linux-amd64.tar.gz -C /apps
ln -s /apps/grafana-9.2.3 /apps/grafana
#2、grafana对接数据库mariadb
mysql -uroot -pMySQL@2022.
MariaDB [(none)]> create database grafana;
MariaDB [(none)]> GRANT ALL PRIVILEGES ON grafana.* TO 'grafana'@'' IDENTIFIED BY "MySQL@2022.aaa" WITH GRANT OPTION;
MariaDB [(none)]> flush privileges;
MariaDB [(none)]> exit;
#修改grafana连数据库配置
chmod 755 -R /apps/grafana-9.2.3
vim /apps/grafana/conf/defaults.ini
type = mysql
host = 127.0.0.1:3306
name = grafana
user = grafana
password = MySQL@2022.aaa
url = mysql://grafana:MySQL@2022.aaa@192.168.2.110:3306/grafana
#启动 grafana
/apps/grafana/bin/grafana-server &
#查看
ss -utnlp | grep 3000
ps -ef | grep grafana
#配置开机启动
nohup /apps/grafana/bin/grafana-server >/apps/grafana/grafana.log 2>&1 &
#服务启动配置
cat >> /usr/lib/systemd/system/grafana.service << EOF
[Unit]
Description=Grafana instance
Documentation=http://docs.grafana.org
Wants=network-online.target
After=network-online.target
After=postgresql.service mariadb.service mysqld.service
[Service]
Type=notify
Restart=on-failure
WorkingDirectory=/apps/grafana
RuntimeDirectory=/apps/grafana
#RuntimeDirectoryMode=0750
ExecStart=/usr/sbin/grafana-server
ExecStart=/apps/grafana/bin/grafana-server --config=/apps/grafana/conf/defaults.ini --pidfile=/var/run/grafana/grafana-server.pid --packaging=tar
cfg:default.paths.logs=/apps/grafana/data/log
cfg:default.paths.data=/apps/grafana/data
cfg:default.paths.plugins=/apps/grafana/plugins
cfg:default.paths.provisioning=/apps/grafana/provisioning
LimitNOFILE=10000
TimeoutStopSec=20
CapabilityBoundingSet=
DeviceAllow=
LockPersonality=true
MemoryDenyWriteExecute=false
NoNewPrivileges=true
PrivateDevices=true
PrivateTmp=true
ProtectClock=true
ProtectControlGroups=true
ProtectHome=true
ProtectHostname=true
ProtectKernelLogs=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectProc=invisible
ProtectSystem=full
RemoveIPC=true
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
SystemCallArchitectures=native
UMask=0027
[Install]
WantedBy=multi-user.target
EOF
#新建目录
mkdir /apps/grafana/plugins
mkdir /apps/grafana/provisioning
mkdir /var/run/grafana
#服务启动
systemctl daemon-reload && systemctl start grafana && systemctl enable grafana
#url登录
http://x.x.x.x:3000
#默认密码:admin/admin
#1、部署node_exporter
#github下载
tar -zxf node_exporter-1.4.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/node_exporter-1.4.0.linux-amd64 /usr/local/node_exporter
#服务启动
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
[Service]
ExecStart=/usr/local/node_exporter/node_exporter
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
#查看
ss -utnlp | grep 9100
#node_exporter 默认端口为 9100
# http://10.18.125.112:9100/metrics
#2.、物理机部署prometheus的接入node_exporter;
#修改prometheus配置文件
- job_name: 'nginx'
static_configs:
- targets: ['127.0.0.1:9100'] #9100为工具启动的端口
labels:
instance: nginx
#重载prometheus
curl -X POST http://localhost:9090/-/reload
#3、导入grafana模板,rancher自带模板,无需导入
#grafana导入node_exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/1860
#1、mysql_exporter部署
#下载mysql_exporter包
#上传到home目录
cd /home
tar -zxf mysqld_exporter-0.14.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter
#创建mysqld_exporter用户并授权
mysql -uroot -pMySQL@2022.
MariaDB [mysql]> use mysql;
MariaDB [mysql]> CREATE USER 'mysqld_jk'@'*' IDENTIFIED BY '123456ABCqq' WITH MAX_USER_CONNECTIONS 3;
MariaDB [mysql]> GRANT ALL PRIVILEGES ON *.* TO 'mysqld_jk'@'' IDENTIFIED BY '123456ABCqq';
MariaDB [mysql]> flush privileges;
#注意:
#在创建用户的时候,推荐执行 MAX_USER_CONNECTIONS参数,避免我们监控使用过多的数据库连接数,导致数据库压力过大。
#创建 my.cnf 配置文件【注意账号和密码的保密】
#在和mysqld_exporter文件同级的文件夹中创建my.cnf文件,文件内容如下
vi my.cnf
[client]
user=mysqld_jk
password=123456ABCqq
#服务启动配置
cat > /usr/lib/systemd/system/mysqld_exporter.service << EOF
[Unit]
Description=mysqld_exporter
Documentation=https://prometheus.io
Wants=network-online.target
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/mysqld_exporter/mysqld_exporter --config.my-cnf=/usr/local/mysqld_exporter/my.cnf
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start mysqld_exporter && systemctl enable mysqld_exporter
#systemctl stop mysqld_exporter
#查看
ps -ef | grep mysqld_exporter
ss -utnlp | grep 9104
#对外暴露端口为:9104
#2、mysql_exporter、prometheus、grafana结合
#1)创建服务器监控动态配置文件
vim /apps/prometheus/prometheus.yml
- job_name: 'mysqld-exporter'
metrics_path: /metrics
static_configs:
- targets: ['192.168.207.132:9104']
#开机服务
cat >> /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
ExecStart=/apps/prometheus/bin/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
#重载prometheus
curl -X POST http://localhost:9090/-/reload
#grafana导入mysql-exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/7362
#用Prometheus细化Nginx监控、使用默认tub_status模块监控
#1、nginx部署
#nginx版本下载
wget http://nginx.org/download/nginx-1.20.1.tar.gz
#解压部署
tar -zxf nginx-1.20.1.tar.gz -C /usr/local/
ln -s /usr/local/nginx-1.20.1 /usr/local/nginx
#启动与停止Nginx
/usr/local/nginx/sbin/nginx start
/usr/local/nginx/sbin/nginx stop
#进程、端口查看
ps -ef | grep nginx
ss -utlnp | grep 80
#查看模块
# nginx -V 2>&1 | grep -o with-http_stub_status_module
#2、修改配置,开启监控,需一个监控端口,假设为8099
cat nginx_status.conf
server {
listen 8099;
#端口可以自己重新起一个,配置文件要在外层nginx.conf中添加
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
deny all;
}
}
#nginx重载
/usr/local/nginx/sbin/nginx -s reload
#3、部署监控收集端nginx-prometheus-exporter
#下载 https://github.com/nginxinc/nginx-prometheus-exporter/releases
#解压部署
tar nginx-prometheus-exporter_0.11.0_linux_amd64.tar.gz -C .
mv nginx-prometheus-exporter_0.11.0_linux_amd64/nginx-prometheus-exporter /usr/local/bin/
#配置开机启动
vim /usr/lib/systemd/system/nginx-prometheus-exporter.service
[Unit]
Description=nginx-prometheus-exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/nginx-prometheus-exporter -nginx.scrape-uri http://127.0.0.1:8099/nginx_status
ExecStop=/bin/kill -s TERM $MAINPID
Restart=on-failure
[Install]
WantedBy=multi-user.target
#服务启动
systemctl daemon-reload && systemctl start nginx-prometheus-exporter && systemctl enable nginx-prometheus-exporter
#4、nginx_exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:9113/metrics.
#prometheus.yml文件添加被监控的机器节点;
- job_name: 'nginx_status_module' # 采集nginx的指标
metrics_path: '/metrics' # 拉取指标的接口路径
scrape_interval: 10s # 采集指标的间隔周期
static_configs:
- targets: ['127.0.0.1:9113'] # nginx-prometheus-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/12708
#安装zookeeper_exporter监控
#目前promethus中没有开发有现成的zookeeper插件,所以选择github中验证的zookeeper插件
mkdir ~/zookeeper_exporter
#wget https://github.com/dabealu/zookeeper-exporter/releases/download/v0.1.12/zookeeper-exporter-v0.1.12-linux.tar.gz
#解压,cp到指定目录
tar -zxf zookeeper-exporter-v0.1.12-linux.tar.gz -C .
cp zookeeper-exporter-v0.1.12-linux/zookeeper-exporter /usr/local/bin/
#赋予执行权限
chmod 755 /usr/local/bin/zookeeper-exporter
#服务启动脚本
cat >> /usr/lib/systemd/system/zookeeper-exporter.service << EOF
[Unit]
Description= prometheus-zookeeper
After=network.target
[Service]
Type=simple
#填写监控正式ip地址
ExecStart=/usr/local/bin/zookeeper-exporter --zookeeper 192.167.207.213:2181 --bind-addr :9141
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start zookeeper-exporter && systemctl enable zookeeper-exporter
#默认暴露端口:9141
#zookeeper_exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:9141/metrics.
#prometheus.yml文件添加被监控的机器节点;
cd /apps/prometheus
vim prometheus.yml
- job_name: 'zookeeper_exporter' # 采集zookeeper的指标
metrics_path: '/metrics' # 拉取指标的接口路径
scrape_interval: 10s # 采集指标的间隔周期
static_configs:
- targets: ['x.x.x.x:9141'] # zookeeper-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/11442
#1、配置Prometheus监控
#安装RocketMQ Exporter
#RocketMQ官方已经提供了exporter,官方链接 https://github.com/apache/rocketmq-exporter
#编译 RocketMQ-Exporter
#git clone https://github.com/apache/rocketmq-exportercd
#或下载压缩包解压,编译
cd rocketmq-exporter
mvn clean install
#在rocketmq-export编译后的target目录下,cp编译包到指定目录
cp rocketmq-exporter/target/rocketmq-exporter-0.0.2-SNAPSHOT.jar /apps/rocketmq_export/rocketmq-exporter-0.0.2-SNAPSHOT.jar
#服务启动脚本
vim /etc/systemd/system/rocketmq_exporter.service
[Service]
WorkingDirectory=/apps/rocketmq_export
PrivateTmp=true
Restart=always
Type=simple
ExecStart=/bin/java -jar /apps/rocketmq_export/rocketmq-exporter-0.0.2-SNAPSHOT.jar >dev/null 2>&1 &
ExecStop=/usr/bin/kill -15 $MAINPID
[Install]
WantedBy=multi-user.target
systemctl daemon-reload && systemctl start exporter_service && systemctl enable exporter_service
#2、rocketmq—exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:5557/metrics.
#prometheus.yml文件添加被监控的机器节点;
cd /apps/prometheus
vim prometheus.yml
- job_name: 'zookeeper_exporter' # 采集zookeeper的指标
metrics_path: '/metrics' # 拉取指标的接口路径
scrape_interval: 10s # 采集指标的间隔周期
static_configs:
- targets: ['x.x.x.x:5557'] # zookeeper-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/10477
#redis_exporter、prometheus、grafana结合
#1、部署redis_exporter
#GitHub下载redis_exporter
tar -zxf redis/redis_exporter-v1.45.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/redis_exporter-v1.45.0.linux-amd64 /usr/local/redis_exporter
#配置启动脚本
cat > /usr/lib/systemd/system/redis_exporter.service << EOF
[Unit]
Description=redis_exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
[Service]
#ip地址与端口
ExecStart=/usr/local/redis_exporter/redis_exporter --redis.addr X.X.X.X:6379
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start redis_exporter && systemctl enable redis_exporter
#对外默认暴露端口:9121
#查看
ss -tunlp | grep 9121
ps -ef | grep redis_exporter
#prometheus配置
#进入prometheus安装目录,编辑prometheus.yml
vim prometheus.yml
- job_name: 'redis-simple'
metrics_path: /metrics
static_configs:
- targets: ['192.168.207.214:9121']
#配置加载
curl -X POST http://localhost:9090/-/reload
#grafana导入redis_exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/2949
#1、部署process-exporter
#GitHub下载process-exporter包
#https://github.com/ncabatoff/process-exporter/tree/master/packaging
# mv /tmp/process-exporter-0.7.10.linux-amd64.tar.gz .
#解压,安装
tar -zxf process-exporter-0.7.10.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/process-exporter-0.7.10.linux-amd64 /usr/local/process-exporter
#process-conf.yaml为配置监控进程的配置文件
vim process-conf.yaml
process_names:
- name: "{{.Comm}}"
cmdline:
- '.+'
#服务启动配置
cat > /usr/lib/systemd/system/process_exporter.service << EOF
[Unit]
Description=process_exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/process-exporter/process-exporter -config.path=/usr/local/process-exporter/process-conf.yaml
KillMode=process
Restart=always
[Install]
WantedBy=multi-user.target
EOF
#加载并开机自启
systemctl daemon-reload && systemctl enable process_exporter
#启动process exporter
systemctl daemon-reload && systemctl start process_exporter
#查看
ps -ef | grep process-exporter
ss -tunlp | grep 9256
#对外暴露端口:9256
#process-expoeter、prometheus、grafana结合
#修改prometheus配置文件并重启
vim prometheus.yaml
- job_name: 'process-expoeter'
static_configs:
- targets: ['127.0.0.1:9256']
labels:
instance: nginx
#重载prometheus
curl -X POST http://localhost:9090/-/reload
#grafana导入nginx-vts-exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/249
#根据告警指标,准备prometheus的告警规则
vim linux_node_host.yaml
主机告警规则
groups:
- name: linux
rules:
- alert: node-down(主机宕机)
expr: up == 0
for: 1m
labels:
#status: critical
severity: critical
team: operations
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过1分钟"
- alert: node-cpu
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
for: 1m
labels:
#status: warning
severity: warning
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "{{$labels.mountpoint }} CPU使用大于60%(目前值为:{{humanize $value}} %)"
#summary: 主机 {{ $labels.nodename }} 的 CPU使用率持续1分钟超出阈值,当前为 {{humanize $value}} %
- alert: node-memory
expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
for: 1m
labels:
#status: warning
severity: warning
annotations:
summary: "{{$labels.mountpoint}} 内存使用率过高!"
description: "{{$labels.mountpoint }} 内存使用大于80%(目前值为:{{humanize $value}} %)"
- alert: node-IO
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1m
labels:
#status: 严重告警
severity: warning
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前值为:{{humanize $value}} %)"
- alert: node-disk
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
for: 1m
labels:
#status: 严重告警
severity: warning
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前值为:{{humanize $value}} %)"
- alert: node-HostCpuHighIowait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 5
for: 0m
labels:
severity: warning
annotations:
#summary: Host CPU high iowait (instance {{ $labels.instance }})
summary: "{{$labels.mountpoint}} 主机iowait等待值过高!"
description: "CPU iowait当前值大于5%. .\n VALUE = {{humanize $value}}\n LABELS = {{ $labels }}"
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp linux_node_host.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules
#根据告警指标,准备prometheus的告警规则
vim zookeeper.yaml
主机告警规则
groups:
- name: zookeeper服务监控
rules:
- alert: ZookeeperDown宕机
expr: zk_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Zookeeper Down (instance {{ $labels.instance }})
description: "Zookeeper down on instance {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp zookeeper.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules
#根据告警指标,准备prometheus的告警规则
vim nginx.yaml
主机告警规则
groups:
- name: nginx服务监控
rules:
- alert: nginx服务停止
expr: nginx_up == 0
for: 1m
labels:
severity: 严重告警
annotations:
summary: " {{ $labels.alias }} nginx服务已停止,当前状态{{ $value }}"
description: "{{$labels.instance}}: nginx服务停止运行 "
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp nginx.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules
#根据告警指标,准备prometheus的告警规则
vim Redis.yaml
主机告警规则
Redis服务监控
groups:
- name: Redis服务监控
rules:
- alert: Redis服务停止
expr: redis_up == 0
for: 1m
labels:
severity: 严重告警
annotations:
summary: " {{ $labels.alias }} Redis服务已停止,当前状态{{ $value }}"
description: "{{$labels.instance}}:Redis 服务停止运行 "
- alert: Redis服务器内存占用大于80%
expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Redis out of system memory (instance {{ $labels.instance }})
description: "Redis is running out of system memory (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisOutOfConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
description: "Redis is running out of configured maxmemory (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp Redis.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules
#根据告警指标,准备prometheus的告警规则
vim rocketmq.yaml
主机告警规则
groups:
- name: rocketmq
rules:
- alert: RocketMQ宕机
expr: up{job="rocketmq"} == 0
for: 20s
labels:
severity: '灾难'
annotations:
summary: RocketMQ {{ $labels.instance }} is down
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp rocketmq.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules
#根据告警指标,准备prometheus的告警规则
vim mysql.yaml
主机告警规则
groups:
- name: MySQL-rules
rules:
- alert: MySQL服务停止
expr: up == 0
for: 5s
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: MySQL has stop !!!"
description: "检测MySQL数据库运行状态"
- alert: MysqlTooManyConnections(连接数大于80%)
expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MySQL主从IO线程停止
expr: mysql_slave_status_slave_io_running == 0
for: 5s
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: MySQL Slave IO Thread has stop !!!"
description: "检测MySQL主从IO线程运行状态"
- alert: MySQL主从SQL线程停止
expr: mysql_slave_status_slave_sql_running == 0
for: 5s
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: MySQL Slave SQL Thread has stop !!!"
description: "检测MySQL主从SQL线程运行状态"
- alert: MySQL主从延时大于30s
expr: mysql_slave_status_sql_delay == 30
for: 5s
labels:
severity: warning
annotations:
summary: "{{$labels.instance}}: MySQL Slave Delay has more than 30s !!!"
description: "检测MySQL主从延时状态"
- alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 0m
labels:
severity: warning
annotations:
summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration #prometheus 接入alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" #告警规则存储目录
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p
cp mysql.yaml /apps/prometheus/rules/
curl -X POST http://prometheus_ip:9090/-/reload
#web查看
http://x.x.x.x:9090/rules