监控告警的部署与配置(Prothemeus+Alermanager+Grafana)

本文详细介绍了如何部署和配置监控告警系统,包括Prothemeus的安装,Alermanager的单机部署,以及Grafana的单机部署。接着,文章讲解了针对不同服务如node、mariadb、nginx、zookeeper、rocketmq、redis和process的监控配置,以及相应的告警规则设定,帮助运维人员实现全面的系统监控和及时的故障预警。
    • Prothemeus部署

#新建存放tar目录
mkdir ~/prometheus
#下载tar包
wget https://github.com/prometheus/prometheus/releases/download/v2.29.2/prometheus-2.29.2.linux-amd64.tar.gz -O ~/prometheus
#解压到安装目录,假如为/apps
mkdir /apps
tar -zxf ~/prometheus/prometheus-2.29.2.linux-amd64.tar.gz -C /apps
ln -s /apps/prometheus-2.29.2.linux-amd64 /apps/prometheus
mkdir /apps/prometheus/bin && mv /apps/prometheus/prometheus /apps/prometheus/bin
 
#配置prometheus的systemd启动脚本
cat >> /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
ExecStart=/apps/prometheus/bin/prometheus --config.file=/apps/prometheus/prometheus.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

# 检查ss -tunlp | grep 9090 或lsof -i:9090
 
curl http://192.168.207.38:9090
    • Alermanager 单机部署

#1、安装Alertmanager
#官网下载https://prometheus.io/download/
tar -xvf alertmanager-0.20.0.linux-amd64.tar.gz  
mv alertmanager-0.20.0.linux-amd64 /usr/local/alertmanager
 
# cat alertmanager.yml
global:
  resolve_timeout: 10m
  smtp_from: monitor@test.com   #接收告警的服务器邮箱
  smtp_hello: '@test.com'
  smtp_smarthost: mail.test.com:587
  smtp_auth_username: monitor@test.com
  smtp_auth_password: Monitor#2021
  smtp_require_tls: false
route:
  group_by: ['alertname']                         # 分组
  group_wait: 30s                                 # 告警等待
  group_interval: 5m                              #
  repeat_interval: 48h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  email_configs:
  - to: ' monitor@test.com '                        # 告警接收人,可多个
    html: '{{ template "test.html" . }}'
    send_resolved: true
#  webhook_configs:
#  - url: 'http://127.0.0.1:5001/'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']
templates:
- /usr/local/alertmanager/template/*.tmpl    #邮件告警模板存放目录
 
#注意:接收邮件的服务器配置信息要准确,包括地址、端口、密码
 
#告警模板配置说明:
#1、修改alertmanager.yml,配置模板地址,然后在每个receiver引用模板
 ----
templates:
- '/usr/local/alertmanager/template/email.tmpl '
...
...
receivers:
- name: 'web.hook'
  email_configs:
  - to: ' monitor@test.com '                        # 告警接收人,可多个
    html: '{{ template "test.html" . }}'
    send_resolved: true
#  webhook_configs:
 ----
 
#邮件告警模板【存放目录/usr/local/alertmanager/template】
vim  template/email.tmpl
alertmanager参考模板 
{{ define "test.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= ERROR ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= INFO ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
#注:告警模板如果配置有问题,会导致邮件发送失败,注意观察日志。
 
#修改好配置文件后,可以使用amtool工具检查配置
./amtool check-config alertmanager.yml

#Checking 'alertmanager.yml'  SUCCESS
 
#服务开机配置
cat >/usr/lib/systemd/system/alertmanager.service   <<EOF
[Unit]
Description=alertmanager
 
[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/usr/local/alertmanager/data --web.listen-address=:9093 --data.retention=120h  
Restart=on-failure
 
[Install]
WantedBy=multi-user.target
 
EOF
 
systemctl enable alertmanager    
systemctl restart alertmanager 
 
#alertmanager默认运行端口是:9093
ss -tunlp | grep 9093

#alertmanager也可以同prometheus一样热加载配置
curl -X POST http://prometheus_ip:9093/-/reload
    • Grafana单机部署

#部署grafana
#1、新建tar包目录
mkdir ~/grafana
#下载tar包
wget https://dl.grafana.com/enterprise/release/grafana-enterprise-9.2.3.linux-amd64.tar.gz -O ~/grafana
tar -zxvf ~/grafana/grafana-enterprise-9.2.3.linux-amd64.tar.gz -C /apps
ln -s /apps/grafana-9.2.3 /apps/grafana 
#2、grafana对接数据库mariadb
mysql -uroot -pMySQL@2022.
MariaDB [(none)]> create database grafana;
MariaDB [(none)]> GRANT ALL PRIVILEGES ON grafana.* TO 'grafana'@'' IDENTIFIED BY "MySQL@2022.aaa" WITH GRANT OPTION;
MariaDB [(none)]> flush privileges;
MariaDB [(none)]> exit;
 
#修改grafana连数据库配置
chmod 755 -R /apps/grafana-9.2.3
vim /apps/grafana/conf/defaults.ini
type = mysql
host = 127.0.0.1:3306
name = grafana
user = grafana
password = MySQL@2022.aaa
url = mysql://grafana:MySQL@2022.aaa@192.168.2.110:3306/grafana

#启动 grafana
/apps/grafana/bin/grafana-server &
#查看 
ss -utnlp | grep 3000
ps -ef | grep grafana
#配置开机启动
nohup /apps/grafana/bin/grafana-server >/apps/grafana/grafana.log 2>&1 &
#服务启动配置
cat >> /usr/lib/systemd/system/grafana.service << EOF
[Unit]
Description=Grafana instance
Documentation=http://docs.grafana.org
Wants=network-online.target
After=network-online.target
After=postgresql.service mariadb.service mysqld.service
[Service]
Type=notify
Restart=on-failure
WorkingDirectory=/apps/grafana
RuntimeDirectory=/apps/grafana
#RuntimeDirectoryMode=0750
ExecStart=/usr/sbin/grafana-server  
ExecStart=/apps/grafana/bin/grafana-server --config=/apps/grafana/conf/defaults.ini --pidfile=/var/run/grafana/grafana-server.pid --packaging=tar
cfg:default.paths.logs=/apps/grafana/data/log  
cfg:default.paths.data=/apps/grafana/data 
cfg:default.paths.plugins=/apps/grafana/plugins 
cfg:default.paths.provisioning=/apps/grafana/provisioning  
LimitNOFILE=10000
TimeoutStopSec=20
CapabilityBoundingSet=
DeviceAllow=
LockPersonality=true
MemoryDenyWriteExecute=false
NoNewPrivileges=true
PrivateDevices=true
PrivateTmp=true
ProtectClock=true
ProtectControlGroups=true
ProtectHome=true
ProtectHostname=true
ProtectKernelLogs=true
ProtectKernelModules=true
ProtectKernelTunables=true
ProtectProc=invisible
ProtectSystem=full
RemoveIPC=true
RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX
RestrictNamespaces=true
RestrictRealtime=true
RestrictSUIDSGID=true
SystemCallArchitectures=native
UMask=0027
[Install]
WantedBy=multi-user.target

EOF

#新建目录
mkdir /apps/grafana/plugins   
mkdir /apps/grafana/provisioning 
mkdir /var/run/grafana

#服务启动
systemctl daemon-reload && systemctl start grafana && systemctl enable grafana

#url登录
http://x.x.x.x:3000
#默认密码:admin/admin
    • 监控配置

    • node监控

#1、部署node_exporter
#github下载
tar -zxf node_exporter-1.4.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/node_exporter-1.4.0.linux-amd64 /usr/local/node_exporter
#服务启动
cat > /usr/lib/systemd/system/node_exporter.service << EOF
[Unit]
Description=node_exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
 
[Service]
ExecStart=/usr/local/node_exporter/node_exporter 
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

#查看
ss -utnlp | grep 9100
#node_exporter 默认端口为 9100
# http://10.18.125.112:9100/metrics
 
#2.、物理机部署prometheus的接入node_exporter;
#修改prometheus配置文件
- job_name: 'nginx'
  static_configs:
  - targets: ['127.0.0.1:9100']  #9100为工具启动的端口
    labels:
      instance: nginx
      
#重载prometheus
curl -X POST http://localhost:9090/-/reload
#3、导入grafana模板,rancher自带模板,无需导入
#grafana导入node_exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/1860
    • mariadb 监控

#1、mysql_exporter部署
#下载mysql_exporter包
#上传到home目录
cd /home    
tar -zxf mysqld_exporter-0.14.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/mysqld_exporter-0.14.0.linux-amd64 /usr/local/mysqld_exporter

#创建mysqld_exporter用户并授权
mysql -uroot -pMySQL@2022.
MariaDB [mysql]> use mysql;
MariaDB [mysql]> CREATE USER 'mysqld_jk'@'*' IDENTIFIED BY '123456ABCqq' WITH MAX_USER_CONNECTIONS 3;

MariaDB [mysql]> GRANT ALL PRIVILEGES ON *.* TO 'mysqld_jk'@'' IDENTIFIED BY '123456ABCqq';
MariaDB [mysql]> flush privileges;
#注意:
#在创建用户的时候,推荐执行 MAX_USER_CONNECTIONS参数,避免我们监控使用过多的数据库连接数,导致数据库压力过大。

#创建 my.cnf 配置文件【注意账号和密码的保密】
#在和mysqld_exporter文件同级的文件夹中创建my.cnf文件,文件内容如下
vi my.cnf
[client]
user=mysqld_jk
password=123456ABCqq
#服务启动配置
cat > /usr/lib/systemd/system/mysqld_exporter.service << EOF
[Unit]
Description=mysqld_exporter
Documentation=https://prometheus.io
Wants=network-online.target
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/mysqld_exporter/mysqld_exporter --config.my-cnf=/usr/local/mysqld_exporter/my.cnf
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload && systemctl start mysqld_exporter && systemctl enable mysqld_exporter
#systemctl stop mysqld_exporter
#查看
ps -ef | grep mysqld_exporter
ss -utnlp | grep 9104

#对外暴露端口为:9104
#2、mysql_exporter、prometheus、grafana结合
#1)创建服务器监控动态配置文件
 vim /apps/prometheus/prometheus.yml
  - job_name: 'mysqld-exporter'
    metrics_path: /metrics
    static_configs:
      - targets: ['192.168.207.132:9104']

#开机服务
cat >> /usr/lib/systemd/system/prometheus.service << EOF
[Unit]
Description=Prometheus
After=network.target
[Service]
Type=simple
ExecStart=/apps/prometheus/bin/prometheus --config.file=/apps/prometheus/prometheus.yml --web.enable-lifecycle
Restart=on-failure
[Install]
WantedBy=multi-user.target

EOF

#重载prometheus
curl -X POST http://localhost:9090/-/reload
#grafana导入mysql-exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/7362

    • nginx监控

#用Prometheus细化Nginx监控、使用默认tub_status模块监控
#1、nginx部署
#nginx版本下载
wget http://nginx.org/download/nginx-1.20.1.tar.gz
#解压部署
tar -zxf nginx-1.20.1.tar.gz -C /usr/local/
ln -s /usr/local/nginx-1.20.1 /usr/local/nginx
#启动与停止Nginx
/usr/local/nginx/sbin/nginx start   
/usr/local/nginx/sbin/nginx stop      
#进程、端口查看
ps -ef | grep nginx
ss -utlnp | grep 80
 
#查看模块
# nginx -V 2>&1 | grep -o with-http_stub_status_module
#2、修改配置,开启监控,需一个监控端口,假设为8099
cat nginx_status.conf
server {
        listen   8099;  
        #端口可以自己重新起一个,配置文件要在外层nginx.conf中添加
        location /nginx_status {
            stub_status on;
            access_log off;
            allow 127.0.0.1;
            deny all;
        }
}
#nginx重载
/usr/local/nginx/sbin/nginx -s reload

#3、部署监控收集端nginx-prometheus-exporter
#下载 https://github.com/nginxinc/nginx-prometheus-exporter/releases
#解压部署
tar nginx-prometheus-exporter_0.11.0_linux_amd64.tar.gz -C .
mv nginx-prometheus-exporter_0.11.0_linux_amd64/nginx-prometheus-exporter /usr/local/bin/

#配置开机启动
 vim /usr/lib/systemd/system/nginx-prometheus-exporter.service 
[Unit]
Description=nginx-prometheus-exporter
After=network.target
[Service]
Type=simple
ExecStart=/usr/local/bin/nginx-prometheus-exporter -nginx.scrape-uri http://127.0.0.1:8099/nginx_status 
ExecStop=/bin/kill -s TERM $MAINPID
Restart=on-failure
[Install]
WantedBy=multi-user.target
#服务启动
systemctl daemon-reload && systemctl start nginx-prometheus-exporter && systemctl enable nginx-prometheus-exporter
 
#4、nginx_exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:9113/metrics.
#prometheus.yml文件添加被监控的机器节点;
- job_name: 'nginx_status_module' # 采集nginx的指标
  metrics_path: '/metrics' # 拉取指标的接口路径
  scrape_interval: 10s # 采集指标的间隔周期
  static_configs:
  - targets: ['127.0.0.1:9113'] # nginx-prometheus-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/12708
    • zookeeper监控

#安装zookeeper_exporter监控
#目前promethus中没有开发有现成的zookeeper插件,所以选择github中验证的zookeeper插件
mkdir ~/zookeeper_exporter
 
#wget https://github.com/dabealu/zookeeper-exporter/releases/download/v0.1.12/zookeeper-exporter-v0.1.12-linux.tar.gz
#解压,cp到指定目录
tar -zxf zookeeper-exporter-v0.1.12-linux.tar.gz -C .
cp zookeeper-exporter-v0.1.12-linux/zookeeper-exporter /usr/local/bin/
#赋予执行权限
chmod 755 /usr/local/bin/zookeeper-exporter
#服务启动脚本
cat >> /usr/lib/systemd/system/zookeeper-exporter.service << EOF
[Unit]
Description= prometheus-zookeeper
After=network.target
[Service]
Type=simple
#填写监控正式ip地址
ExecStart=/usr/local/bin/zookeeper-exporter --zookeeper 192.167.207.213:2181 --bind-addr :9141
Restart=on-failure
[Install]
WantedBy=multi-user.target

EOF

systemctl daemon-reload && systemctl start zookeeper-exporter && systemctl enable zookeeper-exporter

#默认暴露端口:9141
#zookeeper_exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:9141/metrics.
#prometheus.yml文件添加被监控的机器节点;
cd /apps/prometheus
vim prometheus.yml
- job_name: 'zookeeper_exporter' # 采集zookeeper的指标
  metrics_path: '/metrics' # 拉取指标的接口路径
  scrape_interval: 10s # 采集指标的间隔周期
  static_configs:
  - targets: ['x.x.x.x:9141'] # zookeeper-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/11442
    • rocketmq监控

#1、配置Prometheus监控
#安装RocketMQ Exporter
#RocketMQ官方已经提供了exporter,官方链接 https://github.com/apache/rocketmq-exporter
#编译 RocketMQ-Exporter
#git clone https://github.com/apache/rocketmq-exportercd 
#或下载压缩包解压,编译
cd rocketmq-exporter
mvn clean install
#在rocketmq-export编译后的target目录下,cp编译包到指定目录
cp rocketmq-exporter/target/rocketmq-exporter-0.0.2-SNAPSHOT.jar /apps/rocketmq_export/rocketmq-exporter-0.0.2-SNAPSHOT.jar

#服务启动脚本
vim /etc/systemd/system/rocketmq_exporter.service
[Service]
WorkingDirectory=/apps/rocketmq_export
PrivateTmp=true
Restart=always
Type=simple
ExecStart=/bin/java -jar /apps/rocketmq_export/rocketmq-exporter-0.0.2-SNAPSHOT.jar >dev/null 2>&1 &
ExecStop=/usr/bin/kill -15 $MAINPID
[Install]
WantedBy=multi-user.target

systemctl daemon-reload && systemctl start exporter_service && systemctl enable exporter_service
 
#2、rocketmq—exporter、prometheus、grafana结合
#外暴露监控接口http://xxx:5557/metrics.
#prometheus.yml文件添加被监控的机器节点;
cd /apps/prometheus
vim prometheus.yml
- job_name: 'zookeeper_exporter' # 采集zookeeper的指标
  metrics_path: '/metrics' # 拉取指标的接口路径
  scrape_interval: 10s # 采集指标的间隔周期
  static_configs:
  - targets: ['x.x.x.x:5557'] # zookeeper-exporter服务的ip和端口
#重新加载prometheus
curl -X POST http://localhost:9090/-/reload
#导入grafan模板;
#模板链接:https://grafana.com/grafana/dashboards/10477
    • redis监控

#redis_exporter、prometheus、grafana结合
#1、部署redis_exporter
#GitHub下载redis_exporter
tar -zxf redis/redis_exporter-v1.45.0.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/redis_exporter-v1.45.0.linux-amd64 /usr/local/redis_exporter
#配置启动脚本
cat > /usr/lib/systemd/system/redis_exporter.service << EOF
[Unit]
Description=redis_exporter
After=local-fs.target network-online.target network.target
Wants=local-fs.target network-online.target network.target
 
[Service]
#ip地址与端口
ExecStart=/usr/local/redis_exporter/redis_exporter --redis.addr X.X.X.X:6379
Restart=on-failure
[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload && systemctl start redis_exporter && systemctl enable redis_exporter
#对外默认暴露端口:9121
#查看
ss -tunlp | grep 9121
ps -ef | grep redis_exporter
#prometheus配置
#进入prometheus安装目录,编辑prometheus.yml
vim prometheus.yml
- job_name: 'redis-simple'
    metrics_path: /metrics
    static_configs:
      - targets: ['192.168.207.214:9121']
      
#配置加载
curl -X POST http://localhost:9090/-/reload
 
#grafana导入redis_exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/2949
    • process监控

#1、部署process-exporter
#GitHub下载process-exporter包
#https://github.com/ncabatoff/process-exporter/tree/master/packaging
# mv /tmp/process-exporter-0.7.10.linux-amd64.tar.gz .
#解压,安装
tar -zxf process-exporter-0.7.10.linux-amd64.tar.gz -C /usr/local/
ln -s /usr/local/process-exporter-0.7.10.linux-amd64 /usr/local/process-exporter
 
#process-conf.yaml为配置监控进程的配置文件
vim process-conf.yaml
 
process_names:
  - name: "{{.Comm}}"
    cmdline: 
    - '.+'
 
#服务启动配置
cat > /usr/lib/systemd/system/process_exporter.service << EOF
[Unit]
Description=process_exporter
Documentation=https://github.com/ncabatoff/process-exporter
After=network.target
 
[Service]
Type=simple
ExecStart=/usr/local/process-exporter/process-exporter -config.path=/usr/local/process-exporter/process-conf.yaml
KillMode=process
Restart=always
 
[Install]
WantedBy=multi-user.target
EOF
 
#加载并开机自启
 
systemctl daemon-reload && systemctl enable process_exporter
#启动process exporter
systemctl daemon-reload && systemctl start process_exporter
 
 
#查看
ps -ef | grep process-exporter
ss -tunlp | grep 9256
 
#对外暴露端口:9256
 
#process-expoeter、prometheus、grafana结合
 
#修改prometheus配置文件并重启
 
vim prometheus.yaml
 
- job_name: 'process-expoeter'
  static_configs:
  - targets: ['127.0.0.1:9256']  
    labels:
      instance: nginx

      
#重载prometheus
curl -X POST http://localhost:9090/-/reload
#grafana导入nginx-vts-exporter监控模板
#模板链接:https://grafana.com/grafana/dashboards/249

    • 告警规则配置

    • 主机告警规则

#根据告警指标,准备prometheus的告警规则
vim linux_node_host.yaml
主机告警规则 
groups:
    - name: linux
      rules:
      - alert: node-down(主机宕机)
        expr: up == 0
        for: 1m
        labels:
          #status: critical
          severity: critical
          team: operations
        annotations:
          summary: "{{$labels.instance}}:服务器宕机"
          description: "{{$labels.instance}}:服务器延时超过1分钟"
         
    
      - alert: node-cpu
        expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 60
        for: 1m
        labels:
          #status: warning
          severity: warning
        annotations:
          summary: "{{$labels.mountpoint}} CPU使用率过高!"
          description: "{{$labels.mountpoint }} CPU使用大于60%(目前值为:{{humanize $value}} %)"
          #summary: 主机 {{ $labels.nodename }} 的 CPU使用率持续1分钟超出阈值,当前为 {{humanize $value}} %
  
      - alert: node-memory
        expr: 100 -(node_memory_MemTotal_bytes -node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes ) / node_memory_MemTotal_bytes * 100> 80
        for: 1m
        labels:
          #status: warning
          severity: warning
        annotations:
          summary: "{{$labels.mountpoint}} 内存使用率过高!"
          description: "{{$labels.mountpoint }} 内存使用大于80%(目前值为:{{humanize $value}} %)"
          
      - alert: node-IO
        expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
        for: 1m
        labels:
          #status: 严重告警
          severity: warning
        annotations:
          summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
          description: "{{$labels.mountpoint }} 流入磁盘IO大于60%(目前值为:{{humanize $value}} %)"
    
      - alert: node-disk
        expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
        for: 1m
        labels:
          #status: 严重告警
          severity: warning
        annotations:
          summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
          description: "{{$labels.mountpoint }} 磁盘分区使用大于80%(目前值为:{{humanize $value}} %)"
          
      - alert: node-HostCpuHighIowait
        expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 5
        for: 0m
        labels:
          severity: warning
        annotations:
          #summary: Host CPU high iowait (instance {{ $labels.instance }})
          summary: "{{$labels.mountpoint}} 主机iowait等待值过高!"
          description: "CPU iowait当前值大于5%. .\n  VALUE = {{humanize $value}}\n  LABELS = {{ $labels }}"
 
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  linux_node_host.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules
    • zookeeper告警规则

#根据告警指标,准备prometheus的告警规则
vim zookeeper.yaml
主机告警规则 
groups:
- name: zookeeper服务监控
  rules:
  - alert: ZookeeperDown宕机
    expr: zk_up == 0
    for: 0m
    labels:
      severity: critical
    annotations:
      summary: Zookeeper Down (instance {{ $labels.instance }})
      description: "Zookeeper down on instance {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
  
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  zookeeper.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules

    • nginx告警规则

#根据告警指标,准备prometheus的告警规则
vim nginx.yaml
主机告警规则 
groups:
- name: nginx服务监控
  rules:
  - alert: nginx服务停止
    expr: nginx_up == 0
    for: 1m
    labels:
      severity: 严重告警
    annotations:
      summary: " {{ $labels.alias }} nginx服务已停止,当前状态{{ $value }}"
      description: "{{$labels.instance}}: nginx服务停止运行 "    
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  nginx.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules
    • Redis告警规则

#根据告警指标,准备prometheus的告警规则
vim Redis.yaml
主机告警规则 
  Redis服务监控
groups:
- name: Redis服务监控
  rules:
  - alert: Redis服务停止
    expr: redis_up == 0
    for: 1m
    labels:
      severity: 严重告警
    annotations:
      summary: " {{ $labels.alias }} Redis服务已停止,当前状态{{ $value }}"
      description: "{{$labels.instance}}:Redis 服务停止运行 "
      
  - alert: Redis服务器内存占用大于80%
    expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Redis out of system memory (instance {{ $labels.instance }})
      description: "Redis is running out of system memory (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      
  - alert: RedisOutOfConfiguredMaxmemory
    expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Redis out of configured maxmemory (instance {{ $labels.instance }})
      description: "Redis is running out of configured maxmemory (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      
 
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  Redis.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules
    • rocketmq告警规则

#根据告警指标,准备prometheus的告警规则
vim rocketmq.yaml
主机告警规则 
 groups:
- name: rocketmq
  rules:
  - alert: RocketMQ宕机
    expr: up{job="rocketmq"} == 0
    for: 20s
    labels: 
      severity: '灾难'
    annotations:
      summary: RocketMQ {{ $labels.instance }} is down
 
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  rocketmq.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules
    • mysql告警规则

#根据告警指标,准备prometheus的告警规则
vim mysql.yaml
主机告警规则 
groups:
- name: MySQL-rules
  rules:
  - alert: MySQL服务停止
    expr: up == 0
    for: 5s 
    labels:
      severity: warning
    annotations:
      summary: "{{$labels.instance}}: MySQL has stop !!!"
      description: "检测MySQL数据库运行状态"
          
  - alert: MysqlTooManyConnections(连接数大于80%)
    expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
      description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
 
  - alert: MySQL主从IO线程停止
    expr: mysql_slave_status_slave_io_running == 0
    for: 5s 
    labels:
      severity: warning
    annotations: 
      summary: "{{$labels.instance}}: MySQL Slave IO Thread has stop !!!"
      description: "检测MySQL主从IO线程运行状态"
 
  - alert: MySQL主从SQL线程停止 
    expr: mysql_slave_status_slave_sql_running == 0
    for: 5s 
    labels:
      severity: warning
    annotations: 
      summary: "{{$labels.instance}}: MySQL Slave SQL Thread has stop !!!"
      description: "检测MySQL主从SQL线程运行状态"
 
  - alert: MySQL主从延时大于30s 
    expr: mysql_slave_status_sql_delay == 30
    for: 5s 
    labels:
      severity: warning
    annotations: 
      summary: "{{$labels.instance}}: MySQL Slave Delay has more than 30s !!!"
      description: "检测MySQL主从延时状态"
          
  - alert: MysqlInnodbLogWaits
    expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
    for: 0m
    labels:
      severity: warning
    annotations:
      summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
      description: "MySQL innodb log writes stalling\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
#prometheus配置rules存储目录、prometheus 接入alertmanager
vim /apps/prometheus/prometheus.yaml
prometheus.yaml 
# my global config
global:
  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).
 
# Alertmanager configuration  #prometheus 接入alertmanager
alerting:
  alertmanagers:  
    - static_configs:
        - targets:
          - 127.0.0.1:9093
 
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - "rules/*.yml"  #告警规则存储目录
  # - "first_rules.yml"
  # - "second_rules.yml"
 
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: "prometheus"
 
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
 
    static_configs:
      - targets: ["localhost:9090"]
 
 
#prometheus告警规则加载
mkdir /apps/prometheus/rules -p 
cp  mysql.yaml  /apps/prometheus/rules/
 curl -X POST http://prometheus_ip:9090/-/reload
 
#web查看
http://x.x.x.x:9090/rules

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值