官网: https://prometheus.io/docs/alerting/0.21/configuration/

前言

1)查看monitoring的svc
[root@k8s-master-01 manifests]# kubectl get svc -n monitoring 
NAME                    TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                      AGE
alertmanager-main       ClusterIP   10.110.81.241    <none>        9093/TCP                     45h
alertmanager-operated   ClusterIP   None             <none>        9093/TCP,9094/TCP,9094/UDP   45h
grafana                 ClusterIP   10.111.57.238    <none>        3000/TCP                     45h
kube-state-metrics      ClusterIP   None             <none>        8443/TCP,9443/TCP            45h
node-exporter           ClusterIP   None             <none>        9100/TCP                     45h
prometheus-adapter      ClusterIP   10.105.251.120   <none>        443/TCP                      45h
prometheus-k8s          ClusterIP   10.99.136.60     <none>        9090/TCP                     45h
prometheus-operated     ClusterIP   None             <none>        9090/TCP                     15h
prometheus-operator     ClusterIP   None             <none>        8443/TCP                     45h

2)查看ingress-nginx的svc
[root@k8s-master-01 manifests]# kubectl get svc -n ingress-nginx 
NAME                                 TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                      AGE
ingress-nginx-controller             NodePort    10.106.104.168   <none>        80:32146/TCP,443:32516/TCP   5d21h
ingress-nginx-controller-admission   ClusterIP   10.98.140.148    <none>        443/TCP                      5d21h



3)查看alertmanager-main的详情
[root@k8s-master-01 manifests]# kubectl describe -n monitoring svc alertmanager-main
Name:              alertmanager-main
Namespace:         monitoring
Labels:            alertmanager=main
Annotations:       <none>
Selector:          alertmanager=main,app=alertmanager
Type:              ClusterIP
IP Families:       <none>
IP:                10.110.81.241
IPs:               10.110.81.241
Port:              web  9093/TCP
TargetPort:        web/TCP
Endpoints:         10.244.1.157:9093,10.244.2.150:9093,10.244.2.151:9093
Session Affinity:  ClientIP
Events:            <none>

4)判断alertmanager=main是svc,因为通过这个标签可以过滤出来alertmanager-main-0 /1/2
[root@k8s-master-01 manifests]# kubectl get pods -n monitoring -l alertmanager=main
NAME                  READY   STATUS    RESTARTS   AGE
alertmanager-main-0   2/2     Running   2          10h
alertmanager-main-1   2/2     Running   6          45h
alertmanager-main-2   2/2     Running   2          10h

5)prometheus的控制器是statefulsets
[root@k8s-master-01 manifests]# kubectl get statefulsets.apps -n monitoring 
NAME                READY   AGE
alertmanager-main   3/3     45h
prometheus-k8s      2/2     15h

6)查看secret
[root@k8s-master-01 manifests]# kubectl get secrets -n monitoring 
NAME                              TYPE                                  DATA   AGE
alertmanager-main                 Opaque                                1      45h
alertmanager-main-token-bq5c6     kubernetes.io/service-account-token   3      45h
default-token-s89hq               kubernetes.io/service-account-token   3      45h
etcd-certs                        Opaque                                3      15h
grafana-datasources               Opaque                                1      45h
grafana-token-2h6h9               kubernetes.io/service-account-token   3      45h
kube-state-metrics-token-h7x7c    kubernetes.io/service-account-token   3      45h
node-exporter-token-bb29z         kubernetes.io/service-account-token   3      45h
prometheus-adapter-token-4cxwm    kubernetes.io/service-account-token   3      45h
prometheus-k8s                    Opaque                                1      15h
prometheus-k8s-tls-assets         Opaque                                0      15h
prometheus-k8s-token-6n527        kubernetes.io/service-account-token   3      45h
prometheus-operator-token-9dd6p   kubernetes.io/service-account-token   3      45h

7)查看secret的详情===你修改altermanager-main的配置规则其实就是修改secret里面的alertmanager.yaml文件
[root@k8s-master-01 manifests]# kubectl describe secrets -n monitoring alertmanager-main
Name:         alertmanager-main
Namespace:    monitoring
Labels:       <none>
Annotations:  <none>

Type:  Opaque

Data
====
alertmanager.yaml:  686 bytes
[root@k8s-master-01 manifests]# 

8)输出为alertmanager-main yaml的格式,反解析一下
[root@k8s-master-01 manifests]# kubectl get secrets alertmanager-main -n monitoring -o yaml
apiVersion: v1
data:
  alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKImluaGliaXRfcnVsZXMiOgotICJlcXVhbCI6CiAgLSAibmFtZXNwYWNlIgogIC0gImFsZXJ0bmFtZSIKICAic291cmNlX21hdGNoIjoKICAgICJzZXZlcml0eSI6ICJjcml0aWNhbCIKICAidGFyZ2V0X21hdGNoX3JlIjoKICAgICJzZXZlcml0eSI6ICJ3YXJuaW5nfGluZm8iCi0gImVxdWFsIjoKICAtICJuYW1lc3BhY2UiCiAgLSAiYWxlcnRuYW1lIgogICJzb3VyY2VfbWF0Y2giOgogICAgInNldmVyaXR5IjogIndhcm5pbmciCiAgInRhcmdldF9tYXRjaF9yZSI6CiAgICAic2V2ZXJpdHkiOiAiaW5mbyIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAiRGVmYXVsdCIKLSAibmFtZSI6ICJXYXRjaGRvZyIKLSAibmFtZSI6ICJDcml0aWNhbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gIm5hbWVzcGFjZSIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJEZWZhdWx0IgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJXYXRjaGRvZyIKICAtICJtYXRjaCI6CiAgICAgICJzZXZlcml0eSI6ICJjcml0aWNhbCIKICAgICJyZWNlaXZlciI6ICJDcml0aWNhbCI=
kind: Secret
metadata:
  annotations:
    kubectl.kubernetes.io/last-applied-configuration: |
      {"apiVersion":"v1","data":{},"kind":"Secret","metadata":{"annotations":{},"name":"alertmanager-main","namespace":"monitoring"},"stringData":{"alertmanager.yaml":"\"global\":\n  \"resolve_timeout\": \"5m\"\n\"inhibit_rules\":\n- \"equal\":\n  - \"namespace\"\n  - \"alertname\"\n  \"source_match\":\n    \"severity\": \"critical\"\n  \"target_match_re\":\n    \"severity\": \"warning|info\"\n- \"equal\":\n  - \"namespace\"\n  - \"alertname\"\n  \"source_match\":\n    \"severity\": \"warning\"\n  \"target_match_re\":\n    \"severity\": \"info\"\n\"receivers\":\n- \"name\": \"Default\"\n- \"name\": \"Watchdog\"\n- \"name\": \"Critical\"\n\"route\":\n  \"group_by\":\n  - \"namespace\"\n  \"group_interval\": \"5m\"\n  \"group_wait\": \"30s\"\n  \"receiver\": \"Default\"\n  \"repeat_interval\": \"12h\"\n  \"routes\":\n  - \"match\":\n      \"alertname\": \"Watchdog\"\n    \"receiver\": \"Watchdog\"\n  - \"match\":\n      \"severity\": \"critical\"\n    \"receiver\": \"Critical\""},"type":"Opaque"}
  creationTimestamp: "2021-09-01T04:14:48Z"
  managedFields:
  - apiVersion: v1
    fieldsType: FieldsV1
    fieldsV1:
      f:data:
        .: {}
        f:alertmanager.yaml: {}
      f:metadata:
        f:annotations:
          .: {}
          f:kubectl.kubernetes.io/last-applied-configuration: {}
      f:type: {}
    manager: kubectl-client-side-apply
    operation: Update
    time: "2021-09-01T04:14:48Z"
  name: alertmanager-main
  namespace: monitoring
  resourceVersion: "818924"
  selfLink: /api/v1/namespaces/monitoring/secrets/alertmanager-main
  uid: d7efc459-9d10-4f89-85a9-5bf852496b59
type: Opaque



9)反解析出来alertmanager.yaml文件
[root@k8s-master-01 manifests]# echo Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKImluaGliaXRfcnVsZXMiOgotICJlcXVhbCI6CiAgLSAibmFtZXNwYWNlIgogIC0gImFsZXJ0bmFtZSIKICAic291cmNlX21hdGNoIjoKICAgICJzZXZlcml0eSI6ICJjcml0aWNhbCIKICAidGFyZ2V0X21hdGNoX3JlIjoKICAgICJzZXZlcml0eSI6ICJ3YXJuaW5nfGluZm8iCi0gImVxdWFsIjoKICAtICJuYW1lc3BhY2UiCiAgLSAiYWxlcnRuYW1lIgogICJzb3VyY2VfbWF0Y2giOgogICAgInNldmVyaXR5IjogIndhcm5pbmciCiAgInRhcmdldF9tYXRjaF9yZSI6CiAgICAic2V2ZXJpdHkiOiAiaW5mbyIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAiRGVmYXVsdCIKLSAibmFtZSI6ICJXYXRjaGRvZyIKLSAibmFtZSI6ICJDcml0aWNhbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gIm5hbWVzcGFjZSIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJEZWZhdWx0IgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJXYXRjaGRvZyIKICAtICJtYXRjaCI6CiAgICAgICJzZXZlcml0eSI6ICJjcml0aWNhbCIKICAgICJyZWNlaXZlciI6ICJDcml0aWNhbCI= |base64 -d
"global":
  "resolve_timeout": "5m"
"inhibit_rules":
- "equal":
  - "namespace"
  - "alertname"
  "source_match":
    "severity": "critical"
  "target_match_re":
    "severity": "warning|info"
- "equal":
  - "namespace"
  - "alertname"
  "source_match":
    "severity": "warning"
  "target_match_re":
    "severity": "info"
"receivers":
- "name": "Default"
- "name": "Watchdog"
- "name": "Critical"
"route":
  "group_by":
  - "namespace"
  "group_interval": "5m"
  "group_wait": "30s"
  "receiver": "Default"
  "repeat_interval": "12h"
  "routes":
  - "match":
      "alertname": "Watchdog"
    "receiver": "Watchdog"
  - "match":
      "severity": "critical"
    "receiver": "Critical"
[root@k8s-master-01 manifests]# kubectl get pods -n monitoring 
NAME                                   READY   STATUS    RESTARTS   AGE
alertmanager-main-0                    2/2     Running   2          13h
alertmanager-main-1                    2/2     Running   6          2d
alertmanager-main-2                    2/2     Running   2          13h
grafana-5d9d5f67c4-5rxjl               1/1     Running   3          2d
kube-state-metrics-7fddf8779f-db94d    3/3     Running   3          13h
node-exporter-bw5fp                    2/2     Running   6          2d
node-exporter-n4dgm                    2/2     Running   6          2d
node-exporter-w6pzz                    2/2     Running   6          2d
prometheus-adapter-cb548cdbf-6cmdw     1/1     Running   3          2d
prometheus-k8s-0                       3/3     Running   4          13h
prometheus-k8s-1                       3/3     Running   4          13h
prometheus-operator-6478d8fc6d-x5w9l   2/2     Running   6          2d

[root@k8s-master-01 manifests]# kubectl exec -it -n monitoring alertmanager-main-0 -- sh
Defaulting container name to alertmanager.
Use 'kubectl describe pod/alertmanager-main-0 -n monitoring' to see all of the containers in this pod.
/alertmanager $ cd /etc/alertmanager/config/
/etc/alertmanager/config $ ls
alertmanager.yaml


物理机安装:
[root@k8s-master-01 ~]#  wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz

[root@k8s-master-01 ~]# mv alertmanager-0.21.0.linux-amd64.tar.gz /opt/

[root@k8s-master-01 opt]# tar xf alertmanager-0.21.0.linux-amd64.tar.gz -C /usr/local/
[root@k8s-master-01 opt]# cd /usr/local/
[root@k8s-master-01 local]# ll
总用量 0
drwxr-xr-x  2 3434 3434 93 617 2020 alertmanager-0.21.0.linux-amd64
drwxr-xr-x. 2 root root 18 828 16:35 bin
drwxr-xr-x. 2 root root  6 411 2018 etc
drwxr-xr-x. 2 root root  6 411 2018 games
drwxr-xr-x. 2 root root  6 411 2018 include
drwxr-xr-x. 2 root root  6 411 2018 lib
drwxr-xr-x. 2 root root  6 411 2018 lib64
drwxr-xr-x. 2 root root  6 411 2018 libexec
drwxr-xr-x. 2 root root  6 411 2018 sbin
drwxr-xr-x. 5 root root 49 411 2018 share
drwxr-xr-x. 2 root root  6 411 2018 src
[root@k8s-master-01 local]# cd alertmanager-0.21.0.linux-amd64/
[root@k8s-master-01 alertmanager-0.21.0.linux-amd64]# ll
总用量 51644
-rwxr-xr-x 1 3434 3434 28871879 617 2020 alertmanager
-rw-r--r-- 1 3434 3434      380 617 2020 alertmanager.yml
-rwxr-xr-x 1 3434 3434 23987848 617 2020 amtool
-rw-r--r-- 1 3434 3434    11357 617 2020 LICENSE
-rw-r--r-- 1 3434 3434      457 617 2020 NOTICE
报警规则
[root@k8s-master-01 alertmanager-0.21.0.linux-amd64]# vim alertmanager.yml 

global:
  resolve_timeout: 5m

route:
  group_by: ['alertname']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'http://127.0.0.1:5001/'
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

AlertManager告警简单部署

altermanager是普罗米修斯的报警组件,主要是用针对异常数据进行报警。首先创建一个报警的规则,其次创建路由(给谁发报警信息)。

一、AlertManager告警简介

1.简介

告警能力在Prometheus的架构中被划分成两个独立的部分。

如下所示,通过在Prometheus中定义`AlertRule(告警规则)`。

Prometheus会周期性的对告警规则进行计算,如果满足告警触发条件就会向Alertmanager发送告警信息。

在这里插入图片描述

2.告警规则组成

1)告警名称

​ 用户需要为告警规则命名,当然对于命名而言,需要能够直接表达出该告警的主要内容。

2)告警规则

​ 告警规则实际上主要由PromQL进行定义,其实际意义是当表达式(PromQL)查询结果持续多长时间(During)后出发告警。

​ AlertManager作为一个独立的组件,负责接收并处理来自Prometheus Server(也可以是其它的客户端程序)的告警信息

​ AlertManager可以对这些告警信息进行进一步的处理,比如当接收到大量重复告警时能够消除重复的告警信息,同时对告警信息进行分组并且路由到正确的通知方,Prometheus内置了对邮件,Slack等多种通知方式的支持,同时还支持与Webhook的集成,以支持更多定制化的场景。例如,目前AlertManager还不支持钉钉,那用户完全可以通过Webhook与钉钉机器人进行集成,从而通过钉钉接收告警信息,同时AlertManager还提供了静默和告警抑制机制来对告警通知行为进行优化。

3.Alertmanager特性

​ Alertmanager除了提供基本的告警通知能力以外,还主要提供了如:分组、抑制以及静默等告警特性!

在这里插入图片描述

1)分组

分组机制可以将详细的告警信息合并成一个通知,比如在某些情况下,由于系统宕机导致大量的告警被同时触发,在这种情况下,分组机制可以将这些被触发的告警合并为一个告警通知,避免一次性接受大量的告警通知,而无法对问题进行快速定位。

​ 例如,当即群中有数百个正在运行的服务实例,并且为每一个实例设置了告警规则,若此时发生了网络故障,可能导致大量的服务实例无法连接到数据库,结果就会有数百个告警被发送到AlertManager。

​ 而作为用户,可能只希望在一个通知中就能查看哪些服务实例受到影响,这时就可以按照服务所在的集群,或告警名称对告警进行分组,而将这些告警内聚在一起,成为一个通知。

​ 告警分组、告警时间、及告警的接收方式,可通过AlertManager的配置文件进行配置。

2)抑制

​ 抑制,是指当某一告警发出后,可停止重复发送由此告警引发的其他告警的机制。

​ 例如,当集群不可访问时,出发了一次告警,通过配置AlertManager可以忽略与该集群有关的所有告警,这样可避免接收到大量与实际问题无关的告警通知。

​ 抑制机制同样通过AlertManager的配合文件进行配置。

3)静默

​ 静默,指提供了一个简单的机制,可快速根据标签对告警进行静默处理,若收到的告警符合静默配置,AlertManager则不会发送告警通知。

静默设置需在AlertManager的Web页面上进行配置。

二、安装部署

1.容器化部署

1)部署Alert-Manager

  • 安装监控时自动部署了
[root@k8s-master-01 ~]# cd kube-prometheus/manifests/
[root@k8s-master-01 manifests]# cat alertmanager-alertmanager.yaml (定义altermanage规则)

apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
  labels:
    alertmanager: main
  name: main
  namespace: monitoring
spec:
  image: quay.io/prometheus/alertmanager:v0.20.0
  nodeSelector:
    kubernetes.io/os: linux
  replicas: 3
  securityContext:
    fsGroup: 2000
    runAsNonRoot: true    
    runAsUser: 1000
  serviceAccountName: alertmanager-main    #权限
  version: v0.20.0


2)查看部署结果

[root@k8s-master-01 manifests]# kubectl apply -f  alertmanager-alertmanager.yaml


[root@k8s-master-01 manifests]# kubectl get pods -n monitoring 
NAME                                   READY   STATUS    RESTARTS   AGE
alertmanager-main-0                    2/2     Running   2          12h
alertmanager-main-1                    2/2     Running   6          47h
alertmanager-main-2                    2/2     Running   2          12h
grafana-5d9d5f67c4-5rxjl               1/1     Running   3          47h
kube-state-metrics-7fddf8779f-db94d    3/3     Running   3          13h
node-exporter-bw5fp                    2/2     Running   6          47h
node-exporter-n4dgm                    2/2     Running   6          47h
node-exporter-w6pzz                    2/2     Running   6          47h
prometheus-adapter-cb548cdbf-6cmdw     1/1     Running   3          47h
prometheus-k8s-0                       3/3     Running   4          12h
prometheus-k8s-1                       3/3     Running   4          12h
prometheus-operator-6478d8fc6d-x5w9l   2/2     Running   6          47h


3)创建Ingress

  • 增加一个ingress暴露服务
cat > alertmanage-ingress.yaml  <<EOF
kind: Ingress
apiVersion: extensions/v1beta1
metadata:
  name: alertmanager
  namespace: monitoring
spec:
  rules:
    - host: "www.altermanager.cluster.local.com"
      http:
        paths:
          - backend:
              serviceName: alertmanager-main
              servicePort: 9093
            path: /
EOF

4)部署服务

[root@k8s-master-01 manifests]# kubectl apply -f alertmanage-ingress.yaml 

[root@k8s-master-01 manifests]# kubectl get ingress -n monitoring 
NAME             CLASS    HOSTS                                             ADDRESS         PORTS   AGE
alertmanager     <none>   www.altermanager.cluster.local.com                192.168.15.32   80      45s
grafana          <none>   www.grafana.monitoring.cluster.local.com          192.168.15.32   80      47h
prometheus-k8s   <none>   www.prometheus-k8s.monitoring.cluster.local.com   192.168.15.32   80      47h

[root@k8s-master-01 manifests]# kubectl get svc -n ingress-nginx 
NAME                                 TYPE        CLUSTER-IP       EXTERNAL-IP   PORT(S)                      AGE
ingress-nginx-controller             NodePort    10.106.104.168   <none>        80:32146/TCP,443:32516/TCP   6d
ingress-nginx-controller-admission   ClusterIP   10.98.140.148    <none>        443/TCP                      6d

5)测试访问

编写host文件

192.168.15.33 www.grafana.monitoring.cluster.local.com www.prometheus-k8s.monitoring.cluster.local.com www.altermanager.cluster.local.com


http://www.altermanager.cluster.local.com:32146/

在这里插入图片描述

2、物理机部署

1)下载安装包

[root@promethus ~]# wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz

2)创建目录并解压

[root@promethus /prometheus]# mkdir alertmanager
[root@promethus /opt]# tar xf alertmanager-0.21.0.linux-amd64.tar.gz -C /prometheus/alertmanager
[root@promethus /prometheus/alertmanager]# mv alertmanager-0.21.0.linux-amd64/* ./

3)授权

[root@promethus /prometheus/alertmanager]# chown -R prometheus.prometheus /prometheus

4)添加环境变量

[root@promethus /prometheus/alertmanager]# vim /etc/profile.d/prometheus.sh

export PATH=/prometheus:$PATH
export PATH=/prometheus/alertmanager:$PATH

5)加入systemd管理


[Unit]
Description=prometheus-alertmanager server daemon

[Service]
ExecStart=/prometheus/alertmanager/alertmanager --config.file=/prometheus/alertmanager/alertmanager.yml
Restart=on-failure

[Install]
WantedBy=multi-user.target


6)启动并加入开机自启

[root@promethus ~]# systemctl start prometheus-alertmanager.service
[root@promethus ~]# systemctl enable prometheus-alertmanager.service

三、配置告警邮件+微信

1.全局配置

global:
  # 当告警的状态有firing变为resolve的以后还要呆多长时间,才宣布告警解除。这个主要是解决某些监控指标在阀值边缘上波动,一会儿好一会儿不好。
  resolve_timeout: 1h

2.配置邮件告警

1)发件

smtp_smarthost: 'smtp.exmail.qq.com:465'     #stmp服务器主机
smtp_from: 'dukuan@xxx.com'         
smtp_auth_username: 'dukuan@xxx.com'
smtp_auth_password: 'DKxxx'
# HipChat告警配置
# hipchat_auth_token: '123456789'
# hipchat_auth_url: 'https://hipchat.foobar.org/'

2)收件

  #配置告警接收者信息
receivers:
    #配置邮件收件人
  - name: 'team-ops-mails'    #起个名字,代称方便实用
    email_configs:            #邮件接收配置
      - to: 'dukuan@xxx.com'  #配置收件人邮箱

3)邮件告警模板

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta name="viewport" content="width=device-width" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>{{ template "__subject" . }}</title>
<style>
/* -------------------------------------
    GLOBAL
    A very basic CSS reset
------------------------------------- */
* {
  margin: 0;
  font-family: "Helvetica Neue", Helvetica, Arial, sans-serif;
  box-sizing: border-box;
  font-size: 14px;
}
img {
  max-width: 100%;
}
body {
  -webkit-font-smoothing: antialiased;
  -webkit-text-size-adjust: none;
  width: 100% !important;
  height: 100%;
  line-height: 1.6em;
  /* 1.6em * 14px = 22.4px, use px to get airier line-height also in Thunderbird, and Yahoo!, Outlook.com, AOL webmail clients */
  /*line-height: 22px;*/
}
/* Let's make sure all tables have defaults */
table td {
  vertical-align: top;
}
/* -------------------------------------
    BODY & CONTAINER
------------------------------------- */
body {
  background-color: #f6f6f6;
}
.body-wrap {
  background-color: #f6f6f6;
  width: 100%;
}
.container {
  display: block !important;
  max-width: 600px !important;
  margin: 0 auto !important;
  /* makes it centered */
  clear: both !important;
}
.content {
  max-width: 600px;
  margin: 0 auto;
  display: block;
  padding: 20px;
}
/* -------------------------------------
    HEADER, FOOTER, MAIN
------------------------------------- */
.main {
  background-color: #fff;
  border: 1px solid #e9e9e9;
  border-radius: 3px;
}
.content-wrap {
  padding: 30px;
}
.content-block {
  padding: 0 0 20px;
}
.header {
  width: 100%;
  margin-bottom: 20px;
}
.footer {
  width: 100%;
  clear: both;
  color: #999;
  padding: 20px;
}
.footer p, .footer a, .footer td {
  color: #999;
  font-size: 12px;
}
/* -------------------------------------
    TYPOGRAPHY
------------------------------------- */
h1, h2, h3 {
  font-family: "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
  color: #000;
  margin: 40px 0 0;
  line-height: 1.2em;
  font-weight: 400;
}
h1 {
  font-size: 32px;
  font-weight: 500;
  /* 1.2em * 32px = 38.4px, use px to get airier line-height also in Thunderbird, and Yahoo!, Outlook.com, AOL webmail clients */
  /*line-height: 38px;*/
}
h2 {
  font-size: 24px;
  /* 1.2em * 24px = 28.8px, use px to get airier line-height also in Thunderbird, and Yahoo!, Outlook.com, AOL webmail clients */
  /*line-height: 29px;*/
}
h3 {
  font-size: 18px;
  /* 1.2em * 18px = 21.6px, use px to get airier line-height also in Thunderbird, and Yahoo!, Outlook.com, AOL webmail clients */
  /*line-height: 22px;*/
}
h4 {
  font-size: 14px;
  font-weight: 600;
}
p, ul, ol {
  margin-bottom: 10px;
  font-weight: normal;
}
p li, ul li, ol li {
  margin-left: 5px;
  list-style-position: inside;
}
/* -------------------------------------
    LINKS & BUTTONS
------------------------------------- */
a {
  color: #348eda;
  text-decoration: underline;
}
.btn-primary {
  text-decoration: none;
  color: #FFF;
  background-color: #348eda;
  border: solid #348eda;
  border-width: 10px 20px;
  line-height: 2em;
  /* 2em * 14px = 28px, use px to get airier line-height also in Thunderbird, and Yahoo!, Outlook.com, AOL webmail clients */
  /*line-height: 28px;*/
  font-weight: bold;
  text-align: center;
  cursor: pointer;
  display: inline-block;
  border-radius: 5px;
  text-transform: capitalize;
}
/* -------------------------------------
    OTHER STYLES THAT MIGHT BE USEFUL
------------------------------------- */
.last {
  margin-bottom: 0;
}
.first {
  margin-top: 0;
}
.aligncenter {
  text-align: center;
}
.alignright {
  text-align: right;
}
.alignleft {
  text-align: left;
}
.clear {
  clear: both;
}
/* -------------------------------------
    ALERTS
    Change the class depending on warning email, good email or bad email
------------------------------------- */
.alert {
  font-size: 16px;
  color: #fff;
  font-weight: 500;
  padding: 20px;
  text-align: center;
  border-radius: 3px 3px 0 0;
}
.alert a {
  color: #fff;
  text-decoration: none;
  font-weight: 500;
  font-size: 16px;
}
.alert.alert-warning {
  background-color: #E6522C;
}
.alert.alert-bad {
  background-color: #D0021B;
}
.alert.alert-good {
  background-color: #68B90F;
}
/* -------------------------------------
    INVOICE
    Styles for the billing table
------------------------------------- */
.invoice {
  margin: 40px auto;
  text-align: left;
  width: 80%;
}
.invoice td {
  padding: 5px 0;
}
.invoice .invoice-items {
  width: 100%;
}
.invoice .invoice-items td {
  border-top: #eee 1px solid;
}
.invoice .invoice-items .total td {
  border-top: 2px solid #333;
  border-bottom: 2px solid #333;
  font-weight: 700;
}
/* -------------------------------------
    RESPONSIVE AND MOBILE FRIENDLY STYLES
------------------------------------- */
@media only screen and (max-width: 640px) {
  body {
    padding: 0 !important;
  }
  h1, h2, h3, h4 {
    font-weight: 800 !important;
    margin: 20px 0 5px !important;
  }
  h1 {
    font-size: 22px !important;
  }
  h2 {
    font-size: 18px !important;
  }
  h3 {
    font-size: 16px !important;
  }
  .container {
    padding: 0 !important;
    width: 100% !important;
  }
  .content {
    padding: 0 !important;
  }
  .content-wrap {
    padding: 10px !important;
  }
  .invoice {
    width: 100% !important;
  }
}
</style>
</head>

<body itemscope itemtype="http://schema.org/EmailMessage">

<table class="body-wrap">
  <tr>
    <td></td>
    <td class="container" width="600">
      <div class="content">
        <table class="main" width="100%" cellpadding="0" cellspacing="0">
          <tr>
            {{ if gt (len .Alerts.Firing) 0 }}
            <td class="alert alert-warning">
            {{ else }}
            <td class="alert alert-good">
            {{ end }}
              {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }}
                {{ .Name }}={{ .Value }} 
              {{ end }}
            </td>
          </tr>
          <tr>
            <td class="content-wrap">
              <table width="100%" cellpadding="0" cellspacing="0">
                <tr>
                  <td class="content-block">
                    <a href='{{ template "__alertmanagerURL" . }}' class="btn-primary">View in {{ template "__alertmanager" . }}</a>
                  </td>
                </tr>
                {{ if gt (len .Alerts.Firing) 0 }}
                <tr>
                  <td class="content-block">
                    <strong>[{{ .Alerts.Firing | len }}] Firing</strong>
                  </td>
                </tr>
                {{ end }}
                {{ range .Alerts.Firing }}
                <tr>
                  <td class="content-block">
                    <strong>Labels</strong><br />
                    {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                    {{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }}
                    {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                    <a href="{{ .GeneratorURL }}">Source</a><br />
                  </td>
                </tr>
                {{ end }}

                {{ if gt (len .Alerts.Resolved) 0 }}
                  {{ if gt (len .Alerts.Firing) 0 }}
                <tr>
                  <td class="content-block">
                    <br />
                    <hr />
                    <br />
                  </td>
                </tr>
                  {{ end }}
                <tr>
                  <td class="content-block">
                    <strong>[{{ .Alerts.Resolved | len }}] Resolved</strong>
                  </td>
                </tr>
                {{ end }}
                {{ range .Alerts.Resolved }}
                <tr>
                  <td class="content-block">
                    <strong>Labels</strong><br />
                    {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                    {{ if gt (len .Annotations) 0 }}<strong>Annotations</strong><br />{{ end }}
                    {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}<br />{{ end }}
                    <a href="{{ .GeneratorURL }}">Source</a><br />
                  </td>
                </tr>
                {{ end }}
              </table>
            </td>
          </tr>
        </table>

        <div class="footer">
          <table width="100%">
            <tr>
              <td class="aligncenter content-block"><a href='{{ .ExternalURL }}'>Sent by {{ template "__alertmanager" . }}</a></td>
            </tr>
          </table>
        </div></div>
    </td>
    <td></td>
  </tr>
</table>

</body>
</html>

测试创建规则

建立alertManager报警规则

规则

global:
  resolve_timeout: 1h   # 警告的间隔时间,默认是:5m

  # 邮件告警配置
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: 'axxxy@qq.com'
  smtp_auth_username: 'axxxy@qq.com'
  smtp_auth_password: 'qxxxxb'
  smtp_require_tls: false

# 配置报警的模板
templates:
  - '/etc/alertmanager/config/*.tmpl'   #一定要以她们tmpl结尾

# 路由
route:
  # 匹配的标签的ksy
  group_by: ['severity']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h
  receiver: 'email'
  routes:
    - receiver: 'email'
      match_re:
      	# 匹配的是标签的值
        service: '^(warning|critical)$'

# 接收者
receivers:
  - name: 'email'
    email_configs:
      - to: '12xxxx30@qq.com'

添加报警

[root@kubernetes-master-01 altermanager]# kubectl create secret generic -n monitoring alertmanager-main --from-file=alertmanager.yaml --from-file=email.tmpl --dry-run -oyaml | kubectl replace -f -

3.配置微信告警

[root@kubernetes-master-01 altermanager]# kubectl create secret generic -n monitoring alertmanager-main --from-file=alertmanager.yaml --from-file=email.tmpl --dry-run -oyaml | kubectl replace -f -

1)发件

wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/'
  wechat_api_secret: 'JJ'  # 应用管理中自建小程序的Secret
  wechat_api_corp_id: 'ww'  # 企业信息中的企业ID

2)收件

#配置微信收件人
  - name: 'wechat'      #起个名字方便使用
    wechat_configs:     #微信接收配置
      - send_resolved: true       # 为题解决了是否通知
        corp_id: 'ww'             # 企业信息中的企业ID
        api_secret: 'JJ'          # 应用管理中自建小程序的Secret
        to_party: '2'             # 通知组id
        to_user:                  # 通知用户账号
        agent_id: '1000002'       # 应用管理自建小程序的Agentld

3)微信告警模板

{{ define "wechat.default.message" }}
{{ if gt (len .Alerts.Firing) 0 -}}
Alerts Firing:
{{ range .Alerts }}
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
{{- end }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
Alerts Resolved:
{{ range .Alerts }}
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
恢复时间: {{ .EndsAt.Format "2006-01-02 15:04:05" }}
{{- end }}
{{- end }}
告警链接:
{{ template "__alertmanagerURL" . }}
{{- end }}

4.配置钉钉告警

#通过webhook告警,想要通过钉钉等应用告警时配置
- name: magpie.ding
  webhook_configs:
    - url: http://10.252.3.10:9002/ding_message
      send_resolved: false      # 当问题解决了是否也要通知一下
      
  # 自定义告警通知模板
templates:
- '/etc/alertmanager/config/*.tmpl'

5.其它配置

# route用来设置报警的分发策略,是个重点,告警内容从这里进入,寻找自己应该用那种策略发送出去
route:
  # 告警应该根据那些标签进行分组
  group_by: ['job', 'altername', 'cluster', 'service','severity']

  # 同一组的告警发出前要等待多少秒,这个是为了把更多的告警一个批次发出去
  group_wait: 30s

  #同一组的多批次告警间隔多少秒后,才能发出
  group_interval: 5m

  # 重复的告警要等待多久后才能再次发出去
  repeat_interval: 12h

  # 一级的receiver,也就是默认的receiver,当告警进来后没有找到任何子节点和自己匹配,就用这个receiver
  receiver: 'wechat'

  # 上述route的配置会被传递给子路由节点,子路由节点进行重新配置才会被覆盖
  # 子路由树
  routes:

  # 用于匹配label。此处列出的所有label都匹配到才算匹配
  - match_re:
      service: ^(foo1|foo2|baz)$
    receiver: 'wechat'

    # 在带有service标签的告警同时有severity标签时,他可以有自己的子路由,同时具有severity != critical的告警则被发送给接收者team-ops-mails,对severity == critical的告警则被发送到对应的接收者即team-ops-pager
    routes:
    - match:
        severity: critical
      receiver: 'wechat'

  # 比如关于数据库服务的告警,如果子路由没有匹配到相应的owner标签,则都默认由team-DB-pager接收
  - match:
      service: database
    receiver: 'wechat'

  # 我们也可以先根据标签service:database将数据库服务告警过滤出来,然后进一步将所有同时带labelkey为database
  - match:
      severity: critical
    receiver: 'wechat'

# 抑制规则,当出现critical告警时 忽略warning
inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  # Apply inhibition if the alertname is the same.
  #   equal: ['alertname', 'cluster', 'service']

四、prometheus配置文件详解

1.基础配置

prometheus.yml为主配置文件,该文件大致分为了global全局配置、alerting告警配置、rules_file、scrape_configs被监控端配置。下面是一个基础配置文件说明

# 全局配置
global:
  scrape_interval:     15s # 数据收集频率
  evaluation_interval: 15s # 多久评估一次规则
  scrape_timeout: 10s  # 收集数据的超时时间

#####Alertmanager配置模块

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
      - targets:
        - ['127.0.0.1:9093'] #配置告警信息接收端口

# ###规则文件,支持通配符
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  # - "rules/*.rules"
  # - "*.rules"


# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'    # 被监控资源组的名称

    # metrics_path defaults to '/metrics' #该行可不写,获取数据URI,默认为/metrics
    # scheme defaults to 'http'.          # #默认http方式采集

    static_configs:           ##节点地址与获取Metrics数据端口,多个地址用逗号分隔,也可以写多行
    - targets: ['localhost:9090']
   #- targets: ['localhost:9090','192.168.1.100:9100']
     #- 192.168.1.101:9100
     #- 192.168.1.102:9100
     #- 192.168.1.103:9100

告警规则可以去这里看看:https://awesome-prometheus-alerts.grep.to/

2、标签配置

Prometheus通过标签可以实现查询过滤,并且还支持重新标签实现动态生成标签、过滤、删除无用标签等灵活配置。

在采集数据之前可以使用relabel_configs进行重新标记,存储数据之前可以使用metric_relabel_configs重新标记。

两种重新打标签的方式都支持以下动作:

replace:默认动作,将匹配到的标签内容做替换 
keep:通过正则匹配,仅保留正则匹配到的标签 
drop:通过正则匹配,删除正则匹配到的标签 
labeldrop:删除指定标签,比如一些默认标签并不需要,可以用该动作删除
labelkeep:仅保留指定标签 

配置文件说明

global:  #全局配置,这里的配置项可以单独配置在某个job中
  scrape_interval: 15s  #采集数据间隔,默认15秒
  evaluation_interval: 15s  #告警规则监测频率,比如已经设置了内存使用大于70%就告警的规则,这里就会每15秒执行一次告警规则
  scrape_timeout: 10s   #采集超时时间

scrape_configs:
  - job_name: 'prometheus-server'  #定义一个监控组名称
    metrics_path defaults to '/metrics'  #获取数据URI默认为/metrics
    scheme defaults to 'http'  #默认http方式采集
    static_configs:
    - targets: ['localhost:9090','192.168.1.100:9100']  #节点地址与获取Metrics数据端口,多个地址用逗号分隔,也可以写多行。

  - job_name: 'web_node'  #定义另一个监控组
    metrics_path defaults to '/metrics'  #获取数据URI默认为/metrics
    scheme defaults to 'http'  #默认http方式采集
    static_configs:
    - targets: ['10.160.2.107:9100','192.168.1.100:9100']  #组内多个被监控主机
      labels:  #自定义标签,通过标签可以进行查询过滤
        server: nginx  #将上面2个主机打上server标签,值为nginx

  - job_name: 'mysql_node'
    static_configs:
    - targets: ['10.160.2.110:9100','192.168.1.111:9100']
    metric_relable_configs:   #声明要重命名标签
    - action: replace  #指定动作,replace代表替换标签,也是默认动作
      source_labels: ['job']  #指定需要被action所操作的原标签
      regex: (.*)  #原标签里的匹配条件,符合条件的原标签才会被匹配,支持正则
      replacement: $1  #原标签需要被替换的部分,$1代表regex正则的第一个分组
      target_label: idc  #将$1内容赋值给idc标签

    - action: drop  #正则删除标签示例
      regex: "192.168.100.*"  #正则匹配标签值
      source_labels: ["__address__"]  #需要进行正则匹配的原标签

    - action: labeldrop  #直接删除标签示例
      regex: "job"  #直接写标签名即可

五、检查工具

在启动Prometheus之前可以使用protool工具对配置文件进行检查

protool check config prometheus.yml

六、启动命令详解

1、启动命令

prometheus --config.file="/usr/local/prometheus-2.16.0.linux-amd64/prometheus.yml" --web.listen-address="0.0.0.0:9090" --storage.tsdb.path="/data/prometheus" --storage.tsdb.retention.time=15d --web.enable-lifecycle &

2、常用参数详解

--config.file="/usr/local/prometheus/prometheus.yml"  #指定配置文件路径

--web.listen-address="0.0.0.0:9090"  #指定服务端口

--storage.tsdb.path="/data/prometheus"  #指定数据存储路径

--storage.tsdb.retention.time=15d  #数据保留时间

--collector.systemd #开启systemd的服务状态监控,开启后在WEB上可以看到多出相关监控项

--collector.systemd.unit-whitelist=(sshd|nginx).service  #对systemd具体要监控的服务名

--web.enable-lifecycle  #开启热加载配置

Logo

权威|前沿|技术|干货|国内首个API全生命周期开发者社区

更多推荐