以下为prometheus.yml文件的配置 

alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 10.11.62.26:9093
rule_files:
  - "alertmanager_rules.yml"
  - "zookeeper_rules.yml"
#Global configurations
global:
  scrape_interval: 10s
  scrape_timeout: 10s
  evaluation_interval: 10s
remote_write:
  - url: "http://localhost:8086/api/v1/prom/write?db=prometheus"
remote_read:
  - url: "http://localhost:8086/api/v1/prom/read?db=prometheus"
scrape_configs:
  - job_name: 'MicroService'
    consul_sd_configs:
    - server:   '10.11.62.13:8500'
      services: []
    relabel_configs:
    - source_labels: [__meta_consul_tags]
      regex: '(.*)'
      replacement: '/actuator/prometheus'
      target_label:  __metrics_path__
    - source_labels: [__meta_consul_tags]
      regex: (.*contextPath=(.*[^(,)]).*)
      replacement: '${2}/actuator/prometheus'
      target_label:  __metrics_path__
    - source_labels: [__meta_consul_service,__meta_consul_address]
      regex: (consul.*);(.*)
      replacement: $2:9107
      target_label: __address__
    - source_labels: [__meta_consul_service]
      regex: (consul.*)
      replacement: '/metrics'
      target_label: __metrics_path__
    - source_labels: [__meta_consul_service]
      regex: '(.+)'
      replacement: ${1}
      target_label:  meta_consul_service
    - source_labels: [__meta_consul_service_address]
      regex: '(.+)'
      replacement: ${1}
      target_label:  meta_consul_service_address
    - source_labels: [__meta_consul_service_id]
      regex: '(.+)'
      replacement: ${1}
      target_label:  meta_consul_service_id
    - source_labels: [__meta_consul_service_port]
      regex: '(.+)'
      replacement: ${1}
      target_label:  meta_consul_service_port
    - source_labels: [__address__]
      regex: '[^:]+'
      replacement: ${1}
      target_label: instance
#  - job_name: 'kong'
#    metrics_path: '/metrics'
    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.
#    static_configs:
#      - targets: ['10.11.62.27:8001','10.11.62.28:8001','10.11.62.4:8001','10.11.62.5:8001']
#  - job_name: 'zk'
#    metrics_path: '/metrics'
    # metrics_path defaults to '/metrics'
    #     # scheme defaults to 'http'.
#    static_configs:
#      - targets: ['10.11.62.9:9141','10.11.62.4:9141','10.11.62.5:9141']
#  - job_name: 'kafka'
#    metrics_path: '/metrics'
    # metrics_path defaults to '/metrics'
    #     # scheme defaults to 'http'.
#    static_configs:
#      - targets: ['10.11.62.32:9308']
#  - job_name: node_exporter
#    static_configs:
#     - targets: ['10.11.62.32:9100']
  - job_name: 'sysmonitor'
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - ./conf.d/*.json
  - job_name: 'zk'
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - ./conf.d/kafka/*.json
  - job_name: 'kong'
    file_sd_configs:
    - refresh_interval: 1m
      files:
      - ./conf.d/kong/*.json

conf.d目录下文件列表如下

kafka目录下的文件

内如如下

[
{
"targets": ["10.11.62.4:9141"],
"labels": {
"instance": "10.11.62.4",
"env": "product",
"name":"kfzx-gw-1"
}
},
{
"targets": ["10.11.62.5:9141"],
"labels": {
"instance": "10.11.62.5",
"env": "product",
"name":"kfzx-gw-2"
}
},
{
"targets": ["10.11.62.9:9141"],
"labels": {
"instance": "10.11.62.9",
"env": "product",
"name":"kfzx-cache-2"
}
}
]

同样在zookeeper上也有类似的内容

alertmanager_rules.yml内容如下

groups:
- name: 服务器系统监控
  rules:
  - alert: "服务器系统监控"
    expr: up{job!="MicroService"} == 0
    for: 1s #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "instancemonitor"   # 自定义标签
      hostname: "{{ $labels.name }}"
      inhibit_instance: "{{ $labels.instance }}"
    annotations:
      summary: "Instance {{ $labels.instance }} down" # 自定义摘要
      severity: "紧急"
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 15s."   # 自定义具体描述
- name: 服务状态监控
  rules:
  - alert: "微服务状态监控"
    expr: up{job="MicroService"} == 0
    for: 1s #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "{{ $labels.meta_consul_service }}"   # 自定义标签
      hostname: "{{ $labels.meta_consul_service_id }}"
      inhibit_instance: "{{ $labels.meta_consul_service_address }}"
    annotations:
      summary: "微服务{{ $labels.meta_consul_service }}异常" # 自定义摘要
      severity: "紧急"
      description: "端口为{{ $labels.meta_consul_service_port }}的微服务{{ $labels.meta_consul_service }} 在 {{ $labels.meta_consul_service_address }} 上的实例异常 ."   # 自定义具体描述
- name: 微服务PS Old Gen监控
  rules:
  - alert: "微服务PS Old Gen监控"
    expr: (jvm_memory_used_bytes{area="heap",id="PS Old Gen"})*100/(jvm_memory_max_bytes{area="heap",id="PS Old Gen"}) > 95
    for: 5m #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "{{ $labels.meta_consul_service }}"   # 自定义标签
      hostname: "{{ $labels.meta_consul_service_id }}"
    annotations:
      summary: "微服务{{ $labels.meta_consul_service }} PS Old Gen 超过95%" # 自定义摘要
      severity: "紧急"
      description: "端口为{{ $labels.meta_consul_service_port }}的微服务{{ $labels.meta_consul_service }}在{{ $labels.meta_consul_service_address }}上的实例 PS Old Gen 持续增长达到超过95%以上,当前值为{{ $value }}%"   # 自定义具体描述
- name: 服务器资源监控
  rules:
  - alert: CPUUsage
    expr: 100 - ((avg by (instance,job,env)(irate(node_cpu_seconds_total{mode="idle"}[30s]))) *100) > 75
    for: 1s #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "serverMonitor"   # 自定义标签
      hostname: "{{ $labels.name }}"
    annotations:
      summary: "服务器 {{ $labels.instance }} cpu 使用率超过75%" # 自定义摘要
      severity: "一般"
      description: "服务器{{ $labels.instance }}在30秒内cpu使用率持续在75%以上,当前值 {{ $value }}%"
  - alert: FilesystemUsage
    expr: (1 - (node_filesystem_free_bytes{fstype!="tmpfs"}) / node_filesystem_size_bytes{fstype!="tmpfs"}) * 100 >75
    for: 1s #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "serverMonitor"   # 自定义标签
      hostname: "{{ $labels.name }}"
    annotations:
      summary: "服务器 {{ $labels.instance }} 磁盘使用率超过75%" # 自定义摘要
      severity: "一般"
      description: "服务器{{ $labels.instance }}磁盘使用率在75%以上,当前值 {{ $value }}%"
  - alert: MemoryUsage
    expr: (1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes)))* 100 > 90
    for: 20s #持续时间 , 表示持续15s钟获取不到信息,则触发报警
    labels:
      course: "serverMonitor"   # 自定义标签
      hostname: "{{ $labels.name }}"
    annotations:
      summary: "服务器 {{ $labels.instance }} 内存使用率超过90%" # 自定义摘要
      severity: "一般"
      description: "服务器{{ $labels.instance }}内存使用率在75%以上,当前值 {{ $value }}%"

 

在alertmanager服务中的配置文件中增加抑制规则

 

inhibit_rules:
- source_match:
    alertname: '服务器系统监控'
  target_match:
    alertname: '微服务状态监控'
  equal: ['inhibit_rules']

 

神坑,请记住,如果要在    relabel_configs中配置target供alertmanager里面的如标签使用{{$labels.meta_consul_service_address}},切记一定不能在target_label标签中以下划线开头

- source_labels: [__meta_consul_service_address]
      regex: '(.+)'
      replacement: ${1}
      target_label:  meta_consul_service_address

 

模板配置就按照普通的模板配置就行如微信模板如下:

{{ define "wechat.default.message" }}
{{ range .Alerts }}
========start==========
告警程序:prometheus_alert
告警级别:{{ .Annotations.severity }}
告警类型:{{ .Labels.alertname }}
故障主机: {{ .Labels.instance }}
告警主题: {{ .Annotations.summary }}
告警详情: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
========end==========
{{ end }}
{{ end }}

Logo

权威|前沿|技术|干货|国内首个API全生命周期开发者社区

更多推荐