### Prometheus+Grafana+Alertmanager+prometheusalert告警对接飞书监控系统(实战)第一部分
测试环境机器(基于k8s部署)https://blog.csdn.net/qq_42995639/article/details/105997023?spm=1001.2014.3001.5501 (快速部署一套k8s集群)机器规划:我的实验环境使用的 k8s 集群是一个 master 节点和三个 node 节点master:10.1.1.104centos 7.7 1908Node1:10.1.
测试环境机器(基于k8s部署)
https://blog.csdn.net/qq_42995639/article/details/105997023?spm=1001.2014.3001.5501 (快速部署一套k8s集群)
机器规划:我的实验环境使用的 k8s 集群是一个 master 节点和三个 node 节点
master:10.1.1.122 centos 7.7 1908
Node1: 10.1.1.123 centos 7.7 1908
Node2: 10.1.1.124 centos 7.7 1908
1、node-exporter 组件安装和配置
[root@master prometheus]# kubectl apply -f node_export.yaml
daemonset.apps/node-exporter created
[root@master prometheus]# cat node_export.yaml
apiVersion: apps/v1
kind: DaemonSet #保证每个节点都运行完全一样的 pod
metadata:
name: node-exporter
namespace: prometheus #名称空间
labels: #标签
name: node-exporter
spec:
selector:
matchLabels:
name: node-exporter
template:
metadata:
labels:
name: node-exporter
spec:
hostPID: true
hostIPC: true #使用宿主机的网络进程间的通信
hostNetwork: true #表示pod的所有容器会直接使用宿主机网络
containers:
- name: node-exporter
image: prom/node-exporter:latest
ports:
- containerPort: 9100
resources: #表示资源限额
requests:
cpu: 0.15 #pod调度至少要满足0.15核cpu
securityContext: #特权模式
privileged: true #打开特权模式
args: #定义一些参数
- --path.procfs
- /host/proc #加载host下的proc目录
- --path.sysfs
- /host/sys #加载host下的sys目录
- --collector.filesystem.ignored-mount-points
- '"^/(sys|proc|dev|host|etc)($|/)"'
volumeMounts: #挂载卷(定义卷需要在下面vloumes定义)
- name: dev #卷名
mountPath: /host/dev #物理下的dec目录
- name: proc #卷名
mountPath: /host/proc #物理机下的proc目录生成的(获取物理机一些物理信息)
- name: sys #卷名
mountPath: /host/sys #对应物理机的sys目录
- name: rootfs #卷名
mountPath: /rootfs #物理机下的根目录
tolerations: #容忍度 污点
- key: "node-role.kubernetes.io/master"
operator: "Exists"
effect: "NoSchedule" #排斥等级
volumes: #容器挂载(就是把上面 volumeMounts定义的卷挂载的容器中)
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: rootfs
hostPath:
path: /
通过 node-exporter 采集数据(可以测试下是否能拿到数据)
curl http://主机 ip:9100/metrics | grep node_cpu_seconds (会拿到cpu的信息)
#HELP:解释当前指标的含义,上面表示在每种模式下 node 节点的 cpu 花费的时间,以 s 为单位
#TYPE:说明当前指标的数据类型,上面是 counter 类型
node_cpu_seconds_total{cpu=“0”,mode=“idle”} :
cpu0 上 idle 进程占用 CPU 的总时间,CPU 占用时间是一个只增不减的度量指标,从类型中也可以看
出 node_cpu 的数据类型是 counter(计数器)
2、Prometheus server 安装和配置
1、创建prometheus-configmap
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prometheus
data:
prometheus.yml: |
rule_files:
- /etc/config/rules/*.rules #告警规则文件
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-kubelet
kubernetes_sd_configs:
- role: node # 发现集群中的节点
relabel_configs:
# 将标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
# 将标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# 实际访问指标接口 https://NodeIP:10250/metrics/cadvisor,这里替换默认指标URL路径
- target_label: __metrics_path__
replacement: /metrics/cadvisor
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints # 从Service列表中的Endpoint发现Pod为目标
relabel_configs:
# Service没配置注解prometheus.io/scrape的不采集
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
# 重命名采集目标协议
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
# 重命名采集目标指标URL路径
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
# 重命名采集目标地址
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
# 将K8s标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# 生成命名空间标签
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
# 生成Service名称标签
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod # 发现所有Pod为目标
# 重命名采集目标协议
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
# 重命名采集目标指标URL路径
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
# 重命名采集目标地址
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_pod_annotation_prometheus_io_port
target_label: __address__
# 将K8s标签(.*)作为新标签名,原有值不变
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
# 生成命名空间标签
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
# 生成Service名称标签
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: kubernetes_pod_name
alerting:
alertmanagers:
- static_configs:
- targets: ["0.0.0.0:9093"]
[root@master /]# kubectl apply -f prometheus-configmap.yaml
2、创建prometheus-deploy以及Service、rbac
```yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
annotations: {}
labels:
k8s-app: prometheus
k8s.kuboard.cn/name: prometheus
name: prometheus
namespace: prometheus
resourceVersion: '1807046'
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
k8s-app: prometheus
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
annotations:
kubectl.kubernetes.io/restartedAt: '2021-11-05T11:01:06+08:00'
creationTimestamp: null
labels:
k8s-app: prometheus
spec:
containers:
- args:
- '--volume-dir=/etc/config'
- '--webhook-url=http://localhost:9090/-/reload'
image: 'jimmidyson/configmap-reload:v0.1'
imagePullPolicy: IfNotPresent
name: prometheus-server-configmap-reload
resources:
limits:
cpu: 10m
memory: 100Mi
requests:
cpu: 10m
memory: 100Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/config
name: config-volume
readOnly: true
- mountPath: /etc/localtime
name: timezone
- args:
- '--config.file=/etc/config/prometheus.yml'
- '--storage.tsdb.path=/data'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
image: 'prom/prometheus:v2.20.0'
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: 9090
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
name: prometheus-server
ports:
- containerPort: 9090
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: 9090
scheme: HTTP
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
resources:
limits:
cpu: 500m
memory: 800Mi
requests:
cpu: 200m
memory: 400Mi
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/config
name: config-volume
- mountPath: /data
name: prometheus-data
- mountPath: /etc/config/rules
name: prometheus-rules
- mountPath: /etc/localtime
name: timezone
dnsPolicy: ClusterFirst
initContainers:
- command:
- chown
- '-R'
- '65534:65534'
- /data
image: 'busybox:latest'
imagePullPolicy: Always
name: init-chown-data
resources: {}
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data
name: prometheus-data
nodeName: gov-master69 #根据自己节点实际情况写 调度主机名称
restartPolicy: Always
schedulerName: default-scheduler
securityContext: {}
serviceAccount: prometheus
serviceAccountName: prometheus
terminationGracePeriodSeconds: 30
volumes:
- configMap:
defaultMode: 420
name: prometheus-config
name: config-volume
- configMap:
defaultMode: 420
name: prometheus-rules
name: prometheus-rules
- hostPath:
path: /data/k8s-prometheus/data/
type: DirectoryOrCreate
name: prometheus-data
- hostPath:
path: /usr/share/zoneinfo/Asia/Shanghai
type: ''
name: timezone
---
apiVersion: v1
kind: Service
metadata:
annotations: {}
name: prometheus
namespace: prometheus
resourceVersion: '1804449'
spec:
clusterIP: 10.98.247.115 #根据自己实际情况来
clusterIPs:
- 10.98.247.115
externalTrafficPolicy: Cluster
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- name: http
nodePort: 30006
port: 9090
protocol: TCP
targetPort: 9090
selector:
k8s-app: prometheus
sessionAffinity: None
type: NodePort
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
verbs:
- get
- nonResourceURLs:
- "/metrics"
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
[root@master /]# kubectl apply -f prometheus-deploy.yaml
3、创建prometheus-rules (需要创建这个规则文件,不然先启动prometheus的时候会报错找不到rules文件)
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: prometheus
data:
general.rules: |
groups:
- name: general.rules
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: error
annotations:
summary: "Instance {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }} job {{ $labels.job }} 已经停止5分钟以上."
node.rules: |
groups:
- name: node.rules
rules:
- alert: NodeFilesystemUsage
expr: |
100 - (node_filesystem_free_bytes / node_filesystem_size_bytes) * 100 > 90
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于90% (当前值: {{ $value }})"
- alert: NodeMemoryUsage
expr: |
100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) /
node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} 内存使用率过高"
description: "{{ $labels.instance }}内存使用大于90% (当前值: {{ $value }})"
- alert: NodeCPUUsage
expr: |
100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) * 100) > 60
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} CPU使用率过高"
description: "{{ $labels.instance }}CPU使用大于60% (当前值: {{ $value }})"
- alert: KubeNodeNotReady
expr: |
kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1m
labels:
severity: error
annotations:
message: '{{ $labels.node }} 已经有10多分钟没有准备好了.'
更多推荐
所有评论(0)