12. K8S集群的监控
1. Prometheus 安装1.1 主要组件Prometheus server:主服务,接受外部 http 请求,收集、存储与查询数据等Prometheus targets: 静态收集的目标服务数据Service discovery:动态发现服务Prometheus alerting:报警通知Push gateway:数据收集代理服务器(类似于 zabbix proxy)Data visual
1. Prometheus 安装
1.1 主要组件
- Prometheus server:主服务,接受外部 http 请求,收集、存储与查询数据等
- Prometheus targets: 静态收集的目标服务数据
- Service discovery:动态发现服务
- Prometheus alerting:报警通知
- Push gateway:数据收集代理服务器(类似于 zabbix proxy)
- Data visualization and export: 数据可视化与数据导出(访问客户端)
1.2 Prometheus Server安装
官方文档:https://prometheus.io/docs/prometheus/latest/installation/
下载:https://prometheus.io/download/
1.2.1 通过容器启动
docker pull prom/prometheus:v2.31.1
docker run -it -d --restart=always -p 9090 prom/prometheus:v2.31.1
1.2.2 在线安装
# apt search prometheus
# apt-cache madison prometheus
prometheus | 2.15.2+ds-2 | http://mirrors.aliyun.com/ubuntu focal/universe amd64 Packages
prometheus | 2.15.2+ds-2 | http://mirrors.aliyun.com/ubuntu focal/universe Sources
#apt install prometheus
1.2.3 Operator 部署
官方部署文档:https://github.com/prometheus-operator/kube-prometheus
# git clone -b release-0.9 https://github.com/prometheus-operator/kube-prometheus.git #注意版本与k8s版本的对应关系
# cd kube-prometheus/
# kubectl apply -f manifests/setup
# kubectl apply -f manifests/
可以视情况修改manifests和manifests/setup(grep -R image: *.yaml)中yaml文件中的镜像源。
1.2.3.1 验证Prometheus
kubectl port-forward --help #在哪个node上执行就访问哪个node,临时暴露端口
kubectl --namespace monitoring port-forward --address 0.0.0.0 svc/prometheus-k8s 9090:9090
1.2.3.2 SVC暴露Prometheus
# vim manifests/prometheus-service.yaml
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 2.29.1
prometheus: k8s
name: prometheus-k8s
namespace: monitoring
spec:
type: NodePort
ports:
- name: web
port: 9090
nodePort: 39090
targetPort: web
selector:
app: prometheus app.kubernetes.io/component: prometheus
app.kubernetes.io/name: prometheus
app.kubernetes.io/part-of: kube-prometheus
prometheus: k8s
sessionAffinity: ClientIP
# kubectl apply -f manifests/prometheus-service.yaml
service/prometheus-k8s configured
1.2.3.3 验证Grafana
# kubectl --namespace monitoring port-forward --address 0.0.0.0 svc/grafana 3000:3000
1.2.3.4 SVC暴露Grafana
# vim manifests/grafana-service.yaml
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 8.1.1
name: grafana
namespace: monitoring
spec:
type: NodePort
ports:
- name: http
port: 3000
targetPort: http
nodePort: 33000
selector:
app.kubernetes.io/component: grafana
app.kubernetes.io/name: grafana
app.kubernetes.io/part-of: kube-prometheus
1.2.4 官方二进制部署
1.2.4.1 下载官方二进制包
# mkdir /apps
# tar xvf prometheus-2.31.1.linux-amd64.tar.gz
# ln -sv /apps/prometheus-2.31.1.linux-amd64 /apps/prometheus
'/apps/prometheus' -> '/apps/prometheus-2.31.1.linux-amd64'
# cd /apps/prometheus # ll
prometheus* #prometheus 服务可执行程序
prometheus.yml #配置文件
promtool* #测试工具,用于检测配置 prometheus 配置文件、检测 metrics 数据等
# ./promtool check config prometheus.yml Checking prometheus.yml
SUCCESS: 0 rule files found
1.2.4.2 创建service文件
# vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus Server Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure WorkingDirectory=/apps/prometheus/
ExecStart=/apps/prometheus/prometheus --config.file=/apps/prometheus/prometheus.yml
[Install]
WantedBy=multi-user.target
1.2.4.3 启动 prometheus 服务
# systemctl daemon-reload && systemctl restart prometheus && systemctl enable prometheus
1.2.4.4 安装Node_exporter
1.2.4.4.1 Node节点上安装node_exporter
https://prometheus.io/download/
# wget https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz
# mkdir /apps
# tar xf node_exporter-1.2.2.linux-amd64.tar.gz -C /apps
# ln -sv node_exporter-1.2.2.linux-amd64/ node_exporter
1.2.4.4.2 配置node_exporter的service文件
# vim /etc/systemd/system/node-exporter.service
[Unit]
Description=Prometheus Node Exporter After=network.target
[Service]
ExecStart=/apps/node_exporter/node_exporter --web.listen-address=":9110"
[Install]
WantedBy=multi-user.target
1.2.4.4.3 启动node_exporter服务
# systemctl daemon-reload && systemctl restart node-exporter && systemctl enable node-exporter.service
1.2.4.4.4 验证服务
1.2.4.5 配置Prometheus采集node指标数据
配置 Prometheus 通过 node exporter 采集 node 节点的监控指标数据。
# vim /apps/prometheus/prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.数据收集间隔时间,如果不配置默认为一分钟
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.规 则扫描间隔时间,如果不配置默认为一分钟
# scrape_timeout is set to the global default (10s).超时时间
# Alertmanager configuration 报警通知配置
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: #规则配置
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs: #数据采集目标配置
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: 'promethues-node'
static_configs:
- targets: ['172.16.244.111:9110','172.16.244.112:9110']
# systemctl restart prometheus.service
1.2.4.6 安装blackbox exporter
https://prometheus.io/download/#blackbox_exporter
blackbox_exporter 是 Prometheus 官方提供的一个 exporter,可以通过 HTTP, HTTPS, DNS, TCP 和 ICMP 对被监控节点进行监控和数据采集。
- HTTP/HTPPS:URL/API 可用性检测
- TCP:端口监听检测 ICMP:主机存活检测
- DNS:域名解析
1.2.4.6.1 node节点上部署
# wget https://github.com/prometheus/blackbox_exporter/releases/download/v0.19.0/blackbox_exporter-0.19.0.linux-amd64.tar.gz
# tar xf /root/blackbox_exporter-0.19.0.linux-amd64.tar.gz -C /apps
# ln -sv /apps/blackbox_exporter-0.19.0.linux-amd64 /apps/blackbox_exporter
1.2.4.6.2 配置文件
# cat blackbox.yml #一般不用动,因为blackbox是通过prometheus server端来配置对哪个服务进行什么样的监控,server端来发送监控请求,client端将监控数据收集到后推送给server端,因此client端的配置模块只是定义了一些启动的模块。
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
1.2.4.6.3 配置service文件
# vim /etc/systemd/system/blackbox-exporter.service
[Unit]
Description=Prometheus Blackbox Exporter
After=network.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/apps/blackbox_exporter/blackbox_exporter \
--config.file=/apps/blackbox_exporter/blackbox.yml \
--web.listen-address=:9115
Restart=on-failure
[Install]
WantedBy=multi-user.target
1.2.4.6.4 启动black_exporter 并验证
systemctl daemon-reload && systemctl restart blackbox-exporter.service && systemctl enable blackbox-exporter.service
1.2.4.7 blackbox exporter 实现 URL 监控
# vim /apps/prometheus/prometheus.yml
# 网站监控
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets: ['http://www.xiaomi.com', 'http://www.baidu.com']
labels:
instance: http_status
group: web
relabel_configs:
- source_labels: [__address__] #relabel 通 过 将 __address__( 当 前 目 标 地 址 ) 写 入__param_target 标签来创建一个 label。
target_label: __param_target #监控目标 www.xiaomi.com,作为__address__的 value
- source_labels: [__param_target] #监控目标
target_label: url #将监控目标与 url 创建一个 label
- target_label: __address__
replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule files found
# systemctl restart prometheus.service
1.2.4.8 blackbox exporter 实现 ICMP 监控
# vim /apps/prometheus/prometheus.yml
# icmp 检测
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
static_configs:
- targets: ['172.31.0.2',"223.6.6.6"]
labels:
instance: 'ping_status'
group: 'icmp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip #将 ip 与__param_target 创建一个 label
- target_label: __address__
replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule files found
# systemctl restart prometheus.service
1.2.4.9 blackbox exporter 实现端口监控
# vim /apps/prometheus/prometheus.yml
# 端口监控
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ['172.16.244.101:9100','172.16.244.132:80','172.16.244.202:6443']
labels:
instance: 'port_status'
group: 'port'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: ip
- target_label: __address__
replacement: 172.16.244.111:9115
# /apps/prometheus/promtool check config /apps/prometheus/prometheus.yml
Checking /apps/prometheus/prometheus.yml
SUCCESS: 0 rule files found
# systemctl restart prometheus.service
1.2.5 安装Grafana
https://grafana.com/docs/
grafana 是一个开源的可视化工具,可以调用 prometheus、mysql 等数据源进行更绚丽的前 端可视化。
1.2.5.1 安装并启动 grafana
# sudo apt-get install -y adduser libfontconfig1
# dpkg -i grafana-enterprise_7.5.11_amd64.deb
配置文件:
# vim /etc/grafana/grafana.ini
[server]
# Protocol (http, https, socket)
protocol = http
# The ip address to bind to, empty will bind to all interfaces http_addr = 0.0.0.0
# The http port to use
http_port = 3000
# systemctl restart grafana-server
# systemctl enable grafana-server
1.2.5.2 添加数据源
1.2.5.3 导入模板
-
node exporter 模板(ID:8919)
-
blackbox exporter 模板(ID:9719)
2. PromQL 语句
Prometheus 提供一个函数式的表达式语言 PromQL (Prometheus Query Language),可以使用户实时地查找和聚合时间序列数据,表达式计算结果可以在图表中展示,也可以在 Prometheus表达式浏览器中以表格形式展示,或者作为数据源,以HTTP API的方式提供给 外部系统使用。
node_memory_MemTotal_bytes #查询 node 节点总内存大小
node_memory_MemFree_bytes #查询 node 节点剩余可用内存
node_memory_MemTotal_bytes{instance=“172.31.7.111:9100”} #查询指定节点的总内存
node_memory_MemFree_bytes{instance=“172.31.7.111:9100”} #查询指定节点的可用内存
node_disk_io_time_seconds_total{device=“sda”} #查询指定磁盘的每秒磁盘 io
node_filesystem_free_bytes{device="/dev/sda1",fstype=“xfs”,mountpoint="/"} #查看指定磁盘的磁盘剩余空间
node_load1 0.1 #CPU 1min 负载
2.1 数据类型
- 瞬时向量 (instant vector):是一组时间序列,每个时间序列包含单个数据样本,比如node_memory_MemTotal_bytes 查询当前剩余内存就是一个瞬时向量,该表达式的返回值中只会包含该时间序列中的最新的一个样本值,而相应的这样的表达式称之为瞬时向量表达式。
- 范围向量(range vector):是指在任何一个时间范围内,抓取的所有度量指标数据.比如最近一天的网卡流量趋势图。
- 标量(scalar):是一个浮点数类型的数据值,使用 node_load1 获取到时一个瞬时向量,但是可用使用内置函数 scalar()将瞬时向量转换为标量。
- 字符串(string):字符串类型的数据,目前使用较少
2.2 匹配器
- = :选择与提供的字符串完全相同的标签。
- != :选择与提供的字符串不相同的标签。
- =~ :选择正则表达式与提供的字符串(或子字符串)相匹配的标签。
- !~ :选择正则表达式与提供的字符串(或子字符串)不匹配的标签。
#查询格式{
node_load1{instance=“172.16.244.100:9100”}
node_load1{job=“promethues-node”}node_load1{job=“promethues-node”,instance=“172.16.244.100:9100”}
node_load1{job=“promethues-node”,instance!=“172.16.244.100:9100”}
2.3 时间范围
s-秒
m - 分钟
h - 小时
d-天
w-周
y-年
node_memory_MemTotal_bytes{} # 瞬时向量表达式,选择当前最新的数据
node_memory_MemTotal_bytes{}[5m] # 区间向量表达式,选择以当前时间为基准,5 分钟内的数据node_memory_MemTotal_bytes{instance=“172.31.7.111:9100”}[5m]
2.4 运算符
+ 加法
- 减法
* 乘法
/ 除法% 模
^ 幂等
node_memory_MemFree_bytes/1024/1024 #将内存进行单位转换
node_disk_read_bytes_total{device=“sda”} + node_disk_written_bytes_total{device=“sda”} #计算磁盘每秒读写数据量
2.5 聚合运算
sum (求和)
min (最小值)
max (最大值)
avg (平均值)
stddev (标准差)
stdvar (标准差异)
count (计数)
count_values (对 value 进行计数)
bottomk (样本值最小的 k 个元素)
topk (样本值最大的 k 个元素)
quantile (分布统计)
max(node_memory_MemFree_bytes) #某个指标数据的最大值
sum(http_requests_total) #计算 http_requests_total 最近的请求总量
3. 对Pod的监控
cadvisor 由谷歌开源,cadvisor 不仅可以搜集一台机器上所有运行的容器信息,还提供基础 查询界面和 http 接口,方便其他组件如 Prometheus 进行数据抓取,cAdvisor 可以对节点机 器上的资源及容器进行实时监控和性能数据采集,包括 CPU 使用情况、内存使用情况、网 络吞吐量及文件系统使用情况。
k8s 1.12 之前 cadvisor 集成在 node 节点的上 kubelet 服务中,从 1.12 版本开始分离为两个组 件,因此需要在 node 节点单独部署 cadvisor
https://github.com/google/cadvisor
3.1 cAdvisor 镜像准备
# docker load -i cadvisor-v0.39.2.tar.gz
# docker tag gcr.io/cadvisor/cadvisor:v0.39.2 harbor.k8s.local/k8s/cadvisor:v0.39.2
# docker push harbor.k8s.local/k8s/cadvisor:v0.39.2
3.2 启动 cAdvisor容器
docker run -it -d \
--restart=always \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker/:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
--privileged \
--device=/dev/kmsg \
harbor.k8s.local/k8s/cadvisor:v0.39.2
- kubernetes 的kustomize部署
官方文档:https://github.com/google/cadvisor/tree/master/deploy/kubernetes
3.3 验证页面
3.4 Prometheus采集cAdvisor数据
# vim /apps/prometheus/prometheus.yml
- job_name: 'prometheus-containers'
static_configs:
- targets: ["172.16.244.111:8080","172.16.244.112:8080","172.16.244.113:8080"]
3.5 添加Grafana模板
容器模板ID:395 893
4. Prometheus告警
prometheus—>触发阈值—>超出持续时间—>alertmanager—>分组|抑制|静默—>媒体类型 —>邮件|钉钉|微信等。
- 分组(group): 将类似性质的警报合并为单个通知,比如网络通知、主机通知、服务通知。
- 静默(silences): 是一种简单的特定时间静音的机制,例如:服务器要升级维护可以先设置这 个时间段告警静默。
- 抑制(inhibition): 当警报发出后,停止重复发送由此警报引发的其他警报即合并一个故障引 起的多个报警事件,可以消除冗余告警
4.1 下载报警组件 alertmanager
# tar xf alertmanager-0.23.0.linux-amd64.tar.gz -C /apps
# ln -sv /apps/alertmanager-0.23.0.linux-amd64 /apps/alertmanager
# vim /etc/systemd/system/alertmanager.service
[Unit]
Description=Prometheus Server Documentation=https://prometheus.io/docs/introduction/overview/
After=network.target
[Service]
Restart=on-failure
WorkingDirectory=/apps/alertmanager
ExecStart=/apps/alertmanager/alertmanager
[Install]
WantedBy=multi-user.target
4.2 配置alertmanager
官方配置文档:https://prometheus.io/docs/alerting/configuration/
global:
smtp_from: #发件人邮箱地址
smtp_smarthost: #邮箱 smtp 地址。
smtp_auth_username: #发件人的登陆用户名,默认和发件人地址一致。
smtp_auth_password: #发件人的登陆密码,有时候是授权码。
smtp_require_tls: #是否需要 tls 协议。默认是 true。
wechat_api_url: #企业微信 API 地址。
wechat_api_secret: #企业微信 API secret
wechat_api_corp_id: #企业微信 corp id 信息。
resolve_timeout: #在指定时间内没有产生新的事件就发送恢复通知
4.2.1 配置示例
# pwd
/apps/alertmanager
# cat alertmanager.yml
global:
resolve_timeout: 5m #在指定时间内没有产生新的事件就发送恢复通知
smtp_smarthost: 'smtp.126.com:465'
smtp_from: 'xiaoyizi@126.com'
smtp_auth_username: 'xiaoyizi@126.com'
smtp_auth_password: 'TJNTDNDFLAKXOFM'
smtp_hello: '@126.com'
smtp_require_tls: false
route: #route 用来设置报警的分发策略
group_by: ['alertname'] #采用哪个标签来作为分组依据
group_wait: 10s #组告警等待时间。也就是告警产生后等待 10s,如果有同组告警一起发出
group_interval: 2s #两组告警的间隔时间
repeat_interval: 2m #重复告警的间隔时间,减少相同邮件的发送频率
receiver: 'web.hook' #设置接收人
receivers:
- name: 'web.hook'
#webhook_configs:
#- url: 'http://127.0.0.1:5001/'
email_configs:
- to: 'xiaoyizi@126.com'
inhibit_rules: #抑制的规则
- source_match: #源匹配级别,当匹配成功发出通知,但是其他的通知将被抑制
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
4.3 启动并验证
# systemctl daemon-reload && systemctl restart alertmanager && systemctl enable alertmanager
# lsof -i:9093
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
alertmana 39441 root 8u IPv6 232407 0t0 TCP *:9093 (LISTEN)
#命令行查看目前是否有告警产生
root@deploy:/apps/alertmanager# ./amtool alert --alertmanager.url=http://172.16.244.100:9093
Alertname Starts At Summary State
4.4 Prometheus 报警配置
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.16.244.100:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "rules.yml"
4.5 创建报警规则文件
# vim /apps/prometheus/rules.yml
groups:
- name: alertmanager_pod.rules
rules:
- alert: Pod_all_cpu_usage
expr: (sum by(name)(rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 1
for: 2m
labels:
severity: critical
service: pods
annotations:
description: 容器 {{ $labels.name }} CPU 资源利用率大于 10% , (current value is {{ $value }})
summary: Dev CPU 负载告警
- alert: Pod_all_memory_usage
expr: sort_desc(avg by(name)(irate(container_memory_usage_bytes{name!=""}[5m]))*100) > 10 #内存大于 10%
#expr: sort_desc(avg by(name)(irate(node_memory_MemFree_bytes {name!=""}[5m]))) > 2 #内存大于 2G
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} Memory 资源利用率大于 2G , (current value is {{ $value }})
summary: Dev Memory 负载告警
- alert: Pod_all_network_receive_usage
expr: sum by(name)(irate(container_network_receive_bytes_total{container_name="POD"}[1m])) > 1
for: 2m
labels:
severity: critical
annotations:
description: 容器 {{ $labels.name }} network_receive 资源利用率大于 50M ,(current value is {{ $value }})
- alert: node 内存可用大小
expr: node_memory_MemFree_bytes > 1 #写错,做测试
for: 2m
labels:
severity: critical
annotations:
description: 容器可用内存小于 100k
4.6 验证规则
root@deploy:/apps/prometheus# ./promtool check rules rules.yml
Checking rules.yml
SUCCESS: 4 rules found
4.7 重启Prometheus并验证规则
# systemctl restart prometheus
4.8 邮件验证
更多推荐
所有评论(0)