【Kubernetes】基于k8s部署整套监控告警系统
说明:prometheus、alertmanager、grafana、dingtalk 、node_exporter 均使用k8s部署k8s版本:v1.20.15所有应用部署方式:docker启动,且都能正常使用。
说明:prometheus、alertmanager、grafana、dingtalk 、node_exporter 均使用k8s部署
k8s版本:v1.20.15
所有应用部署方式:docker启动,且都能正常使用
一、yaml 准备
alertmanager-all-in-one.yaml
apiVersion: v1
data:
config.yml: |
global:
resolve_timeout: 30s
route:
group_by: ['docker', 'node', 'prometheus', 'instance']
group_interval: 30s
group_wait: 5m
repeat_interval: 1h
receiver: webhook
routes:
- match:
severity: 'Critical'
receiver: 'webhook'
- match_re:
severity: ^(Warning|Disaster)$
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: http://dingtalk.prom-test.svc.cluster.local:8060/dingtalk/webhook/send
send_resolved: true
kind: ConfigMap
metadata:
name: alertmanager
namespace: prom-test
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: prom-test
labels:
app: alertmanager
spec:
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
nodeName: node-5
volumes:
- name: config
configMap:
name: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager
imagePullPolicy: IfNotPresent
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--log.level=debug"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: config
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 2
memory: 2Gi
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: prom-test
spec:
type: NodePort
ports:
- name: web
port: 9093
targetPort: http
nodePort: 30333
selector:
app: alertmanager
prometheus-all-in-one.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: prom-test
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
# alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# 告警规则路径,相对路径
rule_files:
- "*-rule.yml"
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'coredns'
static_configs:
- targets: ['10.96.0.10:9153']
- job_name: 'kubernetes-apiserver'
static_configs:
- targets: ['10.96.0.1']
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: 'kubernetes-sd-node-exporter'
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- source_labels: [__address__]
regex: '(.*):(9.*)'
replacement: '$1'
target_label: ip
action: replace
- job_name: 'kubernetes-sd-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- target_label: __address__
replacement: 10.96.0.1
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: "huaweicloud-linux"
static_configs:
- targets: ["xxx.xxx.xx.xx:9100","xxx.xxx.xx.xx:9100"]
- job_name: "icmp_ping"
metrics_path: /probe
params:
module: [icmp] # 使用icmp模块
static_configs:
- targets: ["xxx.xxx.xx.xx"]
relabel_configs:
- source_labels: [__address__]
regex: (.*)(:80)?
target_label: __param_target
replacement: ${1}
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
regex: (.*)
target_label: ping
replacement: ${1}
- source_labels: []
regex: .*
target_label: __address__
replacement: xxx.xxx.xx.xx:9115
- job_name: "http_get_status"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- "https://xxx.xxx.com.cn:{port}"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: "xxx.xxx.x.xx:9115" # 指向实际的Blackbox exporter.
- target_label: region
replacement: "xxx.xxx.x.xx:9115"
- job_name: 'tcp_port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ["https://xxx.xxx.com.cn:xxxx","xxx.xxx.xx.xx:{port}","xxx.xxx.xx.xx:{port}"]
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: xxx.xxx.xx.xx:9115
- job_name: "nacos-server"
metrics_path: '/nacos/actuator/prometheus'
static_configs:
- targets: ["xxx.xxx.xx.xx:8848"]
- job_name: 'Rabbitmq'
scrape_interval: 5s
static_configs:
- targets:
- xxx.xxx.xx.xx:9419
labels:
instance: RabbitMQ-xxx.xxx.xx.xx
- job_name: 'redis'
static_configs:
- targets: ['xxx.xxx.xx.xx:9121']
- job_name: 'mysql-service'
static_configs:
- targets: ["xxx.xxx.xx.xx:9104"]
node-rule.yml: |
groups:
- name: node
rules:
- alert: CPU_High
expr: floor((1 - (sum(increase(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) / sum(increase(node_cpu_seconds_total[1m])) by (instance))) * 100) > 1
for: 3s
labels:
status: Critical
annotations:
summary: "in prometheus {{ $labels.instance }}: High CPU usage"
description: "{{ $labels.instance }} of job {{ $labels.job }} CPU usage is {{ $value }}%"
- alert: Free_High
expr: floor(100 - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100) > 0
for: 3s
labels:
status: Critical
annotations:
summary: "in prometheus {{ $labels.instance }}: High Free usage"
description: "{{ $labels.instance }} of job {{ $labels.job }} Free usage is {{ $value }}%"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus
namespace: prom-test
labels:
app: prometheus
spec:
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus # 引用这个servicecount
initContainers:
- name: "change-permission-of-directory"
image: busybox:latest
command: ["/bin/sh"]
args: ["-c", "chown -R 65534:65534 /prometheus"]
securityContext:
privileged: true
volumeMounts:
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/prometheus" # 挂载到容器的路径,挂载点
name: data
containers:
- image: prometheus:v2.33.2
name: prometheus
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus" # 指定tsdb数据路径
- "--web.enable-lifecycle" # 支持热更新,直接执行localhost:9090/-/reload立即生效
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
ports:
- containerPort: 9090
name: http
volumeMounts:
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/prometheus"
name: data
resources:
requests:
cpu: 200m
memory: 512Mi
limits:
cpu: 4
memory: 4Gi
volumes:
- name: data
persistentVolumeClaim:
claimName: prometheus-pvc
- configMap:
name: prometheus-config
name: config-volume
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prom-test
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups:
- ""
resources:
- nodes
- services
- endpoints
- pods
- nodes/proxy
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- configmaps
- nodes/metrics
verbs:
- get
- nonResourceURLs: # 非资源类型的URL,这里就是用于抓取指标采集的接口.
- /metrics
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prom-test
---
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: prom-test
labels:
app: prometheus
spec:
selector:
app: prometheus
type: NodePort
ports:
- name: web
port: 9090
targetPort: http
nodePort: 30650
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: prom-test
#annotations:
#volume.beta.kubernetes.io/storage-class: "nfs"
spec:
storageClassName: huaweicloud-nfs # 这一定要写明是关联的哪个storageclass
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi # 根据实际情况修改
grafana-all-in-one.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: prom-test
spec:
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
volumes:
- name: storage
persistentVolumeClaim:
claimName: grafana-pvc
securityContext:
runAsUser: 0
containers:
- name: grafana
image: grafana:8.5.0
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: grafana
env:
- name: GF_SECURITY_ADMIN_USER
value: admin
- name: GF_SECURITY_ADMIN_PASSWORD
value: admin@123
readinessProbe:
failureThreshold: 10
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 60
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
# livenessProbe:
# failureThreshold: 3
# httpGet:
# path: /api/health
# port: 3000
# scheme: HTTP
# periodSeconds: 10
# successThreshold: 1
# timeoutSeconds: 1
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 150m
memory: 512Mi
volumeMounts:
- mountPath: /var/lib/grafana
name: storage
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prom-test
spec:
type: ClusterIP
ports:
- port: 3000
selector:
app: grafana
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: grafana-pvc
namespace: prom-test
#annotations:
#volume.beta.kubernetes.io/storage-class: "nfs"
spec:
storageClassName: huaweicloud-nfs
accessModes:
- ReadWriteMany
resources:
requests:
storage: 10Gi
dingtalk-all-in-one.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: dingtalk-config
namespace: prom-test
data:
config.yml: |-
templates:
- /etc/prometheus-webhook-dingtalk/dingding.tmpl
targets:
webhook:
# 钉钉上创建机器人给的token, 需要改成自己的
url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxxx
mention:
all: true
dingding.tmpl: |-
{{ define "dingtalk.to.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= **监控告警** =========
**告警集群:** k8s
**告警类型:** {{ $alert.Labels.alertname }}
**告警级别:** {{ $alert.Labels.severity }}
**告警状态:** {{ .Status }}
**故障主机:** {{ $alert.Labels.instance }} {{ $alert.Labels.device }}
**告警主题:** {{ .Annotations.summary }}
**告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**主机标签:** {{ range .Labels.SortedPairs }} </br> [{{ .Name }}: {{ .Value | markdown | html }} ]
{{- end }} </br>
**故障时间:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = **end** = =========
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
========= **故障恢复** =========
**告警集群:** k8s
**告警主题:** {{ $alert.Annotations.summary }}
**告警主机:** {{ .Labels.instance }}
**告警类型:** {{ .Labels.alertname }}
**告警级别:** {{ $alert.Labels.severity }}
**告警状态:** {{ .Status }}
**告警详情:** {{ $alert.Annotations.message }}{{ $alert.Annotations.description}}
**故障时间:** {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间:** {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = **end** = =========
{{- end }}
{{- end }}
{{- end }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: dingtalk
namespace: prom-test
spec:
replicas: 1
selector:
matchLabels:
app: dingtalk
template:
metadata:
name: dingtalk
labels:
app: dingtalk
spec:
nodeName: node-5
containers:
- name: dingtalk
image: timonwong/prometheus-webhook-dingtalk
imagePullPolicy: IfNotPresent
ports:
- containerPort: 8060
volumeMounts:
- name: config
mountPath: /etc/prometheus-webhook-dingtalk
volumes:
- name: config
configMap:
name: dingtalk-config
---
apiVersion: v1
kind: Service
metadata:
name: dingtalk
namespace: prom-test
labels:
app: dingtalk
annotations:
prometheus.io/scrape: 'false'
spec:
selector:
app: dingtalk
ports:
- name: dingtalk
port: 8060
protocol: TCP
targetPort: 8060
node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: node-exporter
namespace: prom-test
labels:
app: node-exporter
spec:
selector:
matchLabels:
app: node-exporter
template:
metadata:
labels:
app: node-exporter
spec:
hostPID: true
hostIPC: true
hostNetwork: true
nodeSelector:
kubernetes.io/os: linux
containers:
- name: node-exporter
image: node-exporter:v1.3.0
# 因的containerd配置文件写的state = "/data/containerd/run", 故排除data/containerd/
# docker默认的为/var/lib/docker/
args:
- --web.listen-address=$(HOSTIP):9100
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.mount-points-exclude==^/(dev|proc|sys|data/containerd/|var/lib/docker/.+)($|/)
- --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
ports:
- containerPort: 9100
env:
- name: HOSTIP
valueFrom:
fieldRef:
fieldPath: status.hostIP
resources:
requests:
cpu: 500m
memory: 200Mi
limits:
cpu: 500m
memory: 200Mi
securityContext:
runAsNonRoot: true
runAsUser: 65534
volumeMounts:
- name: proc
mountPath: /host/proc
- name: sys
mountPath: /host/sys
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
# master节点都有污点,但是master节点也需要被监控,加入容忍污点,也就可以在master上部署一个客户端
tolerations:
- operator: "Exists"
volumes:
- name: proc
hostPath:
path: /proc
- name: dev
hostPath:
path: /dev
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /
二、部署
说明:prometheus的configmap里面配置了除k8s集群之外的监控(mysql、redis等等),dingtalk的configmap里面配置webhook url也需要改成自己的,在这里我们先可以将prometheus、alertmanager、grafana、dingtalk、node_exporter先部署起来,后面再根据自己的需求进行配置。
这里我们将所有的yaml放在同一目录下面。
# 先创建一个名字为 prom-test的namespace,也可直接改为自己的
[root@master /home/yaml/prometheus-all]# kubectl create ns prom-test
# 执行部署
[root@master /home/yaml/prometheus-all]# kubectl apply -f .
......
# 查看该名称空间下的pod,保证pod状态都是为Running就ok了
[root@master /home/yaml/prometheus-all]# kubectl get po -n prom-test
NAME READY STATUS RESTARTS AGE
alertmanager-64977b58cc-bslk7 1/1 Running 0 2d4h
dingtalk-68698c87b5-rnck8 1/1 Running 0 2d4h
grafana-79647d8956-8j9cq 1/1 Running 0 8d
node-exporter-478z9 1/1 Running 0 9d
node-exporter-6nrhl 1/1 Running 0 9d
node-exporter-94v9c 1/1 Running 0 9d
node-exporter-9z55c 1/1 Running 0 9d
node-exporter-bpm95 1/1 Running 0 9d
node-exporter-rpjnl 1/1 Running 0 9d
prometheus-56744b95c7-p8kjx 1/1 Running 0 2d4h
# 查看该名称空间下的service
[root@master /home/yaml/prometheus-all]# kubectl get svc -n prom-test
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
alertmanager NodePort 10.99.144.191 <none> 9093:30333/TCP 2d4h
dingtalk ClusterIP 10.101.207.48 <none> 8060/TCP 2d4h
grafana NodePort 10.102.61.131 <none> 3000:31430/TCP 8d
prometheus NodePort 10.103.228.196 <none> 9090:30650/TCP 2d4h
# 除了dingtalk之外,TYPE不是nodeport的我们可以使用以下命令修改
kubectl patch svc grafana -n prom-test -p '{"spec": {"type": "NodePort"}}'
dashboard的访问方式均为:{k8s-master ip} : {port}
三、探针部署
blackbox_exporter
# 安装包准备
curl -LO https://github.com/prometheus/blackbox_exporter/releases/download/v0.22.0/blackbox_exporter-0.22.0.linux-amd64.tar.gz
tar xf blackbox_exporter-0.22.0.linux-amd64.tar.gz -C /usr/local/
ln -sv /usr/local/blackbox_exporter-0.22.0.linux-amd64 /usr/local/blackbox_exporter
# 后台启动
cd /usr/local/blackbox_exporter
./blackbox_exporter &
# 检查是否能采集到数据
[root@master /usr/local/blackbox_exporter-0.22.0.linux-amd64]# curl localhost:9115/metrics
# HELP blackbox_exporter_build_info A metric with a constant '1' value labeled by version, revision, branch, and goversion from which blackbox_exporter was built.
# TYPE blackbox_exporter_build_info gauge
blackbox_exporter_build_info{branch="HEAD",goversion="go1.18.5",revision="0bbd65d1264722f7afb87a72ec4128b9214e5840",version="0.22.0"} 1
# HELP blackbox_exporter_config_last_reload_success_timestamp_seconds Timestamp of the last successful configuration reload.
# TYPE blackbox_exporter_config_last_reload_success_timestamp_seconds gauge
blackbox_exporter_config_last_reload_success_timestamp_seconds 1.6945018367674305e+09
# HELP blackbox_exporter_config_last_reload_successful Blackbox exporter config loaded successfully.
# TYPE blackbox_exporter_config_last_reload_successful gauge
blackbox_exporter_config_last_reload_successful 1
# HELP blackbox_module_unknown_total Count of unknown modules requested by probes
......
# 接下来我们就可以开始配置各类监控了
# icmp监控,监控主机存活状态
- job_name: "icmp_ping"
metrics_path: /probe
params:
module: [icmp] # 使用icmp模块
static_configs:
- targets: ["xxx.xxx.xx.xx"]
relabel_configs:
- source_labels: [__address__]
regex: (.*)(:80)?
target_label: __param_target
replacement: ${1}
- source_labels: [__param_target]
target_label: instance
- source_labels: [__param_target]
regex: (.*)
target_label: ping
replacement: ${1}
- source_labels: []
regex: .*
target_label: __address__
replacement: xxx.xxx.xx.xx:9115
# http监控
- job_name: "http_get_status"
metrics_path: /probe
params:
module: [http_2xx] # Look for a HTTP 200 response.
static_configs:
- targets:
- "https://xxx.xxx.com.cn:xxxx"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: "xxx.xxx.xx.xx:9115" # 指向实际的Blackbox exporter.
- target_label: region
replacement: "xxx.xxx.xx.xx:9115"
# tcp端口监控
- job_name: 'tcp_port_status'
metrics_path: /probe
params:
module: [tcp_connect]
static_configs:
- targets: ["https://xxx.xxx.com.cn:xxxx","xxx.xxx.xx.xx:xxxx","xxx.xxx.xx.xx:xxxx"]
labels:
instance: 'port_status'
group: 'tcp'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: xxx.xxx.xx.xx:9115
mysqld_exporter
# 自行下载安装包
# 解压安装包
tar -zvxf mysqld_exporter-0.14.0.linux-amd64.tar.gz
cd mysqld_exporter-0.14.0.linux-amd64
# 配置my.cnf
vim my.cnf
[client]
host=127.0.0.1
port=3306
user=exporter
password=xxxxxxxxxxxxxx
# 数据库授权
CREATE USER 'exporter'@'%' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';
flush privileges;
# 启动
./mysqld_exporter --config.my-cnf=my.cnf &
# 测试是否采集到数据
[root@data-server ~]# curl localhost:9104/metrics
# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 4.0662e-05
go_gc_duration_seconds{quantile="0.25"} 4.0662e-05
go_gc_duration_seconds{quantile="0.5"} 4.0662e-05
go_gc_duration_seconds{quantile="0.75"} 4.0662e-05
go_gc_duration_seconds{quantile="1"} 4.0662e-05
go_gc_duration_seconds_sum 4.0662e-05
go_gc_duration_seconds_count 1
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 8
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
......
# prometheus配置
- job_name: 'mysql-service'
static_configs:
- targets: ["xxx.xxx.xx.xx:9104"]
redis_exporter
github官网下载的文件:redis_exporter-v1.44.0.linux-amd64.tar.gz
# 解压包
tar -zxvf redis_exporter-v1.44.0.linux-amd64.tar.gz
# 启动探针
./redis_exporter -redis.addr 127.0.0.1:6379 -redis.password xxxxxxxxxxx -web.listen-address 127.0.0.1:9121
# 查看是否采集到数据
curl localhost:9121/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 9.382e-06
go_gc_duration_seconds{quantile="0.25"} 2.6133e-05
go_gc_duration_seconds{quantile="0.5"} 3.2812e-05
go_gc_duration_seconds{quantile="0.75"} 6.6195e-05
go_gc_duration_seconds{quantile="1"} 0.000299789
go_gc_duration_seconds_sum 0.195450594
go_gc_duration_seconds_count 4333
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 8
......
# prometheus配置
- job_name: 'redis'
static_configs:
- targets: ['192.168.x.xxx:9121']
rabbitmq_exporter
自行下载安装包:rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gz
# 解压
tar -zxvf rabbitmq_exporter_1.0.0-RC19_linux_amd64.tar.gz
cd rabbitmq_exporter
# 编辑配置文件
vim config.json
{
"rabbit_url": "http://127.0.0.1:15672", # 自己的rabbitmq地址
"rabbit_user": "admin", # 自己的rabbitmq用户
"rabbit_pass": "xxxxxxxxxxxxx", # 自己的rabbitmq密码
"publish_port": "9419",
"publish_addr": "",
"output_format": "TTY",
"ca_file": "ca.pem",
"cert_file": "client-cert.pem",
"key_file": "client-key.pem",
"insecure_skip_verify": false,
"exlude_metrics": [],
"include_queues": ".*",
"skip_queues": "^$",
"skip_vhost": "^$",
"include_vhost": ".*",
"rabbit_capabilities": "no_sort,bert",
"enabled_exporters": [
"exchange",
"node",
"overview",
"queue"
],
"timeout": 30,
"max_queues": 0
}
# 启动探针
./rabbitmq_exporter -config-file=/usr/local/rabbitmq_exporter/config.json
# 查看是否采集到数据
curl localhost:9419/metrics
# HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 3.6221e-05
go_gc_duration_seconds{quantile="0.25"} 5.6432e-05
go_gc_duration_seconds{quantile="0.5"} 6.2474e-05
go_gc_duration_seconds{quantile="0.75"} 7.4399e-05
go_gc_duration_seconds{quantile="1"} 0.000156892
go_gc_duration_seconds_sum 3.721871902
go_gc_duration_seconds_count 50404
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 12
# HELP go_info Information about the Go environment.
# TYPE go_info gauge
go_info{version="go1.18.4"} 1
......
# prometheus配置
- job_name: 'Rabbitmq'
scrape_interval: 5s
static_configs:
- targets:
- xxx.xxx.xx.xx:9419
labels:
instance: RabbitMQ-xxx.xxx.xx.xx
nacos监控
默认nacos环境已经搭建好
# 配置application.properties文件,暴露metrics数据,如果仅暴露prometheus只配置prometheus即可,所有的都需要暴露,直接配置*
management.endpoints.web.exposure.include=prometheus
# 检查是否采集到metrics数据
curl localhost:8848/nacos/actuator/prometheus
# HELP process_start_time_seconds Start time of the process since unix epoch.
# TYPE process_start_time_seconds gauge
process_start_time_seconds 1.693794903445E9
# HELP system_load_average_1m The sum of the number of runnable entities queued to available processors and the number of runnable entities running on the available processors averaged over a period of time
# TYPE system_load_average_1m gauge
system_load_average_1m 0.0
# HELP process_files_open_files The open file descriptor count
# TYPE process_files_open_files gauge
process_files_open_files 281.0
......
# prometheus配置文件
- job_name: "nacos-server"
metrics_path: '/nacos/actuator/prometheus'
static_configs:
- targets: ["xxx.xxx.xx.xx:8848"]
kafka_exporter
默认kafka已部署完成(这里是使用docker启动的)
[kafka_exporter下载地址](wget https://github.com/danielqsj/kafka_exporter/releases/download/v1.2.0/kafka_exporter-1.2.0.linux-amd64.tar.gz)
wget https://github.com/danielqsj/kafka_exporter/releases/download/v1.2.0/kafka_exporter-1.2.0.linux-amd64.tar.gz
cd kafka_exporter-1.2.0.linux-amd64
# 启动探针
./kafka_exporter --kafka.server=xxx.xxx.xx.xx:9092 &
# 检查是否能获取到metrics数据
curl localhost:9308/metrics
# HELP go_gc_duration_seconds A summary of the GC invocation durations.
# TYPE go_gc_duration_seconds summary
go_gc_duration_seconds{quantile="0"} 0
go_gc_duration_seconds{quantile="0.25"} 0
go_gc_duration_seconds{quantile="0.5"} 0
go_gc_duration_seconds{quantile="0.75"} 0
go_gc_duration_seconds{quantile="1"} 0
go_gc_duration_seconds_sum 0
go_gc_duration_seconds_count 0
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines 11
# HELP go_memstats_alloc_bytes Number of bytes allocated and still in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes 3.083232e+06
# HELP go_memstats_alloc_bytes_total Total number of bytes allocated, even if freed.
# TYPE go_memstats_alloc_bytes_total counter
go_memstats_alloc_bytes_total 3.100496e+06
......
# prometheus.yml配置
- job_name: "kafka"
static_configs:
- targets: ['xxx.xxx.xx.xx:9308']
四、其他
prometheus-targets
prometheus-datasource
更多推荐
所有评论(0)