k8s监控
wget https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml配置参数--kubelet-insecure-tls-不要验证Kubelets提供的服务证书的CA。仅用于测试目的。或者配置证书
k8s监控Promethus + kube-state-metrics + Grafana部署
★为注意事项
★yaml文件用存储类动态供应PVC 根据各自玩法修改,★★Grafana官网的插件还有仪表盘面板里的指标有些都弃用的,需自己修改普罗米修斯查询语句又或者使用到的指标对应的出口商版本获取即可
先来个建个命名空间监控的部署都在这个空间
# cat namespace.yaml
apiVersion: v1
kind: Namespace
metadata:
name: prometheus
# kubectl apply -f namespace.yaml
namespace/prometheus created
部署node-exporter(获取节点指标)
# kubectl apply -f node-exporter.yaml
daemonset.apps/node-exporter created
# cat node-exporter.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: "node-exporter"
name: node-exporter
namespace: prometheus
spec:
selector:
matchLabels:
app: "node-exporter"
template:
metadata:
labels:
app: "node-exporter"
spec:
hostNetwork: true
hostPID: true
hostIPC: true
containers:
- name: prometheus-node-exporter
image: "prom/node-exporter:v1.0.1"
imagePullPolicy: "IfNotPresent"
args:
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --collector.filesystem.ignored-mount-points="^/(dev|proc|sys|host|etc)($|/)"
ports:
- name: metrics
containerPort: 9100
resources:
limits:
memory: "300Mi"
cpu: 300m
requests:
memory: "100Mi"
cpu: 100m
volumeMounts:
- name: dev
mountPath: /host/dev
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
- name: rootfs
mountPath: /rootfs
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: dev
hostPath:
path: /dev
- name: rootfs
hostPath:
path: /
部署alertmanager,后续报警使用—★告警分组和接受方式根据需求自行配置,实验只做简单的-所有报警发往邮箱
# kubectl apply -f alertmanager.yaml
configmap/prometheus-alertmanager created
persistentvolumeclaim/prometheus-alertmanager created
service/prometheus-alertmanager created
deployment.apps/prometheus-alertmanager created
# cat alertmanager.yaml
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: alertmanager
name: prometheus-alertmanager
namespace: prometheus
data:
alertmanager.yml: |
global:
smtp_smarthost: 'smtp.qq.com:465'
smtp_from: '发件人@qq.com'
smtp_auth_username: '发件人@qq.com'
smtp_auth_password: '邮箱生成的密码'
smtp_require_tls: false
resolve_timeout: 5m
route:
group_by: ['alertname']
group_interval: 10s
group_wait: 10s
receiver: 'mail'
repeat_interval: 10s
receivers:
- name: 'mail'
email_configs:
- to: '502369651@qq.com'
send_resolved: true
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app: alertmanager
name: prometheus-alertmanager
namespace: prometheus
spec:
storageClassName: nfs
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "2Gi"
---
apiVersion: v1
kind: Service
metadata:
labels:
app: alertmanager
name: prometheus-alertmanager
namespace: prometheus
spec:
type: NodePort
ports:
- name: http
port: 9093
protocol: TCP
targetPort: 9093
selector:
app: alertmanager
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: alertmanager
name: prometheus-alertmanager
namespace: prometheus
spec:
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: prometheus-alertmanager
image: "quay.io/prometheus/alertmanager:v0.21.0"
imagePullPolicy: "IfNotPresent"
env:
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
args:
- --config.file=/etc/config/alertmanager.yml
- --storage.path=/data
- --cluster.advertise-address=$(POD_IP):9094
- --web.external-url=http://localhost:9093
ports:
- containerPort: 9093
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 30
resources:
limits:
memory: "256Mi"
cpu: 100m
requests:
memory: "256Mi"
cpu: 100m
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: storage-volume
mountPath: "/data"
- name: prometheus-alertmanager-configmap-reload
image: "jimmidyson/configmap-reload:v0.4.0"
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/config
- --webhook-url=http://127.0.0.1:9093/-/reload
resources:
limits:
memory: "100Mi"
cpu: 100m
requests:
memory: "20Mi"
cpu: 10m
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
volumes:
- name: config-volume
configMap:
name: prometheus-alertmanager
- name: storage-volume
persistentVolumeClaim:
claimName: prometheus-alertmanager
部署kube-state-metrics (它侦听Kubernetes API服务器并生成有关对象状态的指标)
# kubectl apply -f kube-state-metrics.yaml
serviceaccount/kube-state-metrics created
clusterrole.rbac.authorization.k8s.io/kube-state-metrics unchanged
clusterrolebinding.rbac.authorization.k8s.io/kube-state-metrics unchanged
service/kube-state-metrics created
deployment.apps/kube-state-metrics created
# cat kube-state-metrics.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app: kube-state-metrics
name: kube-state-metrics
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app: kube-state-metrics
name: kube-state-metrics
rules:
- apiGroups:
- ""
resources:
- configmaps
- secrets
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs:
- list
- watch
- apiGroups:
- extensions
resources:
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- apps
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs:
- list
- watch
- apiGroups:
- batch
resources:
- cronjobs
- jobs
verbs:
- list
- watch
- apiGroups:
- autoscaling
resources:
- horizontalpodautoscalers
verbs:
- list
- watch
- apiGroups:
- authentication.k8s.io
resources:
- tokenreviews
verbs:
- create
- apiGroups:
- authorization.k8s.io
resources:
- subjectaccessreviews
verbs:
- create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch
- apiGroups:
- certificates.k8s.io
resources:
- certificatesigningrequests
verbs:
- list
- watch
- apiGroups:
- storage.k8s.io
resources:
- storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
- ingresses
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app: kube-state-metrics
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: prometheus
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
labels:
app: kube-state-metrics
name: kube-state-metrics
namespace: prometheus
spec:
type: NodePort
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry
selector:
app: kube-state-metrics
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: kube-state-metrics
name: kube-state-metrics
namespace: prometheus
spec:
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
containers:
- image: quay.io/coreos/kube-state-metrics:v2.0.0-alpha.3
livenessProbe:
httpGet:
path: /healthz
port: 8080
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
readinessProbe:
httpGet:
path: /
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
serviceAccountName: kube-state-metrics
部署完以上可根据svc访问相应web
node-exporter通过节点Ip+9100端口访问web查看指标
alertmanager 告警web
kube-state-metrics访问web查看指标 8080是k8s对象指标 8081是它自身的指标
部署prometheus-server…
# kubectl apply -f prometheus-server.yaml
serviceaccount/prometheus-server created
configmap/prometheus-server created
persistentvolumeclaim/prometheus-server created
clusterrole.rbac.authorization.k8s.io/prometheus-server unchanged
clusterrolebinding.rbac.authorization.k8s.io/prometheus-server unchanged
service/prometheus-server created
deployment.apps/prometheus-server created
# cat prometheus-server.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app: prometheus-server
name: prometheus-server
namespace: prometheus
---
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: prometheus-server
name: prometheus-server
namespace: prometheus
data:
alerting_rules.yml: |
{}
recording_rules.yml: |
{}
prometheus.yml: |
global:
evaluation_interval: 10s
scrape_interval: 10s
scrape_timeout: 5s
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app: prometheus-server
name: prometheus-server
namespace: prometheus
spec:
storageClassName: nfs
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "10Gi"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app: prometheus-server
name: prometheus-server
rules:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
- ingresses
- configmaps
verbs:
- get
- list
- watch
- apiGroups:
- "extensions"
- "networking.k8s.io"
resources:
- ingresses/status
- ingresses
verbs:
- get
- list
- watch
- nonResourceURLs:
- "/metrics"
verbs:
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app: prometheus-server
name: prometheus-server
subjects:
- kind: ServiceAccount
name: prometheus-server
namespace: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-server
---
apiVersion: v1
kind: Service
metadata:
labels:
app: prometheus-server
name: prometheus-server
namespace: prometheus
spec:
type: NodePort
ports:
- name: http
port: 9090
protocol: TCP
targetPort: 9090
selector:
app: prometheus-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: prometheus-server
name: prometheus-server
namespace: prometheus
spec:
selector:
matchLabels:
app: prometheus-server
template:
metadata:
labels:
app: prometheus-server
spec:
serviceAccountName: prometheus-server
containers:
- name: prometheus-server-configmap-reload
image: "jimmidyson/configmap-reload:v0.4.0"
imagePullPolicy: "IfNotPresent"
args:
- --volume-dir=/etc/config
- --webhook-url=http://127.0.0.1:9090/-/reload
resources:
limits:
memory: "100Mi"
cpu: 100m
requests:
memory: "20Mi"
cpu: 10m
volumeMounts:
- name: config-volume
mountPath: /etc/config
readOnly: true
- name: prometheus-server
image: "prom/prometheus:v2.22.2"
imagePullPolicy: "IfNotPresent"
args:
- --storage.tsdb.retention.time=15d
- --config.file=/etc/config/prometheus.yml
- --storage.tsdb.path=/data
- --web.console.libraries=/etc/prometheus/console_libraries
- --web.console.templates=/etc/prometheus/consoles
- --web.enable-lifecycle
ports:
- containerPort: 9090
readinessProbe:
httpGet:
path: /-/ready
port: 9090
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
resources:
limits:
memory: "1024Mi"
cpu: 500m
requests:
memory: "100Mi"
cpu: 100m
volumeMounts:
- name: config-volume
mountPath: /etc/config
- name: storage-volume
mountPath: /data
subPath: ""
volumes:
- name: config-volume
configMap:
name: prometheus-server
- name: storage-volume
persistentVolumeClaim:
claimName: prometheus-server
★★此时普罗米修斯只收集自身目标的指标
★★需知k8s组件提供指标端口的。前提是要开放以下端口,有些是默认端口有些则需自己设置开放
scheduler端口为10251,ControllerManager端口为10252,kube-proxy端口为10249,etcd端口为2379, kubelet端口10255端口,kubelet还会在 /metrics/cadvisor, /metrics/resource 和 /metrics/probes 端点中公开度量值。
例如以下etcd 通过访问查看都有哪些指标curl --cert /etc/kubernetes/pki/etcd/server.crt --key /etc/kubernetes/pki/etcd/server.key --cacert /etc/kubernetes/pki/etcd/ca.crt https://192.168.133.10:2379/metrics。k8s集群内使用curl https带上客户端证书、私钥、根证书双向认证 又或者curl --cert --key -k https://xxxx/xxx 单向认证
# kubectl get configmaps -n prometheus
NAME DATA AGE
prometheus-alertmanager 1 44m
prometheus-server 3 5m46s
# kubectl edit configmaps prometheus-server -n prometheus
……
apiVersion: v1
data:
alerting_rules.yml: |#这里是报警规则文件
{}
prometheus.yml: |#普罗米修斯配置文件
global:
evaluation_interval: 10s
scrape_interval: 10s
scrape_timeout: 5s
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
recording_rules.yml: |#记录规则文件
{}
……
开始修改configmaps对象普罗米修斯配置文件收集其他目标
……
alerting_rules.yml: |#测试告警
groups:
- name: test
rules:
- alert: ceshi
expr: (node_memory_MemTotal_bytes-(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes))/node_memory_MemTotal_bytes*100>5
for: 20s
labels:
test: ceshi
annotations:
summary: "{{ $labels.isinstance }}"
description: "{{ $labels.isinstance }} >20% {{ $value }}"
recording_rules.yml: |
{}
prometheus.yml: |
global:
evaluation_interval: 10s
scrape_interval: 10s
scrape_timeout: 5s
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
alerting:#告警给alertmanager
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: keep
source_labels: [__meta_kubernetes_pod_label_app]
regex: alertmanager
- action: keep
source_labels: [__meta_kubernetes_namespace]
regex: prometheus
- action: drop
source_labels: [__meta_kubernetes_pod_container_port_number]
regex:
scrape_configs:
- job_name: prometheus#普罗米修斯自身指标
static_configs:
- targets: ['localhost:9090']
- job_name: kube-etcd#k8s组件指标
static_configs:
- targets: ['192.168.133.10:2379','192.168.133.11:2379','192.168.133.12:2379']
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/etcd/ca.crt
cert_file: /var/run/secrets/kubernetes.io/etcd/server.crt
key_file: /var/run/secrets/kubernetes.io/etcd/server.key
- job_name: kube-schedule#k8s组件指标
static_configs:
- targets: ['192.168.133.10:10251','192.168.133.11:10251','192.168.133.12:10251']
- job_name: 'kube-control-manager'#k8s组件指标
static_configs:
- targets: ['192.168.133.10:10252','192.168.133.11:10252','192.168.133.12:10252']
- job_name: 'kube-kubelet'#k8s组件指标
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:10255'
target_label: __address__
action: replace
- job_name: 'kube-kubelet-resource'#k8s组件指标
kubernetes_sd_configs:
- role: node
metrics_path: /metrics/resource
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:10255'
target_label: __address__
action: replace
- job_name: 'kube-kubelet-probes'#k8s组件指标
kubernetes_sd_configs:
- role: node
metrics_path: /metrics/probes
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:10255'
target_label: __address__
action: replace
- job_name: kubernetes-cadvisor#k8s组件指标,kubelet带的cadvisor关于容器相关的
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: 'kube-proxy'#k8s组件指标
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:10249'
target_label: __address__
action: replace
- job_name: kubernetes-apiservers#k8s组件指标
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: node-exporter#节点出口商
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
action: replace
- job_name: kubernetes-service-endpoints#根据注释收集 例如dns和kube-state-metrics的指标都能应用这个
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: kubernetes_namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: kubernetes_name
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: kubernetes_node
……
结果示例:
部署grafana
# kubectl apply -f grafana.yaml
persistentvolumeclaim/grafana created
service/grafana created
deployment.apps/grafana created
# cat grafana.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app: grafana
name: grafana
namespace: prometheus
spec:
storageClassName: nfs
accessModes:
- ReadWriteOnce
resources:
requests:
storage: "5Gi"
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
spec:
type: NodePort
ports:
- name: http
port: 3000
targetPort: 3000
selector:
app: grafana
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: prometheus
labels:
app: grafana
spec:
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- image: grafana/grafana:7.3.2
imagePullPolicy: "IfNotPresent"
name: grafana
ports:
- containerPort: 3000
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 100m
memory: 100Mi
readinessProbe:
httpGet:
path: /login
port: 3000
volumeMounts:
- name: storage-volume
mountPath: /var/lib/grafana
volumes:
- name: storage-volume
persistentVolumeClaim:
claimName: grafana
# kubectl get pod -n prometheus
NAME READY STATUS RESTARTS AGE
grafana-7bcdc55776-g7s8j 1/1 Running 0 82s
kube-state-metrics-948d5fb47-ww2t9 1/1 Running 0 10h
node-exporter-9w7fl 1/1 Running 0 11h
node-exporter-flxn4 1/1 Running 0 11h
node-exporter-zdnf2 1/1 Running 0 11h
node-exporter-zmdjz 1/1 Running 0 11h
prometheus-alertmanager-664fdcb8fd-2kpr5 2/2 Running 0 11h
prometheus-server-7fffd7b6df-jxdp5 2/2 Running 0 2m5s
示例 登录grafana添加数据源
网址输入IP加端口 我这里输入的是SVC名称+端口,使用K8S-DNS解析。K8S创建的SVC和Pod资源都会有对应的DNS条目的。
★★★★★头有点铁部署镜像的都是新版… grafana很多仪表盘面板、插件里查询语句中的指标有些都弃用了或者版本兼容有些框不适应。。。。。。适当修改查询语句或者换版本镜像即可
更多推荐
所有评论(0)