k8s部署prometheus
k8s部署prometheus
·
1.创建prometheus主配置文件
configmap-main.yaml
apiVersion: v1
data:
prometheus.yml: |-
# my global config
global:
scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- svc-alertmanager.monitoring.svc:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/etc/prometheus/rules/mysql_rules.yml"
- "/etc/prometheus/rules/general.yml"
- "/etc/prometheus/rules/node.yml"
- "/etc/prometheus/rules/kube-state-metrics.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["localhost:9090"]
- job_name: "kube-state-metrics"
metrics_path: "/metrics"
static_configs:
- targets: [ "svc-kube-state-metrics.monitoring.svc:8080" ]
- job_name: "shuan-services"
metrics_path: "/actuator/prometheus"
file_sd_configs:
- files: ["/etc/prometheus/target/services.yml"]
refresh_interval: 5s
- job_name: "shuan-mysqld-services"
metrics_path: "/metrics"
file_sd_configs:
- files: [ "/etc/prometheus/target/mysqld-services.yml" ]
refresh_interval: 30s
- job_name: "shuan-redis-services"
metrics_path: "/metrics"
file_sd_configs:
- files: [ "/etc/prometheus/target/redis-services.yml" ]
refresh_interval: 30s
- job_name: "shuan-nodes"
metrics_path: "/metrics"
file_sd_configs:
- files: [ "/etc/prometheus/target/nodes.yml" ]
refresh_interval: 30s
- job_name: "kubernetes-apiservers"
kubernetes_sd_configs:
- role: endpoints
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & authorization config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
# insecure_skip_verify: true
authorization:
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels:
[
__meta_kubernetes_namespace,
__meta_kubernetes_service_name,
__meta_kubernetes_endpoint_port_name,
]
action: keep
regex: default;kubernetes;https
# Scrape config for nodes (kubelet).
#
# Rather than connecting directly to the node, the scrape is proxied though the
# Kubernetes apiserver. This means it will work if Prometheus is running out of
# cluster, or can't connect to nodes for some other reason (e.g. because of
# firewalling).
- job_name: "kubernetes-nodes"
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# This TLS & authorization config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
authorization:
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: "kubernetes-cadvisor"
# Default to scraping over https. If required, just disable this or change to
# `http`.
scheme: https
# Starting Kubernetes 1.7.3 the cAdvisor metrics are under /metrics/cadvisor.
# Kubernetes CIS Benchmark recommends against enabling the insecure HTTP
# servers of Kubernetes, therefore the cAdvisor metrics on the secure handler
# are used.
metrics_path: /metrics/cadvisor
# This TLS & authorization config is used to connect to the actual scrape
# endpoints for cluster components. This is separate to discovery auth
# configuration because discovery & scraping are two separate concerns in
# Prometheus. The discovery auth config is automatic if Prometheus runs inside
# the cluster. Otherwise, more config options have to be provided within the
# <kubernetes_sd_config>.
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# If your node certificates are self-signed or use a different CA to the
# master CA, then disable certificate verification below. Note that
# certificate verification is an integral part of a secure infrastructure
# so this should only be disabled in a controlled environment. You can
# disable certificate verification by uncommenting the line below.
#
insecure_skip_verify: true
authorization:
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
remote_write:
- url: "http://svc-influxdb.monitoring.svc:8086/api/v1/prom/write?db=prometheus&u=admin&p=123456"
remote_read:
- url: "http://svc-influxdb.monitoring.svc:8086/api/v1/prom/read?db=prometheus&u=admin&p=123456"
kind: ConfigMap
metadata:
name: prometheus-conf
namespace: monitoring
2. 创建prometheus的告警规则配置文件
configmap-rule.yaml
apiVersion: v1
data:
general.yml: |-
groups:
- name: general
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
kube-state-metrics.yml: |-
groups:
- name: kube-state-metrics
rules:
- alert: pod restart
expr: changes(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace!~"^kube.*"}[30m])== 1
for: 5m
labels:
severity: warning
annotations:
summary: "pod restart"
description: "{{ $labels.pod }} has restarted "
- alert: StatefulSetReplicasMismatch
expr: ( kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace!~"^kube.*"} != kube_statefulset_status_replicas{job="kube-state-metrics",namespace!~"^kube.*"} ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace!~"^kube.*"}[5m]) == 0 )
for: 15m
labels:
severity: warning
annotations:
summary: "StatefulSetReplicasMismatch"
description: "Replicas of {{ $labels.statefulset }} is Mismatch "
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace!~"^kube.*"}[15m]) * 60 * 5 > 0
for: 15m
labels:
severity: warning
annotations:
summary: "KubePodCrashLooping"
description: "{{ $labels.pod }} is CrashLooping "
mysql_rules.yml: |-
groups:
- name: mysql
rules:
- alert: nums of mysql_global_status_threads_connected is more than 180
expr: mysql_global_status_threads_connected > 180
for: 1m
labels:
severity: warning
annotations:
summary: "nums of mysql_global_status_threads_connected is more than 180 in {{ $labels.instance }}"
description: "{{ $labels.instance }} has too many mysql_global_status_threads_connected (current value: {{ $value }}s)"
- alert: Used more than 85% of max connections limited
expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections * 0.85
for: 1m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.instance }} Used more than 80% of max connections limited"
node.yml: |+
groups:
- name: node
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: usage of cpu
expr: ((1 - sum(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(rate(node_cpu_seconds_total[5m])) by (instance) ) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High cpu usage of {{ $labels.instance }}"
description: "CPU of {{ $labels.instance }} is used more than 80%"
- alert: cpu iowaite
expr: (avg by (instance)(irate(node_cpu_seconds_total{mode='iowait',job=~"shuan-nodes"}[5m])) * 100) > 20
for: 10m
labels:
severity: warning
annotations:
summary: "High cpu iowaite of {{ $labels.instance }}"
description: "cpu iowaite of {{ $labels.instance }} is used more than 20%, current vlaue : {{ $value }}%"
- alert: usage of memory
expr: ((1- (node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes) / node_memory_MemTotal_bytes) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage of {{ $labels.instance }}"
description: "memory of {{ $labels.instance }} is used more than 80%"
- alert: usage of disk
expr: ((1 - node_filesystem_avail_bytes{fstype=~"ext4|xfs",job="shuan-nodes"} / node_filesystem_size_bytes{fstype=~"ext4|xfs",job="shuan-nodes"}) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High disk usage of {{ $labels.mountpoint }} in {{ $labels.instance }}"
description: "disk of {{ $labels.mountpoint }} in {{ $labels.instance }} is used more than 80%"
- alert: disk io
expr: (avg(irate(node_disk_io_time_seconds_total{job="shuan-nodes"}[5m])) by(instance)* 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High disk io of {{ $labels.instance }}"
description: "disk io of {{ $labels.instance }} is used more than 90%, current vlaue : {{ $value }}"
- alert: HostDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘读速率过高"
description: "{{ $labels.instance }} 磁盘读速率超过50MB/s, 当前速率:{{ $value }}MB/s"
- alert: HostDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "磁盘写速率过高"
description: "{{ $labels.instance }} 磁盘读速率超过50MB/s, 当前速率:{{ $value }}MB/s"
- alert: network receive
expr: (sum(rate(node_network_receive_bytes_total{device=~'ens.*'}[5m])) by (instance) / 1024 / 1024) > 20
for: 10m
labels:
severity: warning
annotations:
summary: "High network receive of {{ $labels.instance }}"
description: "network receive of {{ $labels.device }} in {{ $labels.instance }} is used more than 20M, current vlaue : {{ $value }}MB/s"
- alert: network transmit
expr: (sum(rate(node_network_transmit_bytes_total{device=~'ens.*'}[5m])) by (instance) / 1024 / 1024) > 20
for: 5m
labels:
severity: warning
annotations:
summary: "High network transmit of {{ $labels.instance }}"
description: "network transmit {{ $labels.device }} in {{ $labels.instance }} is used more than 20M, current vlaue : {{ $value }}"
kind: ConfigMap
metadata:
name: prometheus-rules-conf
namespace: monitoring
3.创建prometheus的rbac.yaml文件
rbac.yaml
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
4.创建prometheus工作负载yaml文件
statefulset.yaml
apiVersion: apps/v1
kind: StatefulSet
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitoring
spec:
serviceName: prometheus
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
volumes:
- name: host-time
hostPath:
path: /etc/localtime
- name: prometheus-conf
configMap:
name: prometheus-conf
defaultMode: 0760
- name: prometheus-rules-conf
configMap:
name: prometheus-rules-conf
defaultMode: 0760
imagePullSecrets:
- name: shu-cn
securityContext: #指定运行的用户为root
runAsUser: 0
serviceAccountName: prometheus #关联该账号,该账号授予了k8s的相关查看权限,见rbac.yaml文件
containers:
- image: shu.cn/prometheus:v2.37.5
name: prometheus
args: #容器启动时指定的额外参数
- "--config.file=/etc/prometheus/prometheus.yml" #通过volume挂载prometheus.yml
- "--storage.tsdb.path=/prometheus" #通过vlolume挂载目录/prometheus
- "--storage.tsdb.retention.time=5d" #本地数据保留时间5天
- "--web.enable-admin-api" #控制对admin HTTP API的访问,其中包括删除时间序列等功能
- "--web.enable-lifecycle" #支持热更新,直接执行localhost:9090/-/reload立即生效
imagePullPolicy: IfNotPresent
resources:
requests:
cpu: 100m
memory: 1Gi
limits:
cpu: 500m
memory: 2Gi
ports:
- containerPort: 9090
name: tcp
volumeMounts:
- name: host-time
mountPath: /etc/localtime
- name: prometheus-conf
mountPath: /etc/prometheus/prometheus.yml
subPath: prometheus.yml
- name: file-sd-confs
mountPath: /etc/prometheus/target
subPath: target
- name: prometheus-rules-conf
mountPath: /etc/prometheus/rules
volumeClaimTemplates:
- metadata:
name: file-sd-confs
spec:
storageClassName: "nfs"
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
---
apiVersion: v1
kind: Service
metadata:
labels:
app: svc-prometheus
name: svc-prometheus
namespace: monitoring
spec:
selector:
app: prometheus
type: NodePort
ports:
- port: 9090
protocol: TCP
targetPort: 9090
nodePort: 38090
name: tcp
5.部署
kubectl apply -f rbac.yaml
kubectl apply -f configmap-rule.yaml
kubectl apply -f configmap-main.yaml
kubectl apply -f statefulset.yaml
6.prometheus访问
http://节点ip:38090
7.其他组件参考链接
prometheus之grafana
prometheus之node_exporter部署
k8s部署prometheus之钉钉告警
k8s部署prometheus之kube-state-metrics
prometheus之influxdb1.8部署
更多推荐
已为社区贡献8条内容
所有评论(0)