prometheus监控外部k8s
获取k8s token 写入 k8s-token.conf文件,prometheus连接外部k8s使用。注:本实验prometheus是二进制部署。
·
node-exporter
vim node-exporter-ds.yaml
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: prom
labels:
app: prometheus
component: node-exporter
spec:
selector:
matchLabels:
app: prometheus
component: node-exporter
template:
metadata:
name: prometheus-node-exporter
labels:
app: prometheus
component: node-exporter
spec:
tolerations:
- key: node-role.kubernetes.io/etcd
effect: NoExecute
operator: "Exists"
- key: node-role.kubernetes.io/controlplane
effect: NoSchedule
operator: "Exists"
containers:
- image: prom/node-exporter:v0.18.1
name: prometheus-node-exporter
ports:
- name: prom-node-exp
containerPort: 9100
hostPort: 9100
hostNetwork: true
hostPID: true
vim node-exporter-svc.yaml
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: prometheus-node-exporter
namespace: prom
labels:
app: prometheus
component: node-exporter
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
app: prometheus
component: node-exporter
type: ClusterIP
kube-state-metrics
vim kube-state-metrics-rbac.yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: prom
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources: ["daemonsets", "deployments", "replicasets"]
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources: ["statefulsets"]
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources: ["cronjobs", "jobs"]
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: prom
vim kube-state-metrics-deploy.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: prom
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: harbor.com.cn/library/kube-state-metrics:2.4.2
# image: gcr.io/google-containers/kube-state-metrics-amd64:v1.9.5
ports:
- containerPort: 8080
vim kube-state-metrics-svc.yaml
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
name: kube-state-metrics
namespace: prom
labels:
app: kube-state-metrics
spec:
ports:
- name: kube-state-metrics
port: 8080
protocol: TCP
selector:
app: kube-state-metrics
blackbox
vim cm.yaml
apiVersion: v1
kind: ConfigMap
metadata:
labels:
app: blackbox-exporter
name: blackbox-exporter
namespace: kube-system
data:
blackbox.yml: |-
modules:
http_2xx:
prober: http
timeout: 2s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2"]
valid_status_codes: [200,301,302]
method: GET
preferred_ip_protocol: "ip4"
tcp_connect:
prober: tcp
timeout: 2s
vim dp.yaml
kind: Deployment
apiVersion: apps/v1
metadata:
name: blackbox-exporter
namespace: kube-system
labels:
app: blackbox-exporter
annotations:
deployment.kubernetes.io/revision: "1"
spec:
replicas: 1
selector:
matchLabels:
app: blackbox-exporter
template:
metadata:
labels:
app: blackbox-exporter
spec:
volumes:
- name: config
configMap:
name: blackbox-exporter
defaultMode: 420
containers:
- name: blackbox-exporter
image: harbor.com.cn/library/blackbox-exporter:v0.15.1
imagePullPolicy: IfNotPresent
args:
- --config.file=/etc/blackbox_exporter/blackbox.yml
- --log.level=info
- --web.listen-address=:9115
ports:
- name: blackbox-port
containerPort: 9115
protocol: TCP
resources:
limits:
cpu: 500m
memory: 512Mi
requests:
cpu: 200m
memory: 256Mi
volumeMounts:
- name: config
mountPath: /etc/blackbox_exporter
readinessProbe:
tcpSocket:
port: 9115
initialDelaySeconds: 5
timeoutSeconds: 5
periodSeconds: 10
successThreshold: 1
failureThreshold: 3
vim svc.yaml
kind: Service
apiVersion: v1
metadata:
name: blackbox-exporter
namespace: kube-system
spec:
selector:
app: blackbox-exporter
ports:
- port: 9115
protocol: TCP
targetPort: 9115
vim ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: blackbox-exporter
namespace: kube-system
annotations:
kubernetes.io/ingress.class: "nginx"
spec:
rules:
- host: blackbox.com.cn
http:
paths:
- path: /
pathType: ImplementationSpecific
backend:
service:
name: blackbox-exporter
port:
number: 9115
cadvisor
docker run \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:rw \
--volume=/sys:/sys:ro \
--volume=/var/lib/docker:/var/lib/docker:ro \
--volume=/dev/disk/:/dev/disk:ro \
--publish=8080:8080 \
--detach=true \
--name=cadvisor \
--privileged=true \
google/cadvisor:latest
注:本实验prometheus是二进制部署
prometheus
获取k8s token 写入 k8s-token.conf文件,prometheus连接外部k8s使用
#创建用户
kubectl create serviceaccount dashboard-admin -n kube-system
#绑定集群角色
kubectl create clusterrolebinding dashboard-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:dashboard-admin
#查看token
kubectl -n kube-system get secrets | grep dashboard-admin
kubectl -n kube-system get secrets dashboard-admin-token-bqnz5
vim prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.51.159:9093']
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/prometheus/prometheus/rules/rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'vcloud'
metrics_path: /actuator/prometheus
file_sd_configs:
- files:
- /prometheus/prometheus/file_config/vcloud/*.json
refresh_interval: 10s
relabel_configs:
- source_labels: [appname]
action: replace
target_label: appname
- job_name: 'rabbitmq'
scrape_interval: 60s
scrape_timeout: 60s
static_configs:
- targets: ['192.168.51.109:9090']
- job_name: 'etcd'
scheme: https
tls_config:
ca_file: /prometheus/prometheus/ca.crt
cert_file: /prometheus/prometheus/server.crt
key_file: /prometheus/prometheus/server.key
static_configs:
- targets:
- '10.10.95.11:2379'
- '10.10.95.12:2379'
- '10.10.95.13:2379'
- job_name: 'k8s-master'
static_configs:
- targets: ['10.10.95.11:9100', '10.10.95.12:9100', '10.10.95.13:9100']
- job_name: 'k8s-node'
static_configs:
- targets: ['10.10.95.21:9100', '10.10.95.22:9100', '10.10.95.23:9100']
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
scheme: https
relabel_configs:
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: default;kubernetes;https
- job_name: 'kubernetes-kubelet'
kubernetes_sd_configs:
- role: node
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
scheme: https
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __address__
replacement: ${1}:10250
- job_name: kubernetes-nodes-cadvisor
metrics_path: /metrics
scheme: https
kubernetes_sd_configs:
- role: node
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.*)
- action: replace
regex: (.*)
source_labels: ["__address__"]
target_label: __address__
replacement: 10.10.95.18:6443
- action: replace
source_labels: [__meta_kubernetes_node_name]
target_label: __metrics_path__
regex: (.*)
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: 'container'
static_configs:
- targets: ['10.10.95.11:8080', '10.10.95.12:8080', '10.10.95.13:8080','10.10.95.21:8080', '10.10.95.22:8080', '10.10.95.23:8080']
labels:
group: container
- job_name: 'blackbox_http_service_probe'
metrics_path: /probe
kubernetes_sd_configs:
- role: service
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_blackbox_scheme]
action: keep
regex: http
- source_labels: [__address__, __meta_kubernetes_service_annotation_blackbox_port, __meta_kubernetes_service_annotation_blackbox_path]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+);(.+)
replacement: $1:$2$3
target_label: __param_target
- action: replace
target_label: __address__
replacement: blackbox.com.cn:80
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_service_name
- job_name: 'blackbox_http_pod_probe'
metrics_path: /probe
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
params:
module: [http_2xx]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
action: keep
regex: http
- source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port, __meta_kubernetes_pod_annotation_blackbox_path]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+);(.+)
replacement: $1:$2$3
target_label: __param_target
- action: replace
target_label: __address__
replacement: blackbox.com.cn:80
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'blackbox_tcp_pod_probe'
metrics_path: /probe
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
params:
module: [tcp_connect]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
action: keep
regex: tcp
- source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __param_target
- action: replace
target_label: __address__
replacement: blackbox.com.cn:80
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'k8s-pods'
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
- job_name: 'ingress-nginx-endpoints'
honor_timestamps: true
metrics_path: /metrics
scheme: http
kubernetes_sd_configs:
- role: pod
api_server: https://10.10.95.18:6443
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
bearer_token_file: /prometheus/prometheus/k8s-token.conf
tls_config:
insecure_skip_verify: true
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
- source_labels: [__meta_kubernetes_service_name]
regex: prometheus-server
action: drop
vim rules.yml
groups:
- name: test-rules
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
everity: Disaster
annotations:
summary: "Instance {{ $labels.instance }} down."
- alert: rabbitmqDown
expr: rabbitmq_running == 0
for: 2m
labels:
everity: Disaster
annotations:
summary: "Instance {{ $labels.node }} down."
- name: http_status
rules:
- alert: BlackboxSlowPing
expr: probe_icmp_duration_seconds > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox slow ping (instance {{ $labels.instance }})"
description: "Blackbox ping took more than 2s (current value: {{ $value }})"
- alert: BlackboxSlowRequests
expr: probe_http_duration_seconds > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Blackbox slow requests (instance {{ $labels.instance }})"
description: "Blackbox request took more than 2s (current value: {{ $value }})"
- alert: ProbeFailed
expr: probe_success == 0
for: 1m
labels:
severity: error
annotations:
summary: "Probe failed (instance {{ $labels.instance }})"
description: "Probe failed (current value: {{ $value }})"
- alert: StatusCode
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 1m
labels:
severity: error
annotations:
summary: "Status Code (instance {{ $labels.instance }})"
description: "HTTP status code is not 200-399 (current value: {{ $value }})"
- name: node_alerts
rules:
- alert: NodeMemoryWaring
expr: node_memory_MemAvailable_bytes/1024/1024 <= 2014
for: 30s
labels:
severity: Disaster
annotations:
summary: "机器 {{ $labels.instance }} 可用内存低于2014M"
- alert: NodeMemoryDisaster
expr: ((node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes) * 100 >= 80
for: 30s
labels:
severity: Critical
annotations:
summary: "机器 {{ $labels.instance }} 内存使用率超过80%"
- alert: NodeCPUUsage
expr: 100 * (1 - sum by (instance)(increase(node_cpu_seconds_total{mode="idle"}[5m])) / sum by (instance)(increase(node_cpu_seconds_total[5m]))) > 80
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: High CPU usage detected"
description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }}"
- alert: NodeFilesystemUsage
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} * 100) / node_filesystem_size_bytes {mountpoint="/",fstype=~"ext4|xfs"}) > 80
for: 2m
labels:
team: node
annotations:
summary: "{{$labels.instance}}: High Filesystem usage detected"
description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }}"
- alert: InstanceDown
expr: up == 0
for: 30s
labels:
severity: Disaster
annotations:
summary: "Instance {{ $labels.instance }} down."
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than"
- alert: ticket-test
expr: sum(jvm_memory_used_bytes{application="vcloud-ticket-api",area="heap",}) / sum(jvm_memory_max_bytes{application="vcloud-ticket-api",area="heap"})*100 > 90
for: 30s
labels:
severity: Disaster
annotations:
summary: "JVM Instance {{ $labels.instance }} memory usage > 90%"
description: "{{ $labels.instance }} has been in status [heap usage > 90%] for more than 1 minutes. current usage ({{ $value }}%)"
test.json
[
{
"targets": [
"192.168.51.201:8010"
],
"labels": {
"appname": "api01"
}
},
{
"targets": [
"192.168.51.202:8010"
],
"labels": {
"appname": "api02"
}
}
]
spug推送助手
alertmanager
vim alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'mail.com.cn:587'
smtp_from: 'admin@com.cn'
smtp_auth_username: 'admin@com.cn'
smtp_auth_password: '123456'
# smtp_require_tls: false
templates:
- '/prometheus/alertmanager/templates/*.tmpl'
route:
group_by: ['service', 'alertname', 'cluster']
group_interval: 5m
group_wait: 10s
repeat_interval: 5m
receiver: default-receiver
routes:
- match:
severity: ^(Critical|Warning|Disaster)$
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'https://push.spug.cc/send/xxxxxxxxx'
- name: 'default-receiver'
email_configs:
- to: 'name1@com.cn,name2@com.cn'
# html: '{{ template "dingding.to.html" . }}'
headers: { Subject: 'Prometheus 告警邮件'}
send_resolved: true
- name: 'email'
email_configs:
- to: 'name1@com.cn,name2@com.cn'
# html: '{{ template "dingding.to.html" . }}'
headers: { Subject: 'Prometheus 告警邮件'}
send_resolved: true
./alertmanager --config.file=alertmanager.yml --web.listen-address=:9093
监控k8s service资源
---
apiVersion: v1
kind: Service
metadata:
annotations:
blackbox_path: "/"
blackbox_port: "80"
blackbox_scheme: "http"
name: $APPNAME
namespace: $NAMESPACE
spec:
selector:
appname: $APPNAME
ports:
- port: 80
protocol: TCP
targetPort: 80
#后端
annotations:
blackbox_path: "/actuator/prometheus"
blackbox_port: "80"
blackbox_scheme: "http"
#前端
annotations:
blackbox_path: "/"
blackbox_port: "80"
blackbox_scheme: "http"
更多推荐
已为社区贡献33条内容
所有评论(0)