node-exporter

vim node-exporter-ds.yaml

apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: prometheus-node-exporter
  namespace: prom
  labels:
    app: prometheus
    component: node-exporter
spec:
  selector:
    matchLabels:
      app: prometheus
      component: node-exporter
  template:
    metadata:
      name: prometheus-node-exporter
      labels:
        app: prometheus
        component: node-exporter
    spec:
      tolerations:
      - key: node-role.kubernetes.io/etcd
        effect: NoExecute
        operator: "Exists"
      - key: node-role.kubernetes.io/controlplane
        effect: NoSchedule
        operator: "Exists"
      containers:
      - image:  prom/node-exporter:v0.18.1
        name: prometheus-node-exporter
        ports:
        - name: prom-node-exp
          containerPort: 9100
          hostPort: 9100
      hostNetwork: true
      hostPID: true

vim node-exporter-svc.yaml

apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: prometheus-node-exporter
  namespace: prom
  labels:
    app: prometheus
    component: node-exporter
spec:
  clusterIP: None
  ports:
    - name: prometheus-node-exporter
      port: 9100
      protocol: TCP
  selector:
    app: prometheus
    component: node-exporter
  type: ClusterIP

kube-state-metrics

vim kube-state-metrics-rbac.yaml

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: kube-state-metrics
  namespace: prom
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: kube-state-metrics
rules:
- apiGroups: [""]
  resources: ["nodes", "pods", "services", "resourcequotas", "replicationcontrollers", "limitranges", "persistentvolumeclaims", "persistentvolumes", "namespaces", "endpoints"]
  verbs: ["list", "watch"]
- apiGroups: ["extensions"]
  resources: ["daemonsets", "deployments", "replicasets"]
  verbs: ["list", "watch"]
- apiGroups: ["apps"]
  resources: ["statefulsets"]
  verbs: ["list", "watch"]
- apiGroups: ["batch"]
  resources: ["cronjobs", "jobs"]
  verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
  resources: ["horizontalpodautoscalers"]
  verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: kube-state-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: kube-state-metrics
subjects:
- kind: ServiceAccount
  name: kube-state-metrics
  namespace: prom

vim kube-state-metrics-deploy.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: kube-state-metrics
  namespace: prom
spec:
  replicas: 1
  selector:
    matchLabels:
      app: kube-state-metrics
  template:
    metadata:
      labels:
        app: kube-state-metrics
    spec:
      serviceAccountName: kube-state-metrics
      containers:
      - name: kube-state-metrics
        image: harbor.com.cn/library/kube-state-metrics:2.4.2
#        image: gcr.io/google-containers/kube-state-metrics-amd64:v1.9.5
        ports:
        - containerPort: 8080

vim kube-state-metrics-svc.yaml

apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: kube-state-metrics
  namespace: prom
  labels:
    app: kube-state-metrics
spec:
  ports:
  - name: kube-state-metrics
    port: 8080
    protocol: TCP
  selector:
    app: kube-state-metrics

blackbox

vim cm.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  labels:
    app: blackbox-exporter
  name: blackbox-exporter
  namespace: kube-system
data:
  blackbox.yml: |-
    modules:
      http_2xx:
        prober: http
        timeout: 2s
        http:
          valid_http_versions: ["HTTP/1.1", "HTTP/2"]
          valid_status_codes: [200,301,302]
          method: GET
          preferred_ip_protocol: "ip4"
      tcp_connect:
        prober: tcp
        timeout: 2s

vim dp.yaml

kind: Deployment
apiVersion: apps/v1
metadata:
  name: blackbox-exporter
  namespace: kube-system
  labels:
    app: blackbox-exporter
  annotations:
    deployment.kubernetes.io/revision: "1"
spec:
  replicas: 1
  selector:
    matchLabels:
      app: blackbox-exporter
  template:
    metadata:
      labels:
        app: blackbox-exporter
    spec:
      volumes:
      - name: config
        configMap:
          name: blackbox-exporter
          defaultMode: 420
      containers:
      - name: blackbox-exporter
        image: harbor.com.cn/library/blackbox-exporter:v0.15.1
        imagePullPolicy: IfNotPresent
        args:
        - --config.file=/etc/blackbox_exporter/blackbox.yml
        - --log.level=info
        - --web.listen-address=:9115
        ports:
        - name: blackbox-port
          containerPort: 9115
          protocol: TCP
        resources:
          limits:
            cpu: 500m
            memory: 512Mi
          requests:
            cpu: 200m
            memory: 256Mi
        volumeMounts:
        - name: config
          mountPath: /etc/blackbox_exporter
        readinessProbe:
          tcpSocket:
            port: 9115
          initialDelaySeconds: 5
          timeoutSeconds: 5
          periodSeconds: 10
          successThreshold: 1
          failureThreshold: 3

vim svc.yaml

kind: Service
apiVersion: v1
metadata:
  name: blackbox-exporter
  namespace: kube-system
spec:
  selector:
    app: blackbox-exporter
  ports:
  - port: 9115
    protocol: TCP
    targetPort: 9115

vim ingress.yaml

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: blackbox-exporter
  namespace: kube-system
  annotations:
    kubernetes.io/ingress.class: "nginx"
spec:
  rules:
  - host: blackbox.com.cn
    http:
      paths:
      - path: /
        pathType: ImplementationSpecific
        backend:
          service:
            name: blackbox-exporter
            port:
              number: 9115

cadvisor

docker run \
  --volume=/:/rootfs:ro \
  --volume=/var/run:/var/run:rw \
  --volume=/sys:/sys:ro \
  --volume=/var/lib/docker:/var/lib/docker:ro \
  --volume=/dev/disk/:/dev/disk:ro \
  --publish=8080:8080 \
  --detach=true \
  --name=cadvisor \
  --privileged=true \
  google/cadvisor:latest

注:本实验prometheus是二进制部署

prometheus

获取k8s token 写入 k8s-token.conf文件,prometheus连接外部k8s使用

#创建用户
kubectl create serviceaccount dashboard-admin -n kube-system

#绑定集群角色
kubectl create clusterrolebinding dashboard-cluster-admin --clusterrole=cluster-admin --serviceaccount=kube-system:dashboard-admin

#查看token
kubectl -n kube-system get secrets | grep dashboard-admin
kubectl -n kube-system get secrets dashboard-admin-token-bqnz5

vim prometheus.yml

global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['192.168.51.159:9093']
      # - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/prometheus/prometheus/rules/rules.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']


  - job_name: 'vcloud'
    metrics_path: /actuator/prometheus
    file_sd_configs:
    - files:
      - /prometheus/prometheus/file_config/vcloud/*.json
      refresh_interval: 10s
    relabel_configs:
      - source_labels: [appname]
        action: replace
        target_label: appname


  - job_name: 'rabbitmq'
    scrape_interval: 60s
    scrape_timeout: 60s
    static_configs:
    - targets: ['192.168.51.109:9090']

  - job_name: 'etcd'
    scheme: https
    tls_config:
      ca_file: /prometheus/prometheus/ca.crt
      cert_file: /prometheus/prometheus/server.crt
      key_file: /prometheus/prometheus/server.key
    static_configs:
    - targets:
      - '10.10.95.11:2379'
      - '10.10.95.12:2379'
      - '10.10.95.13:2379'

  - job_name: 'k8s-master'
    static_configs:
    - targets: ['10.10.95.11:9100', '10.10.95.12:9100', '10.10.95.13:9100']

  - job_name: 'k8s-node'
    static_configs:
    - targets: ['10.10.95.21:9100', '10.10.95.22:9100', '10.10.95.23:9100']

  - job_name: 'kubernetes-apiservers'
    kubernetes_sd_configs:
    - role: endpoints
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    scheme: https
    relabel_configs:
    - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
      action: keep
      regex: default;kubernetes;https

  - job_name: 'kubernetes-kubelet'
    kubernetes_sd_configs:
    - role: node
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    scheme: https
    relabel_configs:
    - action: labelmap
      regex: __meta_kubernetes_node_label_(.+)
    - source_labels: [__meta_kubernetes_node_name]
      regex: (.+)
      target_label: __address__
      replacement: ${1}:10250


  - job_name: kubernetes-nodes-cadvisor
    metrics_path: /metrics
    scheme: https
    kubernetes_sd_configs:
    - role: node
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    relabel_configs:
    - action: labelmap
      regex: __meta_kubernetes_node_label_(.*)
    - action: replace
      regex: (.*)
      source_labels: ["__address__"]
      target_label: __address__
      replacement: 10.10.95.18:6443
    - action: replace
      source_labels: [__meta_kubernetes_node_name]
      target_label: __metrics_path__
      regex: (.*)
      replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

  - job_name: 'container'
    static_configs:
    - targets: ['10.10.95.11:8080', '10.10.95.12:8080', '10.10.95.13:8080','10.10.95.21:8080', '10.10.95.22:8080', '10.10.95.23:8080']
      labels:
        group: container

  - job_name: 'blackbox_http_service_probe'
    metrics_path: /probe
    kubernetes_sd_configs:
    - role: service
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    params:
      module: [http_2xx]
    relabel_configs:
    - source_labels: [__meta_kubernetes_service_annotation_blackbox_scheme]
      action: keep
      regex: http
    - source_labels: [__address__, __meta_kubernetes_service_annotation_blackbox_port,  __meta_kubernetes_service_annotation_blackbox_path]
      action: replace
      regex: ([^:]+)(?::\d+)?;(\d+);(.+)
      replacement: $1:$2$3
      target_label: __param_target
    - action: replace
      target_label: __address__
      replacement: blackbox.com.cn:80
    - source_labels: [__param_target]
      target_label: instance
    - action: labelmap
      regex: __meta_kubernetes_service_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_service_name]
      action: replace
      target_label: kubernetes_service_name

  - job_name: 'blackbox_http_pod_probe'
    metrics_path: /probe
    kubernetes_sd_configs:
    - role: pod
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    params:
      module: [http_2xx]
    relabel_configs:
    - source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
      action: keep
      regex: http
    - source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port,  __meta_kubernetes_pod_annotation_blackbox_path]
      action: replace
      regex: ([^:]+)(?::\d+)?;(\d+);(.+)
      replacement: $1:$2$3
      target_label: __param_target
    - action: replace
      target_label: __address__
      replacement: blackbox.com.cn:80
    - source_labels: [__param_target]
      target_label: instance
    - action: labelmap
      regex: __meta_kubernetes_pod_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_pod_name]
      action: replace
      target_label: kubernetes_pod_name
  
  - job_name: 'blackbox_tcp_pod_probe'
    metrics_path: /probe
    kubernetes_sd_configs:
    - role: pod
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    params:
      module: [tcp_connect]
    relabel_configs:
    - source_labels: [__meta_kubernetes_pod_annotation_blackbox_scheme]
      action: keep
      regex: tcp
    - source_labels: [__address__, __meta_kubernetes_pod_annotation_blackbox_port]
      action: replace
      regex: ([^:]+)(?::\d+)?;(\d+)
      replacement: $1:$2
      target_label: __param_target
    - action: replace
      target_label: __address__
      replacement: blackbox.com.cn:80
    - source_labels: [__param_target]
      target_label: instance
    - action: labelmap
      regex: __meta_kubernetes_pod_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_pod_name]
      action: replace
      target_label: kubernetes_pod_name


  - job_name: 'k8s-pods'
    kubernetes_sd_configs:
    - role: pod  
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    relabel_configs:
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
      action: keep
      regex: true
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
      action: replace
      target_label: __metrics_path__
      regex: (.+)
    - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
      action: replace
      regex: ([^:]+)(?::\d+)?;(\d+)
      replacement: $1:$2
      target_label: __address__
    - action: labelmap
      regex: __meta_kubernetes_pod_label_(.+)
    - source_labels: [__meta_kubernetes_namespace]
      action: replace
      target_label: kubernetes_namespace
    - source_labels: [__meta_kubernetes_pod_name]
      action: replace
      target_label: kubernetes_pod_name

  - job_name: 'ingress-nginx-endpoints'
    honor_timestamps: true
    metrics_path: /metrics
    scheme: http
    kubernetes_sd_configs:
    - role: pod
      api_server: https://10.10.95.18:6443
      bearer_token_file: /prometheus/prometheus/k8s-token.conf
      tls_config:
        insecure_skip_verify: true
    bearer_token_file: /prometheus/prometheus/k8s-token.conf
    tls_config:
      insecure_skip_verify: true
    relabel_configs:
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
      action: keep
      regex: true
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scheme]
      action: replace
      target_label: __scheme__
      regex: (https?)
    - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
      action: replace
      target_label: __metrics_path__
      regex: (.+)
    - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
      action: replace
      target_label: __address__
      regex: ([^:]+)(?::\d+)?;(\d+)
      replacement: $1:$2
    - source_labels: [__meta_kubernetes_service_name]
      regex: prometheus-server
      action: drop

vim rules.yml

groups:
- name: test-rules
  rules:
  - alert: InstanceDown
    expr: up == 0
    for: 2m
    labels:
      everity: Disaster
    annotations:
      summary: "Instance {{ $labels.instance }} down."

  - alert: rabbitmqDown
    expr: rabbitmq_running == 0
    for: 2m
    labels:
      everity: Disaster
    annotations:
      summary: "Instance {{ $labels.node }} down."

- name: http_status
  rules:
  - alert: BlackboxSlowPing
    expr: probe_icmp_duration_seconds > 2
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox slow ping (instance {{ $labels.instance }})"
      description: "Blackbox ping took more than 2s (current value: {{ $value }})"
  
  - alert: BlackboxSlowRequests
    expr: probe_http_duration_seconds > 2 
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "Blackbox slow requests (instance {{ $labels.instance }})"
      description: "Blackbox request took more than 2s (current value: {{ $value }})"

  - alert: ProbeFailed
    expr: probe_success == 0
    for: 1m
    labels:
      severity: error
    annotations:
      summary: "Probe failed (instance {{ $labels.instance }})"
      description: "Probe failed (current value: {{ $value }})"

  - alert: StatusCode
    expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
    for: 1m
    labels:
      severity: error
    annotations:
      summary: "Status Code (instance {{ $labels.instance }})"
      description: "HTTP status code is not 200-399 (current value: {{ $value }})"

- name: node_alerts
  rules:
  - alert: NodeMemoryWaring
    expr: node_memory_MemAvailable_bytes/1024/1024 <= 2014
    for: 30s
    labels:
      severity: Disaster
    annotations:
      summary: "机器 {{ $labels.instance }} 可用内存低于2014M"

  - alert: NodeMemoryDisaster
    expr: ((node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes)) / node_memory_MemTotal_bytes) * 100 >= 80
    for: 30s
    labels:
      severity: Critical
    annotations:
      summary: "机器 {{ $labels.instance }} 内存使用率超过80%"

  - alert: NodeCPUUsage
    expr: 100 * (1 - sum by (instance)(increase(node_cpu_seconds_total{mode="idle"}[5m])) / sum by (instance)(increase(node_cpu_seconds_total[5m]))) > 80
    for: 2m
    labels:
      team: node
    annotations:
      summary: "{{$labels.instance}}: High CPU usage detected"
      description: "{{$labels.instance}}: CPU usage is above 80% (current value is: {{ $value }}"

  - alert: NodeFilesystemUsage
    expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} * 100) / node_filesystem_size_bytes {mountpoint="/",fstype=~"ext4|xfs"}) > 80
    for: 2m
    labels:
      team: node
    annotations:
      summary: "{{$labels.instance}}: High Filesystem usage detected"
      description: "{{$labels.instance}}: Filesystem usage is above 80% (current value is: {{ $value }}"

  - alert: InstanceDown
    expr: up == 0
    for: 30s
    labels:
      severity: Disaster
    annotations:
      summary: "Instance {{ $labels.instance }} down."
      description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than"


  - alert: ticket-test
    expr: sum(jvm_memory_used_bytes{application="vcloud-ticket-api",area="heap",}) / sum(jvm_memory_max_bytes{application="vcloud-ticket-api",area="heap"})*100 > 90
    for: 30s
    labels:
      severity: Disaster
    annotations:
      summary: "JVM Instance {{ $labels.instance }} memory usage > 90%"
      description: "{{ $labels.instance }} has been in status [heap usage > 90%] for more than 1 minutes. current usage ({{ $value }}%)"

test.json

[
  {
    "targets": [
      "192.168.51.201:8010"
    ],
    "labels": {
      "appname": "api01"
    }
  },

  {
    "targets": [
      "192.168.51.202:8010"
    ],
    "labels": {
      "appname": "api02"
    }
  }
]

spug推送助手

https://push.spug.cc/

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

alertmanager

vim alertmanager.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'mail.com.cn:587'
  smtp_from: 'admin@com.cn'
  smtp_auth_username: 'admin@com.cn'
  smtp_auth_password: '123456'
#  smtp_require_tls: false
templates:
  - '/prometheus/alertmanager/templates/*.tmpl'
route:
  group_by: ['service', 'alertname', 'cluster']
  group_interval: 5m
  group_wait: 10s
  repeat_interval: 5m
  receiver: default-receiver
  routes:
  - match:
      severity: ^(Critical|Warning|Disaster)$
    receiver: 'web.hook'
receivers:
- name: 'web.hook'
  webhook_configs:
  - url: 'https://push.spug.cc/send/xxxxxxxxx'
- name: 'default-receiver'
  email_configs:
  - to: 'name1@com.cn,name2@com.cn'
#    html: '{{ template "dingding.to.html" . }}'
    headers: { Subject: 'Prometheus 告警邮件'}
    send_resolved: true
- name: 'email'
  email_configs:
  - to: 'name1@com.cn,name2@com.cn'
#    html: '{{ template "dingding.to.html" . }}'
    headers: { Subject: 'Prometheus 告警邮件'}
    send_resolved: true
./alertmanager --config.file=alertmanager.yml --web.listen-address=:9093

监控k8s service资源

---
apiVersion: v1   
kind: Service
metadata:
  annotations:
    blackbox_path: "/"
    blackbox_port: "80"
    blackbox_scheme: "http"
  name: $APPNAME
  namespace: $NAMESPACE
spec:
  selector:
    appname: $APPNAME
  ports:
  - port: 80
    protocol: TCP
    targetPort: 80
#后端
  annotations:
    blackbox_path: "/actuator/prometheus"
    blackbox_port: "80"
    blackbox_scheme: "http"
	
#前端
  annotations:
    blackbox_path: "/"
    blackbox_port: "80"
    blackbox_scheme: "http"
Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐