1.创建prometheus主配置文件

configmap-main.yaml

apiVersion: v1
data:
  prometheus.yml: |-
    # my global config
    global:
      scrape_interval: 60s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
      evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
      # scrape_timeout is set to the global default (10s).

    # Alertmanager configuration
    alerting:
      alertmanagers:
        - static_configs:
            - targets:
               - svc-alertmanager.monitoring.svc:9093

    # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
    rule_files:
       - "/etc/prometheus/rules/mysql_rules.yml"
       - "/etc/prometheus/rules/general.yml"
       - "/etc/prometheus/rules/node.yml"
       - "/etc/prometheus/rules/kube-state-metrics.yml"
      # - "second_rules.yml"

    # A scrape configuration containing exactly one endpoint to scrape:
    # Here it's Prometheus itself.
    scrape_configs:
      # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
      - job_name: "prometheus"

        # metrics_path defaults to '/metrics'
        # scheme defaults to 'http'.

        static_configs:
          - targets: ["localhost:9090"]

      - job_name: "kube-state-metrics"
        metrics_path: "/metrics"

        static_configs:
          - targets: [ "svc-kube-state-metrics.monitoring.svc:8080" ]

      - job_name: "shuan-services"
        metrics_path: "/actuator/prometheus"
        file_sd_configs:
          - files: ["/etc/prometheus/target/services.yml"]
            refresh_interval: 5s

      - job_name: "shuan-mysqld-services"
        metrics_path: "/metrics"
        file_sd_configs:
          - files: [ "/etc/prometheus/target/mysqld-services.yml" ]
            refresh_interval: 30s

      - job_name: "shuan-redis-services"
        metrics_path: "/metrics"
        file_sd_configs:
          - files: [ "/etc/prometheus/target/redis-services.yml" ]
            refresh_interval: 30s

      - job_name: "shuan-nodes"
        metrics_path: "/metrics"
        file_sd_configs:
          - files: [ "/etc/prometheus/target/nodes.yml" ]
            refresh_interval: 30s

      - job_name: "kubernetes-apiservers"

        kubernetes_sd_configs:
          - role: endpoints

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & authorization config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          # insecure_skip_verify: true
        authorization:
          credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        # Keep only the default/kubernetes service endpoints for the https port. This
        # will add targets for each API server which Kubernetes adds an endpoint to
        # the default/kubernetes service.
        relabel_configs:
          - source_labels:
              [
                __meta_kubernetes_namespace,
                __meta_kubernetes_service_name,
                __meta_kubernetes_endpoint_port_name,
              ]
            action: keep
            regex: default;kubernetes;https

      # Scrape config for nodes (kubelet).
      #
      # Rather than connecting directly to the node, the scrape is proxied though the
      # Kubernetes apiserver.  This means it will work if Prometheus is running out of
      # cluster, or can't connect to nodes for some other reason (e.g. because of
      # firewalling).
      - job_name: "kubernetes-nodes"

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # This TLS & authorization config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        authorization:
          credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)

      - job_name: "kubernetes-cadvisor"

        # Default to scraping over https. If required, just disable this or change to
        # `http`.
        scheme: https

        # Starting Kubernetes 1.7.3 the cAdvisor metrics are under /metrics/cadvisor.
        # Kubernetes CIS Benchmark recommends against enabling the insecure HTTP
        # servers of Kubernetes, therefore the cAdvisor metrics on the secure handler
        # are used.
        metrics_path: /metrics/cadvisor

        # This TLS & authorization config is used to connect to the actual scrape
        # endpoints for cluster components. This is separate to discovery auth
        # configuration because discovery & scraping are two separate concerns in
        # Prometheus. The discovery auth config is automatic if Prometheus runs inside
        # the cluster. Otherwise, more config options have to be provided within the
        # <kubernetes_sd_config>.
        tls_config:
          ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
          # If your node certificates are self-signed or use a different CA to the
          # master CA, then disable certificate verification below. Note that
          # certificate verification is an integral part of a secure infrastructure
          # so this should only be disabled in a controlled environment. You can
          # disable certificate verification by uncommenting the line below.
          #
          insecure_skip_verify: true
        authorization:
          credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token

        kubernetes_sd_configs:
          - role: node

        relabel_configs:
          - action: labelmap
            regex: __meta_kubernetes_node_label_(.+)




    remote_write:
      - url: "http://svc-influxdb.monitoring.svc:8086/api/v1/prom/write?db=prometheus&u=admin&p=123456"

    remote_read:
      - url: "http://svc-influxdb.monitoring.svc:8086/api/v1/prom/read?db=prometheus&u=admin&p=123456"
kind: ConfigMap
metadata:
  name: prometheus-conf
  namespace: monitoring

2. 创建prometheus的告警规则配置文件

configmap-rule.yaml

apiVersion: v1
data:
  general.yml: |-
    groups:
    - name: general
      rules:
      # Alert for any instance that is unreachable for >5 minutes.
      - alert: InstanceDown
        expr: up == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} down"
          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
  kube-state-metrics.yml: |-
    groups:
    - name: kube-state-metrics
      rules:
      - alert: pod restart
        expr: changes(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace!~"^kube.*"}[30m])== 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "pod restart"
          description: "{{ $labels.pod }} has restarted "

      - alert: StatefulSetReplicasMismatch
        expr: ( kube_statefulset_status_replicas_ready{job="kube-state-metrics",namespace!~"^kube.*"} != kube_statefulset_status_replicas{job="kube-state-metrics",namespace!~"^kube.*"} ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics",namespace!~"^kube.*"}[5m]) == 0 )
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "StatefulSetReplicasMismatch"
          description: "Replicas of {{ $labels.statefulset }} is Mismatch "


      - alert: KubePodCrashLooping
        expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics",namespace!~"^kube.*"}[15m]) * 60 * 5 > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "KubePodCrashLooping"
          description: "{{ $labels.pod }} is CrashLooping "
  mysql_rules.yml: |-
    groups:
    - name: mysql
      rules:
      - alert: nums of mysql_global_status_threads_connected is more than 180
        expr: mysql_global_status_threads_connected > 180
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "nums of mysql_global_status_threads_connected is more than 180 in {{ $labels.instance }}"
          description: "{{ $labels.instance }} has too many mysql_global_status_threads_connected (current value: {{ $value }}s)"
      - alert: Used more than 85% of max connections limited
        expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections * 0.85
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Instance {{ $labels.instance }} Used more than 80% of max connections limited"
  node.yml: |+
    groups:
    - name: node
      rules:
      # Alert for any instance that is unreachable for >5 minutes.
      - alert: usage of cpu
        expr: ((1 - sum(rate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance) / sum(rate(node_cpu_seconds_total[5m])) by (instance) ) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High cpu usage of  {{ $labels.instance }}"
          description: "CPU of  {{ $labels.instance }} is used more than 80%"

      - alert: cpu iowaite
        expr: (avg by (instance)(irate(node_cpu_seconds_total{mode='iowait',job=~"shuan-nodes"}[5m])) * 100) > 20
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High cpu iowaite of   {{ $labels.instance }}"
          description: "cpu iowaite of {{ $labels.instance }} is used more than 20%, current vlaue : {{ $value }}%"

      - alert: usage of memory
        expr: ((1- (node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes) / node_memory_MemTotal_bytes) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage of  {{ $labels.instance }}"
          description: "memory of  {{ $labels.instance }} is used more than 80%"

      - alert: usage of disk
        expr: ((1 - node_filesystem_avail_bytes{fstype=~"ext4|xfs",job="shuan-nodes"} / node_filesystem_size_bytes{fstype=~"ext4|xfs",job="shuan-nodes"}) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High disk usage of {{ $labels.mountpoint }} in  {{ $labels.instance }}"
          description: "disk of {{ $labels.mountpoint }} in {{ $labels.instance }} is used more than 80%"

      - alert: disk io
        expr: (avg(irate(node_disk_io_time_seconds_total{job="shuan-nodes"}[5m])) by(instance)* 100) > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High disk io  of  {{ $labels.instance }}"
          description: "disk io of  {{ $labels.instance }} is used more than 90%, current vlaue : {{ $value }}"

      - alert: HostDiskReadRate
        expr: sum by (instance) (rate(node_disk_read_bytes_total[5m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "磁盘读速率过高"
          description: "{{ $labels.instance }} 磁盘读速率超过50MB/s, 当前速率:{{ $value }}MB/s"

      - alert: HostDiskWriteRate
        expr: sum by (instance) (rate(node_disk_written_bytes_total[5m])) / 1024 / 1024 > 50
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "磁盘写速率过高"
          description: "{{ $labels.instance }} 磁盘读速率超过50MB/s, 当前速率:{{ $value }}MB/s"

      - alert: network receive
        expr: (sum(rate(node_network_receive_bytes_total{device=~'ens.*'}[5m])) by (instance) / 1024 / 1024) > 20
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High network receive of  {{ $labels.instance }}"
          description: "network receive  of {{ $labels.device }} in {{ $labels.instance }} is used more than 20M, current vlaue : {{ $value }}MB/s"

      - alert: network transmit
        expr: (sum(rate(node_network_transmit_bytes_total{device=~'ens.*'}[5m])) by (instance) / 1024 / 1024) > 20
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network transmit of  {{ $labels.instance }}"
          description: "network transmit  {{ $labels.device }} in {{ $labels.instance }} is used more than 20M, current vlaue : {{ $value }}"

kind: ConfigMap
metadata:
  name: prometheus-rules-conf
  namespace: monitoring


3.创建prometheus的rbac.yaml文件

rbac.yaml

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/metrics
  - nodes/proxy
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: monitoring

4.创建prometheus工作负载yaml文件

statefulset.yaml

apiVersion: apps/v1
kind: StatefulSet
metadata:
  labels:
    app: prometheus
  name: prometheus
  namespace: monitoring
spec:
  serviceName: prometheus
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      volumes:
        - name: host-time
          hostPath:
            path: /etc/localtime
        - name: prometheus-conf
          configMap:
            name: prometheus-conf
            defaultMode: 0760
        - name: prometheus-rules-conf
          configMap:
            name: prometheus-rules-conf
            defaultMode: 0760
      imagePullSecrets:
        - name: shu-cn
      securityContext: #指定运行的用户为root
        runAsUser: 0
      serviceAccountName: prometheus #关联该账号,该账号授予了k8s的相关查看权限,见rbac.yaml文件
      containers:
      - image: shu.cn/prometheus:v2.37.5
        name: prometheus
        args:  #容器启动时指定的额外参数
          - "--config.file=/etc/prometheus/prometheus.yml" #通过volume挂载prometheus.yml
          - "--storage.tsdb.path=/prometheus"              #通过vlolume挂载目录/prometheus
          - "--storage.tsdb.retention.time=5d"            #本地数据保留时间5天
          - "--web.enable-admin-api"                       #控制对admin HTTP API的访问,其中包括删除时间序列等功能
          - "--web.enable-lifecycle"                       #支持热更新,直接执行localhost:9090/-/reload立即生效
        imagePullPolicy: IfNotPresent
        resources:
          requests:
            cpu: 100m
            memory: 1Gi
          limits:
            cpu: 500m
            memory: 2Gi
        ports:
        - containerPort: 9090
          name: tcp
        volumeMounts:
        - name: host-time
          mountPath: /etc/localtime
        - name: prometheus-conf
          mountPath: /etc/prometheus/prometheus.yml
          subPath: prometheus.yml
        - name: file-sd-confs
          mountPath: /etc/prometheus/target
          subPath: target
        - name: prometheus-rules-conf
          mountPath: /etc/prometheus/rules

  volumeClaimTemplates:
  - metadata:
      name: file-sd-confs
    spec:
      storageClassName: "nfs"
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: 1Gi


---
apiVersion: v1
kind: Service
metadata:
  labels:
    app: svc-prometheus
  name: svc-prometheus
  namespace: monitoring
spec:
  selector:
    app: prometheus
  type: NodePort
  ports:
  - port: 9090
    protocol: TCP
    targetPort: 9090
    nodePort: 38090
    name: tcp


5.部署

kubectl apply -f rbac.yaml
kubectl apply -f configmap-rule.yaml
kubectl apply -f configmap-main.yaml
kubectl apply -f statefulset.yaml

6.prometheus访问

http://节点ip:38090

7.其他组件参考链接

prometheus之grafana
prometheus之node_exporter部署
k8s部署prometheus之钉钉告警
k8s部署prometheus之kube-state-metrics
prometheus之influxdb1.8部署

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐