一、kube-prometheuse安装

# 克隆仓库
$ git clone -b release-0.5 https://gitee.com/lonelyZhe/kube-prometheus.git

# 执行命令
$ cd kube-prometheus
$ kubectl create -f manifests/setup
# 再执行,仅安装prometheus
$ kubectl create -f manifests/
# 可选择性安装grafana
$ kubectl create -f manifests/grafana/
# 可选择性安装alertmanager
$ kubectl create -f manifests/alertmanager/

  添加grafana监控Node基础信息的中文Dashboard:8919

二、Prometheus基本使用

# 测试是否成功启动,ip为pod的ip,port一般为9090,或者使用nodePort形式测试
$ curl  http://<ip>:<port>/api/v1/query?query=node_cpu_seconds_total

# 监控cpu空闲率,30秒内空闲率
$ query=(avg(irate(node_cpu_seconds_total{mode='idle'}[30s])) by (instance) *100)

# 监控cpu使用率
# 全部结点
$ instance:node_cpu_utilisation:rate1m
# 指定结点
$ instance:node_cpu_utilisation:rate1m{instance="m1"}

# 监控内存空闲值,单位为Mb
# 全部结点
$ node_memory_MemFree_bytes / (1024*1024)
# 指定结点
$ node_memory_MemFree_bytes{instance="m1"} / (1024*1024)

# 监控磁盘已用值,单位Byte
$ query=sum(max by (device) (node_filesystem_size_bytes{job="node-exporter", instance="m1", fstype!=""}-node_filesystem_avail_bytes{job="node-exporter", instance="m1", fstype!=""}))&start=1618988010&end=1618988010&step=1

# 监控磁盘空闲值,单位Byte
$ query=sum(max by (device) (node_filesystem_avail_bytes{job="node-exporter", instance="m1", fstype!=""}))&start=1618988010&end=1618991610&step=1
解决node-exporter端口9100被占用的问题

  修改node-exporter-daemonset.yaml文件中的全部9100端口,service中的targetPort也要改成相匹配的。

解决Grafana访问前缀问题

  设置环境变量

- name: GF_SERVER_SERVE_FROM_SUB_PATH
  value: "true"
- name: GF_SERVER_ROOT_URL
  value: "https://localhost:3000/k8s/grafana/"

三、GPU监控

  1. 首先给拥有GPU的结点打标签

$ kubectl label node node225 hardware-type=NVIDIAGPU

  2. dcgm-exporter.yaml

apiVersion: apps/v1
kind: DaemonSet
metadata:
  namespace: monitoring
  name: "dcgm-exporter"
  labels:
    app.kubernetes.io/name: "dcgm-exporter"
    app.kubernetes.io/version: "2.1.0"
spec:
  updateStrategy:
    type: RollingUpdate
  selector:
    matchLabels:
      app.kubernetes.io/name: "dcgm-exporter"
      app.kubernetes.io/version: "2.1.0"
  template:
    metadata:
      labels:
        app.kubernetes.io/name: "dcgm-exporter"
        app.kubernetes.io/version: "2.1.0"
      name: "dcgm-exporter"
    spec:
      nodeSelector:
        hardware-type: NVIDIAGPU
      containers:
      - image: 192.168.185.99:8080/cgf_ml/dcgm-exporter:latest
        imagePullPolicy: Always
        env:
        - name: "DCGM_EXPORTER_LISTEN"
          value: ":9400"
        - name: "DCGM_EXPORTER_KUBERNETES"
          value: "true"
        name: "dcgm-exporter"
        ports:
        - name: "metrics"
          containerPort: 9400
        securityContext:
          runAsNonRoot: false
          runAsUser: 0
        volumeMounts:
        - name: "pod-gpu-resources"
          readOnly: true
          mountPath: "/var/lib/kubelet/pod-resources"
      volumes:
      - name: "pod-gpu-resources"
        hostPath:
          path: "/var/lib/kubelet/pod-resources"

---

kind: Service
apiVersion: v1
metadata:
  namespace: monitoring
  name: "dcgm-exporter"
  labels:
    app.kubernetes.io/name: "dcgm-exporter"
    app.kubernetes.io/version: "2.1.0"
spec:
  selector:
    app.kubernetes.io/name: "dcgm-exporter"
    app.kubernetes.io/version: "2.1.0"
  ports:
  - name: "metrics"
    port: 9400

  3. dcgm-service-monitor.yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  namespace: monitoring
  name: "dcgm-exporter"
  labels:
    app.kubernetes.io/name: "dcgm-exporter"
    app.kubernetes.io/version: "2.1.0"
spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: "dcgm-exporter"
      app.kubernetes.io/version: "2.1.0"
  endpoints:
  - port: "metrics"
    path: "/metrics"

  4. 添加grafana监控GPU的Dashboard:12239

四、Mysql监控

  创建mysql用户

mysql> GRANT SELECT, PROCESS, SUPER, REPLICATION CLIENT, RELOAD ON *.* TO 'exporter'@'%' IDENTIFIED BY '123456';

  1. mysqld-exporter.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: monitoring
  name: mysql-exporter
spec:
  replicas: 1
  selector:
    matchLabels:
        app: mysql-exporter
  template:
    metadata:
      labels:
        app: mysql-exporter
    spec:
      containers:
        - name: mysql-exporter
          image: 192.168.185.99:8080/cgf_ml/mysqld-exporter:latest
          imagePullPolicy: Always
          ports:
          - name: mysqlexporter
            containerPort: 9104
            protocol: TCP
          env:
          - name: "DATA_SOURCE_NAME"
            value: "root:123456@(mysql-service.cgf-ml.svc.cluster.local:3306)/"
          resources:
            requests:
              cpu: 0.2
              memory: 200Mi
            limits:
              cpu: 0.2
              memory: 200Mi
---
apiVersion: v1
kind: Service
metadata:
  namespace: monitoring
  name: mysql-exporter
  labels:
    app: mysql-exporter
spec:
  ports:
  - port: 9104
    targetPort: 9104
    name: mysqlexporter
  selector:
    app: mysql-exporter

  2. mysqld-service-monitor.yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  namespace: monitoring
  labels:
    k8s-app: mysql-exporter
  name: mysql-exporter
spec:
  endpoints:
  - interval: 15s
    port: mysqlexporter
    path: /metrics
  selector:
    matchLabels:
      app: mysql-exporter

  3. 添加grafana监控Mysql的Dashboard:7362

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐