在ceph 集群启动prome 模块

ceph mgr module enable prometheus
[root@cephnode01 my-cluster]# ceph mgr services
{
    "dashboard": "https://cephnode01:8443/",
    "prometheus": "http://cephnode01:9283/"
}

 安装 3台 ceph_exporter

 yum install golang git librados2-devel librbd1-devel -y
[root@cephnode02 ~]# cat  /etc/profile.d/go.shexport
GOROOT=/usr/lib/golangexport 
GOBIN=$GOROOT/binexport 
GOPATH=/home/golangexport 
PATH=$PATH:$GOROOT/bin:$GOPATH/bin
source /etc/profile.d/go.shexport
go get -u github.com/digitalocean/ceph_exporter
cd /root/go/bin
nohup ./ceph_exporter &

############################################################

apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-config
  namespace: kube-system 
  labels:
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: EnsureExists
data:
  prometheus.yml: |
    rule_files:
    - /etc/config/rules/*.rules

    scrape_configs:
    - job_name: prometheus
      static_configs:
      - targets:
        - localhost:9090
   
    - job_name: 'ceph'
      static_configs:
      - targets:
          - 10.1.234.131:9283
          - 10.1.234.132:9283
          - 10.1.234.133:9283
   
    - job_name: 'ceph_class'
      static_configs:
      - targets: ['10.1.234.131:9128','10.1.234.132:9128','10.1.234.133:9128']
        labels:
          instance: ceph_class

[root@k8s-master1 prome]# cat alertmanager-configmap.yaml 
apiVersion: v1
kind: ConfigMap
metadata:
  name: alertmanager-config
  namespace: kube-system
  labels:
    kubernetes.io/cluster-service: "true"
    addonmanager.kubernetes.io/mode: EnsureExists
data:
  alertmanager.yml: |
    global:
      # 每2分钟检查一次是否恢复
      resolve_timeout: 2m
      # SMTP的相关配置
      smtp_smarthost: 'smtp.163.com:25'
      smtp_from: '18802676921@163.com'
      smtp_auth_username: '18802676921@163.com'
      smtp_auth_password: '123qqq...A'
      # 自定义 通知的模板的 目录 或者 文件.
      #templates:
      #  - '/usr/local/prometheus/alertmanager/template/wechat.tmpl'
      # 路由树的根节点, 每个传进来的报警从这里开始.
    route:

      # 将传入的报警中有这些标签的分为一个组.
      # 比如, cluster=A 和 alertname=LatencyHigh 会分成一个组.
      group_by: ['alertname_wechat']

      # 指分组创建多久后才可以发送压缩的警报,也就是初次发警报的延时.
      # 这样会确保第一次通知的时候, 有更多的报警被压缩在一起.
      group_wait: 10s

      # 当第一个通知发送,等待多久发送压缩的警报
      group_interval: 10s

      # 默认的接收器
      receiver: 'wechat'

      # 如果报警发送成功, 等待多久重新发送一次
      repeat_interval: 1h
    receivers:
    #SMTP配置
    - name: 'email'
      email_configs:
      - to: '582167559@qq.com'
        send_resolved: true
    - name: 'wechat'
      wechat_configs:
      - corp_id: 'wwab37c47350318435'
        to_party: '2'
        agent_id: '1000002'
        api_secret: 'ti3TXKv7sdZs6r7EUZdgpRoUgjR1ne97R8KSYTtPpDY'
        send_resolved: true

告警规则 

- alert:  集群空间使用率
        expr: ceph_cluster_used_bytes / ceph_cluster_capacity_bytes * 100 > 70
        for: 2m
        labels:
          product: ceph
        annotations:
          summary: "{{$labels.instance}}: Not enough capacity in Ceph detected"
          description: "{{$labels.instance}}: Available capacity is used up to 70% (current value is: {{ $value }}"

grafana 采用 模板  917  

 

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐