k8s prometheus 监控 ceph 集群
在ceph 集群启动prome 模块ceph mgr module enable prometheus[root@cephnode01 my-cluster]# ceph mgr services{"dashboard": "https://cephnode01:8443/","prometheus": "http://cephnode01:9283/"}安装 3台 ceph_exporteryu
·
在ceph 集群启动prome 模块
ceph mgr module enable prometheus
[root@cephnode01 my-cluster]# ceph mgr services
{
"dashboard": "https://cephnode01:8443/",
"prometheus": "http://cephnode01:9283/"
}
安装 3台 ceph_exporter
yum install golang git librados2-devel librbd1-devel -y
[root@cephnode02 ~]# cat /etc/profile.d/go.shexport
GOROOT=/usr/lib/golangexport
GOBIN=$GOROOT/binexport
GOPATH=/home/golangexport
PATH=$PATH:$GOROOT/bin:$GOPATH/bin
source /etc/profile.d/go.shexport
go get -u github.com/digitalocean/ceph_exporter
cd /root/go/bin
nohup ./ceph_exporter &
############################################################
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
data:
prometheus.yml: |
rule_files:
- /etc/config/rules/*.rules
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- job_name: 'ceph'
static_configs:
- targets:
- 10.1.234.131:9283
- 10.1.234.132:9283
- 10.1.234.133:9283
- job_name: 'ceph_class'
static_configs:
- targets: ['10.1.234.131:9128','10.1.234.132:9128','10.1.234.133:9128']
labels:
instance: ceph_class
[root@k8s-master1 prome]# cat alertmanager-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: kube-system
labels:
kubernetes.io/cluster-service: "true"
addonmanager.kubernetes.io/mode: EnsureExists
data:
alertmanager.yml: |
global:
# 每2分钟检查一次是否恢复
resolve_timeout: 2m
# SMTP的相关配置
smtp_smarthost: 'smtp.163.com:25'
smtp_from: '18802676921@163.com'
smtp_auth_username: '18802676921@163.com'
smtp_auth_password: '123qqq...A'
# 自定义 通知的模板的 目录 或者 文件.
#templates:
# - '/usr/local/prometheus/alertmanager/template/wechat.tmpl'
# 路由树的根节点, 每个传进来的报警从这里开始.
route:
# 将传入的报警中有这些标签的分为一个组.
# 比如, cluster=A 和 alertname=LatencyHigh 会分成一个组.
group_by: ['alertname_wechat']
# 指分组创建多久后才可以发送压缩的警报,也就是初次发警报的延时.
# 这样会确保第一次通知的时候, 有更多的报警被压缩在一起.
group_wait: 10s
# 当第一个通知发送,等待多久发送压缩的警报
group_interval: 10s
# 默认的接收器
receiver: 'wechat'
# 如果报警发送成功, 等待多久重新发送一次
repeat_interval: 1h
receivers:
#SMTP配置
- name: 'email'
email_configs:
- to: '582167559@qq.com'
send_resolved: true
- name: 'wechat'
wechat_configs:
- corp_id: 'wwab37c47350318435'
to_party: '2'
agent_id: '1000002'
api_secret: 'ti3TXKv7sdZs6r7EUZdgpRoUgjR1ne97R8KSYTtPpDY'
send_resolved: true
告警规则
- alert: 集群空间使用率
expr: ceph_cluster_used_bytes / ceph_cluster_capacity_bytes * 100 > 70
for: 2m
labels:
product: ceph
annotations:
summary: "{{$labels.instance}}: Not enough capacity in Ceph detected"
description: "{{$labels.instance}}: Available capacity is used up to 70% (current value is: {{ $value }}"
grafana 采用 模板 917
更多推荐
已为社区贡献29条内容
所有评论(0)