在这里插入图片描述

一、准备
1、k8s 1.23
2、helm 3.8
3、minio最新版本 (请自行安装,本人使用docker部暑单节点)
4、kube-prometheus-stack 版本为:35.0.0 (helm安装)
5、kube-thanos版本为:10.3.6 (helm安装)
6、准备两套k8s, 分别使用 *.lady.cn(监控)*.kids.cn(被监控)

二、目标
lady.cn 部暑以下组件

  • grafana
  • prometheus
  • alertmanager
  • query-frontend
  • query #查询 (通过sidecar、storegateway、storegateway-kids)
  • compactor #去重
  • storegateway #为query提供查询objstore
  • sidecar #在kube-prometheus-stack安装时已安装, 用于数据上传和query查询
  • ruler # 告警
  • storegateway-kids #被监控集群的objstore(需要yaml手动部暑 )

kids.cn部暑经以下组件

  • grafana #可不安装
  • alertmanager #可不安装
  • prometheus
  • query-frontend #可不安装
  • query #查询本地 sidecar、storegateway,
  • compactor #去重
  • storegateway
  • sidecar

三、 minio 已在独立服务器部暑minio,作为S3对象存储

172.16.0.39:9000  admin /  Thanos@654321

四、部暑kube-prometheus-stack(分别在两个集群中部暑)

#添加 kubernetes-dashboard helm chart
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts

# 更新下仓库
helm repo update 

#指定变量
pro=kube-prometheus-stack
chart_version=35.0.0

mkdir -p /data/$pro
cd /data/$pro

#下载charts
helm pull prometheus-community/$pro --version=$chart_version

#提取values.yaml文件
tar zxvf $pro-$chart_version.tgz --strip-components 1 $pro/values.yaml 

cat > /data/$pro/start.sh << EOF
helm upgrade --create-namespace --wait --install $pro $pro-$chart_version.tgz \
-f values.yaml \
-n monitoring
EOF
  • 修改配置values.yaml
kubeTargetVersionOverride: "1.23.4"   #指定k8s版本
---
alertmanager:
#  config:
#    route:
#      receiver: 'ding2wechat'
#      routes:
#      - match:
#          alertname: Watchdog
#        receiver: 'ding2wechat'
#    receivers:
#    - name: 'ding2wechat'
#      webhook_configs:
#      - url: 'http://dingtalk-webhook:8080/dingtalk/ding2wechat/send'
  ingress:
    enabled: true
    hosts:
      - alertmanager.lady.cn        #注意修改
---
grafana:
  ingress:
    enabled: true
    hosts:
      - grafana.lady.cn             #注意修改
  additionalDataSources:
    - name: Prometheus
      type: prometheus
      url: http://thanos-query-frontend:9090/        #与query-frontend集成
      access: proxy
      isDefault: true
---
prometheus:
  thanosService:
    enabled: true
  thanosServiceExternal:
    enabled: true                       #设为开启
    type: NodePort                      #注意修改,有loadbance时,改为LoadBalancer
  extraSecret:                                    #配置thanos的bucket-config, 里面包括了objstor(minio)的配置
    name: bucket-config
    data:
      objstore.yml: |
        type: S3
        config:
          bucket: "lady-bucket"                      #minio的桶名,注意修改
          endpoint: "172.16.0.39:9000"               #minio的地址
          access_key: "Thanos"                       #minio的帐号
          secret_key: "Thanos@654321"                #minio的密码
          insecure: true                             #不验证tls证书
  ingress:
    enabled: true
    hosts:
      - prometheus.lady.cn                           #注意修改
  prometheusSpec:
    disableCompaction: true                          #kube-prometheus-stack 启用thanos-sidecar
    externalLabels: 
      cluster: lady.cn                               # 添加 cluster 标签区分集群
    secrets:
    - etcd-client-cert                               #添加etcd的证书,(etcd不在集群内)
    thanos:
      objectStorageConfig:                           #thanos使用上边的secret来配置thanos-sidecar
        name: bucket-config
        key: objstore.yml
---
kubeControllerManager:
  endpoints:
  - 192.168.11.100      #注意修改
  service:
    port: 10257     #此处端口一定要配置
---
kubeScheduler:
  endpoints:
  - 192.168.11.100      #注意修改
  service:
    port: 10259     #此处端口一定要配置
---
kubeEtcd:
  endpoints:
  - 192.168.11.100      #注意修改
---
kubeProxy:
  endpoints:
  - 192.168.11.100       #注意修改
  • 持久化 ---- grafana、prometheus、alertmanager(实验环境可不设置,生产环境需要配置持久化)
#alertmanager
    storage:
      volumeClaimTemplate:
        spec:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 20Gi
#prometheus
    storageSpec:
      volumeClaimTemplate:
        spec:
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: 50Gi    

启动

bash /data/kube-prometheus-stack/start.sh

本图是thanos-sidecar上传数据到minio的结果
在这里插入图片描述

五、kube-thanos安装
1、下载charts

#添加 kubernetes-dashboard helm chart
helm repo add bitnami https://charts.bitnami.com/bitnami

# 更新下仓库
helm repo update 

#指定变量
pro=thanos
chart_version=10.3.6

mkdir -p /data/$pro
cd /data/$pro

#下载charts
helm pull bitnami/$pro --version=$chart_version

#提取values.yaml文件
tar zxvf $pro-$chart_version.tgz --strip-components 1 $pro/values.yaml 

cat > /data/$pro/start.sh << EOF

helm upgrade --wait --create-namespace --install $pro $pro-$chart_version.tgz \
-f values.yaml \
-n monitoring
EOF

2、 配置values.yaml

#此处对应kube-prometheus-stack的values.yaml配置中的prometheus.extraSecret.name
existingObjstoreSecret: "bucket-config"
query:
  replicaLabel: [lady_replica]                             #去重标记,注意修改
  dnsDiscovery:
    sidecarsService: "kube-prometheus-stack-thanos-discovery"  #kube-prometheus-stack的thanos-servicename
    sidecarsNamespace: "monitoring"                            #kube-prometheus-stack部暑空间   
  ingress:
    enabled: true
  ingress:
    enabled: true
    hostname: thanos.lady.cn    #注意修改
queryFrontend:                 #提供给grafana查询使用,看下图
  enabled: true
  extraFlags:
  - --query-frontend.compress-responses            #压缩http请求
  - --query-range.split-interval=12h               # 将请求按照时间间隔分隔
  - --query-range.max-retries-per-request=5        
  - --query-frontend.log-queries-longer-than=10s    # 打印查询时间大于指定值的查询时间。
  - --labels.split-interval=12h                     # 将请求按照时间间隔分隔
  - --labels.max-retries-per-request=5
  - --query-range.align-range-with-step       # 使其开始和结束与步长保持一致,以获得更好的缓存能力。
  - --query-range.max-query-length=0        # 限制查询的时间范围,设置为0禁用,1h只能查询1小时范围数据
  - --query-range.response-cache-max-freshness=1m   # 范围查询请求的最近允许的可缓存结果,为了防止最近的缓存结果不断变化
  - |-
    --query-range.response-cache-config="config":
      max_size: "200MB"
      max_size_items: 0
      validity: 0s
    type: IN-MEMORY
  - |-
    --labels.response-cache-config="config":
      max_size: "200MB"
      max_size_items: 0
      validity: 0s
    type: IN-MEMORY
  ingress:
    enabled: true
    hostname: thanos-frontend.lady.cn
compactor:
  enabled: true
  persistence:
    enabled: true             #生产环境设为true,持久化
storegateway:
  enabled: true 
  persistence:
    enabled: true             #生产环境设为true,持久化
ruler:
  enabled: true
  replicaLabel: lady_replica              #去重标记,注意修改
  alertmanagers:
  - kube-prometheus-stack-alertmanager:9093       #kube-prometheus-stack的servicename地址
  existingConfigmap: "prometheus-kube-prometheus-stack-prometheus-rulefiles-0"   #kube-prometheus-stack的ruler规则配置
  persistence:
    enabled: true             #生产环境设为true,持久化
  ingress:
    enabled: true
    hostname: thanos-ruler.lady.cn     #注意修改

注: 需要修改一下charts的原码

tar zxvf thanos-10.3.6.tgz 
vi thanos/templates/ruler/statefulset.yaml 

--rule-file=/conf/rules/*.yml   改为  --rule-file=/conf/rules/*.yaml

helm package thanos      #重新打包chart.
bash /data/thanos/start.sh

3、query图,包含了sidecar、store、rule
在这里插入图片描述

在这里插入图片描述

  • grafana配置新的数据源为 http://thanos-query-frontend:9090/
    在这里插入图片描述

在这里插入图片描述

在lady中集群中增加thanos-storegateway-kids 和thanos-query-kids来收集kids集群的数据

cat > /data/thanos/query-kids.yaml << 'EOF'
---
apiVersion: v1
kind: Endpoints
metadata:
  name: thanos-query-kids
  namespace: monitoring
subsets:
- addresses:
  - ip: 192.168.11.101     #注意修改,这里指向kids.cn的集群
  ports:
  - name: grpc
    port: 30901
    protocol: TCP
  - name: http
    port: 30902
    protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/instance: thanos-query-kids
  name: thanos-query-kids
  namespace: monitoring
spec:
  ports:
  - name: grpc
    port: 30901
    protocol: TCP
    targetPort: grpc
  - name: http
    port: 30902
    protocol: TCP
    targetPort: http
  type: ClusterIP
EOF

kubectl apply -f /data/thanos/query-kids.yaml
cat > /data/thanos/storegateway-kids.yaml << 'EOF'
apiVersion: v1
kind: Secret
metadata:
  labels:
    app: kube-prometheus-stack-prometheus
    app.kubernetes.io/component: prometheus
    app.kubernetes.io/instance: kube-prometheus-stack
    app.kubernetes.io/part-of: kube-prometheus-stack
  name: bucket-config-kids
  namespace: monitoring
data:
  objstore.yml: dHlwZTogUzMKY29uZmlnOgogIGJ1Y2tldDogImtpZHMtYnVja2V0IiAgICAgICAgICAgICAgICAgICAgICAjbWluaW/nmoTmobblkI0KICBlbmRwb2ludDogIjE3Mi4xNi4wLjM5OjkwMDAiICAgICAgICAgICAgICAgI21pbmlv55qE5Zyw5Z2ACiAgYWNjZXNzX2tleTogIlRoYW5vcyIgICAgICAgICAgICAgICAgICAgICAgICNtaW5pb+eahOW4kOWPtwogIHNlY3JldF9rZXk6ICJUaGFub3NANjU0MzIxIiAgICAgICAgICAgICAgICAjbWluaW/nmoTlr4bnoIEKICBpbnNlY3VyZTogdHJ1ZSAgICAgICAgICAgICAgICAgICAgICAgICAgICAgI+S4jemqjOivgXRsc+ivgeS5pgo=
type: Opaque
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
  labels:
    app.kubernetes.io/component: storegateway-kids
    app.kubernetes.io/instance: thanos
    app.kubernetes.io/name: thanos
  name: thanos-storegateway-kids
  namespace: monitoring
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/component: storegateway-kids
      app.kubernetes.io/instance: thanos
      app.kubernetes.io/name: thanos
  serviceName: thanos-storegateway-headless
  template:
    metadata:
      labels:
        app.kubernetes.io/component: storegateway-kids
        app.kubernetes.io/instance: thanos
        app.kubernetes.io/name: thanos
    spec:
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - podAffinityTerm:
              labelSelector:
                matchLabels:
                  app.kubernetes.io/component: storegateway-kids
                  app.kubernetes.io/instance: thanos
                  app.kubernetes.io/name: thanos
              namespaces:
              - monitoring
              topologyKey: kubernetes.io/hostname
            weight: 1
      automountServiceAccountToken: true
      containers:
      - args:
        - store
        - --log.level=info
        - --log.format=logfmt
        - --grpc-address=0.0.0.0:10901
        - --http-address=0.0.0.0:10902
        - --data-dir=/data
        - --objstore.config-file=/conf/objstore.yml
        image: docker.io/bitnami/thanos:0.25.2-scratch-r5
        imagePullPolicy: IfNotPresent
        livenessProbe:
          failureThreshold: 6
          httpGet:
            path: /-/healthy
            port: http
            scheme: HTTP
          initialDelaySeconds: 30
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        name: storegateway
        ports:
        - containerPort: 10902
          name: http
          protocol: TCP
        - containerPort: 10901
          name: grpc
          protocol: TCP
        readinessProbe:
          failureThreshold: 6
          httpGet:
            path: /-/ready
            port: http
            scheme: HTTP
          initialDelaySeconds: 30
          periodSeconds: 10
          successThreshold: 1
          timeoutSeconds: 30
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          runAsNonRoot: true
          runAsUser: 1001
        terminationMessagePath: /dev/termination-log
        terminationMessagePolicy: File
        volumeMounts:
        - mountPath: /conf
          name: objstore-config
        - mountPath: /data
          name: data
      dnsPolicy: ClusterFirst
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext:
        fsGroup: 1001
      serviceAccount: thanos-storegateway
      serviceAccountName: thanos-storegateway
      terminationGracePeriodSeconds: 30
      volumes:
      - name: objstore-config
        secret:
          defaultMode: 420
          secretName: bucket-config-kids
      - emptyDir: {}
        name: data
  updateStrategy:
    type: RollingUpdate
---
apiVersion: v1
kind: Service
metadata:
  labels:
    app.kubernetes.io/component: storegateway-kids
    app.kubernetes.io/instance: thanos
    app.kubernetes.io/name: thanos
  name: thanos-storegateway-kids
  namespace: monitoring
spec:
  internalTrafficPolicy: Cluster
  ipFamilies:
  - IPv4
  ipFamilyPolicy: SingleStack
  ports:
  - name: http
    port: 9090
    protocol: TCP
    targetPort: http
  - name: grpc
    port: 10901
    protocol: TCP
    targetPort: grpc
  selector:
    app.kubernetes.io/component: storegateway-kids
    app.kubernetes.io/instance: thanos
    app.kubernetes.io/name: thanos
  sessionAffinity: None
  type: ClusterIP
EOF

kubectl apply -f /data/thanos/storegateway-kids.yaml

修改lady集群中的thanos-query

kubectl edit -n monitoring deployments.apps thanos-query

        - --store=dnssrv+_grpc._tcp.kube-prometheus-stack-thanos-discovery.monitoring.svc.cluster.local
        - --store=dnssrv+_grpc._tcp.thanos-storegateway.monitoring.svc.cluster.local                                  
        - --store=dnssrv+_grpc._tcp.thanos-ruler.monitoring.svc.cluster.local
        - --store=dnssrv+_grpc._tcp.thanos-query-kids.monitoring.svc.cluster.local                 #增加此项,指向kids.cn
        - --store=dnssrv+_grpc._tcp.thanos-storegateway-kids.monitoring.svc.cluster.local          #增加此项,指向kids.cn

验证
lady集群

kubectl get pod -n monitoring 
NAME                                                        READY   STATUS    RESTARTS      AGE
alertmanager-kube-prometheus-stack-alertmanager-0           2/2     Running   2 (44h ago)   2d1h
kube-prometheus-stack-grafana-799446c5b9-8h2kh              3/3     Running   3 (44h ago)   2d1h
kube-prometheus-stack-kube-state-metrics-6c5d86887c-hr7l7   1/1     Running   1 (44h ago)   2d1h
kube-prometheus-stack-operator-5bbb5f4f64-dk5dr             1/1     Running   1 (44h ago)   2d1h
kube-prometheus-stack-prometheus-node-exporter-r6pcz        1/1     Running   1 (44h ago)   2d1h
prometheus-kube-prometheus-stack-prometheus-0               3/3     Running   3 (44h ago)   2d1h
thanos-compactor-66ccd948d-g72zt                            1/1     Running   2 (44h ago)   2d
thanos-query-5df6c68bc5-vptrq                               1/1     Running   0             53m
thanos-query-frontend-59df69d5c-gndz4                       1/1     Running   1 (44h ago)   2d
thanos-ruler-0                                              1/1     Running   1 (44h ago)   2d
thanos-storegateway-0                                       1/1     Running   2 (44h ago)   2d
thanos-storegateway-kids-0                                  1/1     Running   0             155m

kids集群

kubectl get pod -n monitoring 
NAME                                                        READY   STATUS    RESTARTS   AGE
alertmanager-kube-prometheus-stack-alertmanager-0           2/2     Running   0          44h
kube-prometheus-stack-grafana-799446c5b9-fdgng              3/3     Running   0          44h
kube-prometheus-stack-kube-state-metrics-6c5d86887c-m7tw5   1/1     Running   0          44h
kube-prometheus-stack-operator-5bbb5f4f64-rxxn6             1/1     Running   0          44h
kube-prometheus-stack-prometheus-node-exporter-fqtjl        1/1     Running   0          44h
prometheus-kube-prometheus-stack-prometheus-0               3/3     Running   0          44h
thanos-compactor-66ccd948d-7tfzd                            1/1     Running   0          43h
thanos-query-f6ffddfb4-8qhdj                                1/1     Running   0          23h
thanos-query-frontend-59df69d5c-pwbxs                       1/1     Running   0          43h
thanos-storegateway-0                                       1/1     Running   0          43h

thanos-query-frontend配置https://blog.csdn.net/qq_34556414/article/details/124997111

如何使用 Thanos 实现 Prometheus 多集群监控 https://blog.csdn.net/xxxxaayy/article/details/104989792

Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐