k8s配置telegraf自定义监控

【代码】k8s配置telegraf自定义监控。

一只行走鸟

856人浏览 · 2023-03-27 13:56:51

一只行走鸟 · 2023-03-27 13:56:51 发布

以下内容使用telegraf监控kafka集群

配置configmap

configmap.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: telegraf-deployment
data:
  telegraf.conf: |+
    [global_tags]
      dc = "bigdata"   
    [agent]
      interval = "10s"
      round_interval = true
      metric_batch_size = 1000
      metric_buffer_limit = 10000
      collection_jitter = "0s"
      flush_interval = "10s"
      flush_jitter = "0s"
      precision = ""
      debug = true
      quiet = false
      logfile = "/var/log/telegraf/telegraf.log"    #日志目录
      hostname = ""
      omit_hostname = false
      log_with_timezone = "Asia/Shanghai"
    
    [[outputs.prometheus_client]]
      listen = ":9123"
    [[inputs.cpu]]
      percpu = true
      totalcpu = true
      collect_cpu_time = false
      report_active = false

telegraf-kafka.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: kafka-deployment
data:
  kafka.conf: |+
    [[inputs.jolokia2_agent]]
      name_prefix = "kafka_"
      urls = ["http://kafka-service-1.bigdata:8779/jolokia", "http://kafka-service-2.bigdata:8779/jolokia","http://kafka-service-3.bigdata:8779/jolokia"]

    [[inputs.jolokia2_agent.metric]]
            name  = "jvm_memory"
            mbean = "java.lang:type=Memory"

    [[inputs.jolokia2_agent.metric]]
       name  = "old_gc_time"
       mbean = "java.lang:type=GarbageCollector,name=G1 Old Generation"
       paths = ["CollectionCount","CollectionTime"]

    #Byte in rate from clients
    [[inputs.jolokia2_agent.metric]]
       name  = "clients_byte_in_rate"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec"
       paths = ["MeanRate","OneMinuteRate"]
    #Byte out rate to clients
    [[inputs.jolokia2_agent.metric]]
       name  = "clients_byte_out_rate"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec"
       paths = ["MeanRate","OneMinuteRate"]
    #Message in rate
    [[inputs.jolokia2_agent.metric]]
       name  = "message_in_rate"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec"
       paths = ["MeanRate","OneMinuteRate"]
    #producer consumer follower request in per sec
    [[inputs.jolokia2_agent.metric]]
       name  = "producer_consumer_follower_request_per_sec"
       mbean = "kafka.network:type=RequestMetrics,name=RequestsPerSec,request=Produce"
    #Request total time
    [[inputs.jolokia2_agent.metric]]
       name  = "request_total_time"
       mbean = "kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #The time the request waits in the request queue
    [[inputs.jolokia2_agent.metric]]
       name  = "request_waits_time_in_queue"
       mbean = "kafka.network:type=RequestMetrics,name=RequestQueueTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #Time the request is processed at the leader
    [[inputs.jolokia2_agent.metric]]
       name  = "request_time_at_leader"
       mbean = "kafka.network:type=RequestMetrics,name=LocalTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #Time the request waits for the follower
    [[inputs.jolokia2_agent.metric]]
       name = "request_time_at_follower"
       mbean = "kafka.network:type=RequestMetrics,name=RemoteTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #Time the request waits in the response queue
    [[inputs.jolokia2_agent.metric]]
       name = "request_time_at_response_queue"
       mbean = "kafka.network:type=RequestMetrics,name=ResponseQueueTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #Time to send the response
    [[inputs.jolokia2_agent.metric]]
       name = "time_to_send_response"
       mbean = "kafka.network:type=RequestMetrics,name=ResponseSendTimeMs,request=Produce"
       paths = ["Max","Mean"]
    #The average fraction of time the network processors are idle
    [[inputs.jolokia2_agent.metric]]
       name = "average_time_of_network_idle"
       mbean = "kafka.network:type=SocketServer,name=NetworkProcessorAvgIdlePercent"
    #Network request queue
    [[inputs.jolokia2_agent.metric]]
       name = "request_channel_queue_size"
       mbean = "kafka.network:type=RequestChannel,name=RequestQueueSize"
    #The average fraction of time the request handler threads are idle
    [[inputs.jolokia2_agent.metric]]
       name = "average_time_request_idle"
       mbean = "kafka.server:type=KafkaRequestHandlerPool,name=RequestHandlerAvgIdlePercent"
       paths = ["MeanRate","OneMinuteRate"]
    #Log flush rate and time
    [[inputs.jolokia2_agent.metric]]
       name = "log_flush_rate_time"
       mbean = "kafka.log:type=LogFlushStats,name=LogFlushRateAndTimeMs"
       paths = ["MeanRate","OneMinuteRate","Max","Mean"]
    #ISR shrink rate
    [[inputs.jolokia2_agent.metric]]
       name = "isr_shrink_rate"
       mbean = "kafka.server:type=ReplicaManager,name=IsrShrinksPerSec"
       paths = ["MeanRate","OneMinuteRate"] 
    #of under replicated partitions (|ISR| < |all replicas|)
    [[inputs.jolokia2_agent.metric]]
       name = "under_replicated_partitions"
       mbean = "kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions"
    #Partitions Count for every Broker
    [[inputs.jolokia2_agent.metric]]
       name = "total_partitions_count"
       mbean = "kafka.server:type=ReplicaManager,name=PartitionCount"
    #LeaderPartitions Count for every Broker
    [[inputs.jolokia2_agent.metric]]
       name = "leader_partitions_count"
       mbean = "kafka.server:type=ReplicaManager,name=LeaderCount"
    #Unclean Leader Elections rate
    [[inputs.jolokia2_agent.metric]]
       name = "unclean_leader_elections_per_sec"
       mbean = "kafka.controller:type=ControllerStats,name=UncleanLeaderElectionsPerSec"

    #topic
    [[inputs.jolokia2_agent.metric]]
       name = "topic_bytes_in"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_bytes_out"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_bytes_rejected"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesRejectedPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_failed_fetch_request"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=FailedFetchRequestsPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_failed_produce_request"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=FailedProduceRequestsPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_messages_in"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_total_fetch_request"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=TotalFetchRequestsPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]
    [[inputs.jolokia2_agent.metric]]
       name = "topic_total_fetch_request"
       mbean = "kafka.server:type=BrokerTopicMetrics,name=TotalProduceRequestsPerSec,topic=*"
       tag_keys   = ["topic"]
       paths = ["MeanRate","OneMinuteRate"]

    #consumer request in per sec
    [[inputs.jolokia2_agent.metric]]
       name  = "consumer_request"
       mbean = "kafka.network:type=RequestMetrics,name=RequestsPerSec,request=FetchConsumer"
    #Request total time
    [[inputs.jolokia2_agent.metric]]
       name  = "consumer_request_total_time"
       mbean = "kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]
    #The time the request waits in the request queue
    [[inputs.jolokia2_agent.metric]]
       name  = "consumer_request_waits_time_in_queue"
       mbean = "kafka.network:type=RequestMetrics,name=RequestQueueTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]
    #Time the request is processed at the leader3
    [[inputs.jolokia2_agent.metric]]
       name  = "consumer_request_time_at_leader"
       mbean = "kafka.network:type=RequestMetrics,name=LocalTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]
    #Time the request waits for the follower
    [[inputs.jolokia2_agent.metric]]
       name = "consumer_request_time_at_follower"
       mbean = "kafka.network:type=RequestMetrics,name=RemoteTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]
    #Time the request waits in the response queue
    [[inputs.jolokia2_agent.metric]]
       name = "consumer_request_time_at_response_queue"
       mbean = "kafka.network:type=RequestMetrics,name=ResponseQueueTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]
    #Time to send the response
    [[inputs.jolokia2_agent.metric]]
       name = "consumer_time_to_send_response"
       mbean = "kafka.network:type=RequestMetrics,name=ResponseSendTimeMs,request=FetchConsumer"
       paths = ["Max","Mean"]

配置svc和deployment文件

telegraf-svc.yaml

apiVersion: v1
kind: Service
metadata:
  name: telegraf-service
  labels:
    app: telegraf
  namespace: prom-ha
spec:
  selector:
    app: telegraf
  ports:
  - name: http
    port: 9123
    targetPort: 9123

telegraf.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: telegraf
  labels: 
    app: telegraf
spec:
  replicas: 1
  selector:
    matchLabels:
      app: telegraf
  minReadySeconds: 5
  template:
    metadata:
      labels:
        app: telegraf
      name: telegraf
    spec:
      securityContext:
        runAsUser: 0
        fsGroup: 0
      containers:
        - image: telegraf-jq:1.25.3
          name: telegraf
          resources:
            requests:
              memory: "256Mi"
              cpu: "256m"
            limits:
              memory: "512Mi"
              cpu: "1"
          command:
            - /usr/bin/telegraf
          args:
            - --config
            - /etc/telegraf/telegraf.conf
            - --config-directory
            - /etc/telegraf/telegraf.d
          volumeMounts:
            - mountPath: /etc/telegraf/telegraf.conf
              name: telegraf-config
              subPath: telegraf.conf
              readOnly: false
            - mountPath: /etc/telegraf/telegraf.d/kafka.conf
              name: kafka-config
              subPath: kafka.conf
              readOnly: false
      volumes:
        - name: telegraf-config
          configMap:
            items:
            - key: telegraf.conf
              path: telegraf.conf
            name: telegraf-deployment
        - name: kafka-config
          configMap:
            name: kafka-deployment
            items:
            - key: kafka.conf
              path: kafka.conf

配置servicemonitor

telegraf-service-monitor.yaml

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  labels:
    app: telegraf
  name: telegraf
  namespace: prom-ha
spec:
  endpoints:
    - honorLabels: true
      interval: 1m
      path: /metrics
      port: http
      scheme: http
      params:
        target:
          - 'telegraf-service.prom-ha:9123'
      relabelings:
        - sourceLabels: [__param_targets]
          targetLabel: instances
  namespaceSelector:
    matchNames:
      - prom-ha
  selector:
    matchLabels:
      app: telegraf