k8s配置telegraf自定义监控
【代码】k8s配置telegraf自定义监控。
·
以下内容使用telegraf监控kafka集群
配置configmap
configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: telegraf-deployment
data:
telegraf.conf: |+
[global_tags]
dc = "bigdata"
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = ""
debug = true
quiet = false
logfile = "/var/log/telegraf/telegraf.log" #日志目录
hostname = ""
omit_hostname = false
log_with_timezone = "Asia/Shanghai"
[[outputs.prometheus_client]]
listen = ":9123"
[[inputs.cpu]]
percpu = true
totalcpu = true
collect_cpu_time = false
report_active = false
telegraf-kafka.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: kafka-deployment
data:
kafka.conf: |+
[[inputs.jolokia2_agent]]
name_prefix = "kafka_"
urls = ["http://kafka-service-1.bigdata:8779/jolokia", "http://kafka-service-2.bigdata:8779/jolokia","http://kafka-service-3.bigdata:8779/jolokia"]
[[inputs.jolokia2_agent.metric]]
name = "jvm_memory"
mbean = "java.lang:type=Memory"
[[inputs.jolokia2_agent.metric]]
name = "old_gc_time"
mbean = "java.lang:type=GarbageCollector,name=G1 Old Generation"
paths = ["CollectionCount","CollectionTime"]
#Byte in rate from clients
[[inputs.jolokia2_agent.metric]]
name = "clients_byte_in_rate"
mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec"
paths = ["MeanRate","OneMinuteRate"]
#Byte out rate to clients
[[inputs.jolokia2_agent.metric]]
name = "clients_byte_out_rate"
mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec"
paths = ["MeanRate","OneMinuteRate"]
#Message in rate
[[inputs.jolokia2_agent.metric]]
name = "message_in_rate"
mbean = "kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec"
paths = ["MeanRate","OneMinuteRate"]
#producer consumer follower request in per sec
[[inputs.jolokia2_agent.metric]]
name = "producer_consumer_follower_request_per_sec"
mbean = "kafka.network:type=RequestMetrics,name=RequestsPerSec,request=Produce"
#Request total time
[[inputs.jolokia2_agent.metric]]
name = "request_total_time"
mbean = "kafka.network:type=RequestMetrics,name=TotalTimeMs,request=Produce"
paths = ["Max","Mean"]
#The time the request waits in the request queue
[[inputs.jolokia2_agent.metric]]
name = "request_waits_time_in_queue"
mbean = "kafka.network:type=RequestMetrics,name=RequestQueueTimeMs,request=Produce"
paths = ["Max","Mean"]
#Time the request is processed at the leader
[[inputs.jolokia2_agent.metric]]
name = "request_time_at_leader"
mbean = "kafka.network:type=RequestMetrics,name=LocalTimeMs,request=Produce"
paths = ["Max","Mean"]
#Time the request waits for the follower
[[inputs.jolokia2_agent.metric]]
name = "request_time_at_follower"
mbean = "kafka.network:type=RequestMetrics,name=RemoteTimeMs,request=Produce"
paths = ["Max","Mean"]
#Time the request waits in the response queue
[[inputs.jolokia2_agent.metric]]
name = "request_time_at_response_queue"
mbean = "kafka.network:type=RequestMetrics,name=ResponseQueueTimeMs,request=Produce"
paths = ["Max","Mean"]
#Time to send the response
[[inputs.jolokia2_agent.metric]]
name = "time_to_send_response"
mbean = "kafka.network:type=RequestMetrics,name=ResponseSendTimeMs,request=Produce"
paths = ["Max","Mean"]
#The average fraction of time the network processors are idle
[[inputs.jolokia2_agent.metric]]
name = "average_time_of_network_idle"
mbean = "kafka.network:type=SocketServer,name=NetworkProcessorAvgIdlePercent"
#Network request queue
[[inputs.jolokia2_agent.metric]]
name = "request_channel_queue_size"
mbean = "kafka.network:type=RequestChannel,name=RequestQueueSize"
#The average fraction of time the request handler threads are idle
[[inputs.jolokia2_agent.metric]]
name = "average_time_request_idle"
mbean = "kafka.server:type=KafkaRequestHandlerPool,name=RequestHandlerAvgIdlePercent"
paths = ["MeanRate","OneMinuteRate"]
#Log flush rate and time
[[inputs.jolokia2_agent.metric]]
name = "log_flush_rate_time"
mbean = "kafka.log:type=LogFlushStats,name=LogFlushRateAndTimeMs"
paths = ["MeanRate","OneMinuteRate","Max","Mean"]
#ISR shrink rate
[[inputs.jolokia2_agent.metric]]
name = "isr_shrink_rate"
mbean = "kafka.server:type=ReplicaManager,name=IsrShrinksPerSec"
paths = ["MeanRate","OneMinuteRate"]
#of under replicated partitions (|ISR| < |all replicas|)
[[inputs.jolokia2_agent.metric]]
name = "under_replicated_partitions"
mbean = "kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions"
#Partitions Count for every Broker
[[inputs.jolokia2_agent.metric]]
name = "total_partitions_count"
mbean = "kafka.server:type=ReplicaManager,name=PartitionCount"
#LeaderPartitions Count for every Broker
[[inputs.jolokia2_agent.metric]]
name = "leader_partitions_count"
mbean = "kafka.server:type=ReplicaManager,name=LeaderCount"
#Unclean Leader Elections rate
[[inputs.jolokia2_agent.metric]]
name = "unclean_leader_elections_per_sec"
mbean = "kafka.controller:type=ControllerStats,name=UncleanLeaderElectionsPerSec"
#topic
[[inputs.jolokia2_agent.metric]]
name = "topic_bytes_in"
mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesInPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_bytes_out"
mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesOutPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_bytes_rejected"
mbean = "kafka.server:type=BrokerTopicMetrics,name=BytesRejectedPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_failed_fetch_request"
mbean = "kafka.server:type=BrokerTopicMetrics,name=FailedFetchRequestsPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_failed_produce_request"
mbean = "kafka.server:type=BrokerTopicMetrics,name=FailedProduceRequestsPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_messages_in"
mbean = "kafka.server:type=BrokerTopicMetrics,name=MessagesInPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_total_fetch_request"
mbean = "kafka.server:type=BrokerTopicMetrics,name=TotalFetchRequestsPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
[[inputs.jolokia2_agent.metric]]
name = "topic_total_fetch_request"
mbean = "kafka.server:type=BrokerTopicMetrics,name=TotalProduceRequestsPerSec,topic=*"
tag_keys = ["topic"]
paths = ["MeanRate","OneMinuteRate"]
#consumer request in per sec
[[inputs.jolokia2_agent.metric]]
name = "consumer_request"
mbean = "kafka.network:type=RequestMetrics,name=RequestsPerSec,request=FetchConsumer"
#Request total time
[[inputs.jolokia2_agent.metric]]
name = "consumer_request_total_time"
mbean = "kafka.network:type=RequestMetrics,name=TotalTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
#The time the request waits in the request queue
[[inputs.jolokia2_agent.metric]]
name = "consumer_request_waits_time_in_queue"
mbean = "kafka.network:type=RequestMetrics,name=RequestQueueTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
#Time the request is processed at the leader3
[[inputs.jolokia2_agent.metric]]
name = "consumer_request_time_at_leader"
mbean = "kafka.network:type=RequestMetrics,name=LocalTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
#Time the request waits for the follower
[[inputs.jolokia2_agent.metric]]
name = "consumer_request_time_at_follower"
mbean = "kafka.network:type=RequestMetrics,name=RemoteTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
#Time the request waits in the response queue
[[inputs.jolokia2_agent.metric]]
name = "consumer_request_time_at_response_queue"
mbean = "kafka.network:type=RequestMetrics,name=ResponseQueueTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
#Time to send the response
[[inputs.jolokia2_agent.metric]]
name = "consumer_time_to_send_response"
mbean = "kafka.network:type=RequestMetrics,name=ResponseSendTimeMs,request=FetchConsumer"
paths = ["Max","Mean"]
配置svc和deployment文件
telegraf-svc.yaml
apiVersion: v1
kind: Service
metadata:
name: telegraf-service
labels:
app: telegraf
namespace: prom-ha
spec:
selector:
app: telegraf
ports:
- name: http
port: 9123
targetPort: 9123
telegraf.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: telegraf
labels:
app: telegraf
spec:
replicas: 1
selector:
matchLabels:
app: telegraf
minReadySeconds: 5
template:
metadata:
labels:
app: telegraf
name: telegraf
spec:
securityContext:
runAsUser: 0
fsGroup: 0
containers:
- image: telegraf-jq:1.25.3
name: telegraf
resources:
requests:
memory: "256Mi"
cpu: "256m"
limits:
memory: "512Mi"
cpu: "1"
command:
- /usr/bin/telegraf
args:
- --config
- /etc/telegraf/telegraf.conf
- --config-directory
- /etc/telegraf/telegraf.d
volumeMounts:
- mountPath: /etc/telegraf/telegraf.conf
name: telegraf-config
subPath: telegraf.conf
readOnly: false
- mountPath: /etc/telegraf/telegraf.d/kafka.conf
name: kafka-config
subPath: kafka.conf
readOnly: false
volumes:
- name: telegraf-config
configMap:
items:
- key: telegraf.conf
path: telegraf.conf
name: telegraf-deployment
- name: kafka-config
configMap:
name: kafka-deployment
items:
- key: kafka.conf
path: kafka.conf
配置servicemonitor
telegraf-service-monitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: telegraf
name: telegraf
namespace: prom-ha
spec:
endpoints:
- honorLabels: true
interval: 1m
path: /metrics
port: http
scheme: http
params:
target:
- 'telegraf-service.prom-ha:9123'
relabelings:
- sourceLabels: [__param_targets]
targetLabel: instances
namespaceSelector:
matchNames:
- prom-ha
selector:
matchLabels:
app: telegraf
更多推荐
已为社区贡献1条内容
所有评论(0)