tke k8s 巡检脚本
#!/bin/bash#k8s集群日常巡检#异常数变量unhealthy=$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}' | wc -l)#组件状态异常数kubeletError=$(systemctl status kubelet.service | grep ^"$(date | awk '{print
·
ansible巡检报告 - 未搞定
https://www.cnblogs.com/lixingli/p/12302809.html
ansible巡检集群记录
ansible prod_rbac -m shell -a " ansible node -m shell -a 'df -hT | grep vda'"
ansible prod_rbac -m shell -a "/root/code/xunjian.sh"
ansible rbac -m shell -a "ke diag | head -10" # kubeeye安装文档 https://www.kubernetes.org.cn/8955.html
#!/bin/bash
#k8s集群日常巡检
#异常数变量
unhealthy=$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}' | wc -l) #组件状态异常数
kubeletError=$(systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" | grep -i error | wc -l) #kubelet日志报错数
errorPod=$(kubectl get pods --all-namespaces | grep -v NAMESPACE| awk '{if($4 != "Running") print}' | wc -l ) #非runing状态pod数
#查看controller-manager、scheduler、etcd状态
echo -e "----------Controller-manager、Scheduler、Etcd-0检测中--------------------------------------"
if (( ${unhealthy} >=1 ));then
echo -e "\033[31m$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}') Unhealthy\033[0m"
else
echo -e "\033[32mcontroller-manager、scheduler、etcd-0无异常\033[0m"
fi
#查看kubelet状态
echo -e "\n \n---------- Kubelet状态检测中 ----------"
if (( ${kubeletError} >=1 ));then
echo -e "\033[31mkubelet错误日志:\033[0m" ; systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" |awk '{for (i=10;i<=NF;i++)printf("%s ", $i);print ""}' | grep -i error | sort -n | uniq
else
echo -e "\033[32mkubelet无日志报错\033[0m"
fi
#查看Pods状态
echo -e "\n \n---------- Pods运行状态检测中 ----------"
if (( ${errorPod} >=1 ));then
echo -e "\033[31mErrorPod:\033[0m" && kubectl get pods --all-namespaces | grep -v NAMESPACE | awk '{if($4 != "Running") print}'
else
echo -e "\033[32mPods无异常\033[0m"
fi
#查看Node资源使用率
echo -e "\n \n---------- Nodes资源使用状态检测中 ----------"
memWarn=0
for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
memRq=$(kubectl describe node $i | grep memory | grep % | awk '{print $3}' | sed "s/[^0-9]//g") #memory_request
memLim=$(kubectl describe node $i | grep memory | grep % | awk '{print $5}' | sed "s/[^0-9]//g") #memory_limit
cpuUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g") #cpu使用率
memUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g") #内存使用率
if (( $cpuUsed > 60 || $memUsed > 80 ));then
let memWarn+=1
echo -e "\033[31m$i\tCPU使用率:$cpuUsed%\t内存使用率:$memUsed%\033[0m"
fi
if (( $memRq > 95 ));then
let memWarn+=1
echo -e "\033[31m$i\tMem_Requests:$memRq%\tMem_Limits:$memLim%\033[0m"
fi
done
if (( $memWarn ==0 ));then
echo -e "\033[32m无节点CPU、内存使用异常\033[0m"
fi
#获取pods重启次数
echo -e "\n \n---------- Pods自动重启检测中 ----------"
kubectl get pods --all-namespaces |awk '{if($5 > 0) print}' | awk '{print $2,$5}' >/opt/podsnew.txt
rebootNum=$(diff /opt/podsold.txt /opt/podsnew.txt | wc -l)
if (( $rebootNum > 1 ));then
echo -e "\033[31m有以下pod重启:\033[0m"
diff /opt/podsold.txt /opt/podsnew.txt
else
echo -e "\033[32m无自动重启pod\033[0m"
fi
rm -f /opt/podsold.txt && mv /opt/podsnew.txt /opt/podsold.txt
更多推荐
已为社区贡献59条内容
所有评论(0)