ansible巡检报告 - 未搞定

https://www.cnblogs.com/lixingli/p/12302809.html

ansible巡检集群记录


ansible prod_rbac -m shell -a " ansible node -m shell -a 'df -hT | grep vda'"
ansible prod_rbac -m shell -a "/root/code/xunjian.sh"
ansible rbac -m shell -a "ke diag | head -10"  # kubeeye安装文档 https://www.kubernetes.org.cn/8955.html
#!/bin/bash
#k8s集群日常巡检

#异常数变量
unhealthy=$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}' | wc -l)                                #组件状态异常数
kubeletError=$(systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" | grep -i error | wc -l)         #kubelet日志报错数
errorPod=$(kubectl get pods --all-namespaces | grep -v NAMESPACE| awk '{if($4 != "Running") print}' | wc -l )                #非runing状态pod数

#查看controller-manager、scheduler、etcd状态
echo -e "----------Controller-manager、Scheduler、Etcd-0检测中--------------------------------------"
if (( ${unhealthy} >=1 ));then
  echo -e "\033[31m$(kubectl get cs | awk 'NR == 1 {next}{if($2 != "Healthy") print $1}') Unhealthy\033[0m"
else
  echo -e "\033[32mcontroller-manager、scheduler、etcd-0无异常\033[0m"
fi

#查看kubelet状态
echo -e "\n \n----------       Kubelet状态检测中       ----------"
if (( ${kubeletError} >=1 ));then
  echo -e "\033[31mkubelet错误日志:\033[0m" ; systemctl status kubelet.service | grep ^"$(date | awk '{print$2,$3}')" |awk '{for (i=10;i<=NF;i++)printf("%s ", $i);print ""}' | grep -i error | sort -n | uniq
else
  echo -e "\033[32mkubelet无日志报错\033[0m"
fi

#查看Pods状态
echo -e "\n \n----------      Pods运行状态检测中       ----------"
if (( ${errorPod} >=1 ));then
  echo -e "\033[31mErrorPod:\033[0m"  && kubectl get pods --all-namespaces | grep -v NAMESPACE | awk '{if($4 != "Running") print}'
else
  echo -e "\033[32mPods无异常\033[0m"
fi

#查看Node资源使用率
echo -e "\n \n----------    Nodes资源使用状态检测中    ----------"
memWarn=0
for i in $(kubectl get nodes | awk 'NR == 1 {next}{print $1}');do
  memRq=$(kubectl describe node $i | grep memory | grep % | awk '{print $3}' | sed "s/[^0-9]//g")       #memory_request
  memLim=$(kubectl describe node $i | grep memory | grep % | awk '{print $5}' | sed "s/[^0-9]//g")      #memory_limit
  cpuUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $3}'| sed "s/[^0-9]//g")                   #cpu使用率
  memUsed=$(kubectl top nodes $i | awk 'NR == 1 {next}{print $5}'| sed "s/[^0-9]//g")                   #内存使用率
  if (( $cpuUsed > 60 || $memUsed > 80 ));then
    let memWarn+=1
    echo -e "\033[31m$i\tCPU使用率:$cpuUsed%\t内存使用率:$memUsed%\033[0m"
  fi
  if (( $memRq > 95 ));then
    let memWarn+=1
    echo -e "\033[31m$i\tMem_Requests:$memRq%\tMem_Limits:$memLim%\033[0m"
  fi
done
if (( $memWarn ==0 ));then
  echo -e "\033[32m无节点CPU、内存使用异常\033[0m"
fi

#获取pods重启次数
echo -e "\n \n----------      Pods自动重启检测中      ----------"
kubectl get pods --all-namespaces |awk '{if($5 > 0) print}' | awk '{print $2,$5}' >/opt/podsnew.txt
rebootNum=$(diff /opt/podsold.txt /opt/podsnew.txt | wc -l)
if (( $rebootNum > 1 ));then
  echo -e "\033[31m有以下pod重启:\033[0m"
  diff /opt/podsold.txt /opt/podsnew.txt
else
  echo -e "\033[32m无自动重启pod\033[0m"
fi
rm -f /opt/podsold.txt && mv /opt/podsnew.txt /opt/podsold.txt
Logo

K8S/Kubernetes社区为您提供最前沿的新闻资讯和知识内容

更多推荐