运维巡检shell脚本
df -h | awk 'NR>1 && $5+0 > 80 {print "警告: " $6 " 使用率超过80% ("$5")"}' | while read -r line;# send_alert "发现${error_flag}个异常项,报告路径: ${report_file}"log info "===== Prometheus指标检查 ====="log info "===== 系统
shell脚本
系统环境:Linux 3.10.0-1160.el7.x86_64
shell脚本内容:
#!/bin/bash
# 系统巡检脚本v2.0
set -euo pipefail
trap 'log error "脚本被中断"; exit 130' INT TERM
# 常量定义
readonly log_dir="/var/log"
readonly report_file="${log_dir}/inspection_$(date +%Y%m%d%H%M).log"
error_flag=0
# 初始化检查
[[ -w $log_dir ]] || {
echo -e "\033[31m错误: 无${log_dir}写入权限\033[0m" >&2
exit 1
}
# 增强日志函数
log() {
local level="${1:-info}"
local message="$2"
local timestamp=$(date +"%Y-%m-%d %H:%M:%S")
local color_code="\033[0m"
case $level in
error) color_code="\033[31m" ;;
warn) color_code="\033[33m" ;;
info) color_code="\033[36m" ;;
esac
echo -e "${color_code}[${timestamp}][${level}] ${message}\033[0m" | tee -a "$report_file"
}
# 错误处理函数
check_failure() {
local exit_code=$?
local context="${1:-未指定操作}"
[[ $exit_code -eq 0 ]] && return 0
log error "${context} (退出码: ${exit_code})"
error_flag=1
return $exit_code
}
# 系统基础检查
check_system() {
log info "===== 系统基础检查 ====="
# 磁盘空间检查
df -h | awk 'NR>1 && $5+0 > 80 {print "警告: " $6 " 使用率超过80% ("$5")"}' | while read -r line; do
log warn "$line"
done
# 内存检查
free -m | awk '/Mem/{if ($3/$2*100 > 90) print "警告: 内存使用率超过90%"}'
# CPU负载检查
load_avg=$(awk '{print $1}' /proc/loadavg)
cpu_cores=$(nproc)
if (( $(echo "$load_avg > $cpu_cores * 1.5" | bc -l) )); then
log warn "CPU负载过高: ${load_avg} (核心数: ${cpu_cores})"
fi
}
# 服务状态检查
check_services() {
log info "===== 服务状态检查 ====="
local services=("nginx" "mysql" "prometheus" "node_exporter")
for svc in "${services[@]}"; do
if systemctl is-active "$svc" >/dev/null; then
log info "服务 $svc 运行正常"
else
check_failure "服务 $svc 未运行"
fi
done
}
# Prometheus指标检查
check_prometheus() {
log info "===== Prometheus指标检查 ====="
local prometheus_url="http://localhost:9090"
local query_timeout=5
# 检查Prometheus是否存活
if ! curl -s --max-time $query_timeout "${prometheus_url}/-/healthy" | grep -q "Prometheus"; then
check_failure "Prometheus服务不可达"
return
fi
# 异常连接数检查
local abnormal_conn=$(curl -s --max-time $query_timeout "${prometheus_url}/api/v1/query" \
--data-urlencode 'query=sum(tidb_server_abnormal_connections_total[1h])' \
| jq -r '.data.result.value // 0')
if [[ $abnormal_conn -gt 10 ]]; then
check_failure "TiDB异常连接数过高: ${abnormal_conn}"
else
log info "TiDB异常连接数正常: ${abnormal_conn}"
fi
}
# 安全基线检查
check_security() {
log info "===== 安全基线检查 ====="
# SSH配置检查
if grep -qE "^PermitRootLogin\s+yes" /etc/ssh/sshd_config; then
check_failure "SSH允许Root登录"
else
log info "SSH Root登录已禁用"
fi
# 防火墙状态
if firewall-cmd --state 2>/dev/null | grep -q running; then
log info "防火墙运行正常"
else
check_failure "防火墙未启用"
fi
# 漏洞扫描(可选)
if command -v lynis &>/dev/null; then
log info "开始Lynis安全扫描..."
lynis audit system --quick 2>&1 | grep -iE 'warning|suggestion' | while read -r line; do
log warn "$line"
done
else
log warn "Lynis未安装,跳过漏洞扫描"
fi
}
# 主函数
main() {
check_system
check_services
check_prometheus
check_security
if [[ $error_flag -eq 0 ]]; then
log info "巡检完成,未发现严重问题"
else
log error "巡检完成,发现${error_flag}个异常项,请检查报告!"
# 可在此添加通知逻辑,例如:
# send_alert "发现${error_flag}个异常项,报告路径: ${report_file}"
fi
}
main
使用脚本建议
提前安装依赖项
yum install -y jq curl lynis # RHEL/CentOS
apt install -y jq curl lynis # Debian/Ubuntu
添加执行权限 chmod +x 【文件名】
可以设置每天自动执行
查看日志文件:
tail -f /var/log/inspection_$(date +%Y%m%d)*.log
更多推荐
所有评论(0)