一、部暑postgresql

docker run --name postgres --net=host -itd  -e POSTGRES_PASSWORD=password postgres

# 需在监控的数据库上执行一下以下命令(不使用数据库超级管理员的权限的时候,注意更改密码),
docker exec -e POSTGRES_PASSWORD=password -it postgres psql -U postgres 

CREATE USER postgres_exporter PASSWORD 'postgres_exporter2O21';
ALTER USER postgres_exporter SET SEARCH_PATH TO postgres_exporter,pg_catalog;

CREATE SCHEMA postgres_exporter AUTHORIZATION postgres_exporter;

CREATE FUNCTION postgres_exporter.f_select_pg_stat_activity()
RETURNS setof pg_catalog.pg_stat_activity
LANGUAGE sql
SECURITY DEFINER
AS $$
  SELECT * from pg_catalog.pg_stat_activity;
$$;

CREATE FUNCTION postgres_exporter.f_select_pg_stat_replication()
RETURNS setof pg_catalog.pg_stat_replication
LANGUAGE sql
SECURITY DEFINER
AS $$
  SELECT * from pg_catalog.pg_stat_replication;
$$;

CREATE VIEW postgres_exporter.pg_stat_replication
AS
  SELECT * FROM postgres_exporter.f_select_pg_stat_replication();

CREATE VIEW postgres_exporter.pg_stat_activity
AS
  SELECT * FROM postgres_exporter.f_select_pg_stat_activity();

GRANT SELECT ON postgres_exporter.pg_stat_replication TO postgres_exporter;
GRANT SELECT ON postgres_exporter.pg_stat_activity TO postgres_exporter;

二、部暑postgres_exporter

mkdir -p /data/postgres_exporter
cat  > /data/postgres_exporter/start.sh << 'EOF'
docker run -d \
--restart=always \
--name postgres_exporter \
-p 9187:9187 \
-e DATA_SOURCE_NAME="postgresql://postgres_exporter:postgres_exporter2O21@192.168.11.192:5432/postgres?sslmode=disable" \
-v /etc/localtime:/etc/localtime:ro \
quay.io/prometheuscommunity/postgres-exporter
EOF

bash /data/postgres_exporter/start.sh

三、配置prometheus

#添加自动发现脚本
cat >> /data/prometheus/conf/prometheus.yml << 'EOF'
#postgres自动发现
  - job_name: 'postgres'
    file_sd_configs:
      - files:
        - /etc/prometheus/sd_config/postgres.yaml
        refresh_interval: 5s
    relabel_configs:
      - source_labels: [__address__]
        regex: (.*)
        target_label: instance
        replacement: $1
      - source_labels: [__address__]
        regex: (.*):(.*)
        target_label: __address__
        replacement: $1:9187    
EOF
#自动发现配置
cat >> /data/prometheus/conf/sd_config/postgres.yaml << 'EOF'
#rabbitmq自动发现
- labels:
    project: 网关postgres
  targets:
  - 192.168.11.192:5432
  - 192.168.11.193:5432
EOF

四、grafana
grafanaid: 455
在这里插入图片描述

grafanaid: 9628
在这里插入图片描述
五、警报规则

vi /data/prometheus/conf/rules/postgres.rules
groups:
- name: postgresql-监控告警    
  rules:
  - alert: 警报!Postgresql宕机
    expr: pg_up == 0
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql down"
      description: "Postgresql instance is down\n  当前值={{ $value }}"

  - alert: 警报!Postgresql被重启
    expr: time() - pg_postmaster_start_time_seconds < 60
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql restarted"
      description: "Postgresql restarted\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlExporterError
    expr: pg_exporter_last_scrape_error > 0
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql exporter error"
      description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  当前值={{ $value }}"

  - alert: 警报!Postgresql主从复制不同步
    expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql replication lag"
      description: "PostgreSQL replication lag is going up (> 30s)\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlTableNotVaccumed
    expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24
    for: 0m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql table not vaccumed"
      description: "Table has not been vaccum for 24 hours\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlTableNotAnalyzed
    expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24
    for: 0m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql table not analyzed"
      description: "Table has not been analyzed for 24 hours\n  当前值={{ $value }}"

  - alert: 警报!Postgresql连接数太多
    expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql too many connections"
      description: "PostgreSQL instance has too many connections (> 80%).\n  当前值={{ $value }}"

  - alert: 警报!Postgresql连接数太少
    expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql not enough connections"
      description: "PostgreSQL instance should have more connections (> 5)\n  当前值={{ $value }}"

  - alert: 警报!Postgresql死锁
    expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5
    for: 0m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql dead locks"
      description: "PostgreSQL has dead-locks\n  当前值={{ $value }}"

  - alert: 警报!Postgresql慢查询
    expr: pg_slow_queries > 0
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql slow queries"
      description: "PostgreSQL executes slow queries\n  当前值={{ $value }}"

  - alert: 警报!Postgresql回滚率高
    expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
    for: 0m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql high rollback rate"
      description: "Ratio of transactions being aborted compared to committed is > 2 %\n  当前值={{ $value }}"

  - alert: 警报!Postgresql提交率低
    expr: rate(pg_stat_database_xact_commit[1m]) < 10
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql commit rate low"
      description: "Postgres seems to be processing very few transactions\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlLowXidConsumption
    expr: rate(pg_txid_current[1m]) < 5
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql low XID consumption"
      description: "Postgresql seems to be consuming transaction IDs very slowly\n  当前值={{ $value }}"

  - alert: 警报!PostgresqllowXlogConsumption
    expr: rate(pg_xlog_position_bytes[1m]) < 100
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresqllow XLOG consumption"
      description: "Postgres seems to be consuming XLOG very slowly\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlWaleReplicationStopped
    expr: rate(pg_xlog_position_bytes[1m]) == 0
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql WALE replication stopped"
      description: "WAL-E replication seems to be stopped\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlHighRateStatementTimeout
    expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql high rate statement timeout"
      description: "Postgres transactions showing high rate of statement timeouts\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlHighRateDeadlock
    expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql high rate deadlock"
      description: "Postgres detected deadlocks\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlReplicationLagBytes
    expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - on(environment) group_right(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) > 1e+09
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql replication lag bytes"
      description: "Postgres Replication lag (in bytes) is high\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlUnusedReplicationSlot
    expr: pg_replication_slots_active == 0
    for: 1m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql unused replication slot"
      description: "Unused Replication Slots\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlTooManyDeadTuples
    expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
    for: 2m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql too many dead tuples"
      description: "PostgreSQL dead tuples is too large\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlSplitBrain
    expr: count(pg_replication_is_replica == 0) != 1
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql split brain"
      description: "Split Brain, too many primary Postgresql databases in read-write mode\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlPromotedNode
    expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
    for: 0m
    labels:
      severity: 一般告警
    annotations:
      summary: "{{$labels.instance}} Postgresql promoted node"
      description: "Postgresql standby server has been promoted as primary node\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlSslCompressionActive
    expr: sum(pg_stat_ssl_compression) > 0
    for: 0m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql SSL compression active"
      description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  当前值={{ $value }}"

  - alert: 警报!PostgresqlTooManyLocksAcquired
    expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} Postgresql too many locks acquired"
      description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  当前值={{ $value }}"

Logo

权威|前沿|技术|干货|国内首个API全生命周期开发者社区

更多推荐