第十五章:运维管理

最后更新: 2024-01-01 作者: PostgreSQL Team
页面目录

第十五章:运维管理

15.1 服务管理

15.1.1 systemd 管理

# 启动服务
sudo systemctl start postgresql

# 停止服务
sudo systemctl stop postgresql

# 重启服务
sudo systemctl restart postgresql

# 重载配置(不重启)
sudo systemctl reload postgresql

# 查看状态
sudo systemctl status postgresql

# 启用开机自启
sudo systemctl enable postgresql

# 禁用开机自启
sudo systemctl disable postgresql

15.1.2 pg_ctl 命令

# 启动数据库
pg_ctl -D /var/lib/postgresql/data start

# 停止数据库(智能模式)
pg_ctl -D /var/lib/postgresql/data stop

# 停止数据库(快速模式,不等待客户端断开)
pg_ctl -D /var/lib/postgresql/data stop -m fast

# 立即停止(不推荐,可能导致数据损坏)
pg_ctl -D /var/lib/postgresql/data stop -m immediate

# 查看状态
pg_ctl -D /var/lib/postgresql/data status

# 重载配置
pg_ctl -D /var/lib/postgresql/data reload

15.2 配置管理

15.2.1 修改配置

-- 查看当前配置
SHOW all;
SHOW max_connections;
SHOW shared_buffers;

-- 查看特定配置
SELECT name, setting, unit FROM pg_settings WHERE name LIKE '%buffer%';

-- 设置参数(会话级)
SET statement_timeout = '5s';

-- 设置参数(事务级)
BEGIN;
SET LOCAL statement_timeout = '5s';
COMMIT;

-- 修改配置文件后重载
SELECT pg_reload_conf();

15.2.2 推荐配置参数

# postgresql.conf 优化配置

# 连接设置
max_connections = 200
superuser_reserved_connections = 3

# 内存设置(根据可用内存调整)
shared_buffers = '4GB'           # 建议为系统内存的 25%
effective_cache_size = '12GB'      # 建议为系统内存的 75%
work_mem = '64MB'
maintenance_work_mem = '512MB'
temp_buffers = '16MB'

# WAL 设置
wal_level = replica
wal_buffers = '64MB'
min_wal_size = '1GB'
max_wal_size = '4GB'
checkpoint_completion_target = 0.9

# 并行设置
max_worker_processes = 16
max_parallel_workers_per_gather = 4
max_parallel_workers = 8
parallel_leader_participation = on

# 查询优化
effective_io_concurrency = 200
random_page_cost = 1.1
default_statistics_target = 100

# 日志设置
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_rotation_age = 1d
log_min_duration_statement = 1000

# 异步提交
synchronous_commit = on

# 统计信息
track_activities = on
track_counts = on
track_io_timing = on
track_functions = pl

# 自动清理
autovacuum = on
autovacuum_max_workers = 4
autovacuum_naptime = 1min

15.3 监控指标

15.3.1 系统视图

-- 数据库状态
SELECT * FROM pg_stat_database WHERE datname = 'mydb';

-- 表访问统计
SELECT 
    schemaname,
    relname,
    seq_scan,
    seq_tup_read,
    idx_scan,
    idx_tup_fetch,
    n_tup_ins,
    n_tup_upd,
    n_tup_del,
    n_live_tup,
    n_dead_tup,
    last_vacuum,
    last_autovacuum,
    last_analyze,
    last_autoanalyze,
    vacuum_count,
    autovacuum_count
FROM pg_stat_user_tables
ORDER BY n_live_tup DESC;

-- 索引使用统计
SELECT 
    schemaname,
    relname AS table_name,
    indexrelname AS index_name,
    idx_scan,
    idx_tup_read,
    idx_tup_fetch,
    pg_size_pretty(pg_relation_size(indexrelid)) AS index_size
FROM pg_stat_user_indexes
ORDER BY idx_scan ASC;

-- 连接统计
SELECT 
    state,
    COUNT(*) AS count,
    ARRAY_AGG(usename) AS users,
    ARRAY_AGG(client_addr) AS clients
FROM pg_stat_activity
WHERE state IS NOT NULL
GROUP BY state;

-- 复制状态(主库)
SELECT 
    pid,
    usesysid,
    usename,
    application_name,
    client_addr,
    client_hostname,
    client_port,
    backend_start,
    backend_xmin,
    state,
    sent_lsn,
    write_lsn,
    flush_lsn,
    replay_lsn,
    write_lag,
    flush_lag,
    replay_lag,
    sync_priority,
    sync_state
FROM pg_stat_replication;

15.3.2 常用监控查询

-- 当前活跃查询
SELECT 
    pid,
    now() - query_start AS duration,
    usename,
    datname,
    state,
    left(query, 100) AS query_preview
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;

-- 长事务
SELECT 
    pid,
    now() - xact_start AS transaction_duration,
    usename,
    xmin,
    state,
    left(query, 100) AS query_preview
FROM pg_stat_activity
WHERE xact_start IS NOT NULL
    AND state != 'idle'
    AND now() - xact_start > interval '5 minutes';

-- 大表
SELECT 
    schemaname,
    relname AS table_name,
    pg_size_pretty(pg_total_relation_size(relid)) AS total_size,
    pg_size_pretty(pg_relation_size(relid)) AS table_size,
    pg_size_pretty(pg_indexes_size(relid)) AS index_size,
    n_live_tup,
    n_dead_tup,
    last_vacuum,
    last_autovacuum
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(relid) DESC
LIMIT 20;

-- 缺失索引
SELECT 
    schemaname,
    relname AS table_name,
    seq_scan,
    seq_tup_read,
    idx_scan,
    n_live_tup,
    n_dead_tup
FROM pg_stat_user_tables
WHERE idx_scan = 0
    AND n_live_tup > 1000
ORDER BY seq_tup_read DESC;

15.4 性能监控

15.4.1 使用 pg_stat_statements

-- 安装扩展
CREATE EXTENSION pg_stat_statements;

-- 查询最慢查询
SELECT 
    query,
    calls,
    total_exec_time,
    mean_exec_time,
    max_exec_time,
    min_exec_time,
    rows,
    shared_blks_hit,
    shared_blks_read
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 20;

-- 查询调用次数最多的查询
SELECT 
    query,
    calls,
    total_exec_time,
    mean_exec_time,
    rows,
    ROUND(mean_exec_time::numeric, 2) AS avg_ms
FROM pg_stat_statements
ORDER BY calls DESC
LIMIT 10;

-- 查询消耗最多的 I/O
SELECT 
    query,
    calls,
    shared_blks_hit,
    shared_blks_read,
    temp_blks_read,
    temp_blks_written
FROM pg_stat_statements
ORDER BY shared_blks_read DESC
LIMIT 10;

-- 重置统计
SELECT pg_stat_statements_reset();

15.4.2 pgBadger 日志分析

# 配置详细日志
# postgresql.conf
log_line_prefix = '%t [%p] [%l-1] user=%u,db=%d,app=%a client=%h '
log_duration = off
log_statement = 'all'
log_min_duration_statement = 0

# 生成报告
pgbadger /var/log/postgresql/postgresql.log -o /var/www/pgbadger/index.html

# 实时分析
pgbadger --preprocess --live-scan --log-prefix '%t [%p] ' /var/log/postgresql/

15.5 维护任务

15.5.1 VACUUM 操作

-- 手动 VACUUM
VACUUM ANALYZE users;

-- VACUUM 所有表
VACUUM (VERBOSE, ANALYZE);

-- VACUUM 特定表
VACUUM (VERBOSE, ANALYZE) orders;

-- 冻结事务
VACUUM (FREEZE) users;

-- 查看 VACUUM 状态
SELECT 
    schemaname,
    relname,
    last_vacuum,
    last_autovacuum,
    vacuum_count,
    autovacuum_count,
    n_dead_tup
FROM pg_stat_user_tables
WHERE n_dead_tup > 10000;

15.5.2 ANALYZE 操作

-- 分析表
ANALYZE users;

-- 分析表和索引
ANALYZE VERBOSE users;

-- 分析所有表
ANALYZE VERBOSE;

-- 查看统计信息过期
SELECT 
    schemaname,
    relname,
    last_analyze,
    last_autoanalyze,
    n_live_tup,
    n_dead_tup,
    last_analyze::timestamp - last_autoanalyze::timestamp AS analyze_interval
FROM pg_stat_user_tables
WHERE n_live_tup + n_dead_tup > 1000;

15.5.3 REINDEX 操作

-- 重建索引
REINDEX INDEX idx_users_email;

-- 重建表的所有索引
REINDEX TABLE users;

-- 重建所有表的索引(谨慎使用)
REINDEX DATABASE mydb;

-- 并行重建索引(需要独占锁)
REINDEX INDEX CONCURRENTLY idx_users_email;

-- 查看索引膨胀
SELECT 
    schemaname || '.' || relname AS table,
    indexrelname AS index,
    pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size,
    idx_scan AS index_scans
FROM pg_stat_user_indexes ui
JOIN pg_index i ON ui.indexrelid = i.indexrelid
WHERE idx_scan = 0
ORDER BY pg_relation_size(i.indexrelid) DESC;

15.5.4 维护脚本

#!/bin/bash
# maintenance.sh - PostgreSQL 维护脚本

PGHOST=localhost
PGPORT=5432
PGUSER=postgres
PGDATABASE=mydb

# VACUUM
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE -c "VACUUM (VERBOSE, ANALYZE);"

# 重建膨胀的索引
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE << 'EOF'
SELECT 'REINDEX INDEX CONCURRENTLY ' || indexrelname || ';'
FROM pg_stat_user_indexes
WHERE idx_scan = 0
  AND pg_relation_size(indexrelid) > 10 * 1024 * 1024;
EOF

# 清理 pg_stat_statements
psql -h $PGHOST -p $PGPORT -U $PGUSER -d $PGDATABASE -c "SELECT pg_stat_statements_reset();"

15.6 容量规划

-- 表大小分析
SELECT 
    schemaname,
    relname,
    pg_size_pretty(pg_total_relation_size(relid)) AS total_size,
    pg_size_pretty(pg_relation_size(relid)) AS table_size,
    pg_size_pretty(pg_indexes_size(relid)) AS indexes_size,
    n_live_tup,
    n_dead_tup,
    ROUND(n_live_tup * 100.0 / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS live_ratio
FROM pg_stat_user_tables
ORDER BY pg_total_relation_size(relid) DESC;

-- 数据库总大小
SELECT 
    datname,
    pg_size_pretty(pg_database_size(datname)) AS size
FROM pg_database
ORDER BY pg_database_size(datname) DESC;

-- 预估表增长
SELECT 
    schemaname,
    relname,
    n_live_tup,
    n_dead_tup,
    n_tup_ins,
    n_tup_upd,
    n_tup_del,
    last_vacuum,
    last_autovacuum
FROM pg_stat_user_tables
WHERE n_live_tup > 10000;

15.7 故障排除

15.7.1 常见问题

-- 连接数过多
SELECT COUNT(*) FROM pg_stat_activity;
SELECT MAX(max_connections) FROM pg_settings;

-- 磁盘空间不足
SELECT pg_size_pretty(pg_tablespace_size('pg_default'));
SELECT * FROM pg_ls_waldir() ORDER BY modification DESC LIMIT 10;

-- 长查询
SELECT 
    pid,
    now() - query_start AS duration,
    state,
    left(query, 200) AS query
FROM pg_stat_activity
WHERE state != 'idle'
ORDER BY duration DESC;

-- 终止问题查询
SELECT pg_cancel_backend(pid);  -- 优雅终止
SELECT pg_terminate_backend(pid); -- 强制终止

-- 锁等待
SELECT 
    l.locktype,
    l.relation::regclass,
    l.mode,
    l.granted,
    l.pid,
    a.usename,
    a.query
FROM pg_locks l
JOIN pg_stat_activity a ON l.pid = a.pid
WHERE NOT l.granted;

15.7.2 健康检查脚本

#!/bin/bash
# health_check.sh - PostgreSQL 健康检查

echo "=== PostgreSQL 健康检查 ==="
echo ""

# 检查服务状态
systemctl is-active postgresql || echo "WARNING: PostgreSQL 未运行"

# 检查连接数
psql -U postgres -t -c "SELECT count(*) FROM pg_stat_activity;" 
MAX_CONN=$(psql -U postgres -t -c "SHOW max_connections;")
echo "最大连接数: $MAX_CONN"

# 检查复制延迟
psql -U postgres -c "SELECT client_addr, state, 
    pg_size_pretty(pg_wal_lsn_diff(sent_lsn, replay_lsn)) AS delay 
    FROM pg_stat_replication;"

# 检查数据库大小
psql -U postgres -c "SELECT datname, pg_size_pretty(pg_database_size(datname)) 
    FROM pg_database ORDER BY pg_database_size(datname) DESC LIMIT 5;"

# 检查长查询
psql -U postgres -c "SELECT pid, now() - query_start AS duration, query 
    FROM pg_stat_activity 
    WHERE state != 'idle' AND query_start < now() - interval '5 minutes';"

15.8 本章小结

PostgreSQL 运维管理要点:

领域 关键任务
配置 内存优化、连接管理
监控 性能指标、慢查询
维护 VACUUM、ANALYZE、REINDEX
容量 空间分析、增长预估
故障排除 连接、锁、死锁

📌 教程总结

恭喜完成 PostgreSQL 权威教程!本教程涵盖了:

  • 基础入门:安装配置、SQL 基础、数据类型
  • 核心功能:表设计、查询、索引、视图函数
  • 高级特性:事务、存储过程、触发器、分区表
  • 运维实践:备份恢复、高可用、安全、监控管理