第十四章:运维管理

深入了解 Redis 运维管理,包括监控、备份、日志和常见运维任务。

最后更新: 2024-01-15
页面目录

Redis 运维管理

本章介绍 Redis 的日常运维管理,包括监控、备份、日志分析和常见运维任务。

监控体系

┌─────────────────────────────────────────────────────────────────┐
│                      Redis 监控体系                               │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│   ┌─────────────┐  ┌─────────────┐  ┌─────────────┐            │
│   │   指标采集   │  │   数据存储   │  │   告警通知   │            │
│   │   INFO      │  │   时序数据库  │  │   邮件/钉钉   │            │
│   │   MONITOR   │  │   Prometheus │  │   短信      │            │
│   │   SLOWLOG   │  │   InfluxDB  │  │   WebHook   │            │
│   └─────────────┘  └─────────────┘  └─────────────┘            │
│                                                                  │
│   ┌─────────────────────────────────────────────────────────┐   │
│   │                      可视化展示                           │   │
│   │                    Grafana Dashboard                     │   │
│   └─────────────────────────────────────────────────────────┘   │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

关键监控指标

1. 性能指标

指标 命令 阈值建议
ops_per_sec INFO stats > 50000
connected_clients INFO clients < 10000
blocked_clients INFO clients < 100
instantaneous_latency INFO stats < 10ms

2. 内存指标

指标 命令 阈值建议
used_memory INFO memory < maxmemory
mem_fragmentation_ratio INFO memory < 1.5
evicted_keys INFO stats < 100/s
maxmemory_human INFO memory -

3. 持久化指标

指标 命令 阈值建议
rdb_last_save_time INFO persistence -
aof_last_write_status INFO persistence OK
aof_rewrite_in_progress INFO persistence 0
rdb_bgsave_in_progress INFO persistence 0

4. 复制指标

指标 命令 阈值建议
master_link_status INFO replication up
slave_repl_offset INFO replication -
lag INFO replication < 1s

INFO 命令详解

完整示例

redis-cli INFO

# ========== Server ==========
redis_version:7.2.4
redis_mode:standalone
os:Linux 5.4.0 x86_64
arch_bits:64

# ========== Clients ==========
connected_clients:5
client_longest_output_list:0
client_biggest_input_buf:0
blocked_clients:0

# ========== Memory ==========
used_memory:1048576
used_memory_human:1.00M
used_memory_rss:2097152
used_memory_rss_human:2.00M
mem_fragmentation_ratio:2.00

# ========== Stats ==========
total_connections_received:100
total_commands_processed:50000
instantaneous_ops_per_sec:1000
keyspace_hits:45000
keyspace_misses:5000
keyspace_hitrate:90.00%

监控脚本

Python 监控脚本

#!/usr/bin/env python3
import redis
import time
import json
from datetime import datetime

class RedisMonitor:
    def __init__(self, host='localhost', port=6379, password=None):
        self.client = redis.Redis(
            host=host, 
            port=port, 
            password=password,
            decode_responses=True
        )
    
    def get_all_stats(self):
        """获取所有统计信息"""
        info = self.client.info()
        return {
            'timestamp': datetime.now().isoformat(),
            'server': {
                'version': info.get('redis_version'),
                'uptime_seconds': info.get('uptime_in_seconds'),
            },
            'memory': {
                'used': info.get('used_memory'),
                'used_human': info.get('used_memory_human'),
                'peak': info.get('used_memory_peak'),
                'fragmentation': info.get('mem_fragmentation_ratio'),
            },
            'clients': {
                'connected': info.get('connected_clients'),
                'blocked': info.get('blocked_clients'),
            },
            'stats': {
                'ops_per_sec': info.get('instantaneous_ops_per_sec'),
                'total_commands': info.get('total_commands_processed'),
                'keyspace_hits': info.get('keyspace_hits'),
                'keyspace_misses': info.get('keyspace_misses'),
            },
            'persistence': {
                'rdb_last_save': info.get('rdb_last_save_time'),
                'aof_enabled': info.get('aof_enabled'),
            }
        }
    
    def check_health(self):
        """健康检查"""
        alerts = []
        
        info = self.client.info()
        
        # 检查内存
        if info.get('used_memory') > info.get('maxmemory') * 0.8:
            alerts.append(f"内存使用超过80%: {info['used_memory_human']}")
        
        # 检查碎片
        if info.get('mem_fragmentation_ratio') > 1.5:
            alerts.append(f"内存碎片率过高: {info['mem_fragmentation_ratio']}")
        
        # 检查客户端
        if info.get('connected_clients') > 5000:
            alerts.append(f"连接数过高: {info['connected_clients']}")
        
        # 检查命中率
        hits = info.get('keyspace_hits', 0)
        misses = info.get('keyspace_misses', 0)
        total = hits + misses
        if total > 0 and hits / total < 0.5:
            alerts.append(f"命中率过低: {hits/total:.2%}")
        
        return alerts
    
    def get_slow_queries(self, limit=10):
        """获取慢查询"""
        return self.client.slowlog_get(limit)
    
    def monitor_realtime(self, interval=1, count=10):
        """实时监控"""
        stats_history = []
        
        for _ in range(count):
            stats_history.append(self.get_all_stats())
            time.sleep(interval)
        
        return stats_history


if __name__ == '__main__':
    monitor = RedisMonitor()
    
    print("=== Redis 统计 ===")
    stats = monitor.get_all_stats()
    print(json.dumps(stats, indent=2))
    
    print("\n=== 健康检查 ===")
    alerts = monitor.check_health()
    if alerts:
        for alert in alerts:
            print(f"⚠️  {alert}")
    else:
        print("✅ 所有指标正常")


    print("\n=== 慢查询 ===")
    slow_queries = monitor.get_slow_queries()
    for sq in slow_queries:
        print(f"命令: {sq['command']}, 耗时: {sq['duration']}μs")

日志管理

日志配置

# 日志文件位置
logfile /var/log/redis/redis-server.log

# 日志级别
# debug (开发)
# verbose (少量信息)
# notice (生产推荐)
# warning (仅警告)
loglevel notice

日志分析

# 查看最近日志
tail -100 /var/log/redis/redis-server.log

# 搜索错误
grep -i "error\|exception\|fail" /var/log/redis/redis-server.log

# 搜索连接问题
grep -i "accepting\|client\|connection" /var/log/redis/redis-server.log

# 搜索持久化
grep -i "rdb\|aof\|bgsave\|rewrite" /var/log/redis/redis-server.log

日志轮转

# /etc/logrotate.d/redis
/var/log/redis/*.log {
    daily
    rotate 14
    compress
    delaycompress
    missingok
    notifempty
    create 0640 redis redis
    sharedscripts
    postrotate
        /bin/kill -HUP $(cat /var/run/redis/redis-server.pid 2>/dev/null) 2>/dev/null || true
    endscript
}

备份与恢复

自动备份脚本

#!/bin/bash
# redis-backup.sh

BACKUP_DIR="/backup/redis"
DATE=$(date +%Y%m%d_%H%M%S)
REDIS_HOST="localhost"
REDIS_PORT="6379"
KEEP_DAYS=30

mkdir -p ${BACKUP_DIR}

# BGSAVE 触发快照
echo "触发 RDB 快照..."
redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} BGSAVE

# 等待保存完成
while [ $(redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} LASTSAVE) == $(redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} LASTSAVE) ]; do
    echo "等待保存完成..."
    sleep 1
done

# 复制 RDB 文件
cp /var/lib/redis/dump.rdb ${BACKUP_DIR}/dump_${DATE}.rdb

# 复制 AOF 文件
if [ -f /var/lib/redis/appendonly.aof ]; then
    cp /var/lib/redis/appendonly.aof ${BACKUP_DIR}/appendonly_${DATE}.aof
fi

# 创建压缩包
tar czf ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz \
    -C ${BACKUP_DIR} \
    dump_${DATE}.rdb \
    appendonly_${DATE}.aof

# 清理临时文件
rm -f ${BACKUP_DIR}/dump_${DATE}.rdb
rm -f ${BACKUP_DIR}/appendonly_${DATE}.aof

# 清理旧备份
find ${BACKUP_DIR} -name "redis_backup_*.tar.gz" -mtime +${KEEP_DAYS} -delete

# 复制到远程
# aws s3 cp ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz s3://my-bucket/redis/

echo "备份完成: ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz"

恢复操作

# 1. 停止 Redis
sudo systemctl stop redis

# 2. 备份当前数据
sudo mv /var/lib/redis/dump.rdb /var/lib/redis/dump.rdb.bak

# 3. 解压备份
tar xzf /backup/redis/redis_backup_20240115_020000.tar.gz -C /tmp/

# 4. 恢复 RDB
sudo mv /tmp/dump_*.rdb /var/lib/redis/dump.rdb
sudo chown redis:redis /var/lib/redis/dump.rdb

# 5. 启动 Redis
sudo systemctl start redis

# 6. 验证
redis-cli ping
redis-cli DBSIZE

升级 Redis

升级步骤

# 1. 备份配置和数据
redis-cli BGSAVE
sudo cp /etc/redis/redis.conf /etc/redis/redis.conf.bak
sudo cp -r /var/lib/redis /var/lib/redis.bak

# 2. 停止 Redis
sudo systemctl stop redis

# 3. 升级软件包
# Ubuntu/Debian
sudo apt update
sudo apt install redis-server

# CentOS/RHEL
sudo yum update redis

# 4. 检查配置兼容性
diff /etc/redis/redis.conf.bak /etc/redis/redis.conf

# 5. 启动 Redis
sudo systemctl start redis

# 6. 验证
redis-cli INFO server | grep redis_version
redis-cli PING

# 7. 测试功能
redis-cli --latency

集群升级

# 1. 升级从节点
# 2. 故障转移
# 3. 升级主节点
# 4. 重复直到所有节点升级

集群运维

节点管理

# 添加节点
redis-cli --cluster add-node new_node:port existing_node:port

# 删除节点
redis-cli --cluster del-node existing_node:port node_id

# 重新分配槽位
redis-cli --cluster reshard host:port

# 平衡槽位
redis-cli --cluster rebalance host:port

# 检查集群
redis-cli --cluster check host:port

槽位迁移

# 1. 设置源节点槽位迁移状态
redis-cli -p 7000 CLUSTER SETSLOT 0 MIGRATING node_id

# 2. 获取键
redis-cli -p 7000 CLUSTER GETKEYSINSLOT 0 100

# 3. 迁移键
redis-cli MIGRATE target_host target_port "" 5000 KEYS key1 key2

# 4. 设置槽位归属
redis-cli -p target_port CLUSTER SETSLOT 0 NODE new_node_id

常见运维任务

1. 清空数据

# 清空当前数据库
redis-cli FLUSHDB

# 清空所有数据库
redis-cli FLUSHALL

# 异步清空(不阻塞)
redis-cli FLUSHDB ASYNC

2. 重建索引

def rebuild_cache():
    """重建缓存"""
    client = redis.Redis(host='localhost', port=6379)
    
    # 获取所有缓存键
    cache_keys = client.keys('cache:*')
    
    # 批量删除
    if cache_keys:
        client.delete(*cache_keys)
    
    # 重新预热
    for item in get_all_items():
        cache_item(item)

3. 连接管理

# 查看所有客户端
redis-cli CLIENT LIST

# 杀掉空闲连接
redis-cli CLIENT KILL TYPE idle TIME 3600

# 杀掉特定客户端
redis-cli CLIENT KILL ADDR 192.168.1.100:54321

# 设置客户端名称
redis-cli CLIENT SETNAME worker-1

# 查看当前客户端名称
redis-cli CLIENT GETNAME

4. 配置热更新

# 运行时修改配置
redis-cli CONFIG SET maxmemory 8gb
redis-cli CONFIG SET loglevel notice
redis-cli CONFIG SET slowlog-log-slower-than 10000

# 查看配置
redis-cli CONFIG GET maxmemory
redis-cli CONFIG GET *timeout*

# 保存配置到文件
redis-cli CONFIG REWRITE

运维检查清单

┌─────────────────────────────────────────────────────────────────┐
│                   Redis 运维检查清单                             │
├─────────────────────────────────────────────────────────────────┤
│                                                                  │
│  日常检查:                                                       │
│  □ 监控指标正常                                                  │
│  □ 日志无异常                                                    │
│  □ 备份完成                                                      │
│  □ 连接数正常                                                    │
│                                                                  │
│  每周检查:                                                       │
│  □ 内存使用趋势                                                  │
│  □ 慢查询分析                                                    │
│  □ 客户端连接数趋势                                              │
│  □ 磁盘空间                                                      │
│                                                                  │
│  每月检查:                                                       │
│  □ 性能基线对比                                                  │
│  □ 安全配置审查                                                  │
│  □ 容量规划                                                      │
│  □ 灾难恢复演练                                                  │
│                                                                  │
└─────────────────────────────────────────────────────────────────┘

故障排查流程

def troubleshooting():
    """故障排查流程"""
    
    # 1. 检查服务状态
    if not redis.ping():
        print("Redis 服务不可达")
        # 检查服务状态、防火墙、网络
        return
    
    # 2. 检查内存
    info = redis.info('memory')
    if info['used_memory'] > info['maxmemory']:
        print("内存溢出")
        # 调整 maxmemory、优化淘汰策略
    
    # 3. 检查持久化
    info = redis.info('persistence')
    if info.get('aof_last_write_status') != 'ok':
        print("AOF 写入失败")
    
    # 4. 检查复制
    info = redis.info('replication')
    if info.get('role') == 'slave':
        if info.get('master_link_status') != 'up':
            print("复制断开")
    
    # 5. 检查慢查询
    slow = redis.slowlog_get(1)
    if slow and slow[0]['duration'] > 10000:
        print("发现慢查询")

下一步

恭喜您完成了 Redis 权威教程的全部内容!

👉 返回教程首页