第十四章:运维管理
深入了解 Redis 运维管理,包括监控、备份、日志和常见运维任务。
最后更新: 2024-01-15
页面目录
Redis 运维管理
本章介绍 Redis 的日常运维管理,包括监控、备份、日志分析和常见运维任务。
监控体系
┌─────────────────────────────────────────────────────────────────┐
│ Redis 监控体系 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ 指标采集 │ │ 数据存储 │ │ 告警通知 │ │
│ │ INFO │ │ 时序数据库 │ │ 邮件/钉钉 │ │
│ │ MONITOR │ │ Prometheus │ │ 短信 │ │
│ │ SLOWLOG │ │ InfluxDB │ │ WebHook │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ 可视化展示 │ │
│ │ Grafana Dashboard │ │
│ └─────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
关键监控指标
1. 性能指标
| 指标 | 命令 | 阈值建议 |
|---|---|---|
| ops_per_sec | INFO stats | > 50000 |
| connected_clients | INFO clients | < 10000 |
| blocked_clients | INFO clients | < 100 |
| instantaneous_latency | INFO stats | < 10ms |
2. 内存指标
| 指标 | 命令 | 阈值建议 |
|---|---|---|
| used_memory | INFO memory | < maxmemory |
| mem_fragmentation_ratio | INFO memory | < 1.5 |
| evicted_keys | INFO stats | < 100/s |
| maxmemory_human | INFO memory | - |
3. 持久化指标
| 指标 | 命令 | 阈值建议 |
|---|---|---|
| rdb_last_save_time | INFO persistence | - |
| aof_last_write_status | INFO persistence | OK |
| aof_rewrite_in_progress | INFO persistence | 0 |
| rdb_bgsave_in_progress | INFO persistence | 0 |
4. 复制指标
| 指标 | 命令 | 阈值建议 |
|---|---|---|
| master_link_status | INFO replication | up |
| slave_repl_offset | INFO replication | - |
| lag | INFO replication | < 1s |
INFO 命令详解
完整示例
redis-cli INFO
# ========== Server ==========
redis_version:7.2.4
redis_mode:standalone
os:Linux 5.4.0 x86_64
arch_bits:64
# ========== Clients ==========
connected_clients:5
client_longest_output_list:0
client_biggest_input_buf:0
blocked_clients:0
# ========== Memory ==========
used_memory:1048576
used_memory_human:1.00M
used_memory_rss:2097152
used_memory_rss_human:2.00M
mem_fragmentation_ratio:2.00
# ========== Stats ==========
total_connections_received:100
total_commands_processed:50000
instantaneous_ops_per_sec:1000
keyspace_hits:45000
keyspace_misses:5000
keyspace_hitrate:90.00%
监控脚本
Python 监控脚本
#!/usr/bin/env python3
import redis
import time
import json
from datetime import datetime
class RedisMonitor:
def __init__(self, host='localhost', port=6379, password=None):
self.client = redis.Redis(
host=host,
port=port,
password=password,
decode_responses=True
)
def get_all_stats(self):
"""获取所有统计信息"""
info = self.client.info()
return {
'timestamp': datetime.now().isoformat(),
'server': {
'version': info.get('redis_version'),
'uptime_seconds': info.get('uptime_in_seconds'),
},
'memory': {
'used': info.get('used_memory'),
'used_human': info.get('used_memory_human'),
'peak': info.get('used_memory_peak'),
'fragmentation': info.get('mem_fragmentation_ratio'),
},
'clients': {
'connected': info.get('connected_clients'),
'blocked': info.get('blocked_clients'),
},
'stats': {
'ops_per_sec': info.get('instantaneous_ops_per_sec'),
'total_commands': info.get('total_commands_processed'),
'keyspace_hits': info.get('keyspace_hits'),
'keyspace_misses': info.get('keyspace_misses'),
},
'persistence': {
'rdb_last_save': info.get('rdb_last_save_time'),
'aof_enabled': info.get('aof_enabled'),
}
}
def check_health(self):
"""健康检查"""
alerts = []
info = self.client.info()
# 检查内存
if info.get('used_memory') > info.get('maxmemory') * 0.8:
alerts.append(f"内存使用超过80%: {info['used_memory_human']}")
# 检查碎片
if info.get('mem_fragmentation_ratio') > 1.5:
alerts.append(f"内存碎片率过高: {info['mem_fragmentation_ratio']}")
# 检查客户端
if info.get('connected_clients') > 5000:
alerts.append(f"连接数过高: {info['connected_clients']}")
# 检查命中率
hits = info.get('keyspace_hits', 0)
misses = info.get('keyspace_misses', 0)
total = hits + misses
if total > 0 and hits / total < 0.5:
alerts.append(f"命中率过低: {hits/total:.2%}")
return alerts
def get_slow_queries(self, limit=10):
"""获取慢查询"""
return self.client.slowlog_get(limit)
def monitor_realtime(self, interval=1, count=10):
"""实时监控"""
stats_history = []
for _ in range(count):
stats_history.append(self.get_all_stats())
time.sleep(interval)
return stats_history
if __name__ == '__main__':
monitor = RedisMonitor()
print("=== Redis 统计 ===")
stats = monitor.get_all_stats()
print(json.dumps(stats, indent=2))
print("\n=== 健康检查 ===")
alerts = monitor.check_health()
if alerts:
for alert in alerts:
print(f"⚠️ {alert}")
else:
print("✅ 所有指标正常")
print("\n=== 慢查询 ===")
slow_queries = monitor.get_slow_queries()
for sq in slow_queries:
print(f"命令: {sq['command']}, 耗时: {sq['duration']}μs")
日志管理
日志配置
# 日志文件位置
logfile /var/log/redis/redis-server.log
# 日志级别
# debug (开发)
# verbose (少量信息)
# notice (生产推荐)
# warning (仅警告)
loglevel notice
日志分析
# 查看最近日志
tail -100 /var/log/redis/redis-server.log
# 搜索错误
grep -i "error\|exception\|fail" /var/log/redis/redis-server.log
# 搜索连接问题
grep -i "accepting\|client\|connection" /var/log/redis/redis-server.log
# 搜索持久化
grep -i "rdb\|aof\|bgsave\|rewrite" /var/log/redis/redis-server.log
日志轮转
# /etc/logrotate.d/redis
/var/log/redis/*.log {
daily
rotate 14
compress
delaycompress
missingok
notifempty
create 0640 redis redis
sharedscripts
postrotate
/bin/kill -HUP $(cat /var/run/redis/redis-server.pid 2>/dev/null) 2>/dev/null || true
endscript
}
备份与恢复
自动备份脚本
#!/bin/bash
# redis-backup.sh
BACKUP_DIR="/backup/redis"
DATE=$(date +%Y%m%d_%H%M%S)
REDIS_HOST="localhost"
REDIS_PORT="6379"
KEEP_DAYS=30
mkdir -p ${BACKUP_DIR}
# BGSAVE 触发快照
echo "触发 RDB 快照..."
redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} BGSAVE
# 等待保存完成
while [ $(redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} LASTSAVE) == $(redis-cli -h ${REDIS_HOST} -p ${REDIS_PORT} LASTSAVE) ]; do
echo "等待保存完成..."
sleep 1
done
# 复制 RDB 文件
cp /var/lib/redis/dump.rdb ${BACKUP_DIR}/dump_${DATE}.rdb
# 复制 AOF 文件
if [ -f /var/lib/redis/appendonly.aof ]; then
cp /var/lib/redis/appendonly.aof ${BACKUP_DIR}/appendonly_${DATE}.aof
fi
# 创建压缩包
tar czf ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz \
-C ${BACKUP_DIR} \
dump_${DATE}.rdb \
appendonly_${DATE}.aof
# 清理临时文件
rm -f ${BACKUP_DIR}/dump_${DATE}.rdb
rm -f ${BACKUP_DIR}/appendonly_${DATE}.aof
# 清理旧备份
find ${BACKUP_DIR} -name "redis_backup_*.tar.gz" -mtime +${KEEP_DAYS} -delete
# 复制到远程
# aws s3 cp ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz s3://my-bucket/redis/
echo "备份完成: ${BACKUP_DIR}/redis_backup_${DATE}.tar.gz"
恢复操作
# 1. 停止 Redis
sudo systemctl stop redis
# 2. 备份当前数据
sudo mv /var/lib/redis/dump.rdb /var/lib/redis/dump.rdb.bak
# 3. 解压备份
tar xzf /backup/redis/redis_backup_20240115_020000.tar.gz -C /tmp/
# 4. 恢复 RDB
sudo mv /tmp/dump_*.rdb /var/lib/redis/dump.rdb
sudo chown redis:redis /var/lib/redis/dump.rdb
# 5. 启动 Redis
sudo systemctl start redis
# 6. 验证
redis-cli ping
redis-cli DBSIZE
升级 Redis
升级步骤
# 1. 备份配置和数据
redis-cli BGSAVE
sudo cp /etc/redis/redis.conf /etc/redis/redis.conf.bak
sudo cp -r /var/lib/redis /var/lib/redis.bak
# 2. 停止 Redis
sudo systemctl stop redis
# 3. 升级软件包
# Ubuntu/Debian
sudo apt update
sudo apt install redis-server
# CentOS/RHEL
sudo yum update redis
# 4. 检查配置兼容性
diff /etc/redis/redis.conf.bak /etc/redis/redis.conf
# 5. 启动 Redis
sudo systemctl start redis
# 6. 验证
redis-cli INFO server | grep redis_version
redis-cli PING
# 7. 测试功能
redis-cli --latency
集群升级
# 1. 升级从节点
# 2. 故障转移
# 3. 升级主节点
# 4. 重复直到所有节点升级
集群运维
节点管理
# 添加节点
redis-cli --cluster add-node new_node:port existing_node:port
# 删除节点
redis-cli --cluster del-node existing_node:port node_id
# 重新分配槽位
redis-cli --cluster reshard host:port
# 平衡槽位
redis-cli --cluster rebalance host:port
# 检查集群
redis-cli --cluster check host:port
槽位迁移
# 1. 设置源节点槽位迁移状态
redis-cli -p 7000 CLUSTER SETSLOT 0 MIGRATING node_id
# 2. 获取键
redis-cli -p 7000 CLUSTER GETKEYSINSLOT 0 100
# 3. 迁移键
redis-cli MIGRATE target_host target_port "" 5000 KEYS key1 key2
# 4. 设置槽位归属
redis-cli -p target_port CLUSTER SETSLOT 0 NODE new_node_id
常见运维任务
1. 清空数据
# 清空当前数据库
redis-cli FLUSHDB
# 清空所有数据库
redis-cli FLUSHALL
# 异步清空(不阻塞)
redis-cli FLUSHDB ASYNC
2. 重建索引
def rebuild_cache():
"""重建缓存"""
client = redis.Redis(host='localhost', port=6379)
# 获取所有缓存键
cache_keys = client.keys('cache:*')
# 批量删除
if cache_keys:
client.delete(*cache_keys)
# 重新预热
for item in get_all_items():
cache_item(item)
3. 连接管理
# 查看所有客户端
redis-cli CLIENT LIST
# 杀掉空闲连接
redis-cli CLIENT KILL TYPE idle TIME 3600
# 杀掉特定客户端
redis-cli CLIENT KILL ADDR 192.168.1.100:54321
# 设置客户端名称
redis-cli CLIENT SETNAME worker-1
# 查看当前客户端名称
redis-cli CLIENT GETNAME
4. 配置热更新
# 运行时修改配置
redis-cli CONFIG SET maxmemory 8gb
redis-cli CONFIG SET loglevel notice
redis-cli CONFIG SET slowlog-log-slower-than 10000
# 查看配置
redis-cli CONFIG GET maxmemory
redis-cli CONFIG GET *timeout*
# 保存配置到文件
redis-cli CONFIG REWRITE
运维检查清单
┌─────────────────────────────────────────────────────────────────┐
│ Redis 运维检查清单 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 日常检查: │
│ □ 监控指标正常 │
│ □ 日志无异常 │
│ □ 备份完成 │
│ □ 连接数正常 │
│ │
│ 每周检查: │
│ □ 内存使用趋势 │
│ □ 慢查询分析 │
│ □ 客户端连接数趋势 │
│ □ 磁盘空间 │
│ │
│ 每月检查: │
│ □ 性能基线对比 │
│ □ 安全配置审查 │
│ □ 容量规划 │
│ □ 灾难恢复演练 │
│ │
└─────────────────────────────────────────────────────────────────┘
故障排查流程
def troubleshooting():
"""故障排查流程"""
# 1. 检查服务状态
if not redis.ping():
print("Redis 服务不可达")
# 检查服务状态、防火墙、网络
return
# 2. 检查内存
info = redis.info('memory')
if info['used_memory'] > info['maxmemory']:
print("内存溢出")
# 调整 maxmemory、优化淘汰策略
# 3. 检查持久化
info = redis.info('persistence')
if info.get('aof_last_write_status') != 'ok':
print("AOF 写入失败")
# 4. 检查复制
info = redis.info('replication')
if info.get('role') == 'slave':
if info.get('master_link_status') != 'up':
print("复制断开")
# 5. 检查慢查询
slow = redis.slowlog_get(1)
if slow and slow[0]['duration'] > 10000:
print("发现慢查询")
下一步
恭喜您完成了 Redis 权威教程的全部内容!
👉 返回教程首页