Shell脚本实战 | 技术站点

实战案例概述

本章将展示真实生产环境中常用的Shell脚本案例，涵盖系统监控、日志处理、数据备份、自动化部署等场景。

系统监控脚本

主机健康监控

#!/bin/bash
# health_check.sh - 系统健康状态监控

set -uo pipefail

# 配置
ALERT_THRESHOLD_CPU=80
ALERT_THRESHOLD_MEM=90
ALERT_THRESHOLD_DISK=85
ALERT_EMAIL="admin@example.com"
SLACK_WEBHOOK=""  # 可选Slack通知

# 颜色
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m'

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

check_cpu() {
    local usage=$(top -bn1 | grep "Cpu(s)" | awk '{print int($2)}')
    log "CPU Usage: ${usage}%"
    
    if [ "$usage" -gt "$ALERT_THRESHOLD_CPU" ]; then
        echo -e "${RED}ALERT: CPU usage is ${usage}% (threshold: ${ALERT_THRESHOLD_CPU}%)${NC}"
        return 1
    fi
    echo -e "${GREEN}CPU: OK${NC}"
    return 0
}

check_memory() {
    local mem_used=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100}')
    log "Memory Usage: ${mem_used}%"
    
    if [ "$mem_used" -gt "$ALERT_THRESHOLD_MEM" ]; then
        echo -e "${RED}ALERT: Memory usage is ${mem_used}% (threshold: ${ALERT_THRESHOLD_MEM}%)${NC}"
        return 1
    fi
    echo -e "${GREEN}Memory: OK${NC}"
    return 0
}

check_disk() {
    local disk_usage=$(df -h / | tail -1 | awk '{print int($5)}')
    log "Disk Usage: ${disk_usage}%"
    
    if [ "$disk_usage" -gt "$ALERT_THRESHOLD_DISK" ]; then
        echo -e "${RED}ALERT: Disk usage is ${disk_usage}% (threshold: ${ALERT_THRESHOLD_DISK}%)${NC}"
        return 1
    fi
    echo -e "${GREEN}Disk: OK${NC}"
    return 0
}

check_load() {
    local load=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | sed 's/,//')
    local cores=$(nproc)
    log "Load Average: $load (cores: $cores)"
}

check_services() {
    local services=("nginx" "mysql" "sshd" "docker")
    local failed=()
    
    for svc in "${services[@]}"; do
        if systemctl is-active --quiet "$svc" 2>/dev/null; then
            echo -e "${GREEN}[$svc] Running${NC}"
        else
            echo -e "${RED}[$svc] Not Running${NC}"
            failed+=("$svc")
        fi
    done
    
    [ ${#failed[@]} -gt 0 ] && return 1
    return 0
}

check_process_count() {
    local total=$(ps aux | wc -l)
    log "Total processes: $total"
    
    if [ "$total" -gt 500 ]; then
        echo -e "${YELLOW}WARNING: High process count ($total)${NC}"
    fi
}

send_alert() {
    local message="$1"
    
    # 邮件通知
    if [ -n "$ALERT_EMAIL" ]; then
        echo "$message" | mail -s "[Alert] $(hostname) Health Check" "$ALERT_EMAIL"
    fi
    
    # Slack通知
    if [ -n "$SLACK_WEBHOOK" ]; then
        curl -s -X POST -H 'Content-type: application/json' \
            --data "{\"text\":\"$message\"}" "$SLACK_WEBHOOK"
    fi
}

# 主程序
main() {
    log "=== Starting Health Check ==="
    
    local status=0
    
    check_cpu || status=1
    check_memory || status=1
    check_disk || status=1
    check_load
    check_process_count
    echo
    check_services || status=1
    
    log "=== Health Check Complete ==="
    
    if [ $status -eq 1 ]; then
        log "ALERTS DETECTED!"
        send_alert "Health check failed on $(hostname)"
        exit 1
    fi
    
    exit 0
}

main "$@"

服务可用性监控

#!/bin/bash
# service_monitor.sh - 服务可用性监控

set -uo pipefail

# 配置
SERVICES=(
    "nginx:http://localhost:80"
    "mysql:tcp://localhost:3306"
    "redis:tcp://localhost:6379"
    "api:http://localhost:8080/api/health"
)
ALERT_EMAIL="admin@example.com"

check_http() {
    local url=$1
    local code=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "$url")
    
    if [ "$code" = "200" ] || [ "$code" = "301" ] || [ "$code" = "302" ]; then
        echo "OK (HTTP $code)"
        return 0
    else
        echo "FAILED (HTTP $code)"
        return 1
    fi
}

check_tcp() {
    local host=$(echo "$1" | sed 's|tcp://||' | cut -d: -f1)
    local port=$(echo "$1" | sed 's|tcp://||' | cut -d: -f2)
    
    if timeout 5 bash -c "echo > /dev/tcp/$host/$port" 2>/dev/null; then
        echo "OK"
        return 0
    else
        echo "FAILED"
        return 1
    fi
}

main() {
    local failed=()
    
    echo "=== Service Monitor ==="
    
    for item in "${SERVICES[@]}"; do
        IFS=':' read -r name url <<< "$item"
        echo -n "[$name] "
        
        if [[ "$url" == http* ]]; then
            result=$(check_http "$url")
        else
            result=$(check_tcp "$url")
        fi
        
        echo "$result"
        [[ "$result" != OK* ]] && failed+=("$name")
    done
    
    if [ ${#failed[@]} -gt 0 ]; then
        echo "FAILED SERVICES: ${failed[*]}"
        # 发送告警
        echo "Service ${failed[*]} failed on $(hostname)" | \
            mail -s "[ALERT] Service Down" "$ALERT_EMAIL"
        exit 1
    fi
    
    exit 0
}

main "$@"

日志处理脚本

日志轮转与清理

#!/bin/bash
# log_rotation.sh - 日志轮转与清理

set -uo pipefail

# 配置
LOG_DIRS=("/var/log/nginx" "/var/log/mysql" "/var/log/app")
RETENTION_DAYS=30
MAX_SIZE_MB=100
COMPRESS_AFTER_DAYS=7

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

rotate_log() {
    local logfile=$1
    local timestamp=$(date +%Y%m%d_%H%M%S)
    
    if [ ! -f "$logfile" ]; then
        return
    fi
    
    # 检查文件大小
    local size_mb=$(du -m "$logfile" | cut -f1)
    
    if [ "$size_mb" -gt "$MAX_SIZE_MB" ]; then
        log "Rotating $logfile (size: ${size_mb}MB)"
        
        # 重命名
        mv "$logfile" "${logfile%.log}.${timestamp}.log"
        
        # 通知服务重新打开日志
        if [[ "$logfile" == *nginx* ]]; then
            nginx -s reopen 2>/dev/null || true
        fi
        
        # 创建新空文件
        touch "$logfile"
        chmod 644 "$logfile"
    fi
}

compress_old_logs() {
    local logdir=$1
    
    find "$logdir" -type f -name "*.log" -mtime +"$COMPRESS_AFTER_DAYS" \
        ! -name "*.gz" -exec gzip {} \;
}

cleanup_old_logs() {
    local logdir=$1
    
    find "$logdir" -type f \( -name "*.log" -o -name "*.gz" \) \
        -mtime +"$RETENTION_DAYS" -delete
    
    log "Cleaned logs older than $RETENTION_DAYS days in $logdir"
}

main() {
    log "=== Log Rotation Started ==="
    
    for dir in "${LOG_DIRS[@]}"; do
        if [ -d "$dir" ]; then
            log "Processing $dir"
            
            # 轮转大日志
            for logfile in "$dir"/*.log; do
                [ -f "$logfile" ] && rotate_log "$logfile"
            done
            
            # 压缩旧日志
            compress_old_logs "$dir"
            
            # 清理过期日志
            cleanup_old_logs "$dir"
        else
            log "WARNING: Directory $dir not found"
        fi
    done
    
    log "=== Log Rotation Complete ==="
}

main "$@"

日志实时分析

#!/bin/bash
# log_analyzer.sh - 实时日志分析

set -uo pipefail

# 配置
LOG_FILE=${1:-/var/log/nginx/access.log}
THRESHOLD_RPS=100

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

analyze_status_codes() {
    echo "=== Status Codes ==="
    awk '{print $9}' "$LOG_FILE" | sort | uniq -c | sort -rn
}

analyze_top_ips() {
    echo "=== Top 10 IPs ==="
    awk '{print $1}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
}

analyze_top_pages() {
    echo "=== Top 10 Pages ==="
    awk '{print $7}' "$LOG_FILE" | sort | uniq -c | sort -rn | head -10
}

analyze_response_time() {
    echo "=== Response Time Stats ==="
    awk -F'"' '{print $(NF-1)}' "$LOG_FILE" | \
        awk '{sum+=$1; count++} END {print "Avg:", sum/count "ms"}'
}

analyze_errors() {
    echo "=== Error Analysis ==="
    awk '$9 >= 500 {print $7, $9}' "$LOG_FILE" | sort | uniq -c | sort -rn
}

realtime_monitor() {
    echo "=== Real-time Monitoring (Ctrl+C to stop) ==="
    tail -f "$LOG_FILE" | while read line; do
        # 统计最近一分钟请求数
        :
    done
}

generate_report() {
    local report_file="report_$(date +%Y%m%d_%H%M%S).txt"
    
    {
        echo "=== Log Analysis Report ==="
        echo "Generated: $(date)"
        echo "Log file: $LOG_FILE"
        echo
        
        analyze_status_codes
        echo
        analyze_top_ips
        echo
        analyze_top_pages
        echo
        analyze_errors
    } > "$report_file"
    
    log "Report saved to $report_file"
}

main() {
    case "${1:-report}" in
        report)
            generate_report
            ;;
        status)
            analyze_status_codes
            ;;
        ips)
            analyze_top_ips
            ;;
        pages)
            analyze_top_pages
            ;;
        errors)
            analyze_errors
            ;;
        *)
            echo "Usage: $0 {report|status|ips|pages|errors}"
            exit 1
            ;;
    esac
}

main "$@"

备份脚本

数据库备份

#!/bin/bash
# mysql_backup.sh - MySQL数据库备份

set -uo pipefail

# 配置
MYSQL_HOST=${MYSQL_HOST:-localhost}
MYSQL_PORT=${MYSQL_PORT:-3306}
MYSQL_USER=${MYSQL_USER:-backup}
MYSQL_PASSWORD=${MYSQL_PASSWORD:-}
BACKUP_DIR=${BACKUP_DIR:-/backup/mysql}
RETENTION_DAYS=${RETENTION_DAYS:-7}
DATABASES=${DATABASES:-"--all-databases"}

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

backup_database() {
    local db=$1
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local backup_file="${BACKUP_DIR}/${db}_${timestamp}.sql.gz"
    
    log "Backing up database: $db"
    
    if mysqldump -h "$MYSQL_HOST" -P "$MYSQL_PORT" -u "$MYSQL_USER" \
        -p"${MYSQL_PASSWORD}" --single-transaction --quick "$db" | \
        gzip > "$backup_file"; then
        
        local size=$(du -h "$backup_file" | cut -f1)
        log "Backup completed: $backup_file ($size)"
        
        # 验证备份
        if gunzip -t "$backup_file" 2>/dev/null; then
            log "Backup verified: $db"
        else
            log "ERROR: Backup verification failed for $db"
            return 1
        fi
    else
        log "ERROR: Backup failed for $db"
        return 1
    fi
}

cleanup_old_backups() {
    log "Cleaning backups older than $RETENTION_DAYS days..."
    find "$BACKUP_DIR" -name "*.sql.gz" -mtime +"$RETENTION_DAYS" -delete
    log "Cleanup completed"
}

restore_database() {
    local backup_file=$1
    local db_name=$2
    
    log "Restoring $db_name from $backup_file..."
    
    gunzip -c "$backup_file" | mysql -h "$MYSQL_HOST" -P "$MYSQL_PORT" \
        -u "$MYSQL_USER" -p"${MYSQL_PASSWORD}" "$db_name"
    
    log "Restore completed"
}

main() {
    mkdir -p "$BACKUP_DIR"
    
    case "${1:-backup}" in
        backup)
            log "=== MySQL Backup Started ==="
            
            if [ "$DATABASES" = "--all-databases" ]; then
                backup_file="${BACKUP_DIR}/all_databases_$(date +%Y%m%d_%H%M%S).sql.gz"
                log "Backing up all databases..."
                
                mysqldump -h "$MYSQL_HOST" -P "$MYSQL_PORT" \
                    -u "$MYSQL_USER" -p"${MYSQL_PASSWORD}" \
                    --all-databases --single-transaction --quick | \
                    gzip > "$backup_file"
                
                log "All databases backed up to $backup_file"
            else
                for db in $DATABASES; do
                    backup_database "$db"
                done
            fi
            
            cleanup_old_backups
            log "=== MySQL Backup Complete ==="
            ;;
            
        restore)
            if [ -z "${2:-}" ] || [ -z "${3:-}" ]; then
                echo "Usage: $0 restore <backup_file> <database>"
                exit 1
            fi
            restore_database "$2" "$3"
            ;;
            
        list)
            ls -lh "$BACKUP_DIR"/*.sql.gz 2>/dev/null || echo "No backups found"
            ;;
            
        *)
            echo "Usage: $0 {backup|restore|list}"
            exit 1
            ;;
    esac
}

main "$@"

文件系统备份

#!/bin/bash
# fs_backup.sh - 文件系统增量备份

set -uo pipefail

# 配置
SOURCE_DIRS=("/home" "/etc" "/var/www")
BACKUP_DIR="/backup/fs"
REMOTE_HOST="backup-server"
REMOTE_PATH="/backup"
RETENTION_DAYS=30
EXCLUDE_FILE="/etc/backup_exclude.txt"

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

create_backup() {
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local backup_name="backup_${timestamp}"
    local backup_path="${BACKUP_DIR}/${backup_name}"
    local exclude_opts=""
    
    mkdir -p "$backup_path"
    
    # 生成排除选项
    if [ -f "$EXCLUDE_FILE" ]; then
        exclude_opts="--exclude-from=$EXCLUDE_FILE"
    fi
    
    for dir in "${SOURCE_DIRS[@]}"; do
        if [ -d "$dir" ]; then
            local basename=$(basename "$dir")
            log "Backing up $dir..."
            
            tar -czf "${backup_path}/${basename}.tar.gz" \
                -C "$(dirname "$dir")" \
                $exclude_opts \
                "$(basename "$dir")" 2>/dev/null || true
        fi
    done
    
    # 创建备份信息
    {
        echo "Backup Date: $(date)"
        echo "Source Dirs: ${SOURCE_DIRS[*]}"
        echo "Hostname: $(hostname)"
    } > "${backup_path}/backup_info.txt"
    
    log "Local backup created: $backup_path"
    
    # 复制到远程
    if [ -n "$REMOTE_HOST" ]; then
        log "Copying to remote host..."
        rsync -avz --progress "$backup_path/" \
            "${REMOTE_HOST}:${REMOTE_PATH}/" || \
            log "WARNING: Remote backup failed"
    fi
    
    # 创建压缩包
    cd "$BACKUP_DIR"
    tar -czf "${backup_name}.tar.gz" "$backup_name"
    rm -rf "$backup_name"
    
    log "Final backup: ${backup_path}.tar.gz"
}

cleanup_old_backups() {
    log "Cleaning backups older than $RETENTION_DAYS days..."
    
    find "$BACKUP_DIR" -name "backup_*.tar.gz" -mtime +"$RETENTION_DAYS" -exec rm -v {} \;
    
    log "Cleanup completed"
}

verify_backup() {
    local backup_file=$1
    
    log "Verifying backup: $backup_file"
    
    if tar -tzf "$backup_file" > /dev/null 2>&1; then
        log "Backup verified: $backup_file"
        return 0
    else
        log "ERROR: Backup verification failed: $backup_file"
        return 1
    fi
}

restore_backup() {
    local backup_file=$1
    local restore_dir=${2:-/}
    
    log "Restoring from $backup_file to $restore_dir..."
    
    tar -xzf "$backup_file" -C "$restore_dir"
    
    log "Restore completed"
}

main() {
    mkdir -p "$BACKUP_DIR"
    
    case "${1:-backup}" in
        backup)
            log "=== Filesystem Backup Started ==="
            create_backup
            cleanup_old_backups
            log "=== Backup Complete ==="
            ;;
            
        restore)
            if [ -z "${2:-}" ]; then
                echo "Usage: $0 restore <backup_file> [restore_dir]"
                exit 1
            fi
            verify_backup "$2"
            restore_backup "$2" "${3:-/}"
            ;;
            
        verify)
            if [ -z "${2:-}" ]; then
                echo "Usage: $0 verify <backup_file>"
                exit 1
            fi
            verify_backup "$2"
            ;;
            
        list)
            ls -lh "$BACKUP_DIR"/backup_*.tar.gz 2>/dev/null
            ;;
            
        *)
            echo "Usage: $0 {backup|restore|verify|list}"
            exit 1
            ;;
    esac
}

main "$@"

自动化部署脚本

应用部署脚本

#!/bin/bash
# deploy.sh - 应用自动化部署脚本

set -uo pipefail

# 配置
APP_NAME="myapp"
APP_DIR="/opt/${APP_NAME}"
REPO_URL="git@github.com:org/${APP_NAME}.git"
REPO_BRANCH="main"
DEPLOY_USER="deploy"
ENVIRONMENT=${1:-staging}
CONFIG_DIR="/etc/${APP_NAME}"

# 颜色
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m'

log() {
    echo -e "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

error() {
    echo -e "${RED}[ERROR] $1${NC}" >&2
}

success() {
    echo -e "${GREEN}[SUCCESS] $1${NC}"
}

backup_current() {
    if [ -d "$APP_DIR" ]; then
        local backup_dir="/tmp/${APP_NAME}_backup_$(date +%Y%m%d_%H%M%S)"
        log "Backing up current deployment..."
        cp -a "$APP_DIR" "$backup_dir"
        success "Backup created: $backup_dir"
    fi
}

clone_or_pull() {
    if [ -d "$APP_DIR/.git" ]; then
        log "Pulling latest changes..."
        cd "$APP_DIR"
        sudo -u "$DEPLOY_USER" git pull origin "$REPO_BRANCH"
    else
        log "Cloning repository..."
        sudo -u "$DEPLOY_USER" git clone -b "$REPO_BRANCH" "$REPO_URL" "$APP_DIR"
    fi
}

install_dependencies() {
    log "Installing dependencies..."
    
    cd "$APP_DIR"
    
    if [ -f "package.json" ]; then
        npm install --production
    fi
    
    if [ -f "requirements.txt" ]; then
        pip install -r requirements.txt
    fi
    
    if [ -f "composer.json" ]; then
        composer install --no-dev
    fi
}

run_migrations() {
    log "Running database migrations..."
    
    if [ -f "manage.py" ]; then
        python manage.py migrate --noinput
    fi
    
    if [ -f "migrations.sh" ]; then
        ./migrations.sh
    fi
}

symlink_config() {
    log "Updating configuration..."
    
    if [ -d "$CONFIG_DIR" ]; then
        ln -sf "$CONFIG_DIR/${ENVIRONMENT}.env" "$APP_DIR/.env"
        ln -sf "$CONFIG_DIR/nginx.conf" "$APP_DIR/deploy/nginx.conf"
    fi
}

restart_services() {
    log "Restarting services..."
    
    sudo systemctl restart "${APP_NAME}.service" || true
    sudo systemctl restart nginx
}

health_check() {
    log "Running health check..."
    
    local max_attempts=30
    local attempt=0
    
    while [ $attempt -lt $max_attempts ]; do
        if curl -sf "http://localhost:8080/health" > /dev/null 2>&1; then
            success "Health check passed!"
            return 0
        fi
        
        attempt=$((attempt + 1))
        sleep 2
    done
    
    error "Health check failed after $max_attempts attempts"
    return 1
}

rollback() {
    error "Deployment failed! Rolling back..."
    
    if [ -d "/tmp/${APP_NAME}_backup_"* ]; then
        local backup_dir=$(ls -td /tmp/${APP_NAME}_backup_* | head -1)
        rm -rf "$APP_DIR"
        cp -a "$backup_dir" "$APP_DIR"
        restart_services
        success "Rollback completed"
    fi
}

main() {
    log "=== Deployment Started: ${APP_NAME} to ${ENVIRONMENT} ==="
    
    # 检查root权限
    if [ "$EUID" -ne 0 ]; then
        error "This script must be run as root"
        exit 1
    fi
    
    trap rollback ERR
    
    backup_current
    clone_or_pull
    install_dependencies
    symlink_config
    run_migrations
    restart_services
    
    if health_check; then
        success "Deployment completed successfully!"
    else
        rollback
        exit 1
    fi
    
    log "=== Deployment Finished ==="
}

main "$@"

定时任务设置

Crontab配置

# 编辑crontab
crontab -e

# 常用格式
# * * * * * command
# │ │ │ │ │
# │ │ │ │ └── 星期 (0-7, 0和7是周日)
# │ │ │ └──── 月份 (1-12)
# │ │ └────── 日 (1-31)
# │ └──────── 时 (0-23)
# └───────── 分 (0-59)

# 示例
# 每分钟执行
* * * * * /path/to/script.sh

# 每小时第5分钟
5 * * * * /path/to/script.sh

# 每天凌晨3点
0 3 * * * /path/to/backup.sh

# 每周日凌晨2点
0 2 * * 0 /path/to/weekly_task.sh

# 每月1日凌晨1点
0 1 1 * * /path/to/monthly_task.sh

# 每5分钟执行
*/5 * * * * /path/to/monitor.sh

# 特定时间范围
0 9-17 * * 1-5 /path/to/business_hours.sh

# 系统crontab
sudo vim /etc/crontab
sudo vim /etc/cron.daily/
sudo vim /etc/cron.hourly/

cron管理

# 查看crontab
crontab -l

# 查看系统cron
ls -la /etc/cron.d/

# 删除crontab
crontab -r

# 备份crontab
crontab -l > crontab_backup.txt

# 恢复crontab
crontab crontab_backup.txt

# 查看cron日志
grep CRON /var/log/syslog
journalctl -u cron

课后练习

实践任务

根据实际环境修改并运行健康检查脚本
配置数据库备份脚本的定时任务
实现一套完整的日志分析脚本
编写自动化部署脚本并配置CI/CD
配置日志轮转的cron任务

下一篇预告：我们将学习系统安全加固，掌握Linux安全防护技能。