第七章:基础设施即代码 (IaC)

最后更新: 2024-01-01 作者: DevOps Team
页面目录

第七章:基础设施即代码 (IaC)

基础设施即代码 (Infrastructure as Code, IaC) 是 DevOps 的核心实践,通过代码管理基础设施,实现基础设施的版本化、可重复和可测试部署。


7.1 IaC 核心概念

7.1.1 IaC 工作流

┌─────────────────────────────────────────────────────────────────┐
│                        IaC 工作流程                              │
│                                                                 │
│    代码编写 ──→ 代码审查 ──→ CI 验证 ──→ 部署 ──→ 基础设施        │
│         ↑                                              │         │
│         └──────────────── 变更回滚 ◀──────────────────┘         │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

7.1.2 工具对比

工具 类型 语言 状态管理 适用场景
Terraform 声明式 HCL 有状态 多云基础设施
Ansible 过程式 YAML 无状态 配置管理、应用部署
Pulumi 声明式 TypeScript/Python 有状态 开发者友好的 IaC
CloudFormation 声明式 JSON/YAML 有状态 AWS 专用
** Pulumi** 声明式 多语言 有状态 代码即基础设施

7.2 Terraform 实战

7.2.1 Terraform 基础配置

# providers.tf
terraform {
  required_version = ">= 1.6.0"
  
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.0"
    }
    helm = {
      source  = "hashicorp/helm"
      version = "~> 2.0"
    }
  }
  
  # 远程状态
  backend "s3" {
    bucket         = "my-terraform-state"
    key            = "prod/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"
  }
}

provider "aws" {
  region = "us-east-1"
  
  default_tags {
    tags = {
      Environment = "production"
      ManagedBy   = "terraform"
      Project     = "myapp"
    }
  }
}

7.2.2 VPC 和网络配置

# vpc.tf
resource "aws_vpc" "main" {
  cidr_block           = "10.0.0.0/16"
  enable_dns_hostnames = true
  enable_dns_support   = true
  
  tags = {
    Name = "myapp-vpc"
  }
}

# 公有子网
resource "aws_subnet" "public" {
  count                   = 3
  vpc_id                  = aws_vpc.main.id
  cidr_block              = cidrsubnet(aws_vpc.main.cidr_block, 4, count.index)
  availability_zone       = data.aws_availability_zones.available.names[count.index]
  map_public_ip_on_launch = true
  
  tags = {
    Name = "myapp-public-${count.index + 1}"
    Type = "public"
  }
}

# 私有子网
resource "aws_subnet" "private" {
  count             = 3
  vpc_id            = aws_vpc.main.id
  cidr_block        = cidrsubnet(aws_vpc.main.cidr_block, 4, count.index + 3)
  availability_zone = data.aws_availability_zones.available.names[count.index]
  
  tags = {
    Name = "myapp-private-${count.index + 1}"
    Type = "private"
  }
}

# Internet Gateway
resource "aws_internet_gateway" "main" {
  vpc_id = aws_vpc.main.id
  
  tags = {
    Name = "myapp-igw"
  }
}

# 公有路由表
resource "aws_route_table" "public" {
  vpc_id = aws_vpc.main.id
  
  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.main.id
  }
  
  tags = {
    Name = "myapp-public-rt"
  }
}

# 路由表关联
resource "aws_route_table_association" "public" {
  count          = length(aws_subnet.public)
  subnet_id      = aws_subnet.public[count.index].id
  route_table_id = aws_route_table.public.id
}

# NAT Gateway
resource "aws_eip" "nat" {
  domain = "vpc"
}

resource "aws_nat_gateway" "main" {
  allocation_id = aws_eip.nat.id
  subnet_id     = aws_subnet.public[0].id
  
  tags = {
    Name = "myapp-nat"
  }
}

7.2.3 EKS 集群配置

# eks.tf
module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "~> 19.0"
  
  cluster_name    = "myapp-eks"
  cluster_version = "1.28"
  
  vpc_id                   = aws_vpc.main.id
  subnet_ids               = concat(aws_subnet.private[*].id, aws_subnet.public[*].id)
  control_plane_subnet_ids = aws_subnet.private[*].id
  
  # EKS 节点组
  eks_managed_node_groups = {
    default = {
      name = "default-node-group"
      
      instance_type = "t3.medium"
      capacity_type = "ON_DEMAND"
      
      min_size       = 2
      max_size       = 10
      desired_size   = 3
      
      subnet_type = "private"
      
      labels = {
        role = "general"
      }
      
      tags = {
        Environment = "production"
      }
    }
    
    gpu = {
      name = "gpu-node-group"
      
      instance_type = "g4dn.xlarge"
      capacity_type = "ON_DEMAND"
      
      min_size       = 0
      max_size       = 5
      desired_size   = 0
      
      labels = {
        role = "gpu-worker"
        gpu  = "nvidia"
      }
      
      taints = [
        {
          key    = "nvidia.com/gpu"
          value  = "present"
          effect = "NO_SCHEDULE"
        }
      ]
    }
  }
  
  # Fargate 配置文件
  fargate_profiles = {
    app = {
      name = "app"
      selectors = [
        {
          namespace = "app"
          labels = {
            tier = "frontend"
          }
        }
      ]
    }
  }
  
  # 集群安全组规则
  cluster_security_group_rules = {
    ingress = {
      cidr_blocks = ["10.0.0.0/16"]
    }
  }
  
  # 启用相关组件
  enable_oidc                    = true
  enable_cluster_encryption      = true
  cluster_encryption_resources   = ["secrets"]
  
  # 节点标签
  node_security_group_tags = {
    "k8s.io/cluster-autoscaler/enabled" = true
    "k8s.io/cluster-autoscaler/${local.cluster_name}" = "shared"
  }
}

7.2.4 Terraform 模块化

# modules/vpc/main.tf
variable "vpc_cidr" {
  description = "VPC CIDR block"
  type        = string
}

variable "environment" {
  description = "Environment name"
  type        = string
}

variable "availability_zones" {
  description = "Availability zones"
  type        = list(string)
}

resource "aws_vpc" "main" {
  cidr_block = var.vpc_cidr
  
  tags = {
    Name        = "${var.environment}-vpc"
    Environment = var.environment
  }
}

output "vpc_id" {
  value = aws_vpc.main.id
}

output "private_subnets" {
  value = aws_subnet.private[*].id
}

output "public_subnets" {
  value = aws_subnet.public[*].id
}

# 使用模块
module "vpc" {
  source = "./modules/vpc"
  
  vpc_cidr          = "10.0.0.0/16"
  environment       = "production"
  availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
}

7.3 Ansible 实战

7.3.1 Ansible 配置

# ansible.cfg
[defaults]
inventory = ./inventory
host_key_checking = False
retry_files_enabled = False
gathering = smart
fact_caching = jsonfile
fact_caching_connection = /tmp/ansible_facts
fact_caching_timeout = 3600
interpreter_python = auto_silent
callbacks_enabled = profile_tasks, timer

[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=yes
pipelining = True

[privilege_escalation]
become = True
become_method = sudo
become_user = root
become_ask_pass = False

7.3.2 Inventory 配置

# inventory/production.yml
all:
  children:
    # 生产环境
    production:
      hosts:
        web01.prod.example.com:
          ansible_host: 10.0.1.10
          ansible_user: ubuntu
        web02.prod.example.com:
          ansible_host: 10.0.1.11
          ansible_user: ubuntu
        web03.prod.example.com:
          ansible_host: 10.0.1.12
          ansible_user: ubuntu
      
      children:
        databases:
          hosts:
            db01.prod.example.com:
              ansible_host: 10.0.2.10
              ansible_user: ubuntu
              db_port: 5432
        
        cache:
          hosts:
            redis01.prod.example.com:
              ansible_host: 10.0.3.10
              ansible_user: ubuntu
    
    # 开发环境
    development:
      vars:
        env: development
        app_version: dev
      hosts:
        dev01.example.com:
          ansible_host: 10.0.10.10

7.3.3 Playbook 示例

# playbooks/webapp.yml
---
- name: Deploy Web Application
  hosts: production
  serial: 1  # 滚动更新,每次更新一台
  max_fail_percentage: 25
  
  vars:
    app_dir: /opt/myapp
    app_version: "{{ lookup('env', 'APP_VERSION') | default('latest') }}"
    app_port: 8080
  
  pre_tasks:
    - name: Ensure prerequisites are installed
      package:
        name:
          - curl
          - git
        state: present
    
    - name: Create application user
      user:
        name: appuser
        system: yes
        create_home: no
        shell: /bin/false
    
    - name: Get deployment facts
      setup:
        filter: ansible_distribution_major_version
  
  tasks:
    - name: Create application directory
      file:
        path: "{{ app_dir }}"
        state: directory
        owner: appuser
        group: appuser
        mode: '0755'
    
    - name: Pull application image
      docker_image:
        name: myregistry/myapp:{{ app_version }}
        source: pull
        force_source: yes
      when: deployment_strategy == "blue_green"
    
    - name: Copy systemd service file
      template:
        src: templates/myapp.service.j2
        dest: /etc/systemd/system/myapp.service
        owner: root
        group: root
        mode: '0644'
      notify: reload systemd
    
    - name: Configure application
      template:
        src: templates/app.config.j2
        dest: "{{ app_dir }}/config.yaml"
        owner: appuser
        group: appuser
        mode: '0600'
      notify: restart app
    
    - name: Start application
      systemd:
        name: myapp
        state: started
        enabled: yes
        daemon_reload: yes
    
    - name: Wait for application to be ready
      uri:
        url: "http://localhost:{{ app_port }}/health"
        status_code: 200
      register: health_check
      until: health_check.status == 200
      retries: 30
      delay: 2
    
    - name: Verify deployment
      assert:
        that:
          - ansible_facts['distribution'] == 'Ubuntu'
          - app_version | length > 0
  
  handlers:
    - name: restart app
      systemd:
        name: myapp
        state: restarted
    
    - name: reload systemd
      systemd:
        daemon_reload: yes

7.3.4 Role 结构

# roles/webserver 结构
# roles/webserver/
# ├── defaults/
# │   └── main.yml
# ├── files/
# │   └── nginx.conf
# ├── handlers/
# │   └── main.yml
# ├── meta/
# │   └── main.yml
# ├── tasks/
# │   └── main.yml
# ├── templates/
# │   └── nginx.conf.j2
# └── vars/
#     └── main.yml
# roles/webserver/tasks/main.yml
---
- name: Install Nginx
  apt:
    name: nginx
    state: present
    update_cache: yes

- name: Configure Nginx
  template:
    src: nginx.conf.j2
    dest: /etc/nginx/nginx.conf
    owner: root
    group: root
    mode: '0644'
  notify: reload nginx

- name: Enable Nginx site
  file:
    src: /etc/nginx/sites-available/default
    dest: /etc/nginx/sites-enabled/default
    state: link
  notify: reload nginx

7.4 CI/CD 集成

7.4.1 Terraform CI/CD

# .github/workflows/terraform.yml
name: Terraform CI/CD

on:
  push:
    branches: [main]
    paths: ['terraform/**']
  pull_request:
    paths: ['terraform/**']

jobs:
  terraform:
    runs-on: ubuntu-latest
    permissions:
      contents: read
    
    steps:
      - uses: actions/checkout@v4
      
      - uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: 1.6.6
      
      - name: Configure AWS credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: us-east-1
      
      - name: Init Terraform
        working-directory: terraform
        run: terraform init
        
        env:
          TF_VAR_environment: production
      
      - name: Validate Terraform
        working-directory: terraform
        run: terraform validate
      
      - name: Plan Terraform
        working-directory: terraform
        run: terraform plan -input=false
        continue-on-error: true
      
      - name: Apply Terraform
        if: github.ref == 'refs/heads/main' && github.event_name == 'push'
        working-directory: terraform
        run: terraform apply -auto-approve -input=false
        env:
          TF_VAR_environment: production

7.5 本章小结

工具 核心能力 最佳实践
Terraform 跨云基础设施编排 远程状态、模块化、CI/CD 集成
Ansible 配置管理、应用部署 Idempotency、Roles、Inventory 管理

📌 下一章预告

下一章我们将学习 监控与告警体系,包括:

  • Prometheus 指标采集
  • Grafana 可视化
  • 告警规则配置
  • 事件响应流程

💡 提示:IaC 的核心价值在于"代码即文档",确保所有基础设施变更都通过代码进行,并在 CI/CD 中验证。