第七章:基础设施即代码 (IaC)
最后更新: 2024-01-01
作者: DevOps Team
页面目录
第七章:基础设施即代码 (IaC)
基础设施即代码 (Infrastructure as Code, IaC) 是 DevOps 的核心实践,通过代码管理基础设施,实现基础设施的版本化、可重复和可测试部署。
7.1 IaC 核心概念
7.1.1 IaC 工作流
┌─────────────────────────────────────────────────────────────────┐
│ IaC 工作流程 │
│ │
│ 代码编写 ──→ 代码审查 ──→ CI 验证 ──→ 部署 ──→ 基础设施 │
│ ↑ │ │
│ └──────────────── 变更回滚 ◀──────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────┘
7.1.2 工具对比
| 工具 | 类型 | 语言 | 状态管理 | 适用场景 |
|---|---|---|---|---|
| Terraform | 声明式 | HCL | 有状态 | 多云基础设施 |
| Ansible | 过程式 | YAML | 无状态 | 配置管理、应用部署 |
| Pulumi | 声明式 | TypeScript/Python | 有状态 | 开发者友好的 IaC |
| CloudFormation | 声明式 | JSON/YAML | 有状态 | AWS 专用 |
| ** Pulumi** | 声明式 | 多语言 | 有状态 | 代码即基础设施 |
7.2 Terraform 实战
7.2.1 Terraform 基础配置
# providers.tf
terraform {
required_version = ">= 1.6.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.0"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.0"
}
}
# 远程状态
backend "s3" {
bucket = "my-terraform-state"
key = "prod/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
provider "aws" {
region = "us-east-1"
default_tags {
tags = {
Environment = "production"
ManagedBy = "terraform"
Project = "myapp"
}
}
}
7.2.2 VPC 和网络配置
# vpc.tf
resource "aws_vpc" "main" {
cidr_block = "10.0.0.0/16"
enable_dns_hostnames = true
enable_dns_support = true
tags = {
Name = "myapp-vpc"
}
}
# 公有子网
resource "aws_subnet" "public" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 4, count.index)
availability_zone = data.aws_availability_zones.available.names[count.index]
map_public_ip_on_launch = true
tags = {
Name = "myapp-public-${count.index + 1}"
Type = "public"
}
}
# 私有子网
resource "aws_subnet" "private" {
count = 3
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(aws_vpc.main.cidr_block, 4, count.index + 3)
availability_zone = data.aws_availability_zones.available.names[count.index]
tags = {
Name = "myapp-private-${count.index + 1}"
Type = "private"
}
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = {
Name = "myapp-igw"
}
}
# 公有路由表
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = {
Name = "myapp-public-rt"
}
}
# 路由表关联
resource "aws_route_table_association" "public" {
count = length(aws_subnet.public)
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
# NAT Gateway
resource "aws_eip" "nat" {
domain = "vpc"
}
resource "aws_nat_gateway" "main" {
allocation_id = aws_eip.nat.id
subnet_id = aws_subnet.public[0].id
tags = {
Name = "myapp-nat"
}
}
7.2.3 EKS 集群配置
# eks.tf
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 19.0"
cluster_name = "myapp-eks"
cluster_version = "1.28"
vpc_id = aws_vpc.main.id
subnet_ids = concat(aws_subnet.private[*].id, aws_subnet.public[*].id)
control_plane_subnet_ids = aws_subnet.private[*].id
# EKS 节点组
eks_managed_node_groups = {
default = {
name = "default-node-group"
instance_type = "t3.medium"
capacity_type = "ON_DEMAND"
min_size = 2
max_size = 10
desired_size = 3
subnet_type = "private"
labels = {
role = "general"
}
tags = {
Environment = "production"
}
}
gpu = {
name = "gpu-node-group"
instance_type = "g4dn.xlarge"
capacity_type = "ON_DEMAND"
min_size = 0
max_size = 5
desired_size = 0
labels = {
role = "gpu-worker"
gpu = "nvidia"
}
taints = [
{
key = "nvidia.com/gpu"
value = "present"
effect = "NO_SCHEDULE"
}
]
}
}
# Fargate 配置文件
fargate_profiles = {
app = {
name = "app"
selectors = [
{
namespace = "app"
labels = {
tier = "frontend"
}
}
]
}
}
# 集群安全组规则
cluster_security_group_rules = {
ingress = {
cidr_blocks = ["10.0.0.0/16"]
}
}
# 启用相关组件
enable_oidc = true
enable_cluster_encryption = true
cluster_encryption_resources = ["secrets"]
# 节点标签
node_security_group_tags = {
"k8s.io/cluster-autoscaler/enabled" = true
"k8s.io/cluster-autoscaler/${local.cluster_name}" = "shared"
}
}
7.2.4 Terraform 模块化
# modules/vpc/main.tf
variable "vpc_cidr" {
description = "VPC CIDR block"
type = string
}
variable "environment" {
description = "Environment name"
type = string
}
variable "availability_zones" {
description = "Availability zones"
type = list(string)
}
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
tags = {
Name = "${var.environment}-vpc"
Environment = var.environment
}
}
output "vpc_id" {
value = aws_vpc.main.id
}
output "private_subnets" {
value = aws_subnet.private[*].id
}
output "public_subnets" {
value = aws_subnet.public[*].id
}
# 使用模块
module "vpc" {
source = "./modules/vpc"
vpc_cidr = "10.0.0.0/16"
environment = "production"
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
}
7.3 Ansible 实战
7.3.1 Ansible 配置
# ansible.cfg
[defaults]
inventory = ./inventory
host_key_checking = False
retry_files_enabled = False
gathering = smart
fact_caching = jsonfile
fact_caching_connection = /tmp/ansible_facts
fact_caching_timeout = 3600
interpreter_python = auto_silent
callbacks_enabled = profile_tasks, timer
[ssh_connection]
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=yes
pipelining = True
[privilege_escalation]
become = True
become_method = sudo
become_user = root
become_ask_pass = False
7.3.2 Inventory 配置
# inventory/production.yml
all:
children:
# 生产环境
production:
hosts:
web01.prod.example.com:
ansible_host: 10.0.1.10
ansible_user: ubuntu
web02.prod.example.com:
ansible_host: 10.0.1.11
ansible_user: ubuntu
web03.prod.example.com:
ansible_host: 10.0.1.12
ansible_user: ubuntu
children:
databases:
hosts:
db01.prod.example.com:
ansible_host: 10.0.2.10
ansible_user: ubuntu
db_port: 5432
cache:
hosts:
redis01.prod.example.com:
ansible_host: 10.0.3.10
ansible_user: ubuntu
# 开发环境
development:
vars:
env: development
app_version: dev
hosts:
dev01.example.com:
ansible_host: 10.0.10.10
7.3.3 Playbook 示例
# playbooks/webapp.yml
---
- name: Deploy Web Application
hosts: production
serial: 1 # 滚动更新,每次更新一台
max_fail_percentage: 25
vars:
app_dir: /opt/myapp
app_version: "{{ lookup('env', 'APP_VERSION') | default('latest') }}"
app_port: 8080
pre_tasks:
- name: Ensure prerequisites are installed
package:
name:
- curl
- git
state: present
- name: Create application user
user:
name: appuser
system: yes
create_home: no
shell: /bin/false
- name: Get deployment facts
setup:
filter: ansible_distribution_major_version
tasks:
- name: Create application directory
file:
path: "{{ app_dir }}"
state: directory
owner: appuser
group: appuser
mode: '0755'
- name: Pull application image
docker_image:
name: myregistry/myapp:{{ app_version }}
source: pull
force_source: yes
when: deployment_strategy == "blue_green"
- name: Copy systemd service file
template:
src: templates/myapp.service.j2
dest: /etc/systemd/system/myapp.service
owner: root
group: root
mode: '0644'
notify: reload systemd
- name: Configure application
template:
src: templates/app.config.j2
dest: "{{ app_dir }}/config.yaml"
owner: appuser
group: appuser
mode: '0600'
notify: restart app
- name: Start application
systemd:
name: myapp
state: started
enabled: yes
daemon_reload: yes
- name: Wait for application to be ready
uri:
url: "http://localhost:{{ app_port }}/health"
status_code: 200
register: health_check
until: health_check.status == 200
retries: 30
delay: 2
- name: Verify deployment
assert:
that:
- ansible_facts['distribution'] == 'Ubuntu'
- app_version | length > 0
handlers:
- name: restart app
systemd:
name: myapp
state: restarted
- name: reload systemd
systemd:
daemon_reload: yes
7.3.4 Role 结构
# roles/webserver 结构
# roles/webserver/
# ├── defaults/
# │ └── main.yml
# ├── files/
# │ └── nginx.conf
# ├── handlers/
# │ └── main.yml
# ├── meta/
# │ └── main.yml
# ├── tasks/
# │ └── main.yml
# ├── templates/
# │ └── nginx.conf.j2
# └── vars/
# └── main.yml
# roles/webserver/tasks/main.yml
---
- name: Install Nginx
apt:
name: nginx
state: present
update_cache: yes
- name: Configure Nginx
template:
src: nginx.conf.j2
dest: /etc/nginx/nginx.conf
owner: root
group: root
mode: '0644'
notify: reload nginx
- name: Enable Nginx site
file:
src: /etc/nginx/sites-available/default
dest: /etc/nginx/sites-enabled/default
state: link
notify: reload nginx
7.4 CI/CD 集成
7.4.1 Terraform CI/CD
# .github/workflows/terraform.yml
name: Terraform CI/CD
on:
push:
branches: [main]
paths: ['terraform/**']
pull_request:
paths: ['terraform/**']
jobs:
terraform:
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- uses: actions/checkout@v4
- uses: hashicorp/setup-terraform@v3
with:
terraform_version: 1.6.6
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: us-east-1
- name: Init Terraform
working-directory: terraform
run: terraform init
env:
TF_VAR_environment: production
- name: Validate Terraform
working-directory: terraform
run: terraform validate
- name: Plan Terraform
working-directory: terraform
run: terraform plan -input=false
continue-on-error: true
- name: Apply Terraform
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
working-directory: terraform
run: terraform apply -auto-approve -input=false
env:
TF_VAR_environment: production
7.5 本章小结
| 工具 | 核心能力 | 最佳实践 |
|---|---|---|
| Terraform | 跨云基础设施编排 | 远程状态、模块化、CI/CD 集成 |
| Ansible | 配置管理、应用部署 | Idempotency、Roles、Inventory 管理 |
📌 下一章预告
下一章我们将学习 监控与告警体系,包括:
- Prometheus 指标采集
- Grafana 可视化
- 告警规则配置
- 事件响应流程
💡 提示:IaC 的核心价值在于"代码即文档",确保所有基础设施变更都通过代码进行,并在 CI/CD 中验证。