第七章:聚合分析

学习 Elasticsearch 的聚合分析功能,包括指标聚合、桶聚合和管道聚合。

最后更新: 2024-01-15
页面目录

第七章:聚合分析

7.1 聚合概述

Elasticsearch 聚合功能强大,支持对数据进行统计分析、分类汇总等操作。

7.1.1 聚合类型

Aggregations
├── 指标聚合 (Metrics Aggregations)
│   ├── avg, sum, min, max, value_count
│   ├── stats, extended_stats
│   ├── cardinality, percentiles, percentile_ranks
│   └── geo_bounds, geo_centroid
├── 桶聚合 (Bucket Aggregations)
│   ├── terms, significant_terms
│   ├── range, date_range, histogram, date_histogram
│   ├── filter, filters, global, missing
│   └── adjacency_matrix, auto_interval
└── 管道聚合 (Pipeline Aggregations)
    ├── cumulative_sum, cumulative_cardinality
    ├── derivative, moving_avg, moving_fn
    └── bucket_sort, bucket_selector, bucket_script

7.2 指标聚合

7.2.1 基本统计

# 单值聚合
GET /products/_search
{
  "size": 0,
  "aggs": {
    "avg_price": { "avg": { "field": "price" } },
    "max_price": { "max": { "field": "price" } },
    "min_price": { "min": { "field": "price" } },
    "sum_price": { "sum": { "field": "price" } }
  }
}

# 完整统计
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_stats": {
      "stats": { "field": "price" }
    }
  }
}

# 响应
{
  "aggregations": {
    "price_stats": {
      "count": 100,
      "min": 99.0,
      "max": 9999.0,
      "avg": 2500.5,
      "sum": 250050.0
    }
  }
}

7.2.2 扩展统计

GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_extended_stats": {
      "extended_stats": {
        "field": "price",
        "sigma": 2  # 标准差倍数
      }
    }
  }
}

7.2.3 去重计数

# 唯一值数量
GET /products/_search
{
  "size": 0,
  "aggs": {
    "unique_brands": {
      "cardinality": { "field": "brand" }
    }
  }
}

# 高精度去重(大数据量时消耗资源)
GET /products/_search
{
  "size": 0,
  "aggs": {
    "unique_users": {
      "cardinality": {
        "field": "user_id",
        "precision_threshold": 100
      }
    }
  }
}

7.2.4 百分位数

# 百分位数
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_percentiles": {
      "percentiles": {
        "field": "price",
        "percents": [25, 50, 75, 90, 95, 99]
      }
    }
  }
}

# 百分位排名
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_rank": {
      "percentile_ranks": {
        "field": "price",
        "values": [1000, 5000, 8000]
      }
    }
  }
}

7.3 桶聚合

7.3.1 terms 聚合(分词聚合)

# 按品牌分组统计
GET /products/_search
{
  "size": 0,
  "aggs": {
    "by_brand": {
      "terms": {
        "field": "brand",
        "size": 10,
        "order": { "_count": "desc" }
      }
    }
  }
}

# 响应
{
  "aggregations": {
    "by_brand": {
      "doc_count_error_upper_bound": 5,
      "sum_other_doc_count": 100,
      "buckets": [
        { "key": "Apple", "doc_count": 50 },
        { "key": "Samsung", "doc_count": 30 },
        { "key": "Huawei", "doc_count": 20 }
      ]
    }
  }
}

7.3.2 分层聚合

# 按分类再按品牌分组
GET /products/_search
{
  "size": 0,
  "aggs": {
    "by_category": {
      "terms": {
        "field": "category",
        "size": 10
      },
      "aggs": {
        "by_brand": {
          "terms": {
            "field": "brand",
            "size": 5
          },
          "aggs": {
            "avg_price": {
              "avg": { "field": "price" }
            }
          }
        }
      }
    }
  }
}

7.3.3 范围聚合

# 价格区间统计
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_ranges": {
      "range": {
        "field": "price",
        "ranges": [
          { "key": "cheap", "to": 1000 },
          { "key": "medium", "from": 1000, "to": 5000 },
          { "key": "expensive", "from": 5000 }
        ]
      }
    }
  }
}

# 日期范围
GET /logs-*/_search
{
  "size": 0,
  "aggs": {
    "date_ranges": {
      "date_range": {
        "field": "@timestamp",
        "ranges": [
          { "from": "now-7d/d", "to": "now/d", "key": "last_7_days" },
          { "from": "now-30d/d", "to": "now-7d/d", "key": "last_30_days" }
        ]
      }
    }
  }
}

7.3.4 直方图聚合

# 价格直方图(每1000一个桶)
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_histogram": {
      "histogram": {
        "field": "price",
        "interval": 1000,
        "min_doc_count": 1,
        "extended_bounds": { "min": 0, "max": 10000 }
      }
    }
  }
}

# 日期直方图
GET /logs-*/_search
{
  "size": 0,
  "aggs": {
    "logs_over_time": {
      "date_histogram": {
        "field": "@timestamp",
        "calendar_interval": "day",
        "format": "yyyy-MM-dd",
        "min_doc_count": 0,
        "extended_bounds": {
          "min": "2024-01-01",
          "max": "2024-01-31"
        }
      }
    }
  }
}

7.3.5 过滤器聚合

# 按状态分组
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "by_status": {
      "filters": {
        "filters": {
          "pending": { "term": { "status": "pending" } },
          "completed": { "term": { "status": "completed" } },
          "cancelled": { "term": { "status": "cancelled" } }
        }
      }
    }
  }
}

7.3.6 全局聚合

# 全局统计(不受查询影响)
GET /products/_search
{
  "query": {
    "term": { "category": "electronics" }
  },
  "aggs": {
    "electronics_avg_price": {
      "avg": { "field": "price" }
    },
    "all_products": {
      "global": {},
      "aggs": {
        "avg_price": {
          "avg": { "field": "price" }
        }
      }
    }
  }
}

7.4 管道聚合

7.4.1 累计求和

# 累计销售额
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "order_date",
        "calendar_interval": "month"
      },
      "aggs": {
        "monthly_sales": {
          "sum": { "field": "amount" }
        },
        "cumulative_sales": {
          "cumulative_sum": {
            "buckets_path": "monthly_sales"
          }
        }
      }
    }
  }
}

7.4.2 移动平均

GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "order_date",
        "calendar_interval": "day"
      },
      "aggs": {
        "daily_sales": {
          "sum": { "field": "amount" }
        },
        "moving_avg_sales": {
          "moving_avg": {
            "buckets_path": "daily_sales",
            "window": 7,
            "model": "simple"
          }
        }
      }
    }
  }
}

7.4.3 桶排序

# 排序并限制返回数量
GET /products/_search
{
  "size": 0,
  "aggs": {
    "by_category": {
      "terms": {
        "field": "category",
        "size": 10
      },
      "aggs": {
        "avg_price": {
          "avg": { "field": "price" }
        },
        "bucket_sort": {
          "sort": [
            { "avg_price": "desc" }
          ],
          "size": 5
        }
      }
    }
  }
}

7.4.4 桶脚本

# 自定义计算
GET /products/_search
{
  "size": 0,
  "aggs": {
    "by_category": {
      "terms": {
        "field": "category",
        "size": 10
      },
      "aggs": {
        "total_price": { "sum": { "field": "price" } },
        "count": { "value_count": { "field": "_id" } },
        "avg_price": { "avg": { "field": "price" } },
        "custom_metric": {
          "bucket_script": {
            "buckets_path": {
              "totalPrice": "total_price",
              "count": "count"
            },
            "script": "params.totalPrice / params.count"
          }
        }
      }
    }
  }
}

7.5 聚合缓存

7.5.1 分片级别缓存

  • 查询结果缓存(Request Cache)
  • 分片级别聚合缓存(Shard Request Cache)
# 启用请求缓存
GET /products/_search
{
  "size": 0,
  "request_cache": true,
  "aggs": {
    "avg_price": { "avg": { "field": "price" } }
  }
}

7.5.2 缓存失效

# 手动清除缓存
POST /products/_cache/clear
POST /_cache/clear

7.6 性能优化

7.6.1 优化建议

  1. 使用 filter 替代 query:filter 不计算分数,更快
  2. 减少聚合桶数量:设置合理的 size 参数
  3. 使用 shard_size:控制从每个分片获取的桶数量
  4. 关闭 size:设为 0,只返回聚合结果
  5. 使用 global 聚合:不受查询影响可缓存
GET /products/_search
{
  "size": 0,
  "query": {
    "bool": {
      "filter": [
        { "term": { "category": "electronics" } }
      ]
    }
  },
  "aggs": {
    "by_brand": {
      "terms": {
        "field": "brand",
        "size": 10,
        "shard_size": 20
      }
    }
  }
}

7.7 聚合与查询结合

# 搜索 + 聚合
GET /products/_search
{
  "query": {
    "match": { "name": "iPhone" }
  },
  "aggs": {
    "max_price": { "max": { "field": "price" } },
    "avg_price": { "avg": { "field": "price" } },
    "by_brand": {
      "terms": {
        "field": "brand",
        "size": 5
      }
    },
    "by_category": {
      "terms": {
        "field": "category",
        "size": 5
      }
    }
  }
}

7.8 总结

本章介绍了 Elasticsearch 的聚合分析功能,包括指标聚合、桶聚合和管道聚合。熟练掌握这些聚合功能,可以实现丰富的统计分析需求。下一章将学习中文分词器配置。