第七章:聚合分析
学习 Elasticsearch 的聚合分析功能,包括指标聚合、桶聚合和管道聚合。
最后更新: 2024-01-15
页面目录
第七章:聚合分析
7.1 聚合概述
Elasticsearch 聚合功能强大,支持对数据进行统计分析、分类汇总等操作。
7.1.1 聚合类型
Aggregations
├── 指标聚合 (Metrics Aggregations)
│ ├── avg, sum, min, max, value_count
│ ├── stats, extended_stats
│ ├── cardinality, percentiles, percentile_ranks
│ └── geo_bounds, geo_centroid
├── 桶聚合 (Bucket Aggregations)
│ ├── terms, significant_terms
│ ├── range, date_range, histogram, date_histogram
│ ├── filter, filters, global, missing
│ └── adjacency_matrix, auto_interval
└── 管道聚合 (Pipeline Aggregations)
├── cumulative_sum, cumulative_cardinality
├── derivative, moving_avg, moving_fn
└── bucket_sort, bucket_selector, bucket_script
7.2 指标聚合
7.2.1 基本统计
# 单值聚合
GET /products/_search
{
"size": 0,
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"max_price": { "max": { "field": "price" } },
"min_price": { "min": { "field": "price" } },
"sum_price": { "sum": { "field": "price" } }
}
}
# 完整统计
GET /products/_search
{
"size": 0,
"aggs": {
"price_stats": {
"stats": { "field": "price" }
}
}
}
# 响应
{
"aggregations": {
"price_stats": {
"count": 100,
"min": 99.0,
"max": 9999.0,
"avg": 2500.5,
"sum": 250050.0
}
}
}
7.2.2 扩展统计
GET /products/_search
{
"size": 0,
"aggs": {
"price_extended_stats": {
"extended_stats": {
"field": "price",
"sigma": 2 # 标准差倍数
}
}
}
}
7.2.3 去重计数
# 唯一值数量
GET /products/_search
{
"size": 0,
"aggs": {
"unique_brands": {
"cardinality": { "field": "brand" }
}
}
}
# 高精度去重(大数据量时消耗资源)
GET /products/_search
{
"size": 0,
"aggs": {
"unique_users": {
"cardinality": {
"field": "user_id",
"precision_threshold": 100
}
}
}
}
7.2.4 百分位数
# 百分位数
GET /products/_search
{
"size": 0,
"aggs": {
"price_percentiles": {
"percentiles": {
"field": "price",
"percents": [25, 50, 75, 90, 95, 99]
}
}
}
}
# 百分位排名
GET /products/_search
{
"size": 0,
"aggs": {
"price_rank": {
"percentile_ranks": {
"field": "price",
"values": [1000, 5000, 8000]
}
}
}
}
7.3 桶聚合
7.3.1 terms 聚合(分词聚合)
# 按品牌分组统计
GET /products/_search
{
"size": 0,
"aggs": {
"by_brand": {
"terms": {
"field": "brand",
"size": 10,
"order": { "_count": "desc" }
}
}
}
}
# 响应
{
"aggregations": {
"by_brand": {
"doc_count_error_upper_bound": 5,
"sum_other_doc_count": 100,
"buckets": [
{ "key": "Apple", "doc_count": 50 },
{ "key": "Samsung", "doc_count": 30 },
{ "key": "Huawei", "doc_count": 20 }
]
}
}
}
7.3.2 分层聚合
# 按分类再按品牌分组
GET /products/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category",
"size": 10
},
"aggs": {
"by_brand": {
"terms": {
"field": "brand",
"size": 5
},
"aggs": {
"avg_price": {
"avg": { "field": "price" }
}
}
}
}
}
}
}
7.3.3 范围聚合
# 价格区间统计
GET /products/_search
{
"size": 0,
"aggs": {
"price_ranges": {
"range": {
"field": "price",
"ranges": [
{ "key": "cheap", "to": 1000 },
{ "key": "medium", "from": 1000, "to": 5000 },
{ "key": "expensive", "from": 5000 }
]
}
}
}
}
# 日期范围
GET /logs-*/_search
{
"size": 0,
"aggs": {
"date_ranges": {
"date_range": {
"field": "@timestamp",
"ranges": [
{ "from": "now-7d/d", "to": "now/d", "key": "last_7_days" },
{ "from": "now-30d/d", "to": "now-7d/d", "key": "last_30_days" }
]
}
}
}
}
7.3.4 直方图聚合
# 价格直方图(每1000一个桶)
GET /products/_search
{
"size": 0,
"aggs": {
"price_histogram": {
"histogram": {
"field": "price",
"interval": 1000,
"min_doc_count": 1,
"extended_bounds": { "min": 0, "max": 10000 }
}
}
}
}
# 日期直方图
GET /logs-*/_search
{
"size": 0,
"aggs": {
"logs_over_time": {
"date_histogram": {
"field": "@timestamp",
"calendar_interval": "day",
"format": "yyyy-MM-dd",
"min_doc_count": 0,
"extended_bounds": {
"min": "2024-01-01",
"max": "2024-01-31"
}
}
}
}
}
7.3.5 过滤器聚合
# 按状态分组
GET /orders/_search
{
"size": 0,
"aggs": {
"by_status": {
"filters": {
"filters": {
"pending": { "term": { "status": "pending" } },
"completed": { "term": { "status": "completed" } },
"cancelled": { "term": { "status": "cancelled" } }
}
}
}
}
}
7.3.6 全局聚合
# 全局统计(不受查询影响)
GET /products/_search
{
"query": {
"term": { "category": "electronics" }
},
"aggs": {
"electronics_avg_price": {
"avg": { "field": "price" }
},
"all_products": {
"global": {},
"aggs": {
"avg_price": {
"avg": { "field": "price" }
}
}
}
}
}
7.4 管道聚合
7.4.1 累计求和
# 累计销售额
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "order_date",
"calendar_interval": "month"
},
"aggs": {
"monthly_sales": {
"sum": { "field": "amount" }
},
"cumulative_sales": {
"cumulative_sum": {
"buckets_path": "monthly_sales"
}
}
}
}
}
}
7.4.2 移动平均
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "order_date",
"calendar_interval": "day"
},
"aggs": {
"daily_sales": {
"sum": { "field": "amount" }
},
"moving_avg_sales": {
"moving_avg": {
"buckets_path": "daily_sales",
"window": 7,
"model": "simple"
}
}
}
}
}
}
7.4.3 桶排序
# 排序并限制返回数量
GET /products/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category",
"size": 10
},
"aggs": {
"avg_price": {
"avg": { "field": "price" }
},
"bucket_sort": {
"sort": [
{ "avg_price": "desc" }
],
"size": 5
}
}
}
}
}
7.4.4 桶脚本
# 自定义计算
GET /products/_search
{
"size": 0,
"aggs": {
"by_category": {
"terms": {
"field": "category",
"size": 10
},
"aggs": {
"total_price": { "sum": { "field": "price" } },
"count": { "value_count": { "field": "_id" } },
"avg_price": { "avg": { "field": "price" } },
"custom_metric": {
"bucket_script": {
"buckets_path": {
"totalPrice": "total_price",
"count": "count"
},
"script": "params.totalPrice / params.count"
}
}
}
}
}
}
7.5 聚合缓存
7.5.1 分片级别缓存
- 查询结果缓存(Request Cache)
- 分片级别聚合缓存(Shard Request Cache)
# 启用请求缓存
GET /products/_search
{
"size": 0,
"request_cache": true,
"aggs": {
"avg_price": { "avg": { "field": "price" } }
}
}
7.5.2 缓存失效
# 手动清除缓存
POST /products/_cache/clear
POST /_cache/clear
7.6 性能优化
7.6.1 优化建议
- 使用 filter 替代 query:filter 不计算分数,更快
- 减少聚合桶数量:设置合理的
size参数 - 使用
shard_size:控制从每个分片获取的桶数量 - 关闭
size:设为 0,只返回聚合结果 - 使用
global聚合:不受查询影响可缓存
GET /products/_search
{
"size": 0,
"query": {
"bool": {
"filter": [
{ "term": { "category": "electronics" } }
]
}
},
"aggs": {
"by_brand": {
"terms": {
"field": "brand",
"size": 10,
"shard_size": 20
}
}
}
}
7.7 聚合与查询结合
# 搜索 + 聚合
GET /products/_search
{
"query": {
"match": { "name": "iPhone" }
},
"aggs": {
"max_price": { "max": { "field": "price" } },
"avg_price": { "avg": { "field": "price" } },
"by_brand": {
"terms": {
"field": "brand",
"size": 5
}
},
"by_category": {
"terms": {
"field": "category",
"size": 5
}
}
}
}
7.8 总结
本章介绍了 Elasticsearch 的聚合分析功能,包括指标聚合、桶聚合和管道聚合。熟练掌握这些聚合功能,可以实现丰富的统计分析需求。下一章将学习中文分词器配置。