MongoDB高级查询:聚合管道与数据聚合分析
引言
MongoDB是流行的NoSQL文档数据库,其灵活的文档模型和强大的聚合框架使其成为处理大规模数据的理想选择。本文将深入探讨MongoDB的聚合管道,展示如何高效地进行复杂的数据分析和转换。
一、聚合框架基础
1.1 聚合管道概念
MongoDB聚合管道是一个多阶段数据处理流程,每个阶段对输入文档进行转换,输出传递给下一个阶段:
[{$match: {status: "active"}}] → [{$group: {_id: "$category", count: {$sum: 1}}}] → [{$sort: {count: -1}}]
过滤阶段 分组阶段 排序阶段
1.2 Go语言MongoDB连接
package mongo
import (
"context"
"fmt"
"time"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type MongoClient struct {
client *mongo.Client
database *mongo.Database
}
func NewMongoClient(uri, dbName string) (*MongoClient, error) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
client, err := mongo.Connect(ctx, options.Client().ApplyURI(uri))
if err != nil {
return nil, fmt.Errorf("failed to connect to MongoDB: %w", err)
}
if err := client.Ping(ctx, nil); err != nil {
return nil, fmt.Errorf("failed to ping MongoDB: %w", err)
}
return &MongoClient{
client: client,
database: client.Database(dbName),
}, nil
}
func (mc *MongoClient) Collection(name string) *mongo.Collection {
return mc.database.Collection(name)
}
func (mc *MongoClient) Close(ctx context.Context) error {
return mc.client.Disconnect(ctx)
}
二、聚合管道阶段
2.1 $match - 过滤文档
package mongo
import (
"context"
"fmt"
"time"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type AggregationOps struct {
collection *mongo.Collection
}
func NewAggregationOps(collection *mongo.Collection) *AggregationOps {
return &AggregationOps{collection: collection}
}
func (a *AggregationOps) MatchExamples(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"status": "active",
"age": bson.M{"$gte": 18},
"category": bson.M{"$in": []string{"A", "B", "C"}},
}}},
}
cursor, err := a.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, fmt.Errorf("failed to run aggregation: %w", err)
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, fmt.Errorf("failed to decode results: %w", err)
}
return results, nil
}
func (a *AggregationOps) MatchWithDateRange(ctx context.Context, startDate, endDate time.Time) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"created_at": bson.M{
"$gte": startDate,
"$lt": endDate,
},
"status": "completed",
}}},
}
cursor, err := a.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
2.2 $group - 分组聚合
package mongo
import (
"context"
"fmt"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type GroupExample struct {
collection *mongo.Collection
}
func NewGroupExample(collection *mongo.Collection) *GroupExample {
return &GroupExample{collection: collection}
}
type Order struct {
ID string `bson:"_id"`
Customer string `bson:"customer"`
Category string `bson:"category"`
Amount float64 `bson:"amount"`
Status string `bson:"status"`
CreatedAt string `bson:"created_at"`
}
func (g *GroupExample) GroupByCategory(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$group", Value: bson.M{
"_id": "$category",
"total": bson.M{"$sum": "$amount"},
"count": bson.M{"$sum": 1},
"avg": bson.M{"$avg": "$amount"},
"min": bson.M{"$min": "$amount"},
"max": bson.M{"$max": "$amount"},
}}},
{{Key: "$sort", Value: bson.M{"total": -1}}},
}
cursor, err := g.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, fmt.Errorf("failed to run aggregation: %w", err)
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, fmt.Errorf("failed to decode results: %w", err)
}
return results, nil
}
func (g *GroupExample) GroupByCustomerAndMonth(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$group", Value: bson.M{
"_id": bson.M{
"customer": "$customer",
"month": bson.M{
"$dateToString": bson.M{
"format": "%Y-%m",
"date": "$created_at",
},
},
},
"total": bson.M{"$sum": "$amount"},
"count": bson.M{"$sum": 1},
"orders": bson.M{"$push": "$_id"},
}}},
{{Key: "$sort", Value: bson.M{"_id.customer": 1, "_id.month": 1}}},
}
cursor, err := g.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
func (g *GroupExample) TopCustomers(ctx context.Context, limit int) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$group", Value: bson.M{
"_id": "$customer",
"total": bson.M{"$sum": "$amount"},
"orderCount": bson.M{"$sum": 1},
"avgOrder": bson.M{"$avg": "$amount"},
}}},
{{Key: "$sort", Value: bson.M{"total": -1}}},
{{Key: "$limit", Value: limit}},
{{Key: "$project", Value: bson.M{
"_id": 0,
"customer": "$_id",
"total": 1,
"orderCount": 1,
"avgOrder": bson.M{"$round": []interface{}{"$avgOrder", 2}},
}}},
}
cursor, err := g.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
2.3 $project - 字段投影
package mongo
import (
"context"
"fmt"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type ProjectExample struct {
collection *mongo.Collection
}
func NewProjectExample(collection *mongo.Collection) *ProjectExample {
return &ProjectExample{collection: collection}
}
func (p *ProjectExample) BasicProjection(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$project", Value: bson.M{
"_id": 0,
"name": 1,
"email": 1,
"fullAddress": bson.M{
"$concat": []string{
"$address.street",
", ",
"$address.city",
", ",
"$address.state",
},
},
"orderYear": bson.M{
"$year": "$created_at",
},
"discountedPrice": bson.M{
"$multiply": []interface{}{"$price", 0.9},
},
}}},
}
cursor, err := p.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
func (p *ProjectExample) ConditionalProjection(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$project", Value: bson.M{
"name": 1,
"price": 1,
"inStock": "$inventory.count",
"status": bson.M{
"$switch": bson.M{
"branches": []bson.M{
{"case": bson.M{"$gte": []interface{}{"$inventory.count", 100}}, "then": "high"},
{"case": bson.M{"$gte": []interface{}{"$inventory.count", 50}}, "then": "medium"},
{"case": bson.M{"$gte": []interface{}{"$inventory.count", 1}}, "then": "low"},
},
"default": "out_of_stock",
},
},
"categoryTier": bson.M{
"$cond": bson.M{
"if": bson.M{"$eq": []interface{}{"$isPremium", true}},
"then": "tier_1",
"else": "tier_2",
},
},
}}},
}
cursor, err := p.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
三、数组操作
3.1 $unwind - 展开数组
package mongo
import (
"context"
"fmt"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type UnwindExample struct {
collection *mongo.Collection
}
func NewUnwindExample(collection *mongo.Collection) *UnwindExample {
return &UnwindExample{collection: collection}
}
func (u *UnwindExample) UnwindTags(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{"tags": bson.M{"$exists": true, "$ne": []string{}}}}},
{{Key: "$unwind", Value: "$tags"}},
{{Key: "$group", Value: bson.M{
"_id": "$tags",
"count": bson.M{"$sum": 1},
"posts": bson.M{"$addToSet": "$_id"},
}}},
{{Key: "$sort", Value: bson.M{"count": -1}}},
}
cursor, err := u.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, fmt.Errorf("failed to run aggregation: %w", err)
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, fmt.Errorf("failed to decode results: %w", err)
}
return results, nil
}
func (u *UnwindExample) UnwindWithPreserveNull(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$unwind", Value: bson.M{
"path": "$items",
"preserveNullAndEmptyArrays": true,
}}},
{{Key: "$group", Value: bson.M{
"_id": "$_id",
"orderId": bson.M{"$first": "$orderId"},
"total": bson.M{"$sum": "$items.price"},
"itemCount": bson.M{"$sum": 1},
}}},
}
cursor, err := u.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
3.2 $array操作符
package mongo
import (
"context"
"fmt"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type ArrayOpsExample struct {
collection *mongo.Collection
}
func NewArrayOpsExample(collection *mongo.Collection) *ArrayOpsExample {
return &ArrayOpsExample{collection: collection}
}
func (a *ArrayOpsExample) FilterAndMap(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$project", Value: bson.M{
"name": 1,
"scores": bson.M{
"$filter": bson.M{
"input": "$scores",
"as": "score",
"cond": bson.M{"$gte": []interface{}{"$$score", 60}},
},
},
"passedCount": bson.M{
"$size": bson.M{
"$filter": bson.M{
"input": "$scores",
"as": "score",
"cond": bson.M{"$gte": []interface{}{"$$score", 60}},
},
},
},
"totalScore": bson.M{
"$sum": "$scores",
},
}}},
}
cursor, err := a.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
func (a *ArrayOpsExample) ArrayAggregation(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$project", Value: bson.M{
"name": 1,
"values": 1,
"sum": bson.M{"$sum": "$values"},
"avg": bson.M{"$avg": "$values"},
"min": bson.M{"$min": "$values"},
"max": bson.M{"$max": "$values"},
"first": bson.M{"$first": "$values"},
"last": bson.M{"$last": "$values"},
"size": bson.M{"$size": "$values"},
"isEmpty": bson.M{"$eq": []interface{}{"$values", []interface{}{}}},
}}},
}
cursor, err := a.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
四、复杂聚合实战
4.1 用户行为分析
package mongo
import (
"context"
"fmt"
"time"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
)
type UserAnalytics struct {
collection *mongo.Collection
}
func NewUserAnalytics(collection *mongo.Collection) *UserAnalytics {
return &UserAnalytics{collection: collection}
}
type DailyStats struct {
Date string `bson:"_id"`
ActiveUsers int `bson:"activeUsers"`
NewUsers int `bson:"newUsers"`
TotalActions int `bson:"totalActions"`
AvgSession float64 `bson:"avgSessionDuration"`
}
func (ua *UserAnalytics) GetDailyStats(ctx context.Context, startDate, endDate time.Time) ([]DailyStats, error) {
pipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"timestamp": bson.M{
"$gte": startDate,
"$lt": endDate,
},
}}},
{{Key: "$group", Value: bson.M{
"_id": bson.M{
"$dateToString": bson.M{
"format": "%Y-%m-%d",
"date": "$timestamp",
},
},
"activeUsers": bson.M{"$addToSet": "$userId"},
"newUsers": bson.M{"$addToSet": bson.M{
"$cond": []interface{}{
bson.M{"$eq": []interface{}{"$isNewUser", true}},
"$userId",
"$$REMOVE",
},
}},
"totalActions": bson.M{"$sum": 1},
}}},
{{Key: "$project", Value: bson.M{
"_id": 1,
"activeUsers": bson.M{"$size": "$activeUsers"},
"newUsers": bson.M{
"$size": bson.M{
"$filter": bson.M{
"input": "$newUsers",
"as": "u",
"cond": bson.M{"$ne": []interface{}{"$$u", nil}},
},
},
},
"totalActions": 1,
}}},
{{Key: "$sort", Value: bson.M{"_id": 1}}},
}
cursor, err := ua.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, fmt.Errorf("failed to run aggregation: %w", err)
}
defer cursor.Close(ctx)
var results []DailyStats
if err := cursor.All(ctx, &results); err != nil {
return nil, fmt.Errorf("failed to decode results: %w", err)
}
return results, nil
}
func (ua *UserAnalytics) GetCohortAnalysis(ctx context.Context) ([]bson.M, error) {
pipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"createdAt": bson.M{
"$gte": time.Now().AddDate(0, -6, 0),
},
}}},
{{Key: "$group", Value: bson.M{
"_id": bson.M{
"cohort": bson.M{
"$dateToString": bson.M{
"format": "%Y-%m",
"date": "$createdAt",
},
},
"userId": "$userId",
},
"firstActivity": bson.M{"$min": "$lastActivity"},
}}},
{{Key: "$group", Value: bson.M{
"_id": "$_id.cohort",
"users": bson.M{"$push": bson.M{
"userId": "$_id.userId",
"firstActivity": "$firstActivity",
}},
}}},
{{Key: "$sort", Value: bson.M{"_id": 1}}},
}
cursor, err := ua.collection.Aggregate(ctx, pipeline)
if err != nil {
return nil, err
}
defer cursor.Close(ctx)
var results []bson.M
if err := cursor.All(ctx, &results); err != nil {
return nil, err
}
return results, nil
}
4.2 实时仪表盘统计
package mongo
import (
"context"
"fmt"
"time"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type DashboardStats struct {
TotalOrders int64 `bson:"totalOrders"`
TotalRevenue float64 `bson:"totalRevenue"`
AverageOrder float64 `bson:"averageOrder"`
TopCategories []bson.M `bson:"topCategories"`
RecentOrders []bson.M `bson:"recentOrders"`
OrdersByHour []bson.M `bson:"ordersByHour"`
}
type DashboardAggregator struct {
orders *mongo.Collection
products *mongo.Collection
}
func NewDashboardAggregator(orders, products *mongo.Collection) *DashboardAggregator {
return &DashboardAggregator{
orders: orders,
products: products,
}
}
func (da *DashboardAggregator) GetRealtimeStats(ctx context.Context) (*DashboardStats, error) {
sessionStart := time.Now().Add(-24 * time.Hour)
totalOrders, err := da.orders.CountDocuments(ctx, bson.M{
"createdAt": bson.M{"$gte": sessionStart},
})
if err != nil {
return nil, fmt.Errorf("failed to count orders: %w", err)
}
var revenue struct {
Total float64 `bson:"total"`
Average float64 `bson:"average"`
}
revenuePipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"createdAt": bson.M{"$gte": sessionStart},
}}},
{{Key: "$group", Value: bson.M{
"_id": nil,
"total": bson.M{"$sum": "$total"},
"average": bson.M{"$avg": "$total"},
}}},
}
revenueCursor, err := da.orders.Aggregate(ctx, revenuePipeline)
if err != nil {
return nil, fmt.Errorf("failed to calculate revenue: %w", err)
}
if revenueCursor.Next(ctx) {
revenueCursor.Decode(&revenue)
}
revenueCursor.Close(ctx)
topCategoriesPipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"createdAt": bson.M{"$gte": sessionStart},
}}},
{{Key: "$group", Value: bson.M{
"_id": "$category",
"count": bson.M{"$sum": 1},
"total": bson.M{"$sum": "$total"},
}}},
{{Key: "$sort", Value: bson.M{"count": -1}}},
{{Key: "$limit", Value: 5}},
}
topCategoriesCursor, err := da.orders.Aggregate(ctx, topCategoriesPipeline)
if err != nil {
return nil, fmt.Errorf("failed to get top categories: %w", err)
}
defer topCategoriesCursor.Close(ctx)
var topCategories []bson.M
if err := topCategoriesCursor.All(ctx, &topCategories); err != nil {
return nil, err
}
recentPipeline := mongo.Pipeline{
{{Key: "$match", Value: bson.M{
"createdAt": bson.M{"$gte": sessionStart},
}}},
{{Key: "$sort", Value: bson.M{"createdAt": -1}}},
{{Key: "$limit", Value: 10}},
{{Key: "$lookup", Value: bson.M{
"from": "products",
"localField": "productId",
"foreignField": "_id",
"as": "product",
}}},
{{Key: "$unwind", Value: "$product"}},
{{Key: "$project", Value: bson.M{
"orderId": "$_id",
"product": "$product.name",
"total": 1,
"createdAt": 1,
}}},
}
recentCursor, err := da.orders.Aggregate(ctx, recentPipeline)
if err != nil {
return nil, fmt.Errorf("failed to get recent orders: %w", err)
}
defer recentCursor.Close(ctx)
var recentOrders []bson.M
if err := recentCursor.All(ctx, &recentOrders); err != nil {
return nil, err
}
return &DashboardStats{
TotalOrders: totalOrders,
TotalRevenue: revenue.Total,
AverageOrder: revenue.Average,
TopCategories: topCategories,
RecentOrders: recentOrders,
}, nil
}
五、性能优化
5.1 聚合管道优化
package mongo
import (
"context"
"fmt"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"
"go.mongodb.org/mongo-driver/mongo/options"
)
type AggregationOptimizer struct {
collection *mongo.Collection
}
func NewAggregationOptimizer(collection *mongo.Collection) *AggregationOptimizer {
return &AggregationOptimizer{collection: collection}
}
func (ao *AggregationOptimizer) ExplainAggregation(ctx context.Context, pipeline mongo.Pipeline) (*mongo.ExplainResult, error) {
explainOpts := options.Explain().SetVerbose(true)
result, err := ao.collection.Aggregate(ctx, pipeline, options.Aggregate().SetExplainResult(explainOpts))
if err != nil {
return nil, fmt.Errorf("failed to explain aggregation: %w", err)
}
var explainResult mongo.ExplainResult
if err := result.Decode(&explainResult); err != nil {
return nil, fmt.Errorf("failed to decode explain result: %w", err)
}
return &explainResult, nil
}
func (ao *AggregationOptimizer) OptimizePipeline(pipeline mongo.Pipeline) mongo.Pipeline {
optimized := make(mongo.Pipeline, 0, len(pipeline))
for _, stage := range pipeline {
stageName := stage.Key
if stageName == "$lookup" || stageName == "$graphLookup" {
if len(optimized) > 0 && optimized[len(optimized)-1][0].Key == "$match" {
matchStage := optimized[len(optimized)-1][0].Key
if err := ao.pushMatchBeforeLookup(optimized, matchStage, stage); err != nil {
optimized = append(optimized, stage)
}
} else {
optimized = append(optimized, stage)
}
} else {
optimized = append(optimized, stage)
}
}
return optimized
}
func (ao *AggregationOptimizer) pushMatchBeforeLookup(pipeline mongo.Pipeline, matchStage bson.M, lookupStage bson.M) error {
return nil
}
六、总结
MongoDB聚合管道是处理复杂数据分析的强大工具:
- $match优先:尽早过滤数据,减少后续阶段处理的数据量
- $project精简:只输出需要的字段,减少数据传输
- $group合理使用:注意内存限制,避免使用过大的$group
- 数组操作:$unwind、$filter、$map等让数组处理变得简单
- $lookup联表:实现类似SQL JOIN的功能,但要注意性能
- $facet并行:在单个阶段中执行多个聚合计算
掌握聚合管道的各个阶段和操作符,能够帮助您高效地处理各种数据分析需求。

826

被折叠的 条评论
为什么被折叠?



