GridSearchCV简介

原创于 2026-04-04 07:04:52 发布 · 249 阅读

3 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#python #机器学习 #人工智能

GridSearchCV 是 scikit-learn 中用于超参数调优的工具，通过穷举搜索指定的参数组合，结合交叉验证来找到模型的最优参数。

什么是超参数？

超参数是模型训练前需要手动设置的参数，模型不会自动学习它们。

# KNN 的超参数示例
knn = KNeighborsClassifier(
    n_neighbors=5,      # 超参数：K值
    weights='uniform',  # 超参数：权重策略
    p=2                 # 超参数：距离度量
)

GridSearchCV 的核心思想

穷举搜索 + 交叉验证：

你提供一个参数网格（可能的参数值组合）
GridSearchCV 尝试每一种组合
对每种组合进行交叉验证，计算平均性能
返回性能最好的参数组合

基本用法

1. 最简单的例子

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载数据
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# 1. 定义模型
knn = KNeighborsClassifier()

# 2. 定义参数网格
param_grid = {
    'n_neighbors': [3, 5, 7, 9],      # 尝试4个不同的K值
    'weights': ['uniform', 'distance'] # 尝试2种权重策略
}
# 总共会尝试 4 × 2 = 8 种组合

# 3. 创建 GridSearchCV 对象
grid_search = GridSearchCV(
    estimator=knn,           # 模型
    param_grid=param_grid,   # 参数网格
    cv=5,                    # 5折交叉验证
    scoring='accuracy',      # 评估指标
    verbose=1                # 显示进度
)

# 4. 执行搜索（训练）
grid_search.fit(X_train, y_train)

# 5. 查看结果
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
print(f"测试集得分: {grid_search.score(X_test, y_test):.4f}")

参数详解

参数	说明	示例
`estimator`	要调优的模型	`KNeighborsClassifier()`
`param_grid`	参数网格（字典或列表）	`{'n_neighbors': [3,5,7]}`
`cv`	交叉验证折数	`5`（5折）、`3`（3折）
`scoring`	评估指标	`'accuracy'`、`'f1'`、`'roc_auc'`
`n_jobs`	并行计算核心数	`-1`（使用所有核心）
`verbose`	详细程度（0,1,2）	`1`（显示进度）
`refit`	用最佳参数重新训练	`True`（默认）

复杂参数网格

1. 字典形式（笛卡尔积）

# 会尝试所有组合：3×2×2 = 12 种
param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

2. 列表形式（不同参数集）

# 第一组参数
param_grid = [
    {'n_neighbors': [3, 5], 'weights': ['uniform']},     # KNN with uniform
    {'n_neighbors': [7, 9], 'weights': ['distance']},    # KNN with distance
    {'kernel': ['linear', 'rbf'], 'C': [0.1, 1]}         # SVM 的不同参数
]

完整实战示例

示例1：KNN 参数调优

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# 1. 准备数据
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.2, random_state=42
)

# 2. 创建管道（标准化 + KNN）
pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

# 3. 定义参数网格（注意参数名前要加模型名）
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7, 9, 11],
    'kneighborsclassifier__weights': ['uniform', 'distance'],
    'kneighborsclassifier__p': [1, 2]
}

# 4. GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,                  # 5折交叉验证
    scoring='accuracy',
    n_jobs=-1,             # 使用所有CPU核心
    verbose=1
)

# 5. 训练
grid_search.fit(X_train, y_train)

# 6. 结果分析
print("="*50)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳交叉验证得分: {grid_search.best_score_:.4f}")
print(f"测试集得分: {grid_search.score(X_test, y_test):.4f}")
print("="*50)

# 7. 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print(results[['param_kneighborsclassifier__n_neighbors', 
               'param_kneighborsclassifier__weights',
               'mean_test_score', 'std_test_score']].head())

示例2：SVM 参数调优

from sklearn.svm import SVC

# SVM 有更多需要调优的超参数
param_grid = {
    'C': [0.1, 1, 10, 100],           # 正则化强度
    'gamma': [0.001, 0.01, 0.1, 1],   # 核函数系数
    'kernel': ['rbf', 'linear']        # 核函数类型
}

svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳得分: {grid_search.best_score_:.4f}")

示例3：随机森林参数调优

from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],      # 树的数量
    'max_depth': [None, 10, 20, 30],     # 树的最大深度
    'min_samples_split': [2, 5, 10],     # 内部节点再划分所需最小样本数
    'min_samples_leaf': [1, 2, 4]        # 叶子节点最少样本数
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

查看结果的属性

# 训练完成后，GridSearchCV 对象有以下重要属性：

# 1. 最佳参数
print(grid_search.best_params_)
# {'n_neighbors': 5, 'weights': 'uniform'}

# 2. 最佳交叉验证得分
print(grid_search.best_score_)
# 0.9583333333333334

# 3. 最佳模型（已经用最佳参数重新训练）
best_model = grid_search.best_estimator_

# 4. 所有参数组合的详细结果
results = grid_search.cv_results_
print(f"平均测试得分: {results['mean_test_score']}")
print(f"得分标准差: {results['std_test_score']}")
print(f"参数组合: {results['params']}")

# 5. 每个参数组合的排名
print(f"排名: {results['rank_test_score']}")

# 6. 最佳模型在测试集上的表现
test_score = grid_search.score(X_test, y_test)

可视化结果

import matplotlib.pyplot as plt
import seaborn as sns

# 将结果转换为 DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)

# 绘制参数与得分的关系
plt.figure(figsize=(10, 6))
sns.lineplot(
    data=results_df,
    x='param_kneighborsclassifier__n_neighbors',
    y='mean_test_score',
    hue='param_kneighborsclassifier__weights',
    marker='o'
)
plt.title('Grid Search Results')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Cross-Validation Accuracy')
plt.legend(title='Weights')
plt.grid(True)
plt.show()

不同评分指标

# 分类问题常用指标
scoring_options = {
    'accuracy': 'accuracy',           # 准确率
    'f1_macro': 'f1_macro',          # F1分数（宏平均）
    'roc_auc': 'roc_auc_ovr',        # ROC-AUC
    'precision': 'precision_macro',   # 精确率
    'recall': 'recall_macro'          # 召回率
}

for name, metric in scoring_options.items():
    grid_search = GridSearchCV(
        estimator, param_grid, 
        scoring=metric, cv=5
    )
    grid_search.fit(X_train, y_train)
    print(f"{name}: {grid_search.best_score_:.4f}")

注意事项

1. 计算成本高

# 假设：
# - 参数组合数: 20
# - 交叉验证折数: 5
# - 总训练次数: 20 × 5 = 100 次
# 
# 如果每次训练需要1秒，总耗时100秒

# 解决方案：
# - 减少参数组合
# - 降低 cv 值（如 cv=3）
# - 使用 n_jobs=-1 并行计算
# - 先用 RandomizedSearchCV 粗调，再用 GridSearchCV 细调

2. 数据泄露风险

# ❌ 错误：在 GridSearchCV 之前做预处理
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # 数据泄露！
grid_search.fit(X_scaled, y)

# ✅ 正确：使用 Pipeline
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())
grid_search.fit(pipeline, X_train, y_train)

3. 参数命名规则

# 在 Pipeline 中，参数名需要加前缀
pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier())

# 正确写法：'模型名__参数名'
param_grid = {
    'kneighborsclassifier__n_neighbors': [3, 5, 7]
}

# 或者使用 named_steps
param_grid = {
    pipeline.named_steps['kneighborsclassifier'].__class__.__name__.lower() 
    + '__n_neighbors': [3, 5, 7]
}

GridSearchCV vs RandomizedSearchCV

特性	GridSearchCV	RandomizedSearchCV
搜索方式	穷举所有组合	随机采样部分组合
参数空间	离散值	连续或离散
计算成本	高	低
适用场景	参数空间小	参数空间大
能否找到最优	一定能（在给定网格内）	不一定

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# 随机搜索示例
param_dist = {
    'n_neighbors': randint(3, 20),           # 3-20 之间的随机整数
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

random_search = RandomizedSearchCV(
    knn, param_dist, 
    n_iter=20,          # 随机采样20组
    cv=5, 
    random_state=42
)

总结

要点	说明
作用	自动寻找模型的最优超参数
核心	穷举搜索 + 交叉验证
优点	能找到给定范围内的最优参数
缺点	计算成本高，参数空间大时慢
最佳实践	结合 Pipeline 使用，避免数据泄露
适用场景	超参数数量少（<10个），每个参数取值少（<10个）