Day15 Pyhton Study

最新推荐文章于 2026-06-24 20:29:46 发布

原创最新推荐文章于 2026-06-24 20:29:46 发布 · 291 阅读

5 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

标签

#python #机器学习

python打卡日记专栏收录该内容

60 篇文章

订阅专栏

@浙大疏锦行

今日作业：对心脏病数据集利用今天学到的方法，并且结合交叉验证+超参数调优，来达到学习效果更好的目的。

from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# --- 1. 默认参数的随机森林 ---
# 评估基准模型，这里确实不需要验证集
print("--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train) # 在训练集上训练
rf_pred = rf_model.predict(X_test) # 在测试集上预测


print("\n默认随机森林 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred))
print("默认随机森林 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred))

## --- 2. 随机过采样 (Random Oversampling) ---
print("\n--- 2. 随机过采样 (Random Oversampling) ---")
# 导入随机过采样器
from imblearn.over_sampling import RandomOverSampler
import time

# 实例化随机过采样器，设置 sampling_strategy='minority' 表示只对少数类进行采样
# random_state 用于确保结果可复现
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)

# 对训练集进行过采样。注意：只对训练集进行操作，测试集保持不变
start_time = time.time()
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)
end_time = time.time()
print(f"随机过采样耗时: {end_time - start_time:.4f} 秒")

# 检查采样后的类别分布
print("\n随机过采样后训练集类别分布:")
print(pd.Series(y_resampled_ros).value_counts())

# 使用过采样后的数据训练随机森林模型
rf_model_ros = RandomForestClassifier(random_state=42)
rf_model_ros.fit(X_resampled_ros, y_resampled_ros) # 在过采样后的训练集上训练
rf_pred_ros = rf_model_ros.predict(X_test) # 在原始测试集上预测

print("\n随机过采样后模型 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred_ros))
print("随机过采样后模型 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred_ros))

## --- 6. 欠采样：随机欠采样 (Random Under-Sampling) ---
print("\n--- 6. 欠采样：随机欠采样 (Random Under-Sampling) ---")
# 导入随机欠采样器
from imblearn.under_sampling import RandomUnderSampler

# 实例化随机欠采样器，sampling_strategy='majority' 表示只对多数类进行采样
rus = RandomUnderSampler(sampling_strategy='majority', random_state=42)

# 对训练集进行欠采样
start_time = time.time()
# X_resampled_rus 和 y_resampled_rus 是欠采样后的训练集
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)
end_time = time.time()
print(f"随机欠采样耗时: {end_time - start_time:.4f} 秒")

# 检查采样后的类别分布
print("\n随机欠采样后训练集类别分布:")
# 欠采样后，多数类样本数量将等于少数类样本数量
print(pd.Series(y_resampled_rus).value_counts())

# 使用欠采样后的数据训练随机森林模型
rf_model_rus = RandomForestClassifier(random_state=42)
rf_model_rus.fit(X_resampled_rus, y_resampled_rus)
rf_pred_rus = rf_model_rus.predict(X_test) # 在原始测试集上预测

print("\n随机欠采样后模型 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred_rus))
print(confusion_matrix(y_test, rf_pred_rus))

## --- 7. 欠采样：Edited Nearest Neighbors (ENN) ---
print("\n--- 7. 欠采样：Edited Nearest Neighbors (ENN) ---")
from imblearn.under_sampling import EditedNearestNeighbours

# 实例化 ENN (默认参数通常用于清理多数类样本)
# sampling_strategy='all' 表示对所有类别应用规则，但实际主要移除多数类中的噪声
enn = EditedNearestNeighbours(sampling_strategy='all', n_neighbors=3, kind_sel='all')

# 对训练集进行 ENN 欠采样（数据清洗）
start_time = time.time()
X_resampled_enn, y_resampled_enn = enn.fit_resample(X_train, y_train)
end_time = time.time()
print(f"ENN 欠采样耗时: {end_time - start_time:.4f} 秒")

# 检查采样后的类别分布
print("\nENN 欠采样后训练集类别分布:")
# 注意：ENN 是数据清洗，不会完全平衡数据集，多数类样本会减少，但仍多于少数类
print(pd.Series(y_resampled_enn).value_counts())

# 使用 ENN 欠采样后的数据训练随机森林模型
rf_model_enn = RandomForestClassifier(random_state=42)
rf_model_enn.fit(X_resampled_enn, y_resampled_enn)
rf_pred_enn = rf_model_enn.predict(X_test)

print("\nENN 欠采样后模型 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred_enn))
print(confusion_matrix(y_test, rf_pred_enn))

## --- 8. 混合采样：SMOTE + ENN (SMOTENN) ---
print("\n--- 8. 混合采样：SMOTE + ENN (SMOTENN) ---")
# 导入 SMOTENN
from imblearn.combine import SMOTEENN

# 实例化 SMOTEENN，它内部集成了 SMOTE 和 ENN
# random_state 用于确保结果可复现
smote_enn = SMOTEENN(random_state=42)

# 对训练集进行混合采样
start_time = time.time()
# X_resampled_smotenn 和 y_resampled_smotenn 是混合采样后的训练集
X_resampled_smotenn, y_resampled_smotenn = smote_enn.fit_resample(X_train, y_train)
end_time = time.time()
print(f"SMOTE + ENN 混合采样耗时: {end_time - start_time:.4f} 秒")

# 检查采样后的类别分布
print("\nSMOTE + ENN 混合采样后训练集类别分布:")
# 经过 SMOTE 和 ENN 清理后，数据集通常会接近平衡，但多数类会略有减少
print(pd.Series(y_resampled_smotenn).value_counts())

# 使用混合采样后的数据训练随机森林模型
rf_model_smotenn = RandomForestClassifier(random_state=42)
rf_model_smotenn.fit(X_resampled_smotenn, y_resampled_smotenn)
rf_pred_smotenn = rf_model_smotenn.predict(X_test) # 在原始测试集上预测

print("\nSMOTE + ENN 混合采样后模型 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred_smotenn))
print("SMOTE + ENN 混合采样后模型 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred_smotenn))

print("\n--- 代价敏感学习：设置 class_weight='balanced' ---")
# 使用原始训练集 X_train, y_train

# 检查原始训练集类别分布 (多数类: 0, 少数类: 1)
print("原始训练集类别分布:")
print(y_train.value_counts())

start_time = time.time()
# 实例化随机森林，设置 class_weight='balanced'
rf_model_weighted = RandomForestClassifier(
    random_state=42,
    class_weight='balanced' # 核心参数：根据类别频率自动调整权重
)
rf_model_weighted.fit(X_train, y_train) # 在原始训练集上训练
rf_pred_weighted = rf_model_weighted.predict(X_test) # 在测试集上预测
end_time = time.time()

print(f"权重调整模型训练与预测耗时: {end_time - start_time:.4f} 秒")

print("\n权重调整随机森林 在测试集上的分类报告：")
print(classification_report(y_test, rf_pred_weighted))
print("权重调整随机森林 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, rf_pred_weighted))

print("\n--- 阈值调整：计算概率并寻找 F1-Score 最佳阈值 ---")
from sklearn.metrics import precision_recall_curve, f1_score, confusion_matrix
import numpy as np # 确保导入 numpy

# 1. 确保我们使用的是基准模型 (rf_model) 在测试集上的概率预测
y_proba = rf_model.predict_proba(X_test)[:, 1] # 获取类别 1 的预测概率

# 2. 计算所有可能的精确率、召回率和对应的阈值
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# 3. 计算所有阈值对应的 F1-Score
# 注意：fscores 比 thresholds 多一个值，因此在计算 ix 时只看 thresholds 的范围
fscores = (2 * precisions * recalls) / (precisions + recalls)

# 4. 找到 F1-Score 最大的索引 (ix)
# np.argmax 返回最大值的索引。我们使用 [:-1] 来匹配 thresholds 的长度
ix = np.argmax(fscores[:-1])
best_threshold = thresholds[ix]
best_fscore = fscores[ix]

# 5. 打印找到的最佳阈值和性能
print(f"最佳 F1-Score 对应的阈值: {best_threshold:.4f}")
print(f"该阈值下的最大 F1-Score (类别 1): {best_fscore:.4f}")
print(f"该阈值下的精确率 (类别 1): {precisions[ix]:.4f}")
print(f"该阈值下的召回率 (类别 1): {recalls[ix]:.4f}")

# 6. 使用最佳阈值生成最终预测结果
y_pred_best_threshold = (y_proba >= best_threshold).astype(int)

# 7. 打印最终评估报告
print(f"\n基准模型，使用最佳 F1 阈值 ({best_threshold:.4f}) 时的分类报告：")
print(classification_report(y_test, y_pred_best_threshold))
print("该阈值下的混淆矩阵：")
print(confusion_matrix(y_test, y_pred_best_threshold))

print("\n--- 可视化不同阈值下的 F1-Score 变化 ---")

# 1. 绘制 F1-Score 随阈值变化的曲线
plt.figure(figsize=(10, 6))
# thresholds 比 fscores 少一个点，我们使用 fscores[:-1] 匹配
plt.plot(thresholds, fscores[:-1], label='F1-Score', linewidth=2)
# 绘制精确率和召回率，供参考
plt.plot(thresholds, precisions[:-1], label='精确率 (Precision)', linestyle='--')
plt.plot(thresholds, recalls[:-1], label='召回率 (Recall)', linestyle=':')

# 2. 标记 F1-Score 的最佳点
# best_threshold 和 best_fscore 来自上一步骤的计算结果
plt.scatter(best_threshold, best_fscore, marker='o', color='red', s=100,
            label=f'最佳 F1 阈值 ({best_threshold:.4f})')
plt.annotate(f'Max F1: {best_fscore:.4f}',
             xy=(best_threshold, best_fscore),
             xytext=(best_threshold + 0.05, best_fscore - 0.1),
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=12)

# 3. 设置图表属性
plt.title('不同分类阈值下的性能指标变化', fontsize=16)
plt.xlabel('分类阈值 (Threshold)', fontsize=14)
plt.ylabel('性能指标值', fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(fontsize=12)
plt.show()

# 为了演示步长细致的效果，我们不再手动取步长，
# 因为 precision_recall_curve 已经计算了所有数据点对应的阈值。
# 如果想看特定范围的F1变化，可以只绘制该范围的曲线。