Day17 多分类问题-CSDN博客

作业：尝试对sklearn自带的红酒数据集完成机器学习多分类流程

#Day17 多分类问题
#尝试对sklearn自带的红酒数据集完成机器学习多分类流程
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

# 设置画图风格
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 加载红酒数据集
wine = datasets.load_wine()
X = wine.data
y = wine.target
n_classes = len(np.unique(y))
class_names = wine.target_names

# 加载鸢尾花数据集
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target
n_classes_iris = len(np.unique(y_iris))
class_names_iris = iris.target_names

# 查看红酒数据
df = pd.DataFrame(X, columns=wine.feature_names)
df['label (标签)'] = y
df.head()

效果图在Day17参考代码文件有，就不截图了。

鸢尾花多分类ROC曲线

# 鸢尾花多分类ROC曲线 (逻辑回归 + One-vs-Rest)
y_bin = label_binarize(y_iris, classes=[0, 1, 2])
X_train, X_test, y_train, y_test = train_test_split(X_iris, y_bin, test_size=0.5, random_state=0)

# 训练模型
classifier = OneVsRestClassifier(LogisticRegression(random_state=0, solver='liblinear'))
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# 计算每一类的 ROC 和 AUC
fpr, tpr, roc_auc = {}, {}, {}
for i in range(n_classes_iris):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# 绘制 ROC 曲线
plt.figure(figsize=(10, 8))
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in zip(range(n_classes_iris), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='类别 {0} (AUC = {1:0.2f})'.format(class_names_iris[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='随机猜测')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR (假正率)', fontsize=12)
plt.ylabel('TPR (真正率)', fontsize=12)
plt.title('鸢尾花多分类 ROC 曲线 (One-vs-Rest)', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

效果图在参考代码文件有，不演示了。

红酒多分类ROC曲线 (随机森林)

# 红酒多分类ROC曲线 (随机森林)
y_bin = label_binarize(y, classes=[0, 1, 2])
X_train, X_test, y_train_bin, y_test_bin = train_test_split(X, y_bin, test_size=0.5, random_state=0)

# 训练随机森林（使用整数标签）
y_train_int = np.argmax(y_train_bin, axis=1)
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train_int)

# 获取预测概率
y_score = clf.predict_proba(X_test)

# 计算每一类的 ROC 和 AUC
fpr_rf, tpr_rf, roc_auc_rf = {}, {}, {}
for i in range(n_classes):
    fpr_rf[i], tpr_rf[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc_rf[i] = auc(fpr_rf[i], tpr_rf[i])

# 绘制 ROC 曲线
plt.figure(figsize=(10, 8))
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr_rf[i], tpr_rf[i], color=color, lw=2,
             label='类别 {0} (AUC = {1:0.2f})'.format(class_names[i], roc_auc_rf[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2, label='随机猜测')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR (假正率)', fontsize=12)
plt.ylabel('TPR (真正率)', fontsize=12)
plt.title('红酒多分类 ROC 曲线 (随机森林)', fontsize=16)
plt.legend(loc="lower right", fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()

多分类综合评估

# ==========================================
# 多分类综合评估：分类指标 + AUC平均 + ROC曲线
# ==========================================
from sklearn.metrics import classification_report, roc_auc_score

# 获取预测类别和真实类别（确保使用红酒数据集的变量）
y_pred_class = np.argmax(y_score, axis=1)  # y_score 来自第3个单元格
y_test_integer = np.argmax(y_test_bin, axis=1)  # y_test_bin 来自第3个单元格

# 1. 分类详细指标报告（含 Macro/Weighted 平均）
print("="*20 + " 多分类详细指标 " + "="*20)
print(classification_report(y_test_integer, y_pred_class, target_names=class_names))

# 2. 计算 AUC 的三种平均
# Micro-average: 将所有类别展平成一个大的二分类
fpr_rf["micro"], tpr_rf["micro"], _ = roc_curve(y_test_bin.ravel(), y_score.ravel())
roc_auc_rf["micro"] = auc(fpr_rf["micro"], tpr_rf["micro"])

# Macro-average: 对各类别 TPR 插值后取平均
all_fpr = np.unique(np.concatenate([fpr_rf[i] for i in range(n_classes)]))
mean_tpr = np.mean([np.interp(all_fpr, fpr_rf[i], tpr_rf[i]) for i in range(n_classes)], axis=0)
fpr_rf["macro"], tpr_rf["macro"] = all_fpr, mean_tpr
roc_auc_rf["macro"] = auc(fpr_rf["macro"], tpr_rf["macro"])

# Weighted-average: 按样本数量加权
weighted_auc = roc_auc_score(y_test_bin, y_score, average='weighted', multi_class='ovr')

print(f"\n{'='*20} AUC 汇总 {'='*20}")
print(f"  Micro-average AUC : {roc_auc_rf['micro']:.4f}")
print(f"  Macro-average AUC : {roc_auc_rf['macro']:.4f}")
print(f"  Weighted-average AUC : {weighted_auc:.4f}")

# 3. 绘制综合 ROC 曲线
plt.figure(figsize=(10, 8))

# 平均线（粗虚线）
plt.plot(fpr_rf["micro"], tpr_rf["micro"], 'deeppink', linestyle=':', linewidth=4,
         label=f'微平均 (AUC={roc_auc_rf["micro"]:.2f})')
plt.plot(fpr_rf["macro"], tpr_rf["macro"], 'navy', linestyle=':', linewidth=4,
         label=f'宏平均 (AUC={roc_auc_rf["macro"]:.2f})')

# 各类别曲线（细实线）
colors = ['aqua', 'darkorange', 'cornflowerblue']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr_rf[i], tpr_rf[i], color=color, lw=2,
             label=f'{class_names[i]} (AUC={roc_auc_rf[i]:.2f})')

# 随机猜测基线
plt.plot([0, 1], [0, 1], 'k--', lw=2, label='随机猜测')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR (假正率)', fontsize=12)
plt.ylabel('TPR (真正率)', fontsize=12)
plt.title('红酒多分类 ROC 曲线 (含 Micro & Macro 平均)', fontsize=16)
plt.legend(loc="lower right", fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

效果图：

KS曲线

# ==========================================
# 二分类进阶指标：MCC & KS 统计量
# ==========================================
from sklearn.metrics import matthews_corrcoef
from scipy.stats import ks_2samp

# 1. 模拟二分类数据（负样本 + 正样本）
np.random.seed(42)
n_samples = 10000
y_true = np.array([0]*n_samples + [1]*n_samples)
y_scores = np.concatenate([
    np.random.normal(0.3, 0.15, n_samples),  # 负样本预测分数
    np.random.normal(0.6, 0.15, n_samples)   # 正样本预测分数
])
y_scores = np.clip(y_scores, 0, 1)

# 2. 计算 MCC（马修斯相关系数，阈值0.5）
y_pred = (y_scores > 0.5).astype(int)
mcc = matthews_corrcoef(y_true, y_pred)
print(f"【MCC (马修斯相关系数)】: {mcc:.4f}  (范围 -1~1，越接近1越好)")

# 3. 计算 KS 统计量（两种方法）
# 方法A: 直接从两个分布计算
ks_scipy, _ = ks_2samp(y_scores[y_true==1], y_scores[y_true==0])

# 方法B: 从 ROC 曲线推导（便于可视化）
fpr, tpr, thresholds = roc_curve(y_true, y_scores)
ks_diff = tpr - fpr
max_idx = np.argmax(ks_diff)
max_ks = ks_diff[max_idx]
best_thresh = thresholds[max_idx]

print(f"【KS 统计量】: {max_ks:.4f}  (最佳阈值: {best_thresh:.4f})")
print(f"  - scipy 计算: {ks_scipy:.4f}")

# 4. 绘制 KS 曲线
plt.figure(figsize=(10, 6))

# TPR 和 FPR 曲线
plt.plot(thresholds, tpr, 'r-', label='TPR (正样本捕获率)', lw=2)
plt.plot(thresholds, fpr, 'b-', label='FPR (负样本误伤率)', lw=2)

# KS 最大距离标记
plt.plot([best_thresh, best_thresh], [fpr[max_idx], tpr[max_idx]], 
         'g--', lw=3, label=f'KS={max_ks:.3f}')
plt.scatter([best_thresh]*2, [fpr[max_idx], tpr[max_idx]], 
            c='green', s=100, zorder=5)

# 图表装饰
plt.title('KS 曲线 (Kolmogorov-Smirnov) - 金融风控常用指标', fontsize=14)
plt.xlabel('预测阈值 (Threshold)', fontsize=12)
plt.ylabel('累积比例', fontsize=12)
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.legend(loc='best', fontsize=11)
plt.grid(True, alpha=0.3)
plt.text(best_thresh+0.03, (fpr[max_idx]+tpr[max_idx])/2, 
         f'← 最大分离点\n  KS={max_ks:.3f}', 
         color='green', fontweight='bold', fontsize=10)

plt.show()

效果图：

多分类 MCC 阈值优化 (基于概率预测)

# ==========================================
# 多分类 MCC 阈值优化 (基于概率预测)
# ==========================================
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, matthews_corrcoef

# --- 1. 设置画图风格 ---
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False   # 用来正常显示负号

# --- 2. 加载数据 & 预处理 ---
wine = datasets.load_wine()
X = wine.data
y = wine.target
class_names = wine.target_names  # ['class_0', 'class_1', 'class_2']

# 【为了演示效果添加噪声】
# 红酒数据集太简单，不加噪声 MCC 很容易全是 1.0，加噪声为了模拟真实困难场景
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape
X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

# 【标签二值化】
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

# --- 3. 训练模型 (改为输出概率) ---
# 使用 OneVsRest 策略训练逻辑回归
classifier = OneVsRestClassifier(LogisticRegression(random_state=0, solver='liblinear'))

# 【核心修改点】使用 predict_proba 获取 [0, 1] 之间的概率
# y_score 的形状是 (n_samples, n_classes)，每一列对应一个类别的正例概率
y_score = classifier.fit(X_train, y_train).predict_proba(X_test)

# --- 4. 遍历阈值计算 MCC 并寻找最佳点 ---
plt.figure(figsize=(10, 8))
colors = ['aqua', 'darkorange', 'cornflowerblue']

print("="*20 + " 各类别最佳 MCC 分析 (基于概率) " + "="*20)

for i in range(n_classes):
    # 1. 获取当前类别的真实标签 (0或1) 和 预测概率 (0.0~1.0)
    y_true_cls = y_test[:, i]
    y_prob_cls = y_score[:, i]
    
    # 2. 利用 ROC 得到的阈值 (这些是数据中实际存在的关键切分点)
    # roc_curve 返回的 thresholds 是从大到小排序的
    current_fpr, current_tpr, current_thresholds = roc_curve(y_true_cls, y_prob_cls)
    
    # 3. 阈值筛选
    # roc_curve 通常会在第一个位置加一个 > 1 的阈值 (比如 max_prob + 1)，
    # 对于概率来说这是无效的，我们需要过滤掉 > 1.0 的阈值
    valid_thresholds = current_thresholds[current_thresholds <= 1.0]
    
    mcc_list = []
    
    # 4. 遍历这些概率阈值，计算 MCC
    for t in valid_thresholds:
        # 大于等于阈值 t 的预测为 1，否则为 0
        y_pred_cls = (y_prob_cls >= t).astype(int)
        
        # 计算 MCC
        mcc = matthews_corrcoef(y_true_cls, y_pred_cls)
        mcc_list.append(mcc)
    
    # 5. 找到 MCC 最大的位置
    best_idx = np.argmax(mcc_list)
    best_mcc = mcc_list[best_idx]
    best_thresh = valid_thresholds[best_idx]
    
    print(f"【类别 {class_names[i]}】")
    print(f"  最佳 MCC: {best_mcc:.4f}")
    print(f"  最佳阈值: {best_thresh:.4f} (概率值)")
    print("-" * 30)
    
    # 6. 绘制 MCC 随阈值变化的曲线
    plt.plot(valid_thresholds, mcc_list, color=colors[i], lw=2, 
             label=f'{class_names[i]} (Max MCC={best_mcc:.2f})')
    
    # 标记最高点
    plt.scatter(best_thresh, best_mcc, s=100, color=colors[i], edgecolors='k', zorder=10)

# --- 5. 完善图表 ---
plt.xlabel('概率阈值 (Probability Threshold)', fontsize=12)
plt.ylabel('MCC 值', fontsize=12)
plt.title('红酒数据集各类别 MCC 随概率阈值变化曲线', fontsize=16)
plt.legend(loc="lower center") # 图例放下面，避免挡住曲线
plt.grid(True, alpha=0.3)

# 【核心修改点】限制 X 轴为 0 到 1，因为现在是概率
plt.xlim([0.0, 1.0])
plt.ylim([-0.2, 1.05]) 

plt.show()

效果图：