import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv(r'.\python60-days-challenge-master\heart.csv')
#自动识别离散和连续特征
discrete_features = data.select_dtypes(include=['object']).columns.tolist()
continuous_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
#离散特征用众数补全
for features in discrete_features:
mode_value = data[features].mode()[0]
data[features].fillna(mode_value, inplace=True)
#连续特征用中位数补全
for features in continuous_features:
print(f"Column {features} data type: {data[features].dtype}")
median_value = data[features].median()
print(f"Median_value for {features}: {median_value}")
if pd.isnull(median_value):
median_value = 0
data.loc[:, features] = data[features].fillna(median_value)
#离散特征独热编码(用pd库更方便一点)
data = pd.get_dummies(data, columns=discrete_features, drop_first=True, dtype=int)
#数据可视化
#中文字体设置
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15,10))
plt.subplot(2, 2, 1)
sns.histplot(data['age'], kde=True)
plt.title('年龄分布图')
plt.subplot(2,2,2)
sns.scatterplot(x='trestbps', y='thalach',data=data)
plt.title('静息血压与最大心率关系')
plt.subplot(2,2,3)
sns.boxplot(y='chol', data=data)
plt.title('胆固醇分布图')
plt.subplot(2,2,4)
sns.violinplot(x='target', y='age', data=data)
plt.title('年龄与心脏病关系图')
plt.tight_layout()
plt.show()

@浙大疏锦行