字段说明
Instant 记录号
Dteday:日期
Season:季节
1=春天
2=夏天
3=秋天
4=冬天
yr:年份,(0: 2011, 1:2012)
mnth:月份( 1 to 12)
hr:小时 (0 to 23) (只在 hour.csv 有,作业忽略此字段)
holiday:是否是节假日
weekday:星期中的哪天,取值为 0~6
workingday:是否工作日
1=工作日 (非周末和节假日)
0=周末
weathersit:天气
1:晴天,多云
2:雾天,阴天
3:小雪,小雨
4:大雨,大雪,大雾
temp:气温摄氏度
atemp:体感温度
hum:湿度
windspeed:风速
y值
casual:非注册用户个数
registered:注册用户个数
cnt:给定日期(天)时间(每小时)总租车人数,响应变量 y
注意:后三个特征均为要预测的 y,作业里只需对 cnt 进行预测
黑色标记的特征为输入特征 x
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as seb
dpath = "F:/"
#读取数据
data = pd.read_csv(dpath +"day.csv")
#查看前五行数据
#print(data.head())
#print(data.info())
#print(data.describe())
cate_features = ["season","weathersit","weekday"]
for col in cate_features:
print("%s属性的不同取值和次数:" % col)
print(data[col].value_counts())
data[col] = data[col].astype('object')
#该4类特征的取值不多,用one-hot编码
#特征处理
x_train_cat = data[cate_features]
x_train_cat = pd.get_dummies(x_train_cat)
x_train_cat.head()
df = pd.DataFrame(x_train_cat)
df.to_csv("F:/meng.csv")
print(x_train_cat.head())
#对数值型变量进行处理
#对数据进行归一化处理
# from sklearn.preprocessing import MinMaxScaler
# mn_x = MinMaxScaler()
# numerical_features = ["temp","hum","windspeed"]
# temp = mn_x.fit_transform(data[numerical_features])
# x_train_num = pd.DataFrame(data=temp,columns=numerical_features)
# print(x_train_num.head())
########################
from sklearn.preprocessing import PolynomialFeatures #用多项式做数值型数据处理
numerical_features = ["temp","hum","windspeed"]
poly = PolynomialFeatures(degree=4, include_bias=False, interaction_only=False)
X_ploly = poly.fit_transform(data[numerical_features])
X_ploly_df = pd.DataFrame(X_ploly, columns=poly.get_feature_names())
print(X_ploly_df)
########################
#将前边的特征值和4种数值值进行拼接生成一个新的data结果集
x_train = pd.concat([x_train_cat,X_ploly_df,data['holiday'],data['workingday']],axis=1,ignore_index=False)
df = pd.DataFrame(x_train)
df.to_csv("F:/meng2.csv")
final_train = pd.concat([data['instant'],x_train,data['yr'],data['cnt']],axis=1,ignore_index=False)
df = pd.DataFrame(final_train)
df.to_csv(dpath+"final.csv",index=False)
final_train.head()
#加载生成的特征csv
tz_data = pd.read_csv(dpath+"final.csv")
train=tz_data[tz_data.yr==0] #训练数据
train = train.drop(columns = ['instant','yr'])
print("train(训练):"+str(train.shape))
#取2012年的数据作为测试数据
test=tz_data[tz_data.yr==1] #测试数据
#取testID备份留作后用
testID=test['instant']
testCNT=test['cnt']
test = test.drop(columns = ['instant','yr'])
print("test(测试):"+str(test.shape))
print(test.head())
#准备训练数据
#训练数据
y_train = train['cnt']
X_train = train
X_train = X_train.drop(columns=['cnt'])
#测试数据
y_test_real = test['cnt']
y_test = test['cnt']
X_test = test
X_test = X_test.drop(columns = ['cnt'])
print(X_train.shape)
print(X_test.shape)
#数据标准化
from sklearn import preprocessing
X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
mean_y = y_train.mean()#训练数据的均值
print("train_mean_y = " ,mean_y)
std_y = y_train.std()#训练数据的标准差
print("train_std_y = ",std_y)
y_train = (y_train-mean_y) /std_y #训练数据标准化后的y
y_test = (y_test - mean_y ) /std_y#测试数据标准化后的y
print(y_train.head())
print(y_test.head())
mean_test_y = y_test.mean()
mean_diff = mean_test_y
print("标准化后的均值为",mean_diff)
#岭回归模型训练
from sklearn.linear_model import RidgeCV #岭回归
from sklearn.metrics import r2_score #评价回归预测模型的性能
# = alphas=[0.01,0.1,1,10,100,1000] #最佳的alpha = 1.0
#alphas = np.arange(0.09,0.15,0.01) #最佳的alpha = 0.13999999999999996
alphas = np.arange(-10.0,10.0,0.0001) #最佳的alpha = 0.14
ridge = RidgeCV(alphas = alphas,store_cv_values = True)
#训练模型
ridge.fit(X_train,y_train)
alpha = ridge.alpha_
print("最佳的alpha = ",alpha)
#交叉验证得到的测试误差
mse_cv = np.mean(ridge.cv_values_,axis=0)
rmse = np.sqrt(mse_cv)
print("cv of rmse",min(rmse))
# 训练误差
from sklearn.metrics import mean_squared_error
y_train_pred = ridge.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred))
print("训练集rmse",rmse_train)
y_test_pred = ridge.predict(X_test)
y_test_pred = y_test_pred + mean_diff
rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred))
print("测试集rmse",rmse_test)
r2_score_train = r2_score(y_train,y_train_pred)
r2_score_test = r2_score(y_test,y_test_pred)
print("score of r2 train is ",r2_score_train)
print("score of r2 test is ", r2_score_test)
fig = plt.figure(figsize=(10,5))
mse_mean = np.mean(ridge.cv_values_,axis=0)
plt.plot(alphas,mse_mean.reshape(len(alphas),1))
plt.xlabel("alphas")
plt.ylabel("mse")
plt.show()
#Lasso模型训练
from sklearn.linear_model import LassoCV
lasso = LassoCV()
lasso.fit(X_train,y_train)
alpha = lasso.alpha_
print("最佳alpha_: ",alpha)
mses = np.mean(lasso.mse_path_,axis=1)
fig = plt.figure(figsize=(10,5))
plt.plot(np.log10(lasso.alphas_),mses)
plt.xlabel("log(alpha)")
plt.ylabel("mse")
plt.show()
y_train_pred = lasso.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train,y_train_pred))
print("训练集rmse:",rmse_train)
y_test_pred = lasso.predict(X_test)+mean_diff
rmse_test = np.sqrt(mean_squared_error(y_test,y_test_pred))
print("测试集rmse:",rmse_test)
r2_score_train = r2_score(y_train,y_train_pred)
r2_score_test = r2_score(y_test,y_test_pred)
print("score of r2 train " ,r2_score_train)
print("score of r2 test " ,r2_score_test)
#将生成的结果展示
y_test_pred = lasso.predict(X_test)
y_test_pred = y_test_pred+mean_diff#标准化的预测值
y_test_pred = y_test_pred*std_y + mean_y
fig = plt.figure()
plt.plot(testID,y_test_pred,c="red",label="pred")
plt.plot(testID,y_test_real,c="blue",label="real value")
plt.xlabel("instant")
plt.ylabel("count")
plt.legend(loc="best")
plt.show()
df=pd.DataFrame({"instant":testID,"cnt":y_test_real,'pre_cnt':y_test_pred})
df.to_csv(dpath+'result.csv')
df.info()
