任务
1、基于chip_test.csv 数据,建立逻辑回归模型(二阶边界),评估模型表现;
2、以函数的方式求解边界曲线;
3、描绘出完整的决策边界曲线;
视频资料
数据准备
数据集 chip_test.csv
链接: https://pan.baidu.com/s/1i0IxtE6rBKHIb-2kbX1NkA 提取码: 8497
#load the data
import pandas as pd
import numpy as np
data = pd.read_csv('chip_test.csv')
data.head()

#add label mask
mask = data.loc[:,'pass'] == 1
# print(~mask)
#visualize the data
%matplotlib inline
from matplotlib import pyplot as plt
fig1 = plt.figure()
passed = plt.scatter(data.loc[:,'test1'][mask], data.loc[:,'test2'][mask])
# print(pased.shape)
failed = plt.scatter(data.loc[:,'test1'][~mask], data.loc[:,'test2'][~mask])
plt.title('test1-test2')
plt.xlabel('test1')
plt.ylabel('test2')
plt.legend((passed,failed),('passed','failed'))
plt.show()

建立逻辑回归模型(二阶边界),评估模型表现
# define X, y
X = data.drop(['pass'], axis = 1)
y = data.loc[:, 'pass']
X1 = data.loc[:,'test1']
X2 = data.loc[:,'test2']
X1.head()
# create new data
X1_2 = X1*X1
X2_2 = X2*X2
X1_X2 = X1*X2
X_new = {'X1':X1,'X2':X2,'X1_2':X1_2, 'X2_2':X2_2, 'X1_X2':X1_X2}
X_new = pd.DataFrame(X_new)
X_new.head()

#establish new model and train
from sklearn.linear_model import LogisticRegression
LR2 = LogisticRegression()
LR2.fit(X_new, y)

from sklearn.metrics import accuracy_score
y2_predict = LR2.predict(X_new)
accuracy2 = accuracy_score(y, y2_predict)
print(accuracy2) # 0.7966101694915254
X1_sorted = X1.sort_values()
theta0 = LR2.intercept_
theta1 = LR2.coef_[0][0]
theta2 = LR2.coef_[0][1]
theta3 = LR2.coef_[0][2]
theta4 = LR2.coef_[0][3]
theta5 = LR2.coef_[0][4]
a = theta4
b = theta5*X1_sorted + theta2
c = theta0 + theta1 * X1_sorted + theta3*X1_sorted*X1_sorted
X2_new_boundary = (-b + np.sqrt(b*b - 4*a*c))/(2*a)
X2_new_boundary.head()
#接下来把原数据点和边界曲线画到同一张图上
fig2 = plt.figure()
passed = plt.scatter(data.loc[:,'test1'][mask], data.loc[:,'test2'][mask])# 通过的数据
# print(type(passed)) #<class 'matplotlib.collections.PathCollection'>
failed = plt.scatter(data.loc[:,'test1'][~mask], data.loc[:,'test2'][~mask])# 没有通过的数据
plt.plot(X1_sorted, X2_new_boundary) # 边界曲线=================================
plt.title('test1-test2')
plt.xlabel('test1')
plt.ylabel('test2')
# legend()图表添加图例的函数,通过标签(label)区分不同数据系列,提升图表可读性
plt.legend((passed,failed),('passed','failed'))
plt.show()# 由下图看出,只画了边界函数的下半部分,求根公式里根有两个,下图对应的是 +的那个

以函数的方式求解边界曲线
#边界函数的求解包装成函数方法
def f(x):
a = theta4
b = theta5*x + theta2
c = theta0 + theta1 * x + theta3*x*x
X2_new_boundary1 = (-b + np.sqrt(b*b - 4*a*c))/(2*a) # 根1
X2_new_boundary2 = (-b - np.sqrt(b*b - 4*a*c))/(2*a) # 根2
return X2_new_boundary1, X2_new_boundary2
X2_new_boundary1 = []
X2_new_boundary2 = []
for x in X1_sorted:
b1,b2 =f(x) # 注意这里一次性算出两个结果,有的写法需要调用两次方法,也是对的,但没必要
X2_new_boundary1.append(b1)
X2_new_boundary2.append(b2)
print(X2_new_boundary1, X2_new_boundary2)

#重新画图
#接下来把原数据点和边界曲线画到同一张图上
fig3 = plt.figure()
passed = plt.scatter(data.loc[:,'test1'][mask], data.loc[:,'test2'][mask])# 通过的数据
failed = plt.scatter(data.loc[:,'test1'][~mask], data.loc[:,'test2'][~mask])# 没有通过的数据
plt.plot(X1_sorted, X2_new_boundary1) # 边界曲线=================================
plt.plot(X1_sorted, X2_new_boundary2) # 边界曲线=================================
plt.title('test1-test2')
plt.xlabel('test1')
plt.ylabel('test2')
# legend()图表添加图例的函数,通过标签(label)区分不同数据系列,提升图表可读性
plt.legend((passed,failed),('passed','failed'))
plt.show()# 由下图可看出,边界曲线有缺口,原因是 X 不够密集,后面解决缺口问题

解决缺口问题
#增加数据点
X1_range = [-0.9 + x/10000 for x in range(0,19000)] # 从 -0.9 开始,每隔 x/10000 取一个数据点
X1_range = np.array(X1_range)
X2_new_boundary1 = []
X2_new_boundary2 = []
for x in X1_range:
b1,b2 =f(x) # 注意这里一次性算出两个结果,有的写法需要调用两次方法,也是对的,但没必要
X2_new_boundary1.append(b1)
X2_new_boundary2.append(b2)
#重新画图
#接下来把原数据点和边界曲线画到同一张图上
fig4 = plt.figure()
passed = plt.scatter(data.loc[:,'test1'][mask], data.loc[:,'test2'][mask])# 通过的数据
failed = plt.scatter(data.loc[:,'test1'][~mask], data.loc[:,'test2'][~mask])# 没有通过的数据
plt.plot(X1_range, X2_new_boundary1) # 边界曲线=================================
plt.plot(X1_range, X2_new_boundary2) # 边界曲线=================================
plt.title('test1-test2')
plt.xlabel('test1')
plt.ylabel('test2')
# legend()图表添加图例的函数,通过标签(label)区分不同数据系列,提升图表可读性
plt.legend((passed,failed),('passed','failed'))
plt.show()

1640

被折叠的 条评论
为什么被折叠?



