sklearn

sklearn 机器学习（重点）

调用train_test_split()——切分训练集、验证集、测试集
模型初始化（针对模型各种参数的调整都是在这一步）
调用fit()——训练样本
调用score()——计算平均准确率。
结果展示

高考数据——逻辑回归实例（完整）

读入数据

#j加载
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## 将代码块运行结果全部输出，而不是只输出最后的，适用于全文
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"    

## 分类变量——转化为虚拟变量
os.chdir("/Users/mac/Desktop/快乐研一/python/data")
gaokao = pd.read_csv("高考待预处理.csv")
gaokao.head()

	大学名称	类型	重点学科	博士点	分数线
0	北京大学	综合	81.0	201.0	694.0
1	北京大学	综合	81.0	201.0	694.0
2	北京大学	综合	81.0	201.0	694.0
3	复旦大学	综合	30.0	178.0	687.0
4	清华大学	综合	37.0	253.0	686.0

处理缺失重复

# 检查缺失重复问题
gaokao.isnull().sum(axis=0)
gaokao[gaokao.duplicated()]

# 处理缺失重复
gaokao = gaokao.drop_duplicates(subset = ['大学名称'])  # 去重复
gaokao.dropna(axis = 0, how='any', thresh = None, subset = None, inplace=True)   # omit na值

# 复查一遍
gaokao.isnull().sum(axis=0)
gaokao[gaokao.duplicated()]

大学名称    2
类型      2
重点学科    4
博士点     4
分数线     4
dtype: int64

	大学名称	类型	重点学科	博士点	分数线
1	北京大学	综合	81.0	201.0	694.0
2	北京大学	综合	81.0	201.0	694.0
6	上海交通大学	综合	9.0	78.0	664.0
24	NaN	NaN	NaN	NaN	NaN

大学名称    0
类型      0
重点学科    0
博士点     0
分数线     0
dtype: int64

	大学名称	类型	重点学科	博士点	分数线

处理str变量

# 对重点学科和分数线高低做分类
gaokao['重点学科（分类）'] = pd.cut(gaokao['重点学科'], 3, labels = [u"LOW",u"MED",u"TOP"])   # 直接三等分
gaokao['分数线（高低）'] = pd.cut(gaokao['分数线'], 2, labels = [u"低于平均",u"高于平均"])   # 对分数线做划分二等分

# 转化为哑变量
leixing = pd.get_dummies(gaokao.类型, prefix="type")   # 全部生成虚拟变量，列名前面加上type
gaokao = gaokao.join(leixing)   # join进原数据里

fenshu = pd.get_dummies(gaokao['分数线（高低）'] , prefix="分数线") 
gaokao = gaokao.join(fenshu)   # join进原数据里
gaokao.head()

# 查看变量类型
gaokao.dtypes

	大学名称	类型	重点学科	博士点	分数线	重点学科（分类）	分数线（高低）	type_综合	分数线_高于平均
0	北京大学	综合	81.0	201.0	694.0	TOP	高于平均	1	1
3	复旦大学	综合	30.0	178.0	687.0	MED	高于平均	1	1
4	清华大学	综合	37.0	253.0	686.0	MED	高于平均	1	1
5	上海交通大学	综合	9.0	78.0	664.0	LOW	高于平均	1	1
7	南京大学	综合	21.0	44.0	662.0	LOW	高于平均	1	1

大学名称          object
类型            object
重点学科         float64
博士点          float64
分数线          float64
重点学科（分类）    category
分数线（高低）     category
type_农业        uint8
type_医药        uint8
type_工科        uint8
type_师范        uint8
type_林业        uint8
type_民族        uint8
type_综合        uint8
type_财经        uint8
分数线_低于平均       uint8
分数线_高于平均       uint8
dtype: object

划分训练集和测试集，给出X和Y

# 载入函数train_test_split划分训练和测试集
from sklearn.model_selection import train_test_split
# 划分训练集和测试集
x_train,x_test, y_train, y_test = train_test_split(gaokao.iloc[:,[2,3,7,8,9,10]], gaokao['分数线_高于平均'], test_size = 0.33, random_state = 0)
# 输出样本量
print('训练集维度: {}, 测试集维度：{} \n'.format(y_train.shape, y_test.shape))

训练集维度: (85,), 测试集维度：(43,)

标准化（这里暂时不进行）

建立logistic model

multi_class:分类方式选择参数，有"ovr(默认)"和"multinomial"两个值可选择，在二元逻辑回归中无区别
cv:几折交叉验证
solver:优化算法选择参数，当penalty为"l1"时，参数只能是"liblinear(坐标轴下降法)"
"lbfgs"和"cg"都是关于目标函数的二阶泰勒展开
当penalty为"l2"时，参数可以是"lbfgs(拟牛顿法)","newton_cg(牛顿法变种)","seg(minibactch随机平均梯度下降)"
维度<10000时，选择"lbfgs"法，维度>10000时，选择"cs"法比较好，显卡计算的时候，lbfgs"和"cs"都比"seg"快
penalty:正则化选择参数，用于解决过拟合，可选"l1","l2"
tol:当目标函数下降到该值是就停止，叫：容忍度，防止计算的过多

#构建并训练模型
from sklearn.linear_model import LogisticRegression  #加载函数
logreg = LogisticRegression(penalty="l2")  # 模型初始化
logreg.fit(x_train, y_train)   # 调用fit 训练样本

# 查看回归系数
x_train.columns.values
np.around(logreg.coef_,2)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)






array(['重点学科', '博士点', 'type_农业', 'type_医药', 'type_工科', 'type_师范'],
      dtype=object)






array([[-0.08,  0.34,  0.07, -0.59, -0.25, -0.47]])

预测结果

y_pred = logreg.predict(x_test)   # 计算预测结果

# 模型测试
print(logreg.score(x_train, y_train))   # 训练集上准确率
print(logreg.score(x_test, y_test))     # 预测集上准确率

# 加载混淆矩阵函数
from sklearn.metrics import confusion_matrix
c_m = confusion_matrix(y_test,y_pred)
# 输出混淆矩阵
print(c_m)

0.9529411764705882
0.9534883720930233
[[15  2]
 [ 0 26]]

# ROC曲线
import matplotlib.pyplot as plt   # 画图
from sklearn.metrics import roc_auc_score,roc_curve  #计算auc

# 提取预测概率值
y_pred2=logreg.predict_proba(x_test)    # 预测出的概率值
y_0=list(y_pred2[:,1])    #取第二列数据，转化我list

# 计算ROC曲线的横纵轴fpr/tpr真假正例率、thresholds分割门槛
fpr,tpr,thresholds=roc_curve(y_test,y_0)  #计算fpr,tpr,thresholds
auc = roc_auc_score(y_test,y_0) #计算auc
print('auc值为：',str(auc))

# 画曲线图
plt.figure()
plt.plot(fpr,tpr)
plt.title('$ROC—curve$')
plt.show()

auc值为： 0.9683257918552036





<Figure size 432x288 with 0 Axes>






[<matplotlib.lines.Line2D at 0x1a244dd470>]






Text(0.5,1,'$ROC—curve$')

高考数据——线性回归

# 划分训练集和测试集，给出X和Y
# 划分训练集和测试集
gaokao.columns
x_train,x_test, y_train, y_test = train_test_split(gaokao.iloc[:,[2,3,7,8,9,10]], gaokao['分数线'], test_size = 0.33, random_state = 0)

# 模型
from sklearn import linear_model
linear_reg = linear_model.LinearRegression()
# fit训练集拟合
linear_reg.fit(x_train, y_train)   # 调用fit 训练样本

# 查看回归系数
x_train.columns.values
np.around(linear_reg.coef_,2)

# 查看R方
from sklearn.metrics import mean_squared_error , r2_score
print('R2: %.3f' % r2_score(y_test,linear_reg.predict(x_test)))

# 可视化
# 设置字体
import matplotlib.font_manager as mfm
font_path = r"/Users/mac/Library/Fonts/字体管家方萌简（非商业使用）v1.1.ttf"
prop = mfm.FontProperties(fname = font_path)
# 画布
fig = plt.figure(figsize=(8,6))   # 创建一张18*6的画布
ax1 = fig.add_subplot(111)        # 创建子图
# 画图
ax1.set_xlabel('真实值', fontproperties=prop, fontsize=25)
ax1.set_ylabel('预测值', fontproperties=prop, fontsize=25)
ax1.scatter(y_test ,linear_reg.predict(x_test) ,color='blue')
# show
plt.show()
plt.close()  # plt.show()结束后仍然保存在内存中, 切记关闭！！！在jupyter理！！！

Index(['大学名称', '类型', '重点学科', '博士点', '分数线', '重点学科（分类）', '分数线（高低）', 'type_农业',
       'type_医药', 'type_工科', 'type_师范', 'type_林业', 'type_民族', 'type_综合',
       'type_财经', '分数线_低于平均', '分数线_高于平均'],
      dtype='object')






LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)






array(['重点学科', '博士点', 'type_农业', 'type_医药', 'type_工科', 'type_师范'],
      dtype=object)






array([  2.06,   0.47,  23.7 , -23.  ,  13.  ,  28.82])



R2: 0.447





Text(0.5,0,'真实值')






Text(0,0.5,'预测值')






<matplotlib.collections.PathCollection at 0x1a257a4fd0>

数据分析实例——高考数据

sklearn

高考数据——逻辑回归实例（完整）

读入数据

处理缺失重复

处理str变量

划分训练集和测试集，给出X和Y

标准化（这里暂时不进行）

建立logistic model

预测结果

高考数据——线性回归

感谢您的支持，我会继续努力的!