在sklearn中,管道可以使数据像流水线一样的方式进行处理,完成机器学习的任务。
1 数据说明
我们以openml中工人工资数据集为例来演示管道的使用
2 导入所需模块
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
import scipy as sp
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
3 获取数据并划分为训练样本和测试样本
# 获取数据
survey = fetch_openml(data_id=534, as_frame=True)
X = survey.data[survey.feature_names]
y = survey.target.values.ravel()
# 可选择把数据保存到外部Excel
data=X.copy()
data.insert(0, 'WAGE', y)
data.to_csv('./survey.csv', index=None) # 可以把数据保存到本地
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
4 描述性分析
# 描述性分析--绘制自变量与因变量之间的相关性图
train_dataset = X_train.copy()
train_dataset.insert(0, "WAGE", y_train)
_ = sns.pairplot(train_dataset, kind="reg", diag_kind="kde")
plt.show()
5 使用管道
# 定义列转换
colTrans = make_column_transformer(
(OneHotEncoder(drop="if_binary"), ["RACE", "OCCUPATION", "SECTOR", "MARR", "UNION", "SEX", "SOUTH"]),
(StandardScaler(), ['EDUCATION']),
remainder="passthrough",
verbose_feature_names_out=False, # avoid to prepend the preprocessor names
)
# 定义管道
model = make_pipeline(
colTrans,
PCA(n_components = 'mle',svd_solver = 'full'),
TransformedTargetRegressor(
regressor=Ridge(alpha=1e-10),
func=np.log10,
inverse_func=sp.special.exp10
),
)
# 使用管道训练模型
model.fit(X_train, y_train)
# pipe中特征变量的变化
print('管道的图形:','\n', model)
print('管道第一步中的特征变量:','\n', model[0].get_feature_names_out())
管道第一步中的特征变量:
['RACE_Hispanic' 'RACE_Other' 'RACE_White' 'OCCUPATION_Clerical'
'OCCUPATION_Management' 'OCCUPATION_Other' 'OCCUPATION_Professional'
'OCCUPATION_Sales' 'OCCUPATION_Service' 'SECTOR_Construction'
'SECTOR_Manufacturing' 'SECTOR_Other' 'MARR_Unmarried' 'UNION_not_member'
'SEX_male' 'SOUTH_yes' 'EDUCATION' 'EXPERIENCE' 'AGE']
print('管道第二步中的特征变量:','\n',model[1].get_feature_names_out())
管道第二步中的特征变量:
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9'
'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15']
# 模型的回归系数
feature_names = model[:-1].get_feature_names_out()
coefs = pd.DataFrame(
model[-1].regressor_.coef_,
columns=["Coefficients"],
index=feature_names,
)
print(coefs)
# 模型评价
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("训练样本集可决系数: %.2f" % model.score(X_train, y_train))
print("测试样本集可决系数: %.2f" % model.score(X_test, y_test))
训练样本集可决系数: 0.27
测试样本集可决系数: 0.30
6 网格搜索法中的管道
# 模型选择-网格搜索法
# 打乱顺序-定性变量转换时需要打乱,否则训练样本和测试变量出现不一致!
data=shuffle(data)
y1=data.iloc[:,0]
X1=data.iloc[:,1:11]
# 通过”管道步骤名称__参数名“的方法传递参数!
param_grid = dict(pca__n_components=[5, 10, 15],
transformedtargetregressor__regressor__alpha=[0.1, 1, 10, 100],
)
grid_search = GridSearchCV(model, param_grid=param_grid,cv=5).fit(X1, y1)
print('gridsearch选择的参数为:', grid_search.best_params_)
print("训练样本集可决系数: %.2f" % grid_search.best_score_)
gridsearch选择的参数为: {'pca__n_components': 15, 'transformedtargetregressor__regressor__alpha': 10}
训练样本集可决系数: 0.25