使用管道

Reads: 679 Edit

在sklearn中,管道可以使数据像流水线一样的方式进行处理,完成机器学习的任务。

1 数据说明

我们以openml中工人工资数据集为例来演示管道的使用

2 导入所需模块

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.compose import TransformedTargetRegressor
from sklearn.decomposition import PCA
import scipy as sp
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

3 获取数据并划分为训练样本和测试样本

# 获取数据
survey = fetch_openml(data_id=534, as_frame=True)
X = survey.data[survey.feature_names]
y = survey.target.values.ravel()

# 可选择把数据保存到外部Excel
data=X.copy()
data.insert(0, 'WAGE', y)
data.to_csv('./survey.csv', index=None)  # 可以把数据保存到本地

pyt-151

# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

4 描述性分析

# 描述性分析--绘制自变量与因变量之间的相关性图
train_dataset = X_train.copy()
train_dataset.insert(0, "WAGE", y_train)
_ = sns.pairplot(train_dataset, kind="reg", diag_kind="kde")
plt.show()

pyt-152

5 使用管道

# 定义列转换
colTrans = make_column_transformer(
    (OneHotEncoder(drop="if_binary"), ["RACE", "OCCUPATION", "SECTOR", "MARR", "UNION", "SEX", "SOUTH"]),
    (StandardScaler(), ['EDUCATION']),
    remainder="passthrough",
    verbose_feature_names_out=False,  # avoid to prepend the preprocessor names
)

# 定义管道
model = make_pipeline(
    colTrans,
    PCA(n_components = 'mle',svd_solver = 'full'),
    TransformedTargetRegressor(
        regressor=Ridge(alpha=1e-10),
        func=np.log10,
        inverse_func=sp.special.exp10
    ),
)

# 使用管道训练模型
model.fit(X_train, y_train)

# pipe中特征变量的变化
print('管道的图形:','\n', model)

pyt-153

print('管道第一步中的特征变量:','\n', model[0].get_feature_names_out())

管道第一步中的特征变量:
['RACE_Hispanic' 'RACE_Other' 'RACE_White' 'OCCUPATION_Clerical' 'OCCUPATION_Management' 'OCCUPATION_Other' 'OCCUPATION_Professional' 'OCCUPATION_Sales' 'OCCUPATION_Service' 'SECTOR_Construction' 'SECTOR_Manufacturing' 'SECTOR_Other' 'MARR_Unmarried' 'UNION_not_member' 'SEX_male' 'SOUTH_yes' 'EDUCATION' 'EXPERIENCE' 'AGE']

print('管道第二步中的特征变量:','\n',model[1].get_feature_names_out())

管道第二步中的特征变量:
['pca0' 'pca1' 'pca2' 'pca3' 'pca4' 'pca5' 'pca6' 'pca7' 'pca8' 'pca9' 'pca10' 'pca11' 'pca12' 'pca13' 'pca14' 'pca15']

# 模型的回归系数
feature_names = model[:-1].get_feature_names_out()
coefs = pd.DataFrame(
    model[-1].regressor_.coef_,
    columns=["Coefficients"],
    index=feature_names,
)
print(coefs)

pyt-154

# 模型评价
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("训练样本集可决系数: %.2f" % model.score(X_train, y_train))
print("测试样本集可决系数: %.2f" % model.score(X_test, y_test))

训练样本集可决系数: 0.27
测试样本集可决系数: 0.30

6 网格搜索法中的管道

# 模型选择-网格搜索法
# 打乱顺序-定性变量转换时需要打乱,否则训练样本和测试变量出现不一致!
data=shuffle(data)
y1=data.iloc[:,0]
X1=data.iloc[:,1:11]

# 通过”管道步骤名称__参数名“的方法传递参数!
param_grid = dict(pca__n_components=[5, 10, 15],
                  transformedtargetregressor__regressor__alpha=[0.1, 1, 10, 100],
                  )
grid_search = GridSearchCV(model, param_grid=param_grid,cv=5).fit(X1, y1)

print('gridsearch选择的参数为:', grid_search.best_params_)
print("训练样本集可决系数: %.2f" % grid_search.best_score_)

gridsearch选择的参数为: {'pca__n_components': 15, 'transformedtargetregressor__regressor__alpha': 10}
训练样本集可决系数: 0.25



获取案例数据和源代码,请关注微信公众号并回复:Python_dt28


Comments

Make a comment