Lasso回归

Reads: 2351 Edit

1 数据说明

我们仍以sklearn自带的数据集boston为例来预测波士顿房价的变动。Lasso回归和岭回归的用法几乎一致。

2 导入所需模块

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV

3 读取数据并划分为训练样本和测试样本

# 将获取数据,将数据分成特征变量和目标变量
boston = datasets.load_boston()
X, y = datasets.load_boston(return_X_y=True)
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.7)

4 模型训练

# 训练模型
lr_lasso = Lasso(alpha=0.5).fit(X_train, y_train)    

5 模型评价

# 获得训练模型的参数
feature_names = boston.feature_names
coefs = pd.DataFrame(
    lr_lasso.coef_,
    columns=["系数"],
    index=feature_names,
)
print("模型的系数:")
print(coefs)

pyt-123

# 变量的重要性图
coefs_std = pd.DataFrame(
    lr_lasso.coef_ * X_train.std(axis=0),
    columns=["变量重要性"],
    index=feature_names,
)
coefs_std.plot(kind="barh", figsize=(9, 7))
plt.rcParams['font.sans-serif'] = ['SimHei']        # 解决中文显示乱码问题!
plt.rc('axes', unicode_minus=False)                 # 解决坐标轴负号显示乱码问题!
plt.xlabel("标准化系数")
plt.title("普通线性回归")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)
plt.show()

pyt-124

# 获得模型预测值,计算误差与可决系数
y_train_pred = lr_lasso.predict(X_train)
print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred))
print("训练样本集可决系数: %.2f" % lr_lasso.score(X_train, y_train))

训练样本集均方误差: 25.25
训练样本集可决系数: 0.75

y_test_pred = lr_lasso.predict(X_test)
print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred))
print("测试样本集可决系数: %.2f" % lr_lasso.score(X_test, y_test))

测试样本集均方误差: 25.17
测试样本集可决系数: 0.68

# 绘制估值值与实际值的相关图
fig, ax = plt.subplots(figsize=(5, 5))
plt.scatter(y_test, y_test_pred)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
plt.title("普通线性回归")
plt.ylabel("预测值")
plt.xlabel("真实值")
plt.xlim([0, 27])
_ = plt.ylim([0, 27])
plt.show()

pyt-125

6 模型选择-alpha参数选择

# lassoCV模型
lr_lassoCV = LassoCV(alphas=np.logspace(-1, 1, 50)).fit(X_train, y_train)

print("alphas=", lr_lassoCV.alpha_)

alphas= 0.1

y_train_pred_CV = lr_lassoCV.predict(X_train)
y_test_pred_CV = lr_lassoCV.predict(X_test)

print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred_CV))
print("训练样本集可决系数: %.2f" % lr_lassoCV.score(X_train, y_train))

训练样本集均方误差: 21.60
训练样本集可决系数: 0.78

print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred_CV))
print("测试样本集可决系数: %.2f" % lr_lassoCV.score(X_test, y_test))

测试样本集均方误差: 26.05
测试样本集可决系数: 0.67



获取案例数据和源代码,请关注微信公众号并回复:Python_dt17


Comments

Make a comment