岭回归

Reads: 756 Edit

1 数据说明

我们仍以sklearn自带的数据集boston为例来预测波士顿房价的变动。

2 导入所需模块

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV

3 读取数据并划分为训练样本和测试样本

# 将获取数据,将数据分成特征变量和目标变量
boston = datasets.load_boston()
X, y = datasets.load_boston(return_X_y=True)
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.7)

4 模型训练

# 训练模型
lr_ridge = Ridge(alpha=5).fit(X_train, y_train)    

5 模型评价

# 获得训练模型的参数
feature_names = boston.feature_names
coefs = pd.DataFrame(
    lr_ridge.coef_,
    columns=["系数"],
    index=feature_names,
)
print("模型的系数:")
print(coefs)

pyt-120

# 变量的重要性图
coefs_std = pd.DataFrame(
    lr_ridge.coef_ * X_train.std(axis=0),
    columns=["变量重要性"],
    index=feature_names,
)
coefs_std.plot(kind="barh", figsize=(9, 7))
plt.rcParams['font.sans-serif'] = ['SimHei']        # 解决中文显示乱码问题!
plt.rc('axes', unicode_minus=False)                 # 解决坐标轴负号显示乱码问题!
plt.xlabel("标准化系数")
plt.title("普通线性回归")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)
plt.show()

pyt-121

# 获得模型预测值,计算误差与可决系数
y_train_pred = lr_ridge.predict(X_train)
print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred))
print("训练样本集可决系数: %.2f" % lr_ridge.score(X_train, y_train))

训练样本集均方误差: 21.72
训练样本集可决系数: 0.78

y_test_pred = lr_ridge.predict(X_test)
print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred))
print("测试样本集可决系数: %.2f" % lr_ridge.score(X_test, y_test))

测试样本集均方误差: 25.60
测试样本集可决系数: 0.67

# 绘制估值值与实际值的相关图
fig, ax = plt.subplots(figsize=(5, 5))
plt.scatter(y_test, y_test_pred)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
plt.title("普通线性回归")
plt.ylabel("预测值")
plt.xlabel("真实值")
plt.xlim([0, 27])
_ = plt.ylim([0, 27])
plt.show()

pyt-122

6 模型选择-alpha参数选择

# RidgeCV模型
lr_ridgeCV = RidgeCV(alphas=np.logspace(1, 2, 10)).fit(X_train, y_train)

print("alphas=", lr_ridgeCV.alpha_)

alphas= 10.0

y_train_pred_CV = lr_ridgeCV.predict(X_train)
y_test_pred_CV = lr_ridgeCV.predict(X_test)

print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred_CV))
print("训练样本集可决系数: %.2f" % lr_ridgeCV.score(X_train, y_train))

训练样本集均方误差: 22.29
训练样本集可决系数: 0.78

print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred_CV))
print("测试样本集可决系数: %.2f" % lr_ridgeCV.score(X_test, y_test))

测试样本集均方误差: 25.28
测试样本集可决系数: 0.68



获取案例数据和源代码,请关注微信公众号并回复:Python_dt16


Comments

Make a comment