1 数据说明
我们仍以sklearn自带的数据集boston为例来预测波士顿房价的变动。
2 导入所需模块
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
3 读取数据并划分为训练样本和测试样本
# 将获取数据,将数据分成特征变量和目标变量
boston = datasets.load_boston()
X, y = datasets.load_boston(return_X_y=True)
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.7)
4 模型训练
# 训练模型
lr_ridge = Ridge(alpha=5).fit(X_train, y_train)
5 模型评价
# 获得训练模型的参数
feature_names = boston.feature_names
coefs = pd.DataFrame(
lr_ridge.coef_,
columns=["系数"],
index=feature_names,
)
print("模型的系数:")
print(coefs)
# 变量的重要性图
coefs_std = pd.DataFrame(
lr_ridge.coef_ * X_train.std(axis=0),
columns=["变量重要性"],
index=feature_names,
)
coefs_std.plot(kind="barh", figsize=(9, 7))
plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文显示乱码问题!
plt.rc('axes', unicode_minus=False) # 解决坐标轴负号显示乱码问题!
plt.xlabel("标准化系数")
plt.title("普通线性回归")
plt.axvline(x=0, color=".5")
plt.subplots_adjust(left=0.3)
plt.show()
# 获得模型预测值,计算误差与可决系数
y_train_pred = lr_ridge.predict(X_train)
print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred))
print("训练样本集可决系数: %.2f" % lr_ridge.score(X_train, y_train))
训练样本集均方误差: 21.72
训练样本集可决系数: 0.78
y_test_pred = lr_ridge.predict(X_test)
print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred))
print("测试样本集可决系数: %.2f" % lr_ridge.score(X_test, y_test))
测试样本集均方误差: 25.60
测试样本集可决系数: 0.67
# 绘制估值值与实际值的相关图
fig, ax = plt.subplots(figsize=(5, 5))
plt.scatter(y_test, y_test_pred)
ax.plot([0, 1], [0, 1], transform=ax.transAxes, ls="--", c="red")
plt.title("普通线性回归")
plt.ylabel("预测值")
plt.xlabel("真实值")
plt.xlim([0, 27])
_ = plt.ylim([0, 27])
plt.show()
6 模型选择-alpha参数选择
# RidgeCV模型
lr_ridgeCV = RidgeCV(alphas=np.logspace(1, 2, 10)).fit(X_train, y_train)
print("alphas=", lr_ridgeCV.alpha_)
alphas= 10.0
y_train_pred_CV = lr_ridgeCV.predict(X_train)
y_test_pred_CV = lr_ridgeCV.predict(X_test)
print("训练样本集均方误差: %.2f" % mean_squared_error(y_train, y_train_pred_CV))
print("训练样本集可决系数: %.2f" % lr_ridgeCV.score(X_train, y_train))
训练样本集均方误差: 22.29
训练样本集可决系数: 0.78
print("测试样本集均方误差: %.2f" % mean_squared_error(y_test, y_test_pred_CV))
print("测试样本集可决系数: %.2f" % lr_ridgeCV.score(X_test, y_test))
测试样本集均方误差: 25.28
测试样本集可决系数: 0.68