Adaboost

Reads: 606 Edit

1 数据说明

我们以sklearn自带的鸢尾花数据集(Iris)为例来预测花的物种。

2 导入所需模块

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import zero_one_loss
import seaborn as sns

3 读取数据并划分为训练样本和测试样本

# 获取鸢尾花数据集(Iris),将数据分成特征变量和目标变量
iris = datasets.load_iris()
X, y = datasets.load_iris(return_X_y=True)
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5)

4 模型训练

# 训练模型
# 首先定义AdaBoost的弱分类器(这里使用决策树)
Dtree = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)

# 定义AdaBoost模型
n_estimators=50
AdaBst = AdaBoostClassifier(
    base_estimator=Dtree,
    learning_rate=0.8,
    n_estimators=n_estimators,
    algorithm="SAMME",
)
AdaBst.fit(X_train, y_train)

5 模型评价

# 预测目标变量
y_test_pred = AdaBst.predict(X_test)
# 模型评价
print('准确率为:', accuracy_score(y_test,y_test_pred))
print('精确率为:', precision_score(y_test,y_test_pred,average=None))

准确率为: 0.96
精确率为: [1. 0.93548387 0.95652174]

print('使用AdaBoost预测iris数据的分类报告为:','\n',classification_report(y_test,y_test_pred))
# 混淆矩阵
print('使用AdaBoost预测iris数据的混淆矩阵为:','\n',confusion_matrix(y_test,y_test_pred))

pyt-143

6 弱分类器的数量与误差的关系图

# 弱分类器数量与误差关系图
ada_err = np.zeros((n_estimators,))
for i, y_pred in enumerate(AdaBst.staged_predict(X_test)):
    ada_err[i] = zero_one_loss(y_pred, y_test)

ada_err_train = np.zeros((n_estimators,))
for i, y_pred in enumerate(AdaBst.staged_predict(X_train)):
    ada_err_train[i] = zero_one_loss(y_pred, y_train)

colors = sns.color_palette("colorblind")

plt.plot(
    np.arange(n_estimators) + 1,
    ada_err,
    label="测试集误差",
    color=colors[0],
)
plt.plot(
    np.arange(n_estimators) + 1,
    ada_err_train,
    label="训练集误差",
    color=colors[1],
)
plt.rc('axes', unicode_minus=False)  			## 解决坐标轴符号显示乱码问题!
plt.rcParams['font.sans-serif'] = ['SimHei']        # 解决中文显示乱码问题!
plt.xlabel("弱分类器数量")
plt.ylabel("误差率")
plt.show()

pyt-144

注意:AdaBoost训练时与弱分类的数量与参数设置有较大关系,如果将弱分类器树的深度增加,即Dtree = DecisionTreeClassifier(max_depth=2, min_samples_leaf=1),则只需要较少的分类器就可以达到原有的分类精度!

pyt-145

7 SVM的图形展示

鸢尾花数据集(Iris)有6个特征变量,因而无法绘制出svm在6维空间的分类图形。这里仅取前2个特征变量,演示svm模型在2维空间的分类图示。

# 绘制SVM的二维图示
X, y = datasets.load_iris(return_X_y=True)
X = X[:, 0:2]
# 将数据分成训练样本和测试样本
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.7)

# 训练模型
Dtree2 = DecisionTreeClassifier(max_depth=5, min_samples_leaf=1)
AdaBst2 = AdaBoostClassifier(
    base_estimator=Dtree2,
    learning_rate=0.8,
    n_estimators=400,
    algorithm="SAMME",)
AdaBst2.fit(X_train, y_train)

# 设置绘图颜色、坐标轴范围
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF", "chartreuse"])

DecisionBoundaryDisplay.from_estimator(AdaBst2, X_train, cmap=cm, alpha=0.8, eps=0.5)

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5

# 绘制训练样本的散点
plt.scatter(
    X_train[:, 0],
    X_train[:, 1],
    c=y_train,
    cmap=cm_bright,
    edgecolors="k"
)
# 绘制测试样本的散点
plt.scatter(
    X_test[:, 0],
    X_test[:, 1],
    c=y_test,
    cmap=cm_bright,
    edgecolors="k",
    alpha=0.6,
    marker='^',
)

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

plt.show()

pyt-146



获取案例数据和源代码,请关注微信公众号并回复:Python_dt25


Comments

Make a comment