《Python机器学习基础教程》笔记-第6章

On 2019年3月31日By yuer

github地址：点我

在网格搜索中使用管道

from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# 加载并划分数据
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
         cancer.data, cancer.target, random_state=0)

# 先缩放再跑模型的pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

# 5折网格搜索
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))
print("Best parameters: {}".format(grid.best_params_))

from sklearn.svm import SVC

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

# 加载并划分数据

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(

cancer.data, cancer.target, random_state=0)

# 先缩放再跑模型的pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

# 5折网格搜索

param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))

print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

print("Best parameters: {}".format(grid.best_params_))

Best cross-validation accuracy: 0.98
Test set score: 0.97
Best parameters: {'svm__C': 1, 'svm__gamma': 1}

Best cross-validation accuracy: 0.98

Test set score: 0.97

Best parameters: {'svm__C': 1, 'svm__gamma': 1}

pipeline组合了2个步骤（1个预处理+1个分类模型），首先对训练数据做MinMaxScaler的fit_transform，然后再调用SVC模型的fit进行训练。

这里采用5折网格参数搜索，但是在pipeline情况下，需要把搜索参数增加对应步骤的名字作为前缀，这样才会被pipeline中的某个步骤使用。

通用的管道接口

pipeline每个步骤都要求有fit方法，并且除了最后一个步骤之外都额外需要拥有transform方法以便对数据输入进行转换。

K折网格搜索是为了找到表现最好的一次pipeline，它保存在gridsearch的bestestimator中，可以取出其中的每一个步骤的训练结果。

from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# 加载并划分数据
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
         cancer.data, cancer.target, random_state=0)

# 先缩放再跑模型的pipeline
pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

# 5折网格搜索
param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
# 最佳泛化的训练结果
print(grid.best_estimator_)
# 最佳结果中的svm步骤
print(grid.best_estimator_.named_steps['svm'])

from sklearn.svm import SVC

from sklearn.datasets import load_breast_cancer

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

# 加载并划分数据

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(

cancer.data, cancer.target, random_state=0)

# 先缩放再跑模型的pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

# 5折网格搜索

param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]}

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

# 最佳泛化的训练结果

print(grid.best_estimator_)

# 最佳结果中的svm步骤

print(grid.best_estimator_.named_steps['svm'])

Pipeline(memory=None,
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Pipeline(memory=None,

steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,

decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',

max_iter=-1, probability=False, random_state=None, shrinking=True,

tol=0.001, verbose=False))])

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,

decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',

max_iter=-1, probability=False, random_state=None, shrinking=True,

tol=0.001, verbose=False)

可以看到具有最佳泛化的pipeline，其2个步骤的训练结果也对应的保存了起来，我们可以取出其中的某一步骤的训练结果。

网格搜索+预处理步骤

这依旧是pipeline的一个例子，在中间加了一个交互多项式特征的步骤。

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge

# 加载并划分数据
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(
         boston.data, boston.target, random_state=0)

# 缩放数据 + 生成多项式特征 + 岭回归
pipe = Pipeline([("scaler", StandardScaler()), ("ploy", PolynomialFeatures()), ('ridge', Ridge())])

# 前缀指定各个步骤的搜索参数
param_grid = {'ploy__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索
grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)

# 精度
print(grid.score(X_test, y_test))
# 最佳泛化的一组参数
print(grid.best_params_)

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures

from sklearn.datasets import load_boston

from sklearn.linear_model import Ridge

# 加载并划分数据

boston = load_boston()

X_train, X_test, y_train, y_test = train_test_split(

boston.data, boston.target, random_state=0)

# 缩放数据 + 生成多项式特征 + 岭回归

pipe = Pipeline([("scaler", StandardScaler()), ("ploy", PolynomialFeatures()), ('ridge', Ridge())])

# 前缀指定各个步骤的搜索参数

param_grid = {'ploy__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# 网格搜索

grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)

grid.fit(X_train, y_train)

# 精度

print(grid.score(X_test, y_test))

# 最佳泛化的一组参数

print(grid.best_params_)

0.7673580350306177
{'ploy__degree': 2, 'ridge__alpha': 10}

1 2	0.7673580350306177 {'ploy__degree': 2, 'ridge__alpha': 10}

可以看出2次多项式的精度表现最好。

通过pipeline同时搜索预处理参数与模型的参数的确非常强大，但是网格搜索的参数越多代价越高。

网格搜索+模型选择

刚才都是搜索模型的参数，我们有时候想比较哪个模型更好，或者使用/不使用预处理哪个更好，那么就可以利用pipeline+网格搜索实现。

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier

# 加载并划分数据
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
         cancer.data, cancer.target, random_state=0)

# 定义一下pipeline的2个步骤
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

# 对SVC模型进行gamma参数搜索、以及是否预处理的比较
# 对随机森林分类器进行max_features参数搜索、并且不进行预处理
param_grid = [
         {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],
          'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],
          'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},
         {'classifier': [RandomForestClassifier(n_estimators=100)],
          'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]

# 网格搜索
grid = GridSearchCV(pipe, param_grid, cv=5)

grid.fit(X_train, y_train)

# 精度
print(grid.score(X_test, y_test))
# 最佳泛化的一组参数
print(grid.best_params_)

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures

from sklearn.datasets import load_breast_cancer

from sklearn.linear_model import Ridge

from sklearn.ensemble import RandomForestClassifier

# 加载并划分数据

cancer = load_breast_cancer()

X_train, X_test, y_train, y_test = train_test_split(

cancer.data, cancer.target, random_state=0)

# 定义一下pipeline的2个步骤

pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())])

# 对SVC模型进行gamma参数搜索、以及是否预处理的比较

# 对随机森林分类器进行max_features参数搜索、并且不进行预处理

param_grid = [

{'classifier': [SVC()], 'preprocessing': [StandardScaler(), None],

'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100],

'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]},

{'classifier': [RandomForestClassifier(n_estimators=100)],

'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}]

# 网格搜索

grid = GridSearchCV(pipe, param_grid, cv=5)

grid.fit(X_train, y_train)

# 精度

print(grid.score(X_test, y_test))

# 最佳泛化的一组参数

print(grid.best_params_)

0.9790209790209791
{'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

0.9790209790209791

{'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,

decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',

max_iter=-1, probability=False, random_state=None, shrinking=True,

tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

通过在网格搜索中指定步骤的名字，可以进行模型的替换，从而验证哪种模型泛化更好，在此同时也可以进行模型调参。

上述搜索证明，StandardScaler缩放+SVC分类模型的效果要好于随机森林分类。

小结与展望

现实世界中的机器学习很少仅涉及单个模型的独立应用
管道可以将多个步骤封装为一个对象，同样提供了fit/transform/predict方法。
交叉验证/网格搜索+管道是一门艺术，让我们找到最佳组合变得非常方便。
我们已经学完scikit-learn提供的所有通用工具与算法。

如果文章帮助您解决了工作难题，您可以帮我点击屏幕上的任意广告，或者赞助少量费用来支持我的持续创作，谢谢~

在网格搜索中使用管道

通用的管道接口

网格搜索+预处理步骤

网格搜索+模型选择

小结与展望

发表回复 取消回复

发表回复取消回复