github地址:点我
在网格搜索中使用管道
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
from sklearn.svm import SVC from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline # 加载并划分数据 cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=0) # 先缩放再跑模型的pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) # 5折网格搜索 param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) grid.fit(X_train, y_train) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) print("Test set score: {:.2f}".format(grid.score(X_test, y_test))) print("Best parameters: {}".format(grid.best_params_)) |
1 2 3 |
Best cross-validation accuracy: 0.98 Test set score: 0.97 Best parameters: {'svm__C': 1, 'svm__gamma': 1} |
pipeline组合了2个步骤(1个预处理+1个分类模型),首先对训练数据做MinMaxScaler的fit_transform,然后再调用SVC模型的fit进行训练。
这里采用5折网格参数搜索,但是在pipeline情况下,需要把搜索参数增加对应步骤的名字作为前缀,这样才会被pipeline中的某个步骤使用。
通用的管道接口
pipeline每个步骤都要求有fit方法,并且除了最后一个步骤之外都额外需要拥有transform方法以便对数据输入进行转换。
K折网格搜索是为了找到表现最好的一次pipeline,它保存在gridsearch的bestestimator中,可以取出其中的每一个步骤的训练结果。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from sklearn.svm import SVC from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline # 加载并划分数据 cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=0) # 先缩放再跑模型的pipeline pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())]) # 5折网格搜索 param_grid = {'svm__C': [0.001, 0.01, 0.1, 1, 10, 100],'svm__gamma': [0.001, 0.01, 0.1, 1, 10, 100]} grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) grid.fit(X_train, y_train) # 最佳泛化的训练结果 print(grid.best_estimator_) # 最佳结果中的svm步骤 print(grid.best_estimator_.named_steps['svm']) |
1 2 3 4 5 6 7 8 9 |
Pipeline(memory=None, steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svm', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))]) SVC(C=1, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) |
可以看到具有最佳泛化的pipeline,其2个步骤的训练结果也对应的保存了起来,我们可以取出其中的某一步骤的训练结果。
网格搜索+预处理步骤
这依旧是pipeline的一个例子,在中间加了一个交互多项式特征的步骤。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.datasets import load_boston from sklearn.linear_model import Ridge # 加载并划分数据 boston = load_boston() X_train, X_test, y_train, y_test = train_test_split( boston.data, boston.target, random_state=0) # 缩放数据 + 生成多项式特征 + 岭回归 pipe = Pipeline([("scaler", StandardScaler()), ("ploy", PolynomialFeatures()), ('ridge', Ridge())]) # 前缀指定各个步骤的搜索参数 param_grid = {'ploy__degree': [1, 2, 3], 'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]} # 网格搜索 grid = GridSearchCV(pipe, param_grid=param_grid, cv=5) grid.fit(X_train, y_train) # 精度 print(grid.score(X_test, y_test)) # 最佳泛化的一组参数 print(grid.best_params_) |
1 2 |
0.7673580350306177 {'ploy__degree': 2, 'ridge__alpha': 10} |
可以看出2次多项式的精度表现最好。
通过pipeline同时搜索预处理参数与模型的参数的确非常强大,但是网格搜索的参数越多代价越高。
网格搜索+模型选择
刚才都是搜索模型的参数,我们有时候想比较哪个模型更好,或者使用/不使用预处理哪个更好,那么就可以利用pipeline+网格搜索实现。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import PolynomialFeatures from sklearn.datasets import load_breast_cancer from sklearn.linear_model import Ridge from sklearn.ensemble import RandomForestClassifier # 加载并划分数据 cancer = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split( cancer.data, cancer.target, random_state=0) # 定义一下pipeline的2个步骤 pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', SVC())]) # 对SVC模型进行gamma参数搜索、以及是否预处理的比较 # 对随机森林分类器进行max_features参数搜索、并且不进行预处理 param_grid = [ {'classifier': [SVC()], 'preprocessing': [StandardScaler(), None], 'classifier__gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]}, {'classifier': [RandomForestClassifier(n_estimators=100)], 'preprocessing': [None], 'classifier__max_features': [1, 2, 3]}] # 网格搜索 grid = GridSearchCV(pipe, param_grid, cv=5) grid.fit(X_train, y_train) # 精度 print(grid.score(X_test, y_test)) # 最佳泛化的一组参数 print(grid.best_params_) |
1 2 3 4 5 |
0.9790209790209791 {'classifier': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), 'classifier__C': 10, 'classifier__gamma': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)} |
通过在网格搜索中指定步骤的名字,可以进行模型的替换,从而验证哪种模型泛化更好,在此同时也可以进行模型调参。
上述搜索证明,StandardScaler缩放+SVC分类模型的效果要好于随机森林分类。
小结与展望
- 现实世界中的机器学习很少仅涉及单个模型的独立应用
- 管道可以将多个步骤封装为一个对象,同样提供了fit/transform/predict方法。
- 交叉验证/网格搜索+管道是一门艺术,让我们找到最佳组合变得非常方便。
- 我们已经学完scikit-learn提供的所有通用工具与算法。
如果文章帮助您解决了工作难题,您可以帮我点击屏幕上的任意广告,或者赞助少量费用来支持我的持续创作,谢谢~
