属性错误:管道对象没有属性' best_estimator _'

发布于 2025-02-08 19:11:26 字数 3090 浏览 2 评论 0原文

如何修改此代码(这是一个可重复的示例):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pandas as pd



full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)

def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 
  

      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)

      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
              split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc

              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              model = model_name
              rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
              search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
              pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
              result = pipeline.fit(split_x_train,split_y_train)
              #result = search.fit(split_x_train,split_y_train)
              best_model = result.best_estimator_
              yhat = best_model.predict(split_x_test)

              print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))

      return


param_grid = [{
               'min_samples_leaf':[1,3,5],
              }]

run_model_with_grid_search(param_grid=param_grid)

生成:

Attribute Error: Pipeline object has no attribute 'best_estimator_'

最终目的是在此功能中执行嵌套交叉验证,超级参数优化和特征选择,我试图遵循

Intellipaat.com/community/3280/how-to-to-perform-feature-selection-with-gridsearchcv-in-sklearn-in-python“ rel =“ nofollow noreferrer”>此 正确吗?

How to amend this code (which is a reproducible example):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pandas as pd



full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)

def run_model_with_grid_search(param_grid={},output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 
  

      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)

      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
              split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc

              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              model = model_name
              rfecv = RFECV(estimator=model, step=1,cv=5, scoring='roc_auc')
              search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
              pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',search)])
              result = pipeline.fit(split_x_train,split_y_train)
              #result = search.fit(split_x_train,split_y_train)
              best_model = result.best_estimator_
              yhat = best_model.predict(split_x_test)

              print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,result.best_score_,result.best_params_))

      return


param_grid = [{
               'min_samples_leaf':[1,3,5],
              }]

run_model_with_grid_search(param_grid=param_grid)

Generates:

Attribute Error: Pipeline object has no attribute 'best_estimator_'

The ultimate aim is to perform nested cross validation, hyper parameter optimization and feature selection in this function, and I was trying to follow this example

How to edit this function to perform that correctly?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

逐鹿 2025-02-15 19:11:26

通常,您会在管道上运行网格搜索,而不是网格搜索中的管道。有什么原因您会以相反的方式想要吗?

pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_

param_grid将需要clf _当然是超参数名称的前缀。)

在不相关的注释中,准确性似乎是未定义的。

Normally, you'd run grid search on the pipeline, not the pipeline on grid search. Is there a certain reason you'd want it the other way round?

pipeline = Pipeline([('feature_sele',rfecv), ('clf',model)])
search = GridSearchCV(pipeline, param_grid=param_grid, scoring='roc_auc', cv=cv_inner, refit=True)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_

(param_grid will require clf_ prefix to hyperparameter names of course.)

On an unrelated note, accuracy seems to be undefined.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文