X具有14个功能，但是RandomForestClassifier期望20个功能作为输入

发布于 2025-02-10 05:19:55 字数 6971 浏览 1 评论 0原文

我能问一下，我有这个代码（一个可重复的示例）：

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap

full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)


def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 


      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
      acc_list = list()
      f1_list = list()
      precision_list = list()
      recall_list = list()
      auc_list = list()


      #for ROC curve
      tprs = []
      base_fpr = np.linspace(0, 1, 101)
      plt.figure(figsize=(5, 5))
      plt.axes().set_aspect('equal', 'datalim')
      count = 0

      list_shap_values = list()
      list_test_sets = list()

      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
              split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc

              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              model = model_name
              rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
              pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
              search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
              best_model = search.best_estimator_
              best_model_shap = search.best_estimator_['clf_cv'].fit(split_x_train,split_y_train)
              print(search.best_params_)
              
              
              yhat = best_model.predict(split_x_test)  #changed from best_model and split_x_test

              accuracy = accuracy_score(split_y_test,yhat)
              acc_list.append(accuracy)

              f1_sc = f1_score(split_y_test,yhat)
              f1_list.append(f1_sc)

              precision_sc = precision_score(split_y_test,yhat)
              precision_list.append(precision_sc)

              recall_sc = recall_score(split_y_test,yhat)
              recall_list.append(recall_sc)
              
              fpr, tpr, _ = roc_curve(split_y_test, yhat)
              auc = metrics.auc(fpr,tpr)
              auc_list.append(auc)

              plt.plot(fpr, tpr, 'b', alpha=0.15)
              tpr = np.interp(base_fpr, fpr, tpr)
              tpr[0] = 0.0
              tprs.append(tpr)
              count +=1

              print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))

              explainer = shap.TreeExplainer(best_model_shap)
              shap_values = explainer.shap_values(split_x_test)
              list_shap_values.append(shap_values)
              list_test_sets.append(test_ix) 

      test_set = list_test_sets[0]
      shap_values = np.array(list_shap_values[0])

      for i in range(1,len(list_test_sets)):
          test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
          shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)


      X_test_df = pd.DataFrame(full_X_train[test_set])
      cols = X_test_df.columns
      shap_sum = np.abs(shap_values[1,:,:]).mean(0)
      

      importance_df = pd.DataFrame({
           'column_name':cols,
           'shap_values':shap_sum
      }) 
           
      importance_df.sort_values('shap_values',ascending=False)

      print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
      print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
      print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
      print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
      print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))


      tprs = np.array(tprs)
      mean_tprs = tprs.mean(axis=0)
      tpr_std = tprs.std(axis=0)

      tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
      tprs_lower = mean_tprs - tpr_std


      plt.plot(base_fpr, mean_tprs, 'b')
      plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
      plt.plot([0, 1], [0, 1],'r--')
      plt.xlim([-0.01, 1.01])
      plt.ylim([-0.01, 1.01])
      plt.ylabel('True Positive Rate')
      plt.xlabel('False Positive Rate')
      plt.title('ROC for stratified 5-fold CV (blue line = mean)')
      plt.savefig(output_plt_file)

      print(importance_df)

      return


param_grid = [{
               'clf_cv__min_samples_leaf':[1,3,5],
[200,500,700,1000,1500,2000]
              }]

run_model_with_grid_search(param_grid=param_grid)

我得到错误：

X has 14 features, but RandomForestClassifier is expecting 20 features as input

我可以看到其他人遇到了这个问题（我认为） - 这是因为我确实在构建模型之前确实具有选择，但是然后，我不会将X测试数据转换为相同的维度。

我试图基于查看其他人的工作来实现解决方案，例如

              search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
              best_model = search.best_estimator_
              best_model_shap = search.best_estimator_['clf_cv'].fit_transform(split_x_train,split_y_train)
              print(search.best_params_)
              
              transformed_x_test = best_model.transform(split_x_test)
              yhat = best_model.predict(transformed_x_test)  #changed from best_model and split_x_test

，这导致其他错误（例如，在这种情况下，“ gridserchcv对象没有属性fit_transform”） - 因此，我不清楚如何将其他人的解决方案实现到我的我的解决方案。问题。

有人可以向我展示如何使此代码正常工作（无论如何，除了此错误外，都应该工作）。另外，如果有人可以向我展示如何通过功能选择过程选择哪些功能，那将是很棒的，但我想我可以在开始工作后弄清楚这一点。

原文

Could I please ask, I have this code (a reproducible example):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap

full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)


def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 


      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
      acc_list = list()
      f1_list = list()
      precision_list = list()
      recall_list = list()
      auc_list = list()


      #for ROC curve
      tprs = []
      base_fpr = np.linspace(0, 1, 101)
      plt.figure(figsize=(5, 5))
      plt.axes().set_aspect('equal', 'datalim')
      count = 0

      list_shap_values = list()
      list_test_sets = list()

      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
              split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc

              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              model = model_name
              rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
              pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
              search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
              best_model = search.best_estimator_
              best_model_shap = search.best_estimator_['clf_cv'].fit(split_x_train,split_y_train)
              print(search.best_params_)
              
              
              yhat = best_model.predict(split_x_test)  #changed from best_model and split_x_test

              accuracy = accuracy_score(split_y_test,yhat)
              acc_list.append(accuracy)

              f1_sc = f1_score(split_y_test,yhat)
              f1_list.append(f1_sc)

              precision_sc = precision_score(split_y_test,yhat)
              precision_list.append(precision_sc)

              recall_sc = recall_score(split_y_test,yhat)
              recall_list.append(recall_sc)
              
              fpr, tpr, _ = roc_curve(split_y_test, yhat)
              auc = metrics.auc(fpr,tpr)
              auc_list.append(auc)

              plt.plot(fpr, tpr, 'b', alpha=0.15)
              tpr = np.interp(base_fpr, fpr, tpr)
              tpr[0] = 0.0
              tprs.append(tpr)
              count +=1

              print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))

              explainer = shap.TreeExplainer(best_model_shap)
              shap_values = explainer.shap_values(split_x_test)
              list_shap_values.append(shap_values)
              list_test_sets.append(test_ix) 

      test_set = list_test_sets[0]
      shap_values = np.array(list_shap_values[0])

      for i in range(1,len(list_test_sets)):
          test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
          shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)


      X_test_df = pd.DataFrame(full_X_train[test_set])
      cols = X_test_df.columns
      shap_sum = np.abs(shap_values[1,:,:]).mean(0)
      

      importance_df = pd.DataFrame({
           'column_name':cols,
           'shap_values':shap_sum
      }) 
           
      importance_df.sort_values('shap_values',ascending=False)

      print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
      print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
      print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
      print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
      print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))


      tprs = np.array(tprs)
      mean_tprs = tprs.mean(axis=0)
      tpr_std = tprs.std(axis=0)

      tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
      tprs_lower = mean_tprs - tpr_std


      plt.plot(base_fpr, mean_tprs, 'b')
      plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
      plt.plot([0, 1], [0, 1],'r--')
      plt.xlim([-0.01, 1.01])
      plt.ylim([-0.01, 1.01])
      plt.ylabel('True Positive Rate')
      plt.xlabel('False Positive Rate')
      plt.title('ROC for stratified 5-fold CV (blue line = mean)')
      plt.savefig(output_plt_file)

      print(importance_df)

      return


param_grid = [{
               'clf_cv__min_samples_leaf':[1,3,5],
[200,500,700,1000,1500,2000]
              }]

run_model_with_grid_search(param_grid=param_grid)

I get the error:

X has 14 features, but RandomForestClassifier is expecting 20 features as input

I can see other people have had this issue understand the problem (I think) - it's because I do feature selection before I build a model, but then I don't transform my X test data to be the same dimensions.

I was trying to implement solutions based on seeing other people's work e.g.

              search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
              best_model = search.best_estimator_
              best_model_shap = search.best_estimator_['clf_cv'].fit_transform(split_x_train,split_y_train)
              print(search.best_params_)
              
              transformed_x_test = best_model.transform(split_x_test)
              yhat = best_model.predict(transformed_x_test)  #changed from best_model and split_x_test

But that is leading to other errors (e.g. in this case, 'GridSerchCV object has no attribute fit_transform') - so I'm just not clear how to implement other's solutions to my issue.

Could someone please demonstrate to me how to get this piece of code working (it should all work except for this error anyway). Also, if someone could show me how to print which features are being selected by the feature selection process that would be great but I think I can figure that out after this starts working.

分享到QQ

分享到微博