X具有14个功能,但是RandomForestClassifier期望20个功能作为输入
我能问一下,我有这个代码(一个可重复的示例):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
acc_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
auc_list = list()
#for ROC curve
tprs = []
base_fpr = np.linspace(0, 1, 101)
plt.figure(figsize=(5, 5))
plt.axes().set_aspect('equal', 'datalim')
count = 0
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit(split_x_train,split_y_train)
print(search.best_params_)
yhat = best_model.predict(split_x_test) #changed from best_model and split_x_test
accuracy = accuracy_score(split_y_test,yhat)
acc_list.append(accuracy)
f1_sc = f1_score(split_y_test,yhat)
f1_list.append(f1_sc)
precision_sc = precision_score(split_y_test,yhat)
precision_list.append(precision_sc)
recall_sc = recall_score(split_y_test,yhat)
recall_list.append(recall_sc)
fpr, tpr, _ = roc_curve(split_y_test, yhat)
auc = metrics.auc(fpr,tpr)
auc_list.append(auc)
plt.plot(fpr, tpr, 'b', alpha=0.15)
tpr = np.interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
count +=1
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
explainer = shap.TreeExplainer(best_model_shap)
shap_values = explainer.shap_values(split_x_test)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
importance_df.sort_values('shap_values',ascending=False)
print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
tpr_std = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
tprs_lower = mean_tprs - tpr_std
plt.plot(base_fpr, mean_tprs, 'b')
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC for stratified 5-fold CV (blue line = mean)')
plt.savefig(output_plt_file)
print(importance_df)
return
param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
我得到错误:
X has 14 features, but RandomForestClassifier is expecting 20 features as input
我可以看到其他人遇到了这个问题(我认为) - 这是因为我确实在构建模型之前确实具有选择,但是然后,我不会将X测试数据转换为相同的维度。
我试图基于查看其他人的工作来实现解决方案,例如
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit_transform(split_x_train,split_y_train)
print(search.best_params_)
transformed_x_test = best_model.transform(split_x_test)
yhat = best_model.predict(transformed_x_test) #changed from best_model and split_x_test
,这导致其他错误(例如,在这种情况下,“ gridserchcv对象没有属性fit_transform”) - 因此,我不清楚如何将其他人的解决方案实现到我的我的解决方案。问题。
有人可以向我展示如何使此代码正常工作(无论如何,除了此错误外,都应该工作)。另外,如果有人可以向我展示如何通过功能选择过程选择哪些功能,那将是很棒的,但我想我可以在开始工作后弄清楚这一点。
Could I please ask, I have this code (a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
acc_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
auc_list = list()
#for ROC curve
tprs = []
base_fpr = np.linspace(0, 1, 101)
plt.figure(figsize=(5, 5))
plt.axes().set_aspect('equal', 'datalim')
count = 0
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit(split_x_train,split_y_train)
print(search.best_params_)
yhat = best_model.predict(split_x_test) #changed from best_model and split_x_test
accuracy = accuracy_score(split_y_test,yhat)
acc_list.append(accuracy)
f1_sc = f1_score(split_y_test,yhat)
f1_list.append(f1_sc)
precision_sc = precision_score(split_y_test,yhat)
precision_list.append(precision_sc)
recall_sc = recall_score(split_y_test,yhat)
recall_list.append(recall_sc)
fpr, tpr, _ = roc_curve(split_y_test, yhat)
auc = metrics.auc(fpr,tpr)
auc_list.append(auc)
plt.plot(fpr, tpr, 'b', alpha=0.15)
tpr = np.interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
count +=1
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
explainer = shap.TreeExplainer(best_model_shap)
shap_values = explainer.shap_values(split_x_test)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
importance_df.sort_values('shap_values',ascending=False)
print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
tpr_std = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
tprs_lower = mean_tprs - tpr_std
plt.plot(base_fpr, mean_tprs, 'b')
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC for stratified 5-fold CV (blue line = mean)')
plt.savefig(output_plt_file)
print(importance_df)
return
param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
[200,500,700,1000,1500,2000]
}]
run_model_with_grid_search(param_grid=param_grid)
I get the error:
X has 14 features, but RandomForestClassifier is expecting 20 features as input
I can see other people have had this issue understand the problem (I think) - it's because I do feature selection before I build a model, but then I don't transform my X test data to be the same dimensions.
I was trying to implement solutions based on seeing other people's work e.g.
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_
best_model_shap = search.best_estimator_['clf_cv'].fit_transform(split_x_train,split_y_train)
print(search.best_params_)
transformed_x_test = best_model.transform(split_x_test)
yhat = best_model.predict(transformed_x_test) #changed from best_model and split_x_test
But that is leading to other errors (e.g. in this case, 'GridSerchCV object has no attribute fit_transform') - so I'm just not clear how to implement other's solutions to my issue.
Could someone please demonstrate to me how to get this piece of code working (it should all work except for this error anyway). Also, if someone could show me how to print which features are being selected by the feature selection process that would be great but I think I can figure that out after this starts working.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论