shap/numpy:串联轴的所有输入阵列尺寸必须完全匹配
有人可以解释一下此代码时如何修复(可再现的例子):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
acc_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
auc_list = list()
#for ROC curve
tprs = []
base_fpr = np.linspace(0, 1, 101)
plt.figure(figsize=(5, 5))
plt.axes().set_aspect('equal', 'datalim')
count = 0
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_[0]
selected_features = best_model.support_
split_x_test_selected_features = split_x_test[:,selected_features]
print(split_x_test_selected_features.shape)
best_model_shap = search.best_estimator_[1]
print(best_model_shap)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
#transformed_x_test = best_model.transform(split_x_test)
yhat = search.predict(split_x_test) #changed from best_model and split_x_test
accuracy = accuracy_score(split_y_test,yhat)
acc_list.append(accuracy)
f1_sc = f1_score(split_y_test,yhat)
f1_list.append(f1_sc)
precision_sc = precision_score(split_y_test,yhat)
precision_list.append(precision_sc)
recall_sc = recall_score(split_y_test,yhat)
recall_list.append(recall_sc)
fpr, tpr, _ = roc_curve(split_y_test, yhat)
auc = metrics.auc(fpr,tpr)
auc_list.append(auc)
plt.plot(fpr, tpr, 'b', alpha=0.15)
tpr = np.interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
count +=1
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
explainer = shap.TreeExplainer(best_model_shap)
shap_values = explainer.shap_values(split_x_test_selected_features)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
importance_df.sort_values('shap_values',ascending=False)
print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
tpr_std = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
tprs_lower = mean_tprs - tpr_std
plt.plot(base_fpr, mean_tprs, 'b')
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC for stratified 5-fold CV (blue line = mean)')
plt.savefig(output_plt_file)
print(importance_df)
return
param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
产生错误:
File "/home/data/ml_models.py", line 180, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "/home/data/ml_models.py", line 127, in run_model_with_grid_search
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 20 and the array at index 1 has size 16
我通常可以在So上看到其他类似的问题,而不是我可以理解在这里申请的特定解决方案,因此,如果有人可以显示我如何更改此代码,我会感谢它。
Could someone please explain how to fix when this code (a reproducible example):
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
acc_list = list()
f1_list = list()
precision_list = list()
recall_list = list()
auc_list = list()
#for ROC curve
tprs = []
base_fpr = np.linspace(0, 1, 101)
plt.figure(figsize=(5, 5))
plt.axes().set_aspect('equal', 'datalim')
count = 0
list_shap_values = list()
list_test_sets = list()
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc
split_y_train, split_y_test = y_train[train_ix],y_train[test_ix] #add in .iloc
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
model = model_name
rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
best_model = search.best_estimator_[0]
selected_features = best_model.support_
split_x_test_selected_features = split_x_test[:,selected_features]
print(split_x_test_selected_features.shape)
best_model_shap = search.best_estimator_[1]
print(best_model_shap)
print(search.best_params_)
print(search.best_score_)
print(search.best_estimator_)
#transformed_x_test = best_model.transform(split_x_test)
yhat = search.predict(split_x_test) #changed from best_model and split_x_test
accuracy = accuracy_score(split_y_test,yhat)
acc_list.append(accuracy)
f1_sc = f1_score(split_y_test,yhat)
f1_list.append(f1_sc)
precision_sc = precision_score(split_y_test,yhat)
precision_list.append(precision_sc)
recall_sc = recall_score(split_y_test,yhat)
recall_list.append(recall_sc)
fpr, tpr, _ = roc_curve(split_y_test, yhat)
auc = metrics.auc(fpr,tpr)
auc_list.append(auc)
plt.plot(fpr, tpr, 'b', alpha=0.15)
tpr = np.interp(base_fpr, fpr, tpr)
tpr[0] = 0.0
tprs.append(tpr)
count +=1
print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
explainer = shap.TreeExplainer(best_model_shap)
shap_values = explainer.shap_values(split_x_test_selected_features)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
importance_df.sort_values('shap_values',ascending=False)
print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
tprs = np.array(tprs)
mean_tprs = tprs.mean(axis=0)
tpr_std = tprs.std(axis=0)
tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
tprs_lower = mean_tprs - tpr_std
plt.plot(base_fpr, mean_tprs, 'b')
plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.title('ROC for stratified 5-fold CV (blue line = mean)')
plt.savefig(output_plt_file)
print(importance_df)
return
param_grid = [{
'clf_cv__min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Produces the error:
File "/home/data/ml_models.py", line 180, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "/home/data/ml_models.py", line 127, in run_model_with_grid_search
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 20 and the array at index 1 has size 16
I can generally see other questions like this on SO, but not a specific solution that I can understand how to apply here, so if someone could show me how to change this code I'd appreciate it.
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
data:image/s3,"s3://crabby-images/d5906/d59060df4059a6cc364216c4d63ceec29ef7fe66" alt="扫码二维码加入Web技术交流群"
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
在解决问题之前,您必须了解它。当涉及
Shape
错误时,您必须知道所涉及的所有变量的形状。有时可以推断出来,但是通常我必须添加一些打印语句才能肯定。shap_values
以list_shap_values
的第一个元素开始。我不能确切地说出什么,除了它是通过迭代中的
shap_values
创建的:然后您可以在该列表中出现迭代,可以尝试
concatenate
更多元素从该列表中。对于
condenate
工作,您需要知道shap_values.shape.shape
以及np.array(list_shap_values [i i code> np.aray> [i) ])。形状
。我强调知道,不允许猜测。从错误消息中
因此,在此迭代中(我不知道它是第一个),
shap_values
必须为(n,m1,20)形状和np.array(。 ..)
必须为(n,m2,16)。您是在轴1上加入的,但是其他维度必须匹配。清楚吗?
我想知道为什么您要重复
condenate
。您已经显示您知道如何使用列表附加。那要快得多。我打算建议
np.concatenate(list_shap_values,axis = 1)
,也就是说,在列表的所有元素上进行连接 - 一次,而不是零碎。但是此错误表明list_shap_values
中的数组不兼容。有些在此轴上有20个= 2,另一些人16,谁知道还有什么。无论如何,这就是我可以从代码和错误中推断出来的。他们不足以付给我进一步的钱:)
Before you can fix a problem, you have to understand it. When it comes to
shape
errors, you have to know the shape of all variables involved. Sometimes that can be deduced, but often I have to add some print statements to be sure.shap_values
starts as the first element oflist_shap_values
.I can't say exactly what that is, except it was created by appending
shap_values
in an iteration:Then you appear to iterate on that list, can try to
concatenate
more elements from that list.For the
concatenate
work, you need to know what theshap_values.shape
is, as well asnp.array(list_shap_values[i]).shape
. I stress KNOW, no guessing allowed.From the error message
So in this iteration (I don't know if it's the first or not),
shap_values
must be (n,m1,20) shape, andnp.array(...)
must be (n,m2,16).You are concatenating on axis 1, but the other dimensions must match. Is that clear?
I wonder why you are doing this repeated
concatenate
. You already showed you know how to use list append. That is much faster.I was going to suggest
np.concatenate(list_shap_values, axis=1)
, that is, doing a concatenate on all elements of the list - once, rather than piecemeal. But this error suggests that the arrays inlist_shap_values
are not compatible. Some have 20 on this axis=2, others 16, and who knows what else.Anyways, that's what I can deduce from the code and the error. They don't pay me enough to take it any further :)