shap/numpy:串联轴的所有输入阵列尺寸必须完全匹配

发布于 2025-02-10 12:23:40 字数 7458 浏览 3 评论 0原文

有人可以解释一下此代码时如何修复(可再现的例子):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


    full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
    

    def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 

          cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
          acc_list = list()
          f1_list = list()
          precision_list = list()
          recall_list = list()
          auc_list = list()
    
    
          #for ROC curve
          tprs = []
          base_fpr = np.linspace(0, 1, 101)
          plt.figure(figsize=(5, 5))
          plt.axes().set_aspect('equal', 'datalim')
          count = 0
    
          list_shap_values = list()
          list_test_sets = list()
    
          for train_ix,test_ix in cv_outer.split(X_train):
                  split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
                  split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc
    
                  cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
                  model = model_name
                  rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
                  pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
                  search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
                  best_model = search.best_estimator_[0]
                  selected_features = best_model.support_
    
                  split_x_test_selected_features = split_x_test[:,selected_features]
                  print(split_x_test_selected_features.shape)
                  best_model_shap = search.best_estimator_[1]
                  print(best_model_shap)
                  print(search.best_params_)
                  print(search.best_score_)
                  print(search.best_estimator_)
    
                  #transformed_x_test = best_model.transform(split_x_test)
                  yhat = search.predict(split_x_test)  #changed from best_model and split_x_test
    
                  accuracy = accuracy_score(split_y_test,yhat)
                  acc_list.append(accuracy)
    
                  f1_sc = f1_score(split_y_test,yhat)
                  f1_list.append(f1_sc)
    
                  precision_sc = precision_score(split_y_test,yhat)
                  precision_list.append(precision_sc)
    
                  recall_sc = recall_score(split_y_test,yhat)
                  recall_list.append(recall_sc)
                  
                  fpr, tpr, _ = roc_curve(split_y_test, yhat)
                  auc = metrics.auc(fpr,tpr)
                  auc_list.append(auc)
    
                  plt.plot(fpr, tpr, 'b', alpha=0.15)
                  tpr = np.interp(base_fpr, fpr, tpr)
                  tpr[0] = 0.0
                  tprs.append(tpr)
                  count +=1
    
                  print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
    
                  explainer = shap.TreeExplainer(best_model_shap)
                  shap_values = explainer.shap_values(split_x_test_selected_features)
                  list_shap_values.append(shap_values)
                  list_test_sets.append(test_ix) 
    
          test_set = list_test_sets[0]
          shap_values = np.array(list_shap_values[0])
    
          for i in range(1,len(list_test_sets)):
              test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
              shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
    
    
          X_test_df = pd.DataFrame(full_X_train[test_set])
          cols = X_test_df.columns
          shap_sum = np.abs(shap_values[1,:,:]).mean(0)
          
    
          importance_df = pd.DataFrame({
               'column_name':cols,
               'shap_values':shap_sum
          }) 
               
          importance_df.sort_values('shap_values',ascending=False)
    
          print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
          print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
          print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
          print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
          print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
    
    
          tprs = np.array(tprs)
          mean_tprs = tprs.mean(axis=0)
          tpr_std = tprs.std(axis=0)
    
          tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
          tprs_lower = mean_tprs - tpr_std
    
    
          plt.plot(base_fpr, mean_tprs, 'b')
          plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
          plt.plot([0, 1], [0, 1],'r--')
          plt.xlim([-0.01, 1.01])
          plt.ylim([-0.01, 1.01])
          plt.ylabel('True Positive Rate')
          plt.xlabel('False Positive Rate')
          plt.title('ROC for stratified 5-fold CV (blue line = mean)')
          plt.savefig(output_plt_file)
    
          print(importance_df)
    
          return
    
    
    param_grid = [{

                   'clf_cv__min_samples_leaf':[1,3,5],

                  }]
    
    run_model_with_grid_search(param_grid=param_grid)
            

产生错误:

  File "/home/data/ml_models.py", line 180, in <module>
    run_model_with_grid_search(param_grid=param_grid)
  File "/home/data/ml_models.py", line 127, in run_model_with_grid_search
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 20 and the array at index 1 has size 16

我通常可以在So上看到其他类似的问题,而不是我可以理解在这里申请的特定解决方案,因此,如果有人可以显示我如何更改此代码,我会感谢它。

Could someone please explain how to fix when this code (a reproducible example):

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


    full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
    

    def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 

          cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
          acc_list = list()
          f1_list = list()
          precision_list = list()
          recall_list = list()
          auc_list = list()
    
    
          #for ROC curve
          tprs = []
          base_fpr = np.linspace(0, 1, 101)
          plt.figure(figsize=(5, 5))
          plt.axes().set_aspect('equal', 'datalim')
          count = 0
    
          list_shap_values = list()
          list_test_sets = list()
    
          for train_ix,test_ix in cv_outer.split(X_train):
                  split_x_train, split_x_test = X_train[train_ix,:],X_train[test_ix,:] #add in .iloc               
                  split_y_train, split_y_test = y_train[train_ix],y_train[test_ix]  #add in .iloc
    
                  cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
                  model = model_name
                  rfecv = RFECV(estimator=model, step=1,scoring='accuracy',cv=cv_inner) #change scoring to scoring='roc_auc'
                  pipeline = Pipeline([('feature_sele',rfecv),('clf_cv',model)])
                  search = GridSearchCV(pipeline,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True).fit(split_x_train,split_y_train)
                  best_model = search.best_estimator_[0]
                  selected_features = best_model.support_
    
                  split_x_test_selected_features = split_x_test[:,selected_features]
                  print(split_x_test_selected_features.shape)
                  best_model_shap = search.best_estimator_[1]
                  print(best_model_shap)
                  print(search.best_params_)
                  print(search.best_score_)
                  print(search.best_estimator_)
    
                  #transformed_x_test = best_model.transform(split_x_test)
                  yhat = search.predict(split_x_test)  #changed from best_model and split_x_test
    
                  accuracy = accuracy_score(split_y_test,yhat)
                  acc_list.append(accuracy)
    
                  f1_sc = f1_score(split_y_test,yhat)
                  f1_list.append(f1_sc)
    
                  precision_sc = precision_score(split_y_test,yhat)
                  precision_list.append(precision_sc)
    
                  recall_sc = recall_score(split_y_test,yhat)
                  recall_list.append(recall_sc)
                  
                  fpr, tpr, _ = roc_curve(split_y_test, yhat)
                  auc = metrics.auc(fpr,tpr)
                  auc_list.append(auc)
    
                  plt.plot(fpr, tpr, 'b', alpha=0.15)
                  tpr = np.interp(base_fpr, fpr, tpr)
                  tpr[0] = 0.0
                  tprs.append(tpr)
                  count +=1
    
                  print('>acc=%.3f,est=%.3f,cfg=%s'%(accuracy,search.best_score_,search.best_params_))
    
                  explainer = shap.TreeExplainer(best_model_shap)
                  shap_values = explainer.shap_values(split_x_test_selected_features)
                  list_shap_values.append(shap_values)
                  list_test_sets.append(test_ix) 
    
          test_set = list_test_sets[0]
          shap_values = np.array(list_shap_values[0])
    
          for i in range(1,len(list_test_sets)):
              test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
              shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
    
    
          X_test_df = pd.DataFrame(full_X_train[test_set])
          cols = X_test_df.columns
          shap_sum = np.abs(shap_values[1,:,:]).mean(0)
          
    
          importance_df = pd.DataFrame({
               'column_name':cols,
               'shap_values':shap_sum
          }) 
               
          importance_df.sort_values('shap_values',ascending=False)
    
          print('Accuracy: %.3f (%.3f)' % (mean(acc_list),std(acc_list)))
          print('F1: %.3f (%.3f)' % (mean(f1_list),std(f1_list)))
          print('Precision: %.3f (%.3f)' % (mean(precision_list),std(precision_list)))
          print('Recall: %.3f (%.3f)' % (mean(recall_list),std(recall_list)))
          print('AUC: %.3f (%.3f)' % (mean(auc_list),std(auc_list)))
    
    
          tprs = np.array(tprs)
          mean_tprs = tprs.mean(axis=0)
          tpr_std = tprs.std(axis=0)
    
          tprs_upper = np.minimum(mean_tprs + tpr_std, 1)
          tprs_lower = mean_tprs - tpr_std
    
    
          plt.plot(base_fpr, mean_tprs, 'b')
          plt.fill_between(base_fpr, tprs_lower, tprs_upper, color='grey', alpha=0.3)
          plt.plot([0, 1], [0, 1],'r--')
          plt.xlim([-0.01, 1.01])
          plt.ylim([-0.01, 1.01])
          plt.ylabel('True Positive Rate')
          plt.xlabel('False Positive Rate')
          plt.title('ROC for stratified 5-fold CV (blue line = mean)')
          plt.savefig(output_plt_file)
    
          print(importance_df)
    
          return
    
    
    param_grid = [{

                   'clf_cv__min_samples_leaf':[1,3,5],

                  }]
    
    run_model_with_grid_search(param_grid=param_grid)
            

Produces the error:

  File "/home/data/ml_models.py", line 180, in <module>
    run_model_with_grid_search(param_grid=param_grid)
  File "/home/data/ml_models.py", line 127, in run_model_with_grid_search
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
  File "<__array_function__ internals>", line 5, in concatenate
ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 2, the array at index 0 has size 20 and the array at index 1 has size 16

I can generally see other questions like this on SO, but not a specific solution that I can understand how to apply here, so if someone could show me how to change this code I'd appreciate it.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

偏闹i 2025-02-17 12:23:40

在解决问题之前,您必须了解它。当涉及Shape错误时,您必须知道所涉及的所有变量的形状。有时可以推断出来,但是通常我必须添加一些打印语句才能肯定。

shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
    test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
 

shap_valueslist_shap_values的第一个元素开始。
我不能确切地说出什么,除了它是通过迭代中的shap_values创建的:

 list_shap_values.append(shap_values)

然后您可以在该列表中出现迭代,可以尝试concatenate更多元素从该列表中。

对于condenate工作,您需要知道 shap_values.shape.shape以及np.array(list_shap_values [i i code> np.aray> [i) ])。形状。我强调知道,不允许猜测。

从错误消息中

ValueError: all the input array dimensions for the concatenation axis must match exactly, 
but along dimension 2, the array at index 0 has size 20 
and the array at index 1 has size 16

因此,在此迭代中(我不知道它是第一个),

shap_values必须为(n,m1,20)形状和np.array(。 ..)必须为(n,m2,16)。

您是在轴1上加入的,但是其他维度必须匹配。清楚吗?

我想知道为什么您要重复condenate。您已经显示您知道如何使用列表附加。那要快得多。

我打算建议np.concatenate(list_shap_values,axis = 1),也就是说,在列表的所有元素上进行连接 - 一次,而不是零碎。但是此错误表明list_shap_values中的数组不兼容。有些在此轴上有20个= 2,另一些人16,谁知道还有什么。

无论如何,这就是我可以从代码和错误中推断出来的。他们不足以付给我进一步的钱:)

Before you can fix a problem, you have to understand it. When it comes to shape errors, you have to know the shape of all variables involved. Sometimes that can be deduced, but often I have to add some print statements to be sure.

shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
    test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
    shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
 

shap_values starts as the first element of list_shap_values.
I can't say exactly what that is, except it was created by appending shap_values in an iteration:

 list_shap_values.append(shap_values)

Then you appear to iterate on that list, can try to concatenate more elements from that list.

For the concatenate work, you need to know what the shap_values.shape is, as well as np.array(list_shap_values[i]).shape. I stress KNOW, no guessing allowed.

From the error message

ValueError: all the input array dimensions for the concatenation axis must match exactly, 
but along dimension 2, the array at index 0 has size 20 
and the array at index 1 has size 16

So in this iteration (I don't know if it's the first or not),

shap_values must be (n,m1,20) shape, and np.array(...) must be (n,m2,16).

You are concatenating on axis 1, but the other dimensions must match. Is that clear?

I wonder why you are doing this repeated concatenate. You already showed you know how to use list append. That is much faster.

I was going to suggest np.concatenate(list_shap_values, axis=1), that is, doing a concatenate on all elements of the list - once, rather than piecemeal. But this error suggests that the arrays in list_shap_values are not compatible. Some have 20 on this axis=2, others 16, and who knows what else.

Anyways, that's what I can deduce from the code and the error. They don't pay me enough to take it any further :)

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文