Shap：在树解释器中，添加性检查失败：添加check_additivity = false会导致有关轴名称中未找到的键的错误？

发布于 2025-02-09 16:37:24 字数 6384 浏览 2 评论 0原文

我有一个这样的数据集，它是343列二进制数据，并且编码很少（即比1秒比1的0多数）：

                      column1          ...          column343
0                        0  ...                          0
1                        0  ...                          0
2                        0  ...                          0
3                        0  ...                          0
4                        0  ...                          0
..                     ...  ...                        ...
214                      0  ...                          0
215                      0  ...                          0
216                      0  ...                          0
217                      0  ...                          0
218                      0  ...                          0

[219 rows x 343 columns]
(219, 343)

有人可以向我解释如何解决此脚本的问题：

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


df = pd.read_csv('train.txt',sep='\t') #hard-coded
full_y_train = df['Event]
df = df.drop(['Event'],axis=1)
full_X_train = df


def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 
      list_shap_values = list()
      list_test_sets = list()

      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train.iloc[train_ix,:],X_train.iloc[test_ix,:]             
              split_y_train, split_y_test = y_train.iloc[train_ix],y_train.iloc[test_ix]  
              model = model_name
              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
              result = search.fit(split_x_train,split_y_train)
              best_model = result.best_estimator_
              yhat = best_model.predict(split_x_test)

              explainer = shap.TreeExplainer(result.best_estimator_)
              shap_values = explainer.shap_values(split_x_test,check_additivity=False)
              list_shap_values.append(shap_values)
              list_test_sets.append(test_ix) 

      test_set = list_test_sets[0]
      shap_values = np.array(list_shap_values[0])

      for i in range(1,len(list_test_sets)):
          test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
          shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)


      X_test_df = pd.DataFrame(full_X_train[test_set])
      cols = X_test_df.columns
      shap_sum = np.abs(shap_values[1,:,:]).mean(0)
      

      importance_df = pd.DataFrame({
           'column_name':cols,
           'shap_values':shap_sum
      }) 
           
      print(importance_df)

      return


param_grid = [{
               'min_samples_leaf':[1,3,5],
              }]

run_model_with_grid_search(param_grid=param_grid)

生成错误：生成错误：

Traceback (most recent call last):
  File "/home/data/ml_models_genotypic_only_fortest.py", line 103, in <module>
    run_model_with_grid_search(param_grid=param_grid)
  File "/home/data/ml_models_genotypic_only_fortest.py", line 80, in run_model_with_grid_search
    X_test_df = pd.DataFrame(full_X_train[test_set])
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/frame.py", line 3464, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1314, in _get_listlike_indexer
    self._validate_read_indexer(keyarr, indexer, axis)
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1374, in _validate_read_indexer
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([  0,   4,  11,  16,  18,  19,  28,  29,  31,  33,\n            ...\n            156, 157, 175, 178, 192, 203, 204, 207, 211, 215],\n           dtype='int64', length=219)] are in the [columns]"

我将不会遇到错误

shap.utils._exceptions.ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you pass to the explainer is the same data shape that the model was trained on. If your data shape is correct, then please report this on GitHub.

Consider retrying with the feature perturbation=interventional option. This check failed because for one of the samples, the sum of the SHAP values is 0.908553, while the model output was 0.940000. If this difference is acceptable, you can set check_additivity=False to disable this check.

如果我从脚本中删除check_additivity = false

full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)

，我没有错误。

因此，无论我是否离开check_additivity = false在我的真实数据中或之外的脚本中，我都会遇到两个不同的错误，我不确定如何解决这个问题吗？

原文

I have a data set like this, it's 343 columns of binary data, and it is sparsely encoded (i.e. there are many more 0s than 1s):

                      column1          ...          column343
0                        0  ...                          0
1                        0  ...                          0
2                        0  ...                          0
3                        0  ...                          0
4                        0  ...                          0
..                     ...  ...                        ...
214                      0  ...                          0
215                      0  ...                          0
216                      0  ...                          0
217                      0  ...                          0
218                      0  ...                          0

[219 rows x 343 columns]
(219, 343)

Could someone please explain to me how to fix the issue where this script:

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest 
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


df = pd.read_csv('train.txt',sep='\t') #hard-coded
full_y_train = df['Event]
df = df.drop(['Event'],axis=1)
full_X_train = df


def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'): 
      list_shap_values = list()
      list_test_sets = list()

      cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
      for train_ix,test_ix in cv_outer.split(X_train):
              split_x_train, split_x_test = X_train.iloc[train_ix,:],X_train.iloc[test_ix,:]             
              split_y_train, split_y_test = y_train.iloc[train_ix],y_train.iloc[test_ix]  
              model = model_name
              cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
              search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
              result = search.fit(split_x_train,split_y_train)
              best_model = result.best_estimator_
              yhat = best_model.predict(split_x_test)

              explainer = shap.TreeExplainer(result.best_estimator_)
              shap_values = explainer.shap_values(split_x_test,check_additivity=False)
              list_shap_values.append(shap_values)
              list_test_sets.append(test_ix) 

      test_set = list_test_sets[0]
      shap_values = np.array(list_shap_values[0])

      for i in range(1,len(list_test_sets)):
          test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
          shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)


      X_test_df = pd.DataFrame(full_X_train[test_set])
      cols = X_test_df.columns
      shap_sum = np.abs(shap_values[1,:,:]).mean(0)
      

      importance_df = pd.DataFrame({
           'column_name':cols,
           'shap_values':shap_sum
      }) 
           
      print(importance_df)

      return


param_grid = [{
               'min_samples_leaf':[1,3,5],
              }]

run_model_with_grid_search(param_grid=param_grid)

Generates the error:

Traceback (most recent call last):
  File "/home/data/ml_models_genotypic_only_fortest.py", line 103, in <module>
    run_model_with_grid_search(param_grid=param_grid)
  File "/home/data/ml_models_genotypic_only_fortest.py", line 80, in run_model_with_grid_search
    X_test_df = pd.DataFrame(full_X_train[test_set])
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/frame.py", line 3464, in __getitem__
    indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1314, in _get_listlike_indexer
    self._validate_read_indexer(keyarr, indexer, axis)
  File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1374, in _validate_read_indexer
    raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([  0,   4,  11,  16,  18,  19,  28,  29,  31,  33,\n            ...\n            156, 157, 175, 178, 192, 203, 204, 207, 211, 215],\n           dtype='int64', length=219)] are in the [columns]"

I do not get the error if I remove check_additivity=False from the script, however, if I remove the check_additivity parameter, I get the error:

shap.utils._exceptions.ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you pass to the explainer is the same data shape that the model was trained on. If your data shape is correct, then please report this on GitHub.

Consider retrying with the feature perturbation=interventional option. This check failed because for one of the samples, the sum of the SHAP values is 0.908553, while the model output was 0.940000. If this difference is acceptable, you can set check_additivity=False to disable this check.

If I replace my data set with a fake data set:

full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)

, I do not get the error.

So whether I leave check_additivity=False in or out of the script for my real data, leaves me with two different errors, and I'm not sure how to get around this?

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

没有心的人 2025-02-16 16:37:24

很难调试您的代码，因为它不可再现，但是您可以遵循以下代码段“ just runs”：

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_breast_cancer
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


full_X_train, full_y_train = load_breast_cancer(return_X_y=True, as_frame=True)

def run_model_with_grid_search(
    param_grid={},
    output_plt_file="plt.png",
    model_name=RandomForestClassifier(),
    X_train=full_X_train,
    y_train=full_y_train,
    model_id="random_forest_with_hpo_no_fs_geno_class",
    n_splits=5,
    output_file="random_forest_with_hpo_no_fs_geno_class.txt",
):
    list_shap_values = list()
    list_test_sets = list()

    cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_ix, test_ix in cv_outer.split(X_train):
        split_x_train, split_x_test = (
            X_train.iloc[train_ix, :],
            X_train.iloc[test_ix, :],
        )
        split_y_train, split_y_test = y_train[train_ix], y_train[test_ix]
        model = model_name
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        search = GridSearchCV(
            model, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, refit=True
        )
        result = search.fit(split_x_train, split_y_train)
        best_model = result.best_estimator_
        yhat = best_model.predict(split_x_test)

        explainer = shap.TreeExplainer(result.best_estimator_)
        shap_values = explainer.shap_values(split_x_test, check_additivity=False)
        list_shap_values.append(shap_values)

    shap_values = np.vstack([sv[1] for sv in list_shap_values])
    sv = np.abs(shap_values.mean(0))
    cols = X_train.columns

    importance_df = pd.DataFrame({"column_name": cols, "shap_values": sv})

    return importance_df


param_grid = [{"min_samples_leaf": [1, 3, 5],}]

importance_df = run_model_with_grid_search(param_grid=param_grid)

print(importance_df)

                column_name  shap_values
0               mean radius     0.000202
1              mean texture     0.000585
2            mean perimeter     0.000728
3                 mean area     0.000541
4           mean smoothness     0.000867
5          mean compactness     0.000098
6            mean concavity     0.000759
7       mean concave points     0.003325
8             mean symmetry     0.000033
9    mean fractal dimension     0.000349
...

请注意，上面的代码在我的计算机上运行，同时使用true和false 用于check_additivity param

It's hard to debug your code as it's not reproducible but you may follow the following code snippet that "just runs":

from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_breast_cancer
#from xgboost import XGBClassifier 
from sklearn.feature_selection import mutual_info_classif 
from sklearn.feature_selection import SelectKBest, RFECV 
from sklearn.pipeline import Pipeline 
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score 
from sklearn.model_selection import StratifiedKFold 
from sklearn.metrics import make_scorer 
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score 
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean 
from sklearn.model_selection import train_test_split
from numpy import std 
from sklearn.utils import shuffle 
import numpy as np 
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt 
import pickle
#import neptune.new as neptune
import pandas as pd
import shap


full_X_train, full_y_train = load_breast_cancer(return_X_y=True, as_frame=True)

def run_model_with_grid_search(
    param_grid={},
    output_plt_file="plt.png",
    model_name=RandomForestClassifier(),
    X_train=full_X_train,
    y_train=full_y_train,
    model_id="random_forest_with_hpo_no_fs_geno_class",
    n_splits=5,
    output_file="random_forest_with_hpo_no_fs_geno_class.txt",
):
    list_shap_values = list()
    list_test_sets = list()

    cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_ix, test_ix in cv_outer.split(X_train):
        split_x_train, split_x_test = (
            X_train.iloc[train_ix, :],
            X_train.iloc[test_ix, :],
        )
        split_y_train, split_y_test = y_train[train_ix], y_train[test_ix]
        model = model_name
        cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
        search = GridSearchCV(
            model, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, refit=True
        )
        result = search.fit(split_x_train, split_y_train)
        best_model = result.best_estimator_
        yhat = best_model.predict(split_x_test)

        explainer = shap.TreeExplainer(result.best_estimator_)
        shap_values = explainer.shap_values(split_x_test, check_additivity=False)
        list_shap_values.append(shap_values)

    shap_values = np.vstack([sv[1] for sv in list_shap_values])
    sv = np.abs(shap_values.mean(0))
    cols = X_train.columns

    importance_df = pd.DataFrame({"column_name": cols, "shap_values": sv})

    return importance_df


param_grid = [{"min_samples_leaf": [1, 3, 5],}]

importance_df = run_model_with_grid_search(param_grid=param_grid)

print(importance_df)

                column_name  shap_values
0               mean radius     0.000202
1              mean texture     0.000585
2            mean perimeter     0.000728
3                 mean area     0.000541
4           mean smoothness     0.000867
5          mean compactness     0.000098
6            mean concavity     0.000759
7       mean concave points     0.003325
8             mean symmetry     0.000033
9    mean fractal dimension     0.000349
...

Note, the above code runs on my machine with both True and False for check_additivity param

回复收藏 0 原文

~没有更多了~