模型效果很好，但网格搜索会导致错误

发布于 2025-01-20 09:47:09 字数 10184 浏览 0 评论 0原文

在处理一个项目时，我遇到了一个奇怪的错误，我的模型拟合得很好，但是当我应用gridsearch时，它给了我一个错误。

该代码将创建的所有必要对象放入管道中并使用它们。

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft

class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        
        return self
    
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
            unix.append(time.mktime(i.timetuple()))
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        
        return X_t 



class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
        
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        
        self.ind_max = np.abs(self.Y_t).argsort()
        
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        
        self.f_ind = self.f[self.ind_max]
        
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
    
        return self
    
    def transform(self, X):
        
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())
])

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)

在这里安装grid_search给了我这个错误：

    Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    706                               n_splits, n_candidates, n_candidates * n_splits))
    707 
--> 708                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    709                                                        X, y,
    710                                                        train=train, test=test,

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch, callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here, causing self._jobs to

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    558     else:
    559         fit_time = time.time() - start_time
--> 560         test_scores = _score(estimator, X_test, y_test, scorer)
    561         score_time = time.time() - start_time - fit_time
    562         if return_train_score:

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    605         scores = scorer(estimator, X_test)
    606     else:
--> 607         scores = scorer(estimator, X_test, y_test)
    608 
    609     error_msg = ("scoring must return a number, got %s (%s) "

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
     85         for name, scorer in self._scorers.items():
     86             if isinstance(scorer, _BaseScorer):
---> 87                 score = scorer._score(cached_call, estimator,
     88                                       *args, **kwargs)
     89             else:

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
    210                                                  **self._kwargs)
    211         else:
--> 212             return self._sign * self._score_func(y_true, y_pred,
    213                                                  **self._kwargs)
    214 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    176     0.85...
    177     """
--> 178     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    179         y_true, y_pred, multioutput)
    180     check_consistent_length(y_true, y_pred, sample_weight)

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     82 
     83     """
---> 84     check_consistent_length(y_true, y_pred)
     85     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     86     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     uniques = np.unique(lengths)
    255     if len(uniques) > 1:
--> 256         raise ValueError("Found input variables with inconsistent numbers of"
    257                          " samples: %r" % [int(l) for l in lengths])
    258 

ValueError: Found input variables with inconsistent numbers of samples: [188, 748]

但

model.fit(sj_train_features, sj_train_labels)

非常适合。

现在我想知道为什么以及代码中的错误在哪里？有人能指出我正确的方向吗？

一个小例子（希望有代表性）：

sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

原文

While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch it gives me an error.

The code puts all the necessary objects created and uses them in the pipeline.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft

class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        
        return self
    
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
            unix.append(time.mktime(i.timetuple()))
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        
        return X_t 



class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
        
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        
        self.ind_max = np.abs(self.Y_t).argsort()
        
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        
        self.f_ind = self.f[self.ind_max]
        
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
    
        return self
    
    def transform(self, X):
        
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())
])

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)

fitting the grid_search here gives me this error:

    Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    706                               n_splits, n_candidates, n_candidates * n_splits))
    707 
--> 708                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    709                                                        X, y,
    710                                                        train=train, test=test,

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch, callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here, causing self._jobs to

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    558     else:
    559         fit_time = time.time() - start_time
--> 560         test_scores = _score(estimator, X_test, y_test, scorer)
    561         score_time = time.time() - start_time - fit_time
    562         if return_train_score:

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    605         scores = scorer(estimator, X_test)
    606     else:
--> 607         scores = scorer(estimator, X_test, y_test)
    608 
    609     error_msg = ("scoring must return a number, got %s (%s) "

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
     85         for name, scorer in self._scorers.items():
     86             if isinstance(scorer, _BaseScorer):
---> 87                 score = scorer._score(cached_call, estimator,
     88                                       *args, **kwargs)
     89             else:

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
    210                                                  **self._kwargs)
    211         else:
--> 212             return self._sign * self._score_func(y_true, y_pred,
    213                                                  **self._kwargs)
    214 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    176     0.85...
    177     """
--> 178     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    179         y_true, y_pred, multioutput)
    180     check_consistent_length(y_true, y_pred, sample_weight)

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     82 
     83     """
---> 84     check_consistent_length(y_true, y_pred)
     85     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     86     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     uniques = np.unique(lengths)
    255     if len(uniques) > 1:
--> 256         raise ValueError("Found input variables with inconsistent numbers of"
    257                          " samples: %r" % [int(l) for l in lengths])
    258 

ValueError: Found input variables with inconsistent numbers of samples: [188, 748]

but

model.fit(sj_train_features, sj_train_labels)

fits perfectly.

Now I am wondering why and where is the mistake in the code?
Can anyone point me in the right direction?

A small example (hopefully representative):

sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

分享到QQ

分享到微博