模型效果很好,但网格搜索会导致错误

发布于 2025-01-20 09:47:09 字数 10184 浏览 0 评论 0原文

在处理一个项目时,我遇到了一个奇怪的错误,我的模型拟合得很好,但是当我应用gridsearch时,它给了我一个错误。

该代码将创建的所有必要对象放入管道中并使用它们。

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft

class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        
        return self
    
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
            unix.append(time.mktime(i.timetuple()))
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        
        return X_t 



class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
        
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        
        self.ind_max = np.abs(self.Y_t).argsort()
        
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        
        self.f_ind = self.f[self.ind_max]
        
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
    
        return self
    
    def transform(self, X):
        
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())
])

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)

在这里安装grid_search给了我这个错误:

    Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    706                               n_splits, n_candidates, n_candidates * n_splits))
    707 
--> 708                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    709                                                        X, y,
    710                                                        train=train, test=test,

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch, callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here, causing self._jobs to

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    558     else:
    559         fit_time = time.time() - start_time
--> 560         test_scores = _score(estimator, X_test, y_test, scorer)
    561         score_time = time.time() - start_time - fit_time
    562         if return_train_score:

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    605         scores = scorer(estimator, X_test)
    606     else:
--> 607         scores = scorer(estimator, X_test, y_test)
    608 
    609     error_msg = ("scoring must return a number, got %s (%s) "

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
     85         for name, scorer in self._scorers.items():
     86             if isinstance(scorer, _BaseScorer):
---> 87                 score = scorer._score(cached_call, estimator,
     88                                       *args, **kwargs)
     89             else:

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
    210                                                  **self._kwargs)
    211         else:
--> 212             return self._sign * self._score_func(y_true, y_pred,
    213                                                  **self._kwargs)
    214 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    176     0.85...
    177     """
--> 178     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    179         y_true, y_pred, multioutput)
    180     check_consistent_length(y_true, y_pred, sample_weight)

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     82 
     83     """
---> 84     check_consistent_length(y_true, y_pred)
     85     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     86     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     uniques = np.unique(lengths)
    255     if len(uniques) > 1:
--> 256         raise ValueError("Found input variables with inconsistent numbers of"
    257                          " samples: %r" % [int(l) for l in lengths])
    258 

ValueError: Found input variables with inconsistent numbers of samples: [188, 748]

model.fit(sj_train_features, sj_train_labels)

非常适合。

现在我想知道为什么以及代码中的错误在哪里? 有人能指出我正确的方向吗?

一个小例子(希望有代表性):

sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch it gives me an error.

The code puts all the necessary objects created and uses them in the pipeline.

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft

class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        
        return self
    
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
            unix.append(time.mktime(i.timetuple()))
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        
        return X_t 



class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
        
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        
        self.ind_max = np.abs(self.Y_t).argsort()
        
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        
        self.f_ind = self.f[self.ind_max]
        
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
    
        return self
    
    def transform(self, X):
        
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())
])

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)

fitting the grid_search here gives me this error:

    Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
    734                 return results
    735 
--> 736             self._run_search(evaluate_candidates)
    737 
    738         # For multi-metric evaluation, store the best_index_, best_params_ and

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
   1186     def _run_search(self, evaluate_candidates):
   1187         """Search all candidates in param_grid"""
-> 1188         evaluate_candidates(ParameterGrid(self.param_grid))
   1189 
   1190 

~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
    706                               n_splits, n_candidates, n_candidates * n_splits))
    707 
--> 708                 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
    709                                                        X, y,
    710                                                        train=train, test=test,

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch, callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here, causing self._jobs to

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
    250         # change the default number of processes to -1
    251         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252             return [func(*args, **kwargs)
    253                     for func, args, kwargs in self.items]
    254 

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
    558     else:
    559         fit_time = time.time() - start_time
--> 560         test_scores = _score(estimator, X_test, y_test, scorer)
    561         score_time = time.time() - start_time - fit_time
    562         if return_train_score:

~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
    605         scores = scorer(estimator, X_test)
    606     else:
--> 607         scores = scorer(estimator, X_test, y_test)
    608 
    609     error_msg = ("scoring must return a number, got %s (%s) "

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
     85         for name, scorer in self._scorers.items():
     86             if isinstance(scorer, _BaseScorer):
---> 87                 score = scorer._score(cached_call, estimator,
     88                                       *args, **kwargs)
     89             else:

~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
    210                                                  **self._kwargs)
    211         else:
--> 212             return self._sign * self._score_func(y_true, y_pred,
    213                                                  **self._kwargs)
    214 

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     71                           FutureWarning)
     72         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73         return f(**kwargs)
     74     return inner_f
     75 

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
    176     0.85...
    177     """
--> 178     y_type, y_true, y_pred, multioutput = _check_reg_targets(
    179         y_true, y_pred, multioutput)
    180     check_consistent_length(y_true, y_pred, sample_weight)

~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
     82 
     83     """
---> 84     check_consistent_length(y_true, y_pred)
     85     y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
     86     y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)

~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
    254     uniques = np.unique(lengths)
    255     if len(uniques) > 1:
--> 256         raise ValueError("Found input variables with inconsistent numbers of"
    257                          " samples: %r" % [int(l) for l in lengths])
    258 

ValueError: Found input variables with inconsistent numbers of samples: [188, 748]

but

model.fit(sj_train_features, sj_train_labels)

fits perfectly.

Now I am wondering why and where is the mistake in the code?
Can anyone point me in the right direction?

A small example (hopefully representative):

sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

谎言 2025-01-27 09:47:09

在追溯之后的一半以上,您会从_FIT_AND_SCORE中看到一个摘要,这表明拟合已成功,但是得分是失败的。实际上,当我调用model.predict时,i 始终 获得与训练集相同的长度阵列。当将其与真实标签进行比较时,得分手正确地抱怨预测数与标签数不匹配。

我不完全了解您的fourierComponents应该做什么,但我认为它的变换方法需要注意时间索引。

A little more than halfway through your traceback, you see a snippet from _fit_and_score, which indicates that the fitting has succeeded, but that scoring is what fails. Indeed, when I call model.predict, I always get out the same length array as the training set. When comparing that against the true labels then, the scorer correctly complains that the number of predictions does not match the number of labels.

I don't entirely understand what your FourierComponents is supposed to do, but I think its transform method needs to pay attention to the time index.

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文