模型效果很好,但网格搜索会导致错误
在处理一个项目时,我遇到了一个奇怪的错误,我的模型拟合得很好,但是当我应用gridsearch
时,它给了我一个错误。
该代码将创建的所有必要对象放入管道中并使用它们。
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft
class DataPreprocess(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.X_m = X.merge(y, on= ['year', 'weekofyear'])
return self
def transform(self, X):
dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
unix = []
for i in dt:
unix.append(time.mktime(i.timetuple()))
X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
return X_t
class FourierComponents(BaseEstimator, TransformerMixin):
"""creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
def __init__(self, n=10):
self.n = n
def fit(self, X, y= None):
self.labels = X['total_cases']
self.Y_t = fft(self.labels - (self.labels).mean())
self.Y_t = self.Y_t[:len(self.labels)//2]
self.ind_max = np.abs(self.Y_t).argsort()
self.t_span = len(self.labels)
self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
self.f_ind = self.f[self.ind_max]
self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
return self
def transform(self, X):
Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
for i, f in enumerate(self.f_ind[-self.n:]):
Xt[:, 2*i] = np.cos(2*np.pi*f*self.ind).reshape(-1)
Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
return Xt
Unixdata = DataPreprocess()
fourier = FourierComponents()
model = Pipeline([
('indices', Unixdata),
('fourier', fourier),
('scalar', StandardScaler()),
('regressor', Ridge())
])
param_grid = {'fourier__n' : list(range(3,5)),
'regressor__alpha' : np.logspace(1, 4, 20)}
grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')
grid_search.fit(sj_train_features, sj_train_labels)
在这里安装grid_search
给了我这个错误:
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
210 **self._kwargs)
211 else:
--> 212 return self._sign * self._score_func(y_true, y_pred,
213 **self._kwargs)
214
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
176 0.85...
177 """
--> 178 y_type, y_true, y_pred, multioutput = _check_reg_targets(
179 y_true, y_pred, multioutput)
180 check_consistent_length(y_true, y_pred, sample_weight)
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
82
83 """
---> 84 check_consistent_length(y_true, y_pred)
85 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
86 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [188, 748]
但
model.fit(sj_train_features, sj_train_labels)
非常适合。
现在我想知道为什么以及代码中的错误在哪里? 有人能指出我正确的方向吗?
一个小例子(希望有代表性):
sj_train_features = pd.DataFrame({
'year': [1990] * 10,
'weekofyear': np.arange(18, 28),
'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
'ndvi_ne': np.random.random(10),
'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])
sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')
While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch
it gives me an error.
The code puts all the necessary objects created and uses them in the pipeline.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft
class DataPreprocess(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
self.X_m = X.merge(y, on= ['year', 'weekofyear'])
return self
def transform(self, X):
dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
unix = []
for i in dt:
unix.append(time.mktime(i.timetuple()))
X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
return X_t
class FourierComponents(BaseEstimator, TransformerMixin):
"""creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
def __init__(self, n=10):
self.n = n
def fit(self, X, y= None):
self.labels = X['total_cases']
self.Y_t = fft(self.labels - (self.labels).mean())
self.Y_t = self.Y_t[:len(self.labels)//2]
self.ind_max = np.abs(self.Y_t).argsort()
self.t_span = len(self.labels)
self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
self.f_ind = self.f[self.ind_max]
self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
return self
def transform(self, X):
Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
for i, f in enumerate(self.f_ind[-self.n:]):
Xt[:, 2*i] = np.cos(2*np.pi*f*self.ind).reshape(-1)
Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
return Xt
Unixdata = DataPreprocess()
fourier = FourierComponents()
model = Pipeline([
('indices', Unixdata),
('fourier', fourier),
('scalar', StandardScaler()),
('regressor', Ridge())
])
param_grid = {'fourier__n' : list(range(3,5)),
'regressor__alpha' : np.logspace(1, 4, 20)}
grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')
grid_search.fit(sj_train_features, sj_train_labels)
fitting the grid_search
here gives me this error:
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-167-cfce20172a59> in <module>
----> 1 grid_search.fit(sj_train_features, sj_train_labels)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
~\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _score(self, method_caller, estimator, X, y_true, sample_weight)
210 **self._kwargs)
211 else:
--> 212 return self._sign * self._score_func(y_true, y_pred,
213 **self._kwargs)
214
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_absolute_error(y_true, y_pred, sample_weight, multioutput)
176 0.85...
177 """
--> 178 y_type, y_true, y_pred, multioutput = _check_reg_targets(
179 y_true, y_pred, multioutput)
180 check_consistent_length(y_true, y_pred, sample_weight)
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
82
83 """
---> 84 check_consistent_length(y_true, y_pred)
85 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
86 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
254 uniques = np.unique(lengths)
255 if len(uniques) > 1:
--> 256 raise ValueError("Found input variables with inconsistent numbers of"
257 " samples: %r" % [int(l) for l in lengths])
258
ValueError: Found input variables with inconsistent numbers of samples: [188, 748]
but
model.fit(sj_train_features, sj_train_labels)
fits perfectly.
Now I am wondering why and where is the mistake in the code?
Can anyone point me in the right direction?
A small example (hopefully representative):
sj_train_features = pd.DataFrame({
'year': [1990] * 10,
'weekofyear': np.arange(18, 28),
'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
'ndvi_ne': np.random.random(10),
'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])
sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
在追溯之后的一半以上,您会从
_FIT_AND_SCORE
中看到一个摘要,这表明拟合已成功,但是得分是失败的。实际上,当我调用model.predict
时,i 始终 获得与训练集相同的长度阵列。当将其与真实标签进行比较时,得分手正确地抱怨预测数与标签数不匹配。我不完全了解您的
fourierComponents
应该做什么,但我认为它的变换
方法需要注意时间索引。A little more than halfway through your traceback, you see a snippet from
_fit_and_score
, which indicates that the fitting has succeeded, but that scoring is what fails. Indeed, when I callmodel.predict
, I always get out the same length array as the training set. When comparing that against the true labels then, the scorer correctly complains that the number of predictions does not match the number of labels.I don't entirely understand what your
FourierComponents
is supposed to do, but I think itstransform
method needs to pay attention to the time index.