from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
import time
from numpy.fft import fft

class DataPreprocess(BaseEstimator, TransformerMixin):
    def __init__(self):
    def fit(self, X, y=None):
        self.X_m = X.merge(y, on= ['year', 'weekofyear'])
        return self
    def transform(self, X):
        dt = pd.to_datetime(self.X_m["week_start_date"], format="%Y-%m-%d")
        unix = []
        for i in dt:
        X_t = (self.X_m).reset_index().assign(date = unix).set_index(['date'])
        return X_t 

class FourierComponents(BaseEstimator, TransformerMixin):
    """creat features based on sin(2*pi*f*t) and cos(2*pi*f*t)"""
    def __init__(self, n=10):
        self.n = n
    def fit(self, X, y= None):
        self.labels = X['total_cases']
        self.Y_t = fft(self.labels - (self.labels).mean())
        self.Y_t = self.Y_t[:len(self.labels)//2]

        self.ind_max = np.abs(self.Y_t).argsort()
        self.t_span = len(self.labels)

        self.f = np.linspace(0, len(self.Y_t), len(self.Y_t)) / self.t_span
        self.f_ind = self.f[self.ind_max]
        self.ind = pd.RangeIndex(start = 1, stop=(len(X.index.get_level_values('date')) +1)).values.reshape(-1, 1)
        return self
    def transform(self, X):
        Xt = np.zeros((X.shape[0], 2*len(self.f_ind[-self.n:])))
        for i, f in enumerate(self.f_ind[-self.n:]):
            Xt[:, 2*i]     = np.cos(2*np.pi*f*self.ind).reshape(-1)
            Xt[:, 2*i + 1] = np.sin(2*np.pi*f*self.ind).reshape(-1)
        return Xt

Unixdata = DataPreprocess()
fourier = FourierComponents()

model = Pipeline([
    ('indices', Unixdata),
    ('fourier', fourier),
    ('scalar', StandardScaler()),
    ('regressor', Ridge())

param_grid = {'fourier__n' : list(range(3,5)),
              'regressor__alpha' : np.logspace(1, 4, 20)}

grid_search = GridSearchCV(model, param_grid, cv = 5, verbose = 1, scoring='neg_mean_absolute_error')

grid_search.fit(sj_train_features, sj_train_labels)


model.fit(sj_train_features, sj_train_labels)


现在我想知道为什么以及代码中的错误在哪里? 有人能指出我正确的方向吗?


sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

While working on a project I have come across a weird error, where fitting my model works perfectly but when I apply gridsearch it gives me an error.

The code puts all the necessary objects created and uses them in the pipeline.

fitting the grid_search here gives me this error:

model.fit(sj_train_features, sj_train_labels)

fits perfectly.

Now I am wondering why and where is the mistake in the code?
Can anyone point me in the right direction?

A small example (hopefully representative):

sj_train_features = pd.DataFrame({
    'year': [1990] * 10,
    'weekofyear': np.arange(18, 28),
    'week_start_date': pd.date_range('1990-04-30', periods=10, freq='w'),
    'ndvi_ne': np.random.random(10),
    'station_precip': np.random.random(10)*10,
}).set_index(['year', 'weekofyear'])

sj_train_labels = pd.Series(np.random.random(10)*20, index=sj_train_features.index, name='total_cases')

谎言 2025-01-27 09:47:09

在追溯之后的一半以上,您会从_FIT_AND_SCORE中看到一个摘要,这表明拟合已成功,但是得分是失败的。实际上,当我调用model.predict时,i 始终 获得与训练集相同的长度阵列。当将其与真实标签进行比较时,得分手正确地抱怨预测数与标签数不匹配。


A little more than halfway through your traceback, you see a snippet from _fit_and_score, which indicates that the fitting has succeeded, but that scoring is what fails. Indeed, when I call model.predict, I always get out the same length array as the training set. When comparing that against the true labels then, the scorer correctly complains that the number of predictions does not match the number of labels.

I don't entirely understand what your FourierComponents is supposed to do, but I think its transform method needs to pay attention to the time index.

