在sklearn中的管道步骤之间传递信息

发布于 2025-01-13 07:30:04 字数 5825 浏览 0 评论 0 原文

我正在使用 LSTM 解决一个简单的文本生成问题。为了使预处理更加紧凑和可重复，我决定以 sklearn 方式实现所有内容，使用自定义 sklearn 转换器和 scikeras 中的 KerasClassifier 将神经网络定义包装在sklearn 型估计器。

它几乎可以工作，但我不知道如何将信息从某个自定义转换器传递到 KerasClassifier 估计器。更准确地说，对于创建神经网络的方法，我需要输出的数量作为参数；但这取决于拟合词汇表中的单词数量 - 这是当前封装在 ModelEncoder 类中的信息。

（请注意，为了获得当前的逻辑工作，我必须稍微修改默认的 sklearn Pipeline 类，因为它不允许修改和返回两者 X 和 y换句话说，默认的 sklearn Pipeline 只允许特征转换，但不允许目标转换 Pipeline 类。 href="https://stackoverflow.com/a/70191787/5123111">在此 StackOverflow 帖子中。）

示例数据：

train_data = ['o by no means honest ventidius i gave it freely ever and theres none can truly say he gives if our betters play at that game we must not dare to imitate them faults that are rich are fair'
 'but was not this nigh shore'
 'impairing henry strengthening misproud york the common people swarm like summer flies and whither fly the gnats but to the sun'
 'what while you were there'
 'chill pick your teeth zir come no matter vor your foins'
 'thanks dear isabel' 'come prick me bullcalf till he roar again'
 'go some of you knock at the abbeygate and bid the lady abbess come to me'
 'an twere not as good deed as drink to break the pate on thee i am a very villain'
 'beaufort it is thy sovereign speaks to thee'
 'but say lucetta now we are alone wouldst thou then counsel me to fall in love'
 'for being a bawd for being a bawd'
 'all blest secrets all you unpublishd virtues of the earth spring with my tears'
 'what likelihood' 'o find him']

完整代码：

# Modify the sklearn Pipeline class to allow it to return tuples and hence enable both X and y modifications. (Current default implementation in sklearn only allows
# feature transformations, i.e. transformations on X, but not on y.)
class Pipeline(pipeline.Pipeline):

    def _fit(self, X, y=None, **fit_params_steps):
        self.steps = list(self.steps)
        self._validate_steps()
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)

        for (step_idx, name, transformer) in self._iter(
            with_final=False, filter_passthrough=False
        ):
                        
            if transformer is None or transformer == "passthrough":
                with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
                    continue

            try:
                # joblib >= 0.12
                mem = memory.location
            except AttributeError:
                mem = memory.cachedir
            finally:
                cloned_transformer = clone(transformer) if mem else transformer

            X, fitted_transformer = fit_transform_one_cached(
                cloned_transformer,
                X,
                y,
                None,
                message_clsname="Pipeline",
                message=self._log_message(step_idx),
                **fit_params_steps[name],
            )
            
            if isinstance(X, tuple):    ###### unpack X if is tuple X = (X,y)
                X, y = X
            
            self.steps[step_idx] = (name, fitted_transformer)
        
        return X, y
    
    def fit(self, X, y=None, **fit_params):
        fit_params_steps = self._check_fit_params(**fit_params)
        Xt = self._fit(X, y, **fit_params_steps)
        
        if isinstance(Xt, tuple):    ###### unpack X if is tuple X = (X,y)
            Xt, y = Xt 
        
        with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
            if self._final_estimator != "passthrough":
                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
                self._final_estimator.fit(Xt, y, **fit_params_last_step)

        return self

class ModelTokenizer(TransformerMixin, BaseEstimator):
    def __init__(self, max_len=100):
        super().__init__()
        self.max_len = max_len 
    def fit(self, X=None, y=None):
        return self  
    def transform(self, X, y=None):
        X_flattened = " ".join(X).split()
        sequences = list() 
        for i in range(self.max_len+1, len(X_flattened)):
            seq = X_flattened[i-self.max_len-1:i]
            sequences.append(seq)
        return sequences 

class ModelEncoder(TransformerMixin, BaseEstimator):
    def __init__(self):
        super().__init__()
        self.tokenizer = Tokenizer()
    def fit(self, X=None, y=None):
        self.tokenizer.fit_on_texts(X)
        return self 
    def transform(self, X, y=None):
        encoded_sequences = np.array(self.tokenizer.texts_to_sequences(X))
        return (encoded_sequences[:,:-1], encoded_sequences[:,-1])

def create_nn(input_shape=(100,1), output_shape=None):
    
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(output_shape, activation='softmax'))
    
    metrics_list = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = metrics_list)
    return model

pipe = Pipeline([
    ('tokenizer', ModelTokenizer()),
    ('encoder', ModelEncoder()),
    ('model', KerasClassifier(build_fn=create_nn, epochs=10, output_shape=vocab_size)),
])

# Question: how to pass 'vocab_size'?

导入：

from sklearn import pipeline
from sklearn.base import clone
from sklearn.utils import _print_elapsed_time
from sklearn.utils.validation import check_memory
from sklearn.base import BaseEstimator, TransformerMixin
from keras.preprocessing.text import Tokenizer
from scikeras.wrappers import KerasClassifier

原文

I am working on a simple text generation problem with LSTMs. To make the preprocessing more compact and reproducible, I decided to implement everything in sklearn fashion, using custom sklearn transformers, and the KerasClassifier from scikeras to wrap the neural network definition in a sklearn-type estimator.

It almost works but I can't figure out how to pass information from within a certain custom transformer on to the KerasClassifier estimator. More precisely, for the method that creates the neural network, I need the number of outputs as an argument; but this depends on the number of words in the fitted vocabulary - which is an information that is currently encapsulated in ModelEncoder class.

(Note that in order to get the current logic work, I had to slightly modify the default sklearn Pipeline class, as it wouldn't allow modifying and returning both X and y. In other words, the default sklearn Pipeline only allows feature transformations but not target transformations. Modifying the custom Pipeline class was explained in this StackOverflow post.)

Example data:

train_data = ['o by no means honest ventidius i gave it freely ever and theres none can truly say he gives if our betters play at that game we must not dare to imitate them faults that are rich are fair'
 'but was not this nigh shore'
 'impairing henry strengthening misproud york the common people swarm like summer flies and whither fly the gnats but to the sun'
 'what while you were there'
 'chill pick your teeth zir come no matter vor your foins'
 'thanks dear isabel' 'come prick me bullcalf till he roar again'
 'go some of you knock at the abbeygate and bid the lady abbess come to me'
 'an twere not as good deed as drink to break the pate on thee i am a very villain'
 'beaufort it is thy sovereign speaks to thee'
 'but say lucetta now we are alone wouldst thou then counsel me to fall in love'
 'for being a bawd for being a bawd'
 'all blest secrets all you unpublishd virtues of the earth spring with my tears'
 'what likelihood' 'o find him']

Full code:

# Modify the sklearn Pipeline class to allow it to return tuples and hence enable both X and y modifications. (Current default implementation in sklearn only allows
# feature transformations, i.e. transformations on X, but not on y.)
class Pipeline(pipeline.Pipeline):

    def _fit(self, X, y=None, **fit_params_steps):
        self.steps = list(self.steps)
        self._validate_steps()
        memory = check_memory(self.memory)

        fit_transform_one_cached = memory.cache(pipeline._fit_transform_one)

        for (step_idx, name, transformer) in self._iter(
            with_final=False, filter_passthrough=False
        ):
                        
            if transformer is None or transformer == "passthrough":
                with _print_elapsed_time("Pipeline", self._log_message(step_idx)):
                    continue

            try:
                # joblib >= 0.12
                mem = memory.location
            except AttributeError:
                mem = memory.cachedir
            finally:
                cloned_transformer = clone(transformer) if mem else transformer

            X, fitted_transformer = fit_transform_one_cached(
                cloned_transformer,
                X,
                y,
                None,
                message_clsname="Pipeline",
                message=self._log_message(step_idx),
                **fit_params_steps[name],
            )
            
            if isinstance(X, tuple):    ###### unpack X if is tuple X = (X,y)
                X, y = X
            
            self.steps[step_idx] = (name, fitted_transformer)
        
        return X, y
    
    def fit(self, X, y=None, **fit_params):
        fit_params_steps = self._check_fit_params(**fit_params)
        Xt = self._fit(X, y, **fit_params_steps)
        
        if isinstance(Xt, tuple):    ###### unpack X if is tuple X = (X,y)
            Xt, y = Xt 
        
        with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
            if self._final_estimator != "passthrough":
                fit_params_last_step = fit_params_steps[self.steps[-1][0]]
                self._final_estimator.fit(Xt, y, **fit_params_last_step)

        return self

class ModelTokenizer(TransformerMixin, BaseEstimator):
    def __init__(self, max_len=100):
        super().__init__()
        self.max_len = max_len 
    def fit(self, X=None, y=None):
        return self  
    def transform(self, X, y=None):
        X_flattened = " ".join(X).split()
        sequences = list() 
        for i in range(self.max_len+1, len(X_flattened)):
            seq = X_flattened[i-self.max_len-1:i]
            sequences.append(seq)
        return sequences 

class ModelEncoder(TransformerMixin, BaseEstimator):
    def __init__(self):
        super().__init__()
        self.tokenizer = Tokenizer()
    def fit(self, X=None, y=None):
        self.tokenizer.fit_on_texts(X)
        return self 
    def transform(self, X, y=None):
        encoded_sequences = np.array(self.tokenizer.texts_to_sequences(X))
        return (encoded_sequences[:,:-1], encoded_sequences[:,-1])

def create_nn(input_shape=(100,1), output_shape=None):
    
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(Flatten())
    model.add(Dense(20, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(output_shape, activation='softmax'))
    
    metrics_list = [tf.keras.metrics.BinaryAccuracy(name='accuracy')]

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = metrics_list)
    return model

pipe = Pipeline([
    ('tokenizer', ModelTokenizer()),
    ('encoder', ModelEncoder()),
    ('model', KerasClassifier(build_fn=create_nn, epochs=10, output_shape=vocab_size)),
])

# Question: how to pass 'vocab_size'?

Imports:

from sklearn import pipeline
from sklearn.base import clone
from sklearn.utils import _print_elapsed_time
from sklearn.utils.validation import check_memory
from sklearn.base import BaseEstimator, TransformerMixin
from keras.preprocessing.text import Tokenizer
from scikeras.wrappers import KerasClassifier

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

私野 2025-01-20 07:30:04

KerasClassifier 有自己的内部变压器（请参阅此处，它用于提供one-hot 编码等），它有一个 API 将元数据传递给模型（请参阅此处，这就是将诸如 n_outputs_ 之类的参数传递到模型构建函数中的方式）。您可以覆盖它以将这些额外的元数据传递给模型吗？它有点超出了 Scikit-Learn API，但正如您所注意到的，Scikit-Learn API 没有内置此功能。如果您想将该信息从管道中的 Transformer 传播到 SciKeras，您可以进行编码将其转换为一个功能，然后使用上述挂钩和自定义编码器来删除该功能并将其转换为可以传递到模型中的元数据，但现在您将真正推动 Scikit-Learn API。