在 keras 模型中使用 `sentence-transformers`
我想在更大的 Keras 模型中使用来自句子转换器的模型。
这是完整的示例:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = TFAutoModel.from_pretrained(MODEL_PATH, from_pt=True)
class SBert(tf.keras.layers.Layer):
def __init__(self, tokenizer, model):
super(SBert, self).__init__()
self.tokenizer = tokenizer
self.model = model
def tf_encode(self, inputs):
def encode(inputs):
return self.tokenizer(
inputs, padding=True, truncation=True, return_tensors='tf'
)
return tf.py_function(func=encode, inp=[inputs], Tout=[tf.int64])
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = tf.cast(
tf.broadcast_to(tf.expand_dims(attention_mask, -1), token_embeddings.shape),
tf.float32
)
a = tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)
b = tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)
embeddings = a / b
embeddings, _ = tf.linalg.normalize(embeddings, 2, axis=1)
return embeddings
def call(self, inputs):
encoded_input = self.tf_encode(inputs)
model_output = self.model(encoded_input)
embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
return embeddings
sbert = SBert(tokenizer, model)
sbert(['some text', 'more text'])
我可以在 TF / Keras 之外使用模型和标记生成器,没有任何问题,当我们尝试构建图形并且 TF 将符号张量传递给标记生成器时,似乎会发生问题,从而生成错误 -这就是为什么我尝试包装 tf.py_function
但没有成功......
错误:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-20-a0c4a906e456> in <module>
44
45 sbert = SBert(tokenizer, model)
---> 46 sbert(['some text', 'more text'])
~/.pyenv/versions/3.7.8/lib/python3.7/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
<ipython-input-20-a0c4a906e456> in call(self, inputs)
36 def call(self, inputs):
37 tf.print(inputs, output_stream=sys.stdout)
---> 38 encoded_input = self.tf_encode(inputs)
39 tf.print(encoded_input, output_stream=sys.stdout)
40 model_output = self.model(encoded_input)
<ipython-input-20-a0c4a906e456> in tf_encode(self, inputs)
20 inputs, padding=True, truncation=True, return_tensors='tf'
21 )
---> 22 return tf.py_function(func=encode, inp=[inputs], Tout=[tf.int64])
23
24 def mean_pooling(model_output, attention_mask):
InvalidArgumentError: Exception encountered when calling layer "s_bert_6" (type SBert).
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Traceback (most recent call last):
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
return func(device, token, args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 147, in __call__
outputs = self._call(device, args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 154, in _call
ret = self._func(*args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
return func(*args, **kwargs)
File "<ipython-input-20-a0c4a906e456>", line 20, in encode
inputs, padding=True, truncation=True, return_tensors='tf'
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 2378, in __call__
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
[Op:EagerPyFunc]
Call arguments received:
• inputs=["'some text'", "'more text'"]
I would like to use a model from sentence-transformers
inside of a larger Keras model.
Here is the full example:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
MODEL_PATH = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = TFAutoModel.from_pretrained(MODEL_PATH, from_pt=True)
class SBert(tf.keras.layers.Layer):
def __init__(self, tokenizer, model):
super(SBert, self).__init__()
self.tokenizer = tokenizer
self.model = model
def tf_encode(self, inputs):
def encode(inputs):
return self.tokenizer(
inputs, padding=True, truncation=True, return_tensors='tf'
)
return tf.py_function(func=encode, inp=[inputs], Tout=[tf.int64])
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = tf.cast(
tf.broadcast_to(tf.expand_dims(attention_mask, -1), token_embeddings.shape),
tf.float32
)
a = tf.math.reduce_sum(token_embeddings * input_mask_expanded, axis=1)
b = tf.clip_by_value(tf.math.reduce_sum(input_mask_expanded, axis=1), 1e-9, tf.float32.max)
embeddings = a / b
embeddings, _ = tf.linalg.normalize(embeddings, 2, axis=1)
return embeddings
def call(self, inputs):
encoded_input = self.tf_encode(inputs)
model_output = self.model(encoded_input)
embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
return embeddings
sbert = SBert(tokenizer, model)
sbert(['some text', 'more text'])
I am able to use the model and tokenizer outside of TF / Keras with no problems, the issue seems to happen when we try and build the graph and TF passing a symbolic tensor to the tokenizer, generating an error - this is why I have tried to wrap in tf.py_function
but with no success...
The error:
---------------------------------------------------------------------------
InvalidArgumentError Traceback (most recent call last)
<ipython-input-20-a0c4a906e456> in <module>
44
45 sbert = SBert(tokenizer, model)
---> 46 sbert(['some text', 'more text'])
~/.pyenv/versions/3.7.8/lib/python3.7/site-packages/keras/utils/traceback_utils.py in error_handler(*args, **kwargs)
65 except Exception as e: # pylint: disable=broad-except
66 filtered_tb = _process_traceback_frames(e.__traceback__)
---> 67 raise e.with_traceback(filtered_tb) from None
68 finally:
69 del filtered_tb
<ipython-input-20-a0c4a906e456> in call(self, inputs)
36 def call(self, inputs):
37 tf.print(inputs, output_stream=sys.stdout)
---> 38 encoded_input = self.tf_encode(inputs)
39 tf.print(encoded_input, output_stream=sys.stdout)
40 model_output = self.model(encoded_input)
<ipython-input-20-a0c4a906e456> in tf_encode(self, inputs)
20 inputs, padding=True, truncation=True, return_tensors='tf'
21 )
---> 22 return tf.py_function(func=encode, inp=[inputs], Tout=[tf.int64])
23
24 def mean_pooling(model_output, attention_mask):
InvalidArgumentError: Exception encountered when calling layer "s_bert_6" (type SBert).
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
Traceback (most recent call last):
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 269, in __call__
return func(device, token, args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 147, in __call__
outputs = self._call(device, args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 154, in _call
ret = self._func(*args)
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 642, in wrapper
return func(*args, **kwargs)
File "<ipython-input-20-a0c4a906e456>", line 20, in encode
inputs, padding=True, truncation=True, return_tensors='tf'
File "/Users/dennisyurkevich/.pyenv/versions/3.7.8/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 2378, in __call__
"text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).
[Op:EagerPyFunc]
Call arguments received:
• inputs=["'some text'", "'more text'"]
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。
绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论
评论(1)
tf.py_function 似乎不适用于字典输出,这就是为什么您可以尝试返回三个单独的张量。另外,我正在解码输入以删除每个字符串前面的
b
:如果您想使用
Keras
模型,则必须执行以下操作:tf.py_function
does not seem to work with a dict output that’s why you can try returning three separate tensors. Also, I am decoding the inputs to remove theb
in the front of each string:If you want to use a
Keras
model, you will have to do something like this: