五、Tokenizer in Transformers

发布于 2023-07-17 23:38:23 字数 19627 浏览 0 评论 0 收藏 0

4.1 从头开始训练 WordPiece

代码：


xxxxxxxxxx
from tokenizers import pre_tokenizers
# 使用 WordPiece 模型
model = models.WordPiece(unk_token="[UNK]") # 未设置 vocab, 因为词表需要从数据中训练
tokenizer = Tokenizer(model)


################# Step1: Normalization ###################
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(),  
     # NFD Unicode normalizer, 否则 StripAccents normalizer 无法正确识别带重音的字符
     normalizers.Lowercase(), 
     normalizers.StripAccents()]
) # 这个整体等价于 normalizers.BertNormalizer(lowercase=True)


print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
# hello how are u?


################# Step2: Pre-tokenization ###################
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.WhitespaceSplit(), 
     pre_tokenizers.Punctuation()]
) # 这个整体等价于 pre_tokenizers.BertPreTokenizer()


print(tokenizer.pre_tokenizer.pre_tokenize_str("This's me  ."))
# [('This', (0, 4)), ("'", (4, 5)), ('s', (5, 6)), ('me', (7, 9)), ('.', (11, 12))]


################# Step3: Trainer ###################
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordPieceTrainer(vocab_size=25000, special_tokens=special_tokens)


################# Step4: dataset ###################
from datasets import load_dataset # pip install datasets
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"] # batch size = 1000


################# Step5: train ####################
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# tokenizer.train(["wikitext-2.txt"], trainer=trainer) # 也可以从文本文件来训练


## 测试训练好的 WordPiece
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [1511, 11, 61, 1607, 18]
print(encoding.type_ids)
# [0, 0, 0, 0, 0]
print(encoding.tokens)
# ['this', "'", 's', 'me', '.']
print(encoding.offsets)
# [(0, 4), (4, 5), (5, 6), (7, 9), (11, 12)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0]
print(encoding.overflowing)
# []


################# Step6: Post-Processing ####################
cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
print(cls_token_id)
# 2
print(sep_token_id)
# 3


tokenizer.post_processor = processors.TemplateProcessing(
    single= "[CLS]:0 $A:0 [SEP]:0",
    pair= "[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)],
)


## 测试训练好的 WordPiece(单个句子)
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [2, 1511, 11, 61, 1607, 18, 3]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0]
print(encoding.tokens)
# ['[CLS]', 'this', "'", 's', 'me', '.', '[SEP]']
print(encoding.offsets)
# [(0, 0), (0, 4), (4, 5), (5, 6), (7, 9), (11, 12), (0, 0)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [1, 0, 0, 0, 0, 0, 1]
print(encoding.overflowing)
# []


## 测试训练好的 WordPiece(多个句子)
encoding = tokenizer.encode("This's me  .", "That's is fine-tuning.")
print(encoding)
# Encoding(num_tokens=17, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [2, 1511, 11, 61, 1607, 18, 3, 1389, 11, 61, 1390, 6774, 17, 4992, 1343, 18, 3]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.tokens)
# ['[CLS]', 'this', "'", 's', 'me', '.', '[SEP]', 'that', "'", 's', 'is', 'fine', '-', 'tun', '##ing', '.', '[SEP]']
print(encoding.offsets)
# [(0, 0), (0, 4), (4, 5), (5, 6), (7, 9), (11, 12), (0, 0), (0, 4), (4, 5), (5, 6), (7, 9), (10, 14), (14, 15), (15, 18), (18, 21), (21, 22), (0, 0)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
print(encoding.overflowing)
# []


################# Step7: Decode ####################
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer.decode(encoding.ids) # 注意：空格没有被还原
# "this's me. that's is fine - tuning."


################# Step8: Save ####################
tokenizer.save("tokenizer.json")
new_tokenizer = Tokenizer.from_file("tokenizer.json")
print(new_tokenizer.decode(encoding.ids))
# this's me. that's is fine - tuning.

要在 Transformers 中使用这个 tokenizer，我们必须将它封装在一个 PreTrainedTokenizerFast 类中。
- 如果是Transformers 已有的模型，如 BERT，那么就可以用对应的 PreTrainedTokenizerFast 子类，如 BertTokenizerFast 。
```
xxxxxxxxxx
from transformers import BertTokenizerFast


wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)
# wrapped_tokenizer = BertTokenizerFast(tokenizer_file="tokenizer.json")
```
- 或者也可以直接使用 PreTrainedTokenizerFast，方法为：
```
xxxxxxxxxx
from transformers import PreTrainedTokenizerFast


wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # 或者从文件加载
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
```
  注意：我们必须手动设置所有 special token ，因为 PreTrainedTokenizerFast 无法从 tokenizer 对象推断出这些 special token 。
  虽然 tokenizer 有 special token 属性，但是这个属性是所有 special token 的集合，无法区分哪个是 CLS、哪个是 SEP 。
最后，这些 wrapped_tokenizer 可以使用 save_pretrained() 方法或 push_to_hub() 方法来保存到 Hugging Face Hub 。其中 save_pretrained() 方法会保存三个文件：'tokenizer_config.json'、'special_tokens_map.json'、'tokenizer.json' 。

4.2 从头开始训练 BPE

代码：


xxxxxxxxxx
from tokenizers import decoders, models, normalizers, \
pre_tokenizers, processors, trainers, Tokenizer
# 使用 BPE 模型
model = models.BPE() # 未设置 vocab, 因为词表需要从数据中训练; 不需要 unk_token
tokenizer = Tokenizer(model)


################# GPT-2 Skip Normalization ##################


################# Step1: Pre-tokenization ###################
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)


print(tokenizer.pre_tokenizer.pre_tokenize_str("This's me  ."))
# [('This', (0, 4)), ("'s", (4, 6)), ('Ġme', (6, 9)), ('Ġ', (9, 10)), ('Ġ.', (10, 12))]


################# Step2: Trainer ###################
special_tokens = ["<|endoftext|>"] # end-of-text token
trainer = trainers.BpeTrainer(vocab_size=25000, special_tokens=special_tokens)


################# Step3: dataset ###################
from datasets import load_dataset # pip install datasets
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"] # batch size = 1000


################# Step4: train ####################
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# tokenizer.train(["wikitext-2.txt"], trainer=trainer) # 也可以从文本文件来训练


## 测试训练好的 BPE
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [52, 72, 215, 7, 83, 701, 159, 209]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 0]
print(encoding.tokens)
# ['T', 'h', 'is', "'", 's', 'Ġme', 'Ġ', 'Ġ.']
print(encoding.offsets)
# [(0, 1), (1, 2), (2, 4), (4, 5), (5, 6), (6, 9), (9, 10), (10, 12)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0, 0, 0]
print(encoding.overflowing)
# []


################# Step5: Post-Processing ####################
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) # 保留 ‘Ġ’ 代表的空格


## 测试训练好的 BPE (单个句子)
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [52, 72, 215, 7, 83, 701, 159, 209]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 0]
print(encoding.tokens)
# ['T', 'h', 'is', "'", 's', 'Ġme', 'Ġ', 'Ġ.']
print(encoding.offsets)
# [(0, 1), (1, 2), (2, 4), (4, 5), (5, 6), (6, 9), (9, 10), (10, 12)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0, 0, 0]
print(encoding.overflowing)
# []


## 测试训练好的 BPE (多个句子)
encoding = tokenizer.encode("This's me  .", "That's is fine-tuning.")
print(encoding)
# Encoding(num_tokens=19, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [52, 72, 215, 7, 83, 701, 159, 209, 52, 6312, 7, 83, 301, 7620, 13, 84, 302, 223, 14]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.tokens)
# ['T', 'h', 'is', "'", 's', 'Ġme', 'Ġ', 'Ġ.', 'T', 'hat', "'", 's', 'Ġis', 'Ġfine', '-', 't', 'un', 'ing', '.']
print(encoding.offsets)
# [(0, 1), (1, 2), (2, 4), (4, 5), (5, 6), (6, 9), (9, 10), (10, 12), (0, 1), (1, 4), (4, 5), (5, 6), (6, 9), (9, 14), (14, 15), (15, 16), (16, 18), (18, 21), (21, 22)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print(encoding.overflowing)
# []


################# Step6: Decode ####################
tokenizer.decoder = decoders.ByteLevel()
tokenizer.decode(encoding.ids) # 注意：空格能够被还原
# "This's me  .That's is fine-tuning."


################# Step7: Save ####################
tokenizer.save("tokenizer.json")
new_tokenizer = Tokenizer.from_file("tokenizer.json")
print(new_tokenizer.decode(encoding.ids))
# This's me  .That's is fine-tuning.

我们可以把训练好的 tokenizer 封装在一个 PreTrainedTokenizerFast 类中，从而在 Transformers 中使用：

直接使用 GPT2TokenizerFast：


xxxxxxxxxx
from transformers import GPT2TokenizerFast
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)
# wrapped_tokenizer = GPT2TokenizerFast(tokenizer_file="tokenizer.json")

使用 PreTrainedTokenizerFast 类：


xxxxxxxxxx
from transformers import PreTrainedTokenizerFast


wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    # tokenizer_file="tokenizer.json", # 或者从文件加载
    bos_token="<|endoftext|>",
    eos_token="<|endoftext|>",
)

4.3 从头开始训练 Unigram

代码：


xxxxxxxxxx
from tokenizers import decoders, models, normalizers, \
pre_tokenizers, processors, trainers, Tokenizer, Regex
# 使用 Unigram 模型
model = models.models.Unigram() # 未设置 vocab, 因为词表需要从数据中训练
tokenizer = Tokenizer(model)


################# Step1: Normalization ###################
tokenizer.normalizer = normalizers.Sequence(
    [
        normalizers.Replace("``", '"'),
        normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        # NFKD Unicode normalizer, 否则 StripAccents normalizer 无法正确识别带重音的字符
        normalizers.StripAccents(),
        normalizers.Replace(Regex(" {2,}"), " "), # ' {2,}' 表示至少两个空格，因此这里将多个空格替换为一个空格
    ]
)


print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
# Hello how are u?


################# Step2: Pre-tokenization ###################
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()


print(tokenizer.pre_tokenizer.pre_tokenize_str("This's me  ."))
# [("▁This's", (0, 6)), ('▁me', (6, 9)), ('▁', (9, 10)), ('▁.', (10, 12))]


################# Step3: Trainer ###################
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]
trainer = trainers.UnigramTrainer(
    vocab_size=25000, special_tokens=special_tokens, unk_token="<unk>"
)


################# Step4: dataset ###################
from datasets import load_dataset # pip install datasets
dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"] # batch size = 1000


################# Step5: train ####################
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)
# tokenizer.train(["wikitext-2.txt"], trainer=trainer) # 也可以从文本文件来训练


## 测试训练好的 Unigram
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [164, 8030, 9, 918, 7, 11]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0]
print(encoding.tokens)
# ['▁This', "'", 's', '▁me', '▁', '.']
print(encoding.offsets)
# [(0, 4), (4, 5), (5, 6), (6, 9), (10, 11), (11, 12)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0]
print(encoding.overflowing)
# []


################# Step6: Post-Processing ####################
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
print(cls_token_id)
# 0
print(sep_token_id)
# 1


tokenizer.post_processor = processors.TemplateProcessing(
    single="$A:0 <sep>:0 <cls>:2",
    pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
    special_tokens=[("<sep>", sep_token_id), ("<cls>", cls_token_id)],
)


## 测试训练好的 Unigram (单个句子)
encoding = tokenizer.encode("This's me  .")
print(encoding)
# Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
# [164, 8030, 9, 918, 7, 11, 1, 0]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 2]
print(encoding.tokens)
# ['▁This', "'", 's', '▁me', '▁', '.', '<sep>', '<cls>']
print(encoding.offsets)
# [(0, 4), (4, 5), (5, 6), (6, 9), (10, 11), (11, 12), (0, 0), (0, 0)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0, 1, 1]
print(encoding.overflowing)
# []


## 测试训练好的 Unigram (多个句子)
encoding = tokenizer.encode("This's me  .", "That's is fine-tuning.")
print(encoding)
# Encoding(num_tokens=19, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
print(encoding.ids)
#[164, 8030, 9, 918, 7, 11, 1, 1126, 8030, 9, 41, 3030, 28, 37, 2669, 21, 11, 1, 0]
print(encoding.type_ids)
# [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]
print(encoding.tokens)
# ['▁This', "'", 's', '▁me', '▁', '.', '<sep>', '▁That', "'", 's', '▁is', '▁fine', '-', 't', 'un', 'ing', '.', '<sep>', '<cls>']
print(encoding.offsets)
#[(0, 4), (4, 5), (5, 6), (6, 9), (10, 11), (11, 12), (0, 0), (0, 4), (4, 5), (5, 6), (6, 9), (9, 14), (14, 15), (15, 16), (16, 18), (18, 21), (21, 22), (0, 0), (0, 0)]
print(encoding.attention_mask)
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print(encoding.special_tokens_mask)
# [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
print(encoding.overflowing)
# []


################# Step7: Decode ####################
tokenizer.decoder = decoders.Metaspace()
tokenizer.decode(encoding.ids) # 注意：空格没有被还原 ( 'me' 后面的两个空格只剩下一个)
# "This's me . That's is fine-tuning."


################# Step8: Save ####################
tokenizer.save("tokenizer.json")
new_tokenizer = Tokenizer.from_file("tokenizer.json")
print(new_tokenizer.decode(encoding.ids))
# this's me. that's is fine - tuning.

我们可以把训练好的 tokenizer 封装在一个 PreTrainedTokenizerFast 类中，从而在 Transformers 中使用：

直接使用 XLNetTokenizerFast：


xxxxxxxxxx
from transformers import XLNetTokenizerFast
wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)
# wrapped_tokenizer = XLNetTokenizerFast(tokenizer_file="tokenizer.json")

使用 PreTrainedTokenizerFast 类：


xxxxxxxxxx
from transformers import PreTrainedTokenizerFast


wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

分享到QQ

分享到微博