三、Hugging Face Tokenizer 库

发布于 2023-07-17 23:38:23 字数 20965 浏览 0 评论 0 收藏 0

对于 BPE, WordPiece, Unigram 这三个算法，我们采用相同的语料库如下：


xxxxxxxxxx
corpus = [ # The first sentences from the abstract of "<Attention Is All You Need>"
    "The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder.",
    "The bestperforming models also connect the encoder and decoder through an attentionmechanism.",
    "We propose a new simple network architecture, the Transformer,based solely on attention mechanisms, dispensing with recurrence and convolutionsentirely."
]

2.1 BPE

训练算法：


xxxxxxxxxx
from collections import defaultdict
from tokenizers import decoders, models, normalizers, \
pre_tokenizers, processors, trainers, Tokenizer


corpus = [ # The first sentences from the abstract of "<Attention Is All You Need>"
    "The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder.",
    "The bestperforming models also connect the encoder and decoder through an attentionmechanism.",
    "We propose a new simple network architecture, the Transformer,based solely on attention mechanisms, dispensing with recurrence and convolutionsentirely."
]
#################### Step1: word freq ################
word_freqs = defaultdict(int)
pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)


for text in corpus:
    words_with_offsets = pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1


print(word_freqs)
# defaultdict(<class 'int'>, {'The': 2, 'Ġdominant': 1, 'Ġsequence': 1, 'Ġtransduction': 1, ...})


#################### Step2: alphabet ################
alphabet = [] # 字母表
for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()


print(alphabet) # 'Ġ' 是空格符
# [',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ']
vocab = ["<|endoftext|>"] + alphabet.copy() # add special token for GPT-2


#################### Step3: split word to char ################
splits = {word: [c for c in word] for word in word_freqs.keys()} 
print(splits) # 每个字符作为一个 subword
# {'The': ['T', 'h', 'e'], 'Ġdominant': ['Ġ', 'd', 'o', 'm', 'i', 'n', 'a', 'n', 't'],...}  


#################### Step4: find most freq and merge ################


def compute_pair_freqs(splits):
    ''' 计算相邻子词合并之后作为一个整体所出现的频次
    
    :param splits: 截止到目前为止，每个单词的拆分
    '''
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs


def find_most_freq(pair_freqs):
    ''' 计算频次最高的子词
    '''
    best_pair = ""
    max_freq = None


    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    print("\t Find most freq: pair[%s], freq[%s]"%(best_pair, max_freq))
    return best_pair


def merge_pair(a, b, splits):
    ''' 子词合并，将当前 splits 中的所有 "a b" 形式的子词合并为 "ab"
    '''
    combine_ab = "%s%s"%(a,b)
    
    for word in word_freqs:
        split = splits[word] # word 当前的子词拆分
        if len(split) == 1: # 子词只有一个，表示子词就是 word 自身
            continue


        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b: # a 和 b 连续出现，可以合并
                split = split[:i] + [combine_ab, ] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits


merges = {}
vocab_size = 50 


while len(vocab) < vocab_size:
    print("Current vocab size:%s"%len(vocab))
    pair_freqs = compute_pair_freqs(splits)
    print("\t Top3 Pair freq:%s"% sorted(pair_freqs.items(),key=lambda x:-x[1])[:3]) # 频次降序排列
    current_pair = find_most_freq(pair_freqs)
    new_subword = "%s%s"%(current_pair[0],current_pair[1])
    splits = merge_pair(current_pair[0], current_pair[1], splits)
    print("\t Merge '%s %s' to '%s'"%(current_pair[0], current_pair[1], new_subword))
    merges[current_pair] = new_subword
    vocab.append(new_subword)
# Current vocab size:30
#    Top3 Pair freq:[(('Ġ', 'm'), 3), (('l', 's'), 3), (('Ġ', 'c'), 3)]
#    Find most freq: pair[('Ġ', 'm')], freq[3]
#    Merge 'Ġ m' to 'Ġm'    
# Current vocab size:31
#    Top3 Pair freq:[(('l', 's'), 3), (('Ġ', 'c'), 3), (('l', 'e'), 3)]
#    Find most freq: pair[('l', 's')], freq[3]
#    Merge 'l s' to 'ls'
# ...


print(merges) # 20 条 merge 规则
# {('Ġ', 'm'): 'Ġm', ('l', 's'): 'ls', ('Ġ', 'c'): 'Ġc', ('l', 'e'): 'le', ...}
print(vocab) # 词表由 special token、初始字母表、以及 merge结果所组成
# ['<|endoftext|>', ',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ', 'Ġm', 'ls', 'Ġc', 'le', 'lu', 'Ġand', 'is', 'The', 'Ġd', 'om', 'ence', 'ran', 'rans', 'Ġmode', 'Ġmodels', 'Ġar', 'Ġb', 'ase', 'ased', 'Ġon']

为了对新文本进行tokenization，我们对其进行 pre-tokenization 、拆分为单个字符，然后应用学到的所有 merge 规则。


xxxxxxxxxx
def tokenize(text, merges):
    ''' Tokenization, text 为文本， merges 为学到的所有 merge 规则
    '''
    ################## step1: pre_tokenize ##################
    pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    pre_tokenize_result = pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    ################## step2: split ##################
    splits = [[ch for ch in word] for word in pre_tokenized_text]
    ################## step3: tokenize ##################
    for pair, merge in sorted(merges.items(), key=lambda x: -len(x[1])): 
    # 先合并短的子词、后合并长的子词
        for idx, split in enumerate(splits):
            i = 0
            ########### 处理每一个 split ########
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split


    return sum(splits, [])


print(tokenize("This's me  ." ,merges))
# ['T', 'h', 'is', "'", 's', 'Ġm', 'e', 'Ġ', 'Ġ', '.']

2.2 WordPiece

训练算法：


xxxxxxxxxx
from collections import defaultdict
from tokenizers import pre_tokenizers


corpus = [ # The first sentences from the abstract of "<Attention Is All You Need>"
    "The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder.",
    "The bestperforming models also connect the encoder and decoder through an attentionmechanism.",
    "We propose a new simple network architecture, the Transformer,based solely on attention mechanisms, dispensing with recurrence and convolutionsentirely."
]
#################### Step1: word freq ################
word_freqs = defaultdict(int)
pre_tokenizer = pre_tokenizers.BertPreTokenizer()


for text in corpus:
    words_with_offsets = pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1


print(word_freqs)
# defaultdict(<class 'int'>, {'The': 2, 'dominant': 1, 'sequence': 1, ...})


#################### Step2: alphabet ################
alphabet = [] # 字母表
for word in word_freqs.keys():
    if word[0] not in alphabet: # 是单词的第一个字母
        alphabet.append(word[0])
    for letter in word[1:]: # 不是单词的第一个字母
        if f"##{letter}" not in alphabet: # f"{letter}" 是格式化的语法，用 letter 变量的真实值来替代 {letter}
            alphabet.append(f"##{letter}")
alphabet.sort()


print(alphabet)  
# ['##a', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', ',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'i', 'm', 'n', 'o', 'p', 'r', 's', 't', 'w']
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy() # add special token


#################### Step3: split word to char ################
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
} 
print(splits) # 每个字符作为一个 subword
# {'The': ['T', '##h', '##e'], 'dominant': ['d', '##o', '##m', '##i', '##n', '##a', '##n', '##t'],...}  


#################### Step4: find highest score and merge ################


def compute_pair_scores(splits):
    ''' 计算每对相邻子词 merge 操作的得分
    
    :param splits: 截止到目前为止，每个单词的拆分
    '''
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1: # 只有一个子词（就是单词自身）
            letter_freqs[split[0]] += freq 
            continue
        for i in range(len(split) - 1): # 有多个子词
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq # 最后一个位置没有 pair，但是要处理
        
    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores


def find_max_score(scores):
    ''' 计算得分最高的子词
    '''
    best_pair = ""
    max_score = None


    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    print("\t Find max score: pair[%s], freq[%s]"%(best_pair, max_score))
    return best_pair


def merge_pair(a, b, splits):
    ''' 子词合并，将当前 splits 中的所有 "a b" 形式的子词合并为 "ab"
    '''
    combine_ab = "%s%s"%(a,b[2:] if b.startswith("##") else b)
    
    for word in word_freqs:
        split = splits[word] # word 当前的子词拆分
        if len(split) == 1: # 子词只有一个，表示子词就是 word 自身
            continue


        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b: # a 和 b 连续出现，可以合并
                split = split[:i] + [combine_ab, ] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits


vocab_size = 50 


while len(vocab) < vocab_size:
    print("Current vocab size:%s"%len(vocab))
    scores = compute_pair_scores(splits)
    print("\t Top3 Pair scores:%s"% sorted(scores.items(),key=lambda x:-x[1])[:3]) # 得分降序排列
    current_pair = find_max_score(scores)
    new_subword = "%s%s"%(current_pair[0],current_pair[1][2:] if current_pair[1].startswith("##") else current_pair[1])
    splits = merge_pair(current_pair[0], current_pair[1], splits)
    print("\t Merge '%s %s' to '%s'"%(current_pair[0], current_pair[1], new_subword))
    vocab.append(new_subword)
# Current vocab size:46
#    Top3 Pair scores:[(('##q', '##u'), 0.1), (('##l', '##y'), 0.076923), (('t', '##h'), 0.072727)]
#    Find max score: pair[('##q', '##u')], freq[0.1]
#    Merge '##q ##u' to '##qu'    
# Current vocab size:47
#    Top3 Pair scores:[(('##l', '##y'), 0.076923), (('t', '##h'), 0.072727), (('b', '##a'), 0.066667)]
#    Find max score: pair[('##l', '##y')], freq[0.076923]
#    Merge '##l ##y' to '##ly'
# ...


print(vocab) # 词表由 special token、初始字母表、以及 merge结果所组成
# ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '##a', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', ',', '.', 'T', 'W', 'a', 'b', 'c', 'd', 'e', 'i', 'm', 'n', 'o', 'p', 'r', 's', 't', 'w', '##qu', '##ly', 'th', 'Th']

为了对新文本进行tokenization，我们对其进行 pre-tokenization ，然后对每个单词寻找从头开始匹配到的最大子词并进行拆分。然后不断重复这种拆分。


xxxxxxxxxx
def encode_word(word, vocab):
    ''' 用 WordPiece 对单词进行拆分
    '''
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab: # 最长匹配
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i]) # 匹配到的最长子词
        word = word[i:] # 拆剩下的
        if len(word) > 0:
            word = f"##{word}"
    return tokens


def tokenize(text, vocab):
    ''' 对文本进行 tokenize. vocab 为词表
    '''
    pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    pre_tokenize_result = pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word, vocab) for word in pre_tokenized_text]
    return sum(encoded_words, []) # 对列表的列表进行 flatten 处理


print(tokenize("This's me  ." ,vocab))
# ['Th', '##i', '##s', '[UNK]', 's', 'm', '##e', '.']

2.3 Unigram

训练算法：


xxxxxxxxxx
from collections import defaultdict
from tokenizers import pre_tokenizers
from math import log
import copy


corpus = [ # The first sentences from the abstract of "<Attention Is All You Need>"
    "The dominant sequence transduction models are based on complex recurrent orconvolutional neural networks that include an encoder and a decoder.",
    "The bestperforming models also connect the encoder and decoder through an attentionmechanism.",
    "We propose a new simple network architecture, the Transformer,based solely on attention mechanisms, dispensing with recurrence and convolutionsentirely."
]
#################### Step1: word freq ################
word_freqs = defaultdict(int)
pre_tokenizer = pre_tokenizers.Metaspace()


for text in corpus:
    words_with_offsets = pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1


print(word_freqs)
# defaultdict(<class 'int'>, {'▁The': 2, '▁dominant': 1, '▁sequence': 1, ...})


#################### Step2: initial vocab ################
char_freqs = defaultdict(int) # 每个字符的频次
subwords_freqs = defaultdict(int) # 每个 substring 的频次
for word, freq in word_freqs.items():
    for i in range(len(word)):
        char_freqs[word[i]] += freq
        # Loop through the subwords of length at least 2
        for j in range(i + 2, len(word) + 1):
            subwords_freqs[word[i:j]] += freq


sorted_subwords = sorted(subwords_freqs.items(), key=lambda x: x[1], reverse=True)
init_vocab_size = 300 # 一个较大的初始词表
token_freqs = list(char_freqs.items()) + sorted_subwords[: init_vocab_size - len(char_freqs)]
token_freqs = {token: freq for token, freq in token_freqs}


print(sorted_subwords[:5])
# [('▁a', 12), ('an', 10), ('on', 10), ('en', 9), ('de', 9)]


#################### Step3: model ################
total_sum = sum([freq for token, freq in token_freqs.items()])
# model 存放每个候选 token 的负对数似然
model = {token: -log(freq*1.0 / total_sum) for token, freq in token_freqs.items()}


#################### Step4: 定义编码函数和损失函数 ################


def encode_word(word, model):
    ''' 这是用动态规划来实现维特比解码，从而根据每个子词的损失来分词
    '''
    best_segmentations = [{"start": 0, "score": 1}] + [
        {"start": None, "score": None} for _ in range(len(word))
    ] # 核心数据结构，存放每个位置的状态：第 i 个元素表示对前缀 word[:i] 的分词结果：(最近一个拆分点, 最佳分词的损失)
    
    for start_idx in range(len(word)):
        # This should be properly filled by the previous steps of the loop
        best_score_at_start = best_segmentations[start_idx]["score"] # 前缀的分词结果
        #########   寻找下一个拆分点   #############
        for end_idx in range(start_idx + 1, len(word) + 1):
            token = word[start_idx:end_idx]
            if token in model and best_score_at_start is not None:
                score = model[token] + best_score_at_start
                if (
                    best_segmentations[end_idx]["score"] is None
                    or best_segmentations[end_idx]["score"] > score # 损失更小
                ):
                    best_segmentations[end_idx] = {"start": start_idx, "score": score}


    segmentation = best_segmentations[-1] # 最后一个位置就是最终的分词结果
    if segmentation["score"] is None:
        # We did not find a tokenization of the word -> unknown
        return ["<unk>"], None


    score = segmentation["score"]
    start = segmentation["start"] # 前一个拆分点
    end = len(word)
    tokens = []
    while start != 0:
        tokens.insert(0, word[start:end])
        next_start = best_segmentations[start]["start"]
        end = start
        start = next_start
    tokens.insert(0, word[start:end])
    return tokens, score


def compute_loss(model):
    ''' 计算当前语料库和模型的整体损失
    '''
    loss = 0
    for word, freq in word_freqs.items():
        _, word_loss = encode_word(word, model)
        loss += freq * word_loss
    return loss




def compute_scores(model):
    ''' 通过计算移除每个 token 的损失变化，从而计算每个 token 的得分
    '''
    scores = {}
    model_loss = compute_loss(model)
    for token, score in model.items():
        if len(token) == 1: # 总是保留单个字符
            continue
        model_without_token = copy.deepcopy(model)
        _ = model_without_token.pop(token)
        scores[token] = compute_loss(model_without_token) - model_loss
    return scores


#################### Step5: 缩减词表 ################


percent_to_remove = 0.1 # 每轮迭代缩小 10%
max_vocab_size = 100 # 词表的最大规模
while len(model) > max_vocab_size:
    scores = compute_scores(model)
    sorted_scores = sorted(scores.items(), key=lambda x: x[1])
    print("Top3 scores:%s"%sorted_scores[-3:])
    
    for i in range(int(len(model) * percent_to_remove)): # 移除最小的 10%
        _ = token_freqs.pop(sorted_scores[i][0])


    ### 重建 model  ###
    total_sum = sum([freq for token, freq in token_freqs.items()])
    model = {token: -log(freq*1.0 / total_sum) for token, freq in token_freqs.items()}
    
# Top3 scores:[('ing', 8.45913446432769), ('form', 9.041467278547316), ('▁and', 9.270398846926355)]
# Top3 scores:[('form', 8.756385177048287), ('▁and', 8.84277569467804), ('tion', 9.158034534900253)]
# Top3 scores:[('rans', 11.55887624144998), ('▁The', 13.833700317065222), ('▁models', 21.35200333126363)]
# ...

为了对新文本进行tokenization，我们对其进行 pre-tokenization ，然后对每个单词进行维特比解码。


xxxxxxxxxx
def tokenize(text, model):
    ''' 对文本进行 tokenize. 
    '''
    words_with_offsets = pre_tokenizers.Metaspace().pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in words_with_offsets]
    encoded_words = [encode_word(word, model)[0] for word in pre_tokenized_text]
    return sum(encoded_words, [])


print(tokenize("This's me  ." ,model))
# ['<unk>', '▁', 'me', '▁', '▁', '.']

分享到QQ

分享到微博