更快的python lemmatization

发布于 2025-02-09 19:35:20 字数 2666 浏览 0 评论 0原文

我一直在测试不同的柠檬酸方法，因为它将用于非常大的语料库。以下是我的方法和结果。有人有任何提示加快这些方法的提示吗？ Spacy是最快的，其中包括语音标签的一部分（首选），其次是Lemminflect。我是错误的方式吗？这些功能正在包含文本的数据框架上使用pandas .apply（）。

def prepareString_nltk_current(x):
    lemmatizer = WordNetLemmatizer()
    x = re.sub(r"[^0-9a-z]", " ", x)
    if len(x)==0:
        return ''
    tokens = word_tokenize(x)
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]
    if len(tokens)==0:
        return ''
    return ' '.join(map(str,tokens))

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error


def prepareString_spacy_pretrained(x):
    if len(x)==0:
        return ''
    doc = nlp(x)
    return re.sub(r"[^0-9a-zA-Z]", " ", " ".join(str(token.lemma) for token in doc)).lower()

def get_wordnet_pos(word):
    lemmatizer = WordNetLemmatizer()
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": 'a',
                    "N": 'n',
                    "V": 'v',
                    "R": 'r'}

    return lemmatizer.lemmatize(word, tag_dict.get(tag, 'n'))

def prepareString_nltk_pos(x):
    
    tokens = word_tokenize(x)
    if len(x)==0:
        return ''
    return " ".join(get_wordnet_pos(w) for w in tokens)

def prepareString_textblob(x):
    sent = TextBlob(x)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    return " ".join([wd.lemmatize(tag) for wd, tag in words_and_tags])

def prepareString_genism(x):
    return " ".join([wd.decode('utf-8').split('/')[0] for wd in lemmatize(x)])

def prepareString_leminflect(x):
    doc = nlp(x)
    return " ".join([str(x._.lemma) for x in doc])


def prepareString_pattern_pos(x):
    s = parsetree(x, tags=True, lemmata=True)
    for sentence in s:
        return re.sub(r"[^0-9a-zA-Z]", " ", " ".join([str(x._.lemma()) for x in doc])).lower()

原文

I have been testing different lemmatization methods since it will be used on a very large corpus. Below are my methods and results. Does anyone have any tips to speed any of these methods up? Spacy was the fastest with part of speech tags included (preferred), followed by lemminflect. Am I going about this the wrong way? These functions are being applied with pandas .apply() on a dataframe containing the text.

def prepareString_nltk_current(x):
    lemmatizer = WordNetLemmatizer()
    x = re.sub(r"[^0-9a-z]", " ", x)
    if len(x)==0:
        return ''
    tokens = word_tokenize(x)
    tokens = [lemmatizer.lemmatize(word).strip() for word in tokens if word not in stop_words]
    if len(tokens)==0:
        return ''
    return ' '.join(map(str,tokens))

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error

def prepareString_pattern(x):
    error = 'Error'
    x = re.sub(r"[^0-9a-z.,;]", " ", x)
    if len(x)==0:
        return ''
    try:
        return " ".join([lemma(wd) if wd not in ['this', 'his'] else wd for wd in x.split()])
    except StopIteration:
        return error


def prepareString_spacy_pretrained(x):
    if len(x)==0:
        return ''
    doc = nlp(x)
    return re.sub(r"[^0-9a-zA-Z]", " ", " ".join(str(token.lemma) for token in doc)).lower()

def get_wordnet_pos(word):
    lemmatizer = WordNetLemmatizer()
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": 'a',
                    "N": 'n',
                    "V": 'v',
                    "R": 'r'}

    return lemmatizer.lemmatize(word, tag_dict.get(tag, 'n'))

def prepareString_nltk_pos(x):
    
    tokens = word_tokenize(x)
    if len(x)==0:
        return ''
    return " ".join(get_wordnet_pos(w) for w in tokens)

def prepareString_textblob(x):
    sent = TextBlob(x)
    tag_dict = {"J": 'a', 
                "N": 'n', 
                "V": 'v', 
                "R": 'r'}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]    
    return " ".join([wd.lemmatize(tag) for wd, tag in words_and_tags])

def prepareString_genism(x):
    return " ".join([wd.decode('utf-8').split('/')[0] for wd in lemmatize(x)])

def prepareString_leminflect(x):
    doc = nlp(x)
    return " ".join([str(x._.lemma) for x in doc])


def prepareString_pattern_pos(x):
    s = parsetree(x, tags=True, lemmata=True)
    for sentence in s:
        return re.sub(r"[^0-9a-zA-Z]", " ", " ".join([str(x._.lemma()) for x in doc])).lower()

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

柏拉图鍀咏恒 2025-02-16 19:35:20

我认为是需要时间的，而不是实际的诱捕性，这是Spacy解析（创建POS标签等）。从Lemminflect的Reame中，该图书馆平均每遍42US（不包括解析）。看来您的花费更像是42毫秒（即1044S / 26536引理）。这意味着您确实需要加快Spacy的解析。

如果还没有，则可以使用最小的Spacy型号。 load（'en_core_web_sm'）
我认为您可以禁用NER和依赖性解析以加快速度，因为您不需要此信息。请参阅Spacy的文档，了解如何加载“ NLP”并禁用这些文档（我不确定这可以做到，但我怀疑它可以）。
您可以多线程代码，这将为您的计算机具有的核心数字提供几乎线性的速度。

您还可以使用呼叫 noreflow noreferrer“>使用param lemmatize_oov = false。这只会使词典引理查找非常快。它不会诱使vocab单词（即错误的词，稀有的单词，...）的诱惑，这要慢得多。请注意，您必须解析句子才能获取upos。在Spacy中，我认为这是token.pos _。参见 part-of-of-Spech标签 lemminflect的期望和spacy的文档以验证是否验证这是.pos _属性。

但是，我认为您的最大问题是解析和诱饵速度的小变化不会影响您很大。

我还应该指出，解析只有在句子中有自己的话时才有效。从您的代码看来，您正在正确执行此操作，但我无法确定。请确保您是，因为解析器只能给出一个单词或一个小文本片段，就无法选择正确的POS。