从关键字字符串中删除重音符号

发布于 2025-01-10 13:52:43 字数 4520 浏览 0 评论 0原文

这是聊天机器人的文字处理代码，其中删除了一些文章和介词，使机器人更容易阅读

    import json
     from random import choice
        
    class ChatterMessage:
        def __init__(self, raw):
            self.raw = str(raw).lower()
            self.processed_str = self.reduce()
            self.responses = self.get_responses()
            self.data = self.process_response()
            self.response = choice(self.data['response'])
    
    
        def remove_unwanted_chars(self, string):
            list_of_chars = ['?', ".", ",", "!", "@", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
            new_str = ""
            for char in string:
                if char not in list_of_chars:
                    new_str += str(char)
            return new_str
    
        def get_responses(self, response_file="info.json"):
            with open(response_file, 'r') as file:
                return json.loads(file.read())
    
    
        def reduce(self):
            stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
            custom_filter = []
            keywords_list = []
            strlist = self.raw.split(" ")
            for x in strlist:
                if x not in stopwords and x not in custom_filter:
                    keywords_list.append(self.remove_unwanted_chars(x))
            return keywords_list
    
        def process_response(self):
            percentage = lambda x, y: (100 * y) / x
            total = sum(len(x['keywords']) for x in self.responses)
            most_acc = 0
            response_data = None
            acc = 0
            for value in self.responses:
                c = 0
                for x in value['keywords']:
                    if str(x).lower() in self.processed_str:
                        c += 1
                        if c > most_acc:
                            most_acc = c
                            acc = percentage(total, most_acc)
                            print(acc)
                            response_data = value
            if acc < 6:
                return {"response": "Sorry, I do not understand. Be more clear please"}
            for x in self.processed_str:
                if x not in response_data['keywords']:
                    response_data['keywords'].append(x)
    
            return response_data
    
    
    if __name__ == '__main__':
        while True:
            k = input("Você: ")
            res = ChatterMessage(k)

.response
        print("Bot:", res)

如何从关键字字符串中删除重音符号以“让聊天机器人更容易阅读”？我找到了这个解释：如何使用删除字符串重音Python 3？但我不知道它将如何应用于此代码，因为机器人总是停止响应

原文

This is a word processing code for chabot, in it it removes some articles and prepositions to make it easier for the bot to read

    import json
     from random import choice
        
    class ChatterMessage:
        def __init__(self, raw):
            self.raw = str(raw).lower()
            self.processed_str = self.reduce()
            self.responses = self.get_responses()
            self.data = self.process_response()
            self.response = choice(self.data['response'])
    
    
        def remove_unwanted_chars(self, string):
            list_of_chars = ['?', ".", ",", "!", "@", "[", "]", "{", "}", "#", "quot;, "%", "*", "&", "(", ")", "-", "_", "+", "="]
            new_str = ""
            for char in string:
                if char not in list_of_chars:
                    new_str += str(char)
            return new_str
    
        def get_responses(self, response_file="info.json"):
            with open(response_file, 'r') as file:
                return json.loads(file.read())
    
    
        def reduce(self):
            stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
            custom_filter = []
            keywords_list = []
            strlist = self.raw.split(" ")
            for x in strlist:
                if x not in stopwords and x not in custom_filter:
                    keywords_list.append(self.remove_unwanted_chars(x))
            return keywords_list
    
        def process_response(self):
            percentage = lambda x, y: (100 * y) / x
            total = sum(len(x['keywords']) for x in self.responses)
            most_acc = 0
            response_data = None
            acc = 0
            for value in self.responses:
                c = 0
                for x in value['keywords']:
                    if str(x).lower() in self.processed_str:
                        c += 1
                        if c > most_acc:
                            most_acc = c
                            acc = percentage(total, most_acc)
                            print(acc)
                            response_data = value
            if acc < 6:
                return {"response": "Sorry, I do not understand. Be more clear please"}
            for x in self.processed_str:
                if x not in response_data['keywords']:
                    response_data['keywords'].append(x)
    
            return response_data
    
    
    if __name__ == '__main__':
        while True:
            k = input("Você: ")
            res = ChatterMessage(k)

.response
        print("Bot:", res)

How to remove accents from keyword strings to "make it easier" for chatbot to read? I found this explanation: How to remove string accents using Python 3? But I don't know how it would be applied to this code as the bot always stops responding

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

冰葑 2025-01-17 13:52:43

您可以使用 Python 包 unidecode 将特殊字符替换为 ASCII 等效字符。

from unidecode import unidecode
text = "Björn, Łukasz and Σωκράτης."
print(unidecode(text))
# ==> Bjorn, Lukasz and Sokrates.

您可以将其应用于输入和关键字。

# In the function definition of reduce(), place this line of code after 
# stopwords = ['de', 'a', 'o', .....])
stopwords = [unidecode(s) for s in stopwords]

# In "__main__": replace k = input("Você: ") with the following line of code.
k = unidecode(input("Você: "))

如果有意义的话，您还可以强制字符串全部为小写。这将使您的字符串比较更加稳健。

k = unidecode(input("Você: ").lower())

因为您请求了完整的代码：

import json
from random import choice
from unidecode import unidecode
    
class ChatterMessage:
    def __init__(self, raw):
        self.raw = str(raw).lower()
        self.processed_str = self.reduce()
        self.responses = self.get_responses()
        self.data = self.process_response()
        self.response = choice(self.data['response'])


    def remove_unwanted_chars(self, string):
        list_of_chars = ['?', ".", ",", "!", "@", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
        new_str = ""
        for char in string:
            if char not in list_of_chars:
                new_str += str(char)
        return new_str

    def get_responses(self, response_file="info.json"):
        with open(response_file, 'r') as file:
            return json.loads(file.read())


    def reduce(self):
        stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
        stopwords = [unidecode(s) for s in stopwords]

        custom_filter = []
        keywords_list = []
        strlist = self.raw.split(" ")
        for x in strlist:
            if x not in stopwords and x not in custom_filter:
                keywords_list.append(self.remove_unwanted_chars(x))
        return keywords_list

    def process_response(self):
        percentage = lambda x, y: (100 * y) / x
        total = sum(len(x['keywords']) for x in self.responses)
        most_acc = 0
        response_data = None
        acc = 0
        for value in self.responses:
            c = 0
            for x in value['keywords']:
                if str(x).lower() in self.processed_str:
                    c += 1
                    if c > most_acc:
                        most_acc = c
                        acc = percentage(total, most_acc)
                        print(acc)
                        response_data = value
        if acc < 6:
            return {"response": "Sorry, I do not understand. Be more clear please"}
        for x in self.processed_str:
            if x not in response_data['keywords']:
                response_data['keywords'].append(x)

        return response_data


if __name__ == '__main__':
    while True:
        k = unidecode(input("Você: "))
        res = ChatterMessage(k).response
        print("Bot:", res)

You could use the Python package unidecode that replaces special characters with ASCII equivalents.

from unidecode import unidecode
text = "Björn, Łukasz and Σωκράτης."
print(unidecode(text))
# ==> Bjorn, Lukasz and Sokrates.

You could apply this to both the input and keywords.

# In the function definition of reduce(), place this line of code after 
# stopwords = ['de', 'a', 'o', .....])
stopwords = [unidecode(s) for s in stopwords]

# In "__main__": replace k = input("Você: ") with the following line of code.
k = unidecode(input("Você: "))

If it makes sense, you could also force the strings to be all lowercase. This will make your string comparisons even more robust.

k = unidecode(input("Você: ").lower())

Because you requested the entire code:

import json
from random import choice
from unidecode import unidecode
    
class ChatterMessage:
    def __init__(self, raw):
        self.raw = str(raw).lower()
        self.processed_str = self.reduce()
        self.responses = self.get_responses()
        self.data = self.process_response()
        self.response = choice(self.data['response'])


    def remove_unwanted_chars(self, string):
        list_of_chars = ['?', ".", ",", "!", "@", "[", "]", "{", "}", "#", "quot;, "%", "*", "&", "(", ")", "-", "_", "+", "="]
        new_str = ""
        for char in string:
            if char not in list_of_chars:
                new_str += str(char)
        return new_str

    def get_responses(self, response_file="info.json"):
        with open(response_file, 'r') as file:
            return json.loads(file.read())


    def reduce(self):
        stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
        stopwords = [unidecode(s) for s in stopwords]

        custom_filter = []
        keywords_list = []
        strlist = self.raw.split(" ")
        for x in strlist:
            if x not in stopwords and x not in custom_filter:
                keywords_list.append(self.remove_unwanted_chars(x))
        return keywords_list

    def process_response(self):
        percentage = lambda x, y: (100 * y) / x
        total = sum(len(x['keywords']) for x in self.responses)
        most_acc = 0
        response_data = None
        acc = 0
        for value in self.responses:
            c = 0
            for x in value['keywords']:
                if str(x).lower() in self.processed_str:
                    c += 1
                    if c > most_acc:
                        most_acc = c
                        acc = percentage(total, most_acc)
                        print(acc)
                        response_data = value
        if acc < 6:
            return {"response": "Sorry, I do not understand. Be more clear please"}
        for x in self.processed_str:
            if x not in response_data['keywords']:
                response_data['keywords'].append(x)

        return response_data


if __name__ == '__main__':
    while True:
        k = unidecode(input("Você: "))
        res = ChatterMessage(k).response
        print("Bot:", res)

回复收藏 0 原文

~没有更多了~