从大型结构化文本文件中提取信息

发布于 2024-07-12 05:33:31 字数 3357 浏览 9 评论 0原文

我需要读取一些大文件（从 50k 到 100k 行），这些文件按空行分隔的组进行结构。每组以相同的模式“No.999999999 dd/mm/yyyy ZZZ”开始。这是一些示例数据。

编号813829461 16/09/1987 270
Tit.SUZANO PAPEL E CELULOSE SA (BR/BA)
CNPJ/CIC/N INPI : 16404287000155
代理人：MARCELLO DO NASCIMENTO
编号815326777 28/12/1989 351
Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR/RJ)
CNPJ/CIC/N°INPI : 34162651000108
后：主名；名称：产品
马卡报：热带三重奏
分类产品/服务：09.40
*DEFERIDO 符合 123 DE 06/01/2006、PUBLICADA NA RPI 1829、DE 24/01/2006。
代理人：瓦尔德马·罗德里格斯·佩德拉
No.900148764 11/01/2007 LD3
Tiit.TIARA BOLSAS E CALÇADOS LTDA
总督：玛西娅·费雷拉·戈麦斯
*编辑：Marcas Marcantes e Patentes Ltda
*Exigência Formal não respondida Satisfatoriamente，Pedido de Registro de Marca 考虑不存在，de acordo com Art。 157 da LPI
*Protocolo da Petição de cumprimento de Exigência 正式：810080140197

我编写了一些代码来相应地解析它。有什么我可以改进的地方，以提高可读性或性能？到目前为止我的情况是这样的：

import re, pprint

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    regexp = {
        re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'): lambda self: self._processo,
        re.compile(r'Tit.(.*)'): lambda self: self._titular,
        re.compile(r'Procurador: (.*)'): lambda self: self._procurador,
        re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,
        re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,
        re.compile(r'Marca: (.*)'): lambda self: self._marca,
        re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,
        re.compile(r'\*(.*)'): lambda self: self._complemento,
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []

    def _processo(self, matches):
        self.processo, self.data, self.despacho = matches.groups()

    def _titular(self, matches):
        self.titular = matches.group(1)

    def _procurador(self, matches):
        self.procurador = matches.group(1)

    def _documento(self, matches):
        self.documento = matches.group(1)

    def _apresentacao(self, matches):
        self.apresentacao, self.natureza = matches.groups()

    def _marca(self, matches):
        self.marca = matches.group(1)

    def _classe(self, matches):
        self.classe = matches.group(1)

    def _complemento(self, matches):
        self.complemento.append(matches.group(1))

    def read(self, line):
        for pattern in Despacho.regexp:
            m = pattern.match(line)
            if m:
                Despacho.regexp[pattern](self)(m)


def process(rpi):
    """
    read data and process each group
    """
    rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)


arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
    pprint.pprint(desp.__dict__)
    print('--------------')

原文

I need to read some large files (from 50k to 100k lines), structured in groups separated by empty lines. Each group start at the same pattern "No.999999999 dd/mm/yyyy ZZZ". Here´s some sample data.

No.813829461 16/09/1987 270
Tit.SUZANO PAPEL E CELULOSE S.A. (BR/BA)
C.N.P.J./C.I.C./N INPI : 16404287000155
Procurador: MARCELLO DO NASCIMENTO
No.815326777 28/12/1989 351
Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA (BR/RJ)
C.N.P.J./C.I.C./NºINPI : 34162651000108
Apres.: Nominativa ; Nat.: De Produto
Marca: TRIO TROPICAL
Clas.Prod/Serv: 09.40
*DEFERIDO CONFORME RESOLUÇÃO 123 DE 06/01/2006, PUBLICADA NA RPI 1829, DE 24/01/2006.
Procurador: WALDEMAR RODRIGUES PEDRA
No.900148764 11/01/2007 LD3
Tit.TIARA BOLSAS E CALÇADOS LTDA
Procurador: Marcia Ferreira Gomes
*Escritório: Marcas Marcantes e Patentes Ltda
*Exigência Formal não respondida Satisfatoriamente, Pedido de Registro de Marca considerado inexistente, de acordo com Art. 157 da LPI
*Protocolo da Petição de cumprimento de Exigência Formal: 810080140197

I wrote some code that´s parsing it accordingly. There´s anything that I can improve, to improve readability or performance? Here´s what I come so far:

import re, pprint

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    regexp = {
        re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'): lambda self: self._processo,
        re.compile(r'Tit.(.*)'): lambda self: self._titular,
        re.compile(r'Procurador: (.*)'): lambda self: self._procurador,
        re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,
        re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,
        re.compile(r'Marca: (.*)'): lambda self: self._marca,
        re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,
        re.compile(r'\*(.*)'): lambda self: self._complemento,
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []

    def _processo(self, matches):
        self.processo, self.data, self.despacho = matches.groups()

    def _titular(self, matches):
        self.titular = matches.group(1)

    def _procurador(self, matches):
        self.procurador = matches.group(1)

    def _documento(self, matches):
        self.documento = matches.group(1)

    def _apresentacao(self, matches):
        self.apresentacao, self.natureza = matches.groups()

    def _marca(self, matches):
        self.marca = matches.group(1)

    def _classe(self, matches):
        self.classe = matches.group(1)

    def _complemento(self, matches):
        self.complemento.append(matches.group(1))

    def read(self, line):
        for pattern in Despacho.regexp:
            m = pattern.match(line)
            if m:
                Despacho.regexp[pattern](self)(m)


def process(rpi):
    """
    read data and process each group
    """
    rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)


arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
    pprint.pprint(desp.__dict__)
    print('--------------')

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

挽容 2024-07-19 05:33:31

那是相当不错的。下面是一些建议，如果您喜欢它们，请告诉我：

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = {
        ('processo', 
         'data', 
         'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),
        ('titular',): re.compile(r'Tit.(.*)'),
        ('procurador',): re.compile(r'Procurador: (.*)'),
        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
        ('apresentacao',
         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
        ('marca',): re.compile(r'Marca: (.*)'),
        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
        ('complemento',): re.compile(r'\*(.*)'),
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []


    def read(self, line):
        for attrs, pattern in Despacho.regexp.iteritems():
            m = pattern.match(line)
            if m:
                for groupn, attr in enumerate(attrs):
                    # special case complemento:
                    if attr == 'complemento':
                        self.complemento.append(m.group(groupn + 1))
                    else:
                        # set the attribute on the object
                        setattr(self, attr, m.group(groupn + 1))

    def __repr__(self):
        # defines object printed representation
        d = {}
        for attrs in self.regexp:
            for attr in attrs:
                d[attr] = getattr(self, attr, None)
        return pprint.pformat(d)

def process(rpi):
    """
    read data and process each group
    """
    #Useless line, since you're doing a for anyway
    #rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)

def main():
    arquivo = open('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print('-' * 20)
    return 0

if __name__ == '__main__':
    main()

That is pretty good. Below some suggestions, let me know if you like'em:

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = {
        ('processo', 
         'data', 
         'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),
        ('titular',): re.compile(r'Tit.(.*)'),
        ('procurador',): re.compile(r'Procurador: (.*)'),
        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
        ('apresentacao',
         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
        ('marca',): re.compile(r'Marca: (.*)'),
        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
        ('complemento',): re.compile(r'\*(.*)'),
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []


    def read(self, line):
        for attrs, pattern in Despacho.regexp.iteritems():
            m = pattern.match(line)
            if m:
                for groupn, attr in enumerate(attrs):
                    # special case complemento:
                    if attr == 'complemento':
                        self.complemento.append(m.group(groupn + 1))
                    else:
                        # set the attribute on the object
                        setattr(self, attr, m.group(groupn + 1))

    def __repr__(self):
        # defines object printed representation
        d = {}
        for attrs in self.regexp:
            for attr in attrs:
                d[attr] = getattr(self, attr, None)
        return pprint.pformat(d)

def process(rpi):
    """
    read data and process each group
    """
    #Useless line, since you're doing a for anyway
    #rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()        

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)

def main():
    arquivo = open('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print('-' * 20)
    return 0

if __name__ == '__main__':
    main()

回复收藏 0 原文

清晰传感 2024-07-19 05:33:31

如果您有特定的疑虑，会更容易提供帮助。性能在很大程度上取决于您正在使用的特定正则表达式引擎的效率。单个文件中的 100K 行听起来并没有那么大，但这一切都取决于您的环境。

我在 .NET 开发中使用 Expresso 来测试表达式的准确性和性能。
Google 搜索显示了 Kodos，这是一个 GUI Python 正则表达式创作工具。

回复收藏 0 原文

南笙 2024-07-19 05:33:31

总体看起来不错，但为什么会有这样一行：

rpi = (line for line in rpi)

您已经可以迭代文件对象而无需此中间步骤。

It looks good overall, but why do you have the line:

rpi = (line for line in rpi)

You can already iterate over the file object without this intermediate step.

回复收藏 0 原文

牵你手 2024-07-19 05:33:31

我不会在这里使用正则表达式。如果您知道您的行将以固定字符串开头，为什么不检查这些字符串并围绕它编写逻辑呢？

for line in open(file):
    if line[0:3]=='No.':
        currIndex='No'
        map['No']=line[4:]
   ....
   ...
   else if line.strip()=='':
       //store the record in the map and clear the map
   else:
      //append line to the last index in map.. this is when the record overflows to the next line.
      Map[currIndex]=Map[currIndex]+"\n"+line

将上面的代码视为伪代码。

I wouldn't use regex here. If you know that your lines will be starting with fixed strings, why not check those strings and write a logic around it?

for line in open(file):
    if line[0:3]=='No.':
        currIndex='No'
        map['No']=line[4:]
   ....
   ...
   else if line.strip()=='':
       //store the record in the map and clear the map
   else:
      //append line to the last index in map.. this is when the record overflows to the next line.
      Map[currIndex]=Map[currIndex]+"\n"+line

Consider the above code as just the pseudocode.

回复收藏 0 原文

似最初 2024-07-19 05:33:31

另一个版本只有一个组合正则表达式：

#!/usr/bin/python

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = re.compile(
        r'No.(?P<processo>[\d]{9})  (?P<data>[\d]{2}/[\d]{2}/[\d]{4})  (?P<despacho>.*)'
        r'|Tit.(?P<titular>.*)'
        r'|Procurador: (?P<procurador>.*)'
        r'|C.N.P.J./C.I.C./N INPI :(?P<documento>.*)'
        r'|Apres.: (?P<apresentacao>.*) ; Nat.: (?P<natureza>.*)'
        r'|Marca: (?P<marca>.*)'
        r'|Clas.Prod/Serv: (?P<classe>.*)'
        r'|\*(?P<complemento>.*)')

    simplefields = ('processo', 'data', 'despacho', 'titular', 'procurador',
                    'documento', 'apresentacao', 'natureza', 'marca', 'classe')

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single
        registry
        """
        self.__dict__ = dict.fromkeys(self.simplefields)
        self.complemento = []

    def parse(self, line):
        m = self.regexp.match(line)
        if m:
            gd = dict((k, v) for k, v in m.groupdict().items() if v)
            if 'complemento' in gd:
                self.complemento.append(gd['complemento'])
            else:
                self.__dict__.update(gd)

    def __repr__(self):
        # defines object printed representation
        return pprint.pformat(self.__dict__)

def process(rpi):
    """
    read data and process each group
    """
    d = None

    for line in rpi:
        if line.startswith('No.'):
            if d:
                yield d
            d = Despacho()
        d.parse(line)
    yield d

def main():
    arquivo = file('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print '-' * 20

if __name__ == '__main__':
    main()

Another version with only one combined regular expression:

#!/usr/bin/python

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = re.compile(
        r'No.(?P<processo>[\d]{9})  (?P<data>[\d]{2}/[\d]{2}/[\d]{4})  (?P<despacho>.*)'
        r'|Tit.(?P<titular>.*)'
        r'|Procurador: (?P<procurador>.*)'
        r'|C.N.P.J./C.I.C./N INPI :(?P<documento>.*)'
        r'|Apres.: (?P<apresentacao>.*) ; Nat.: (?P<natureza>.*)'
        r'|Marca: (?P<marca>.*)'
        r'|Clas.Prod/Serv: (?P<classe>.*)'
        r'|\*(?P<complemento>.*)')

    simplefields = ('processo', 'data', 'despacho', 'titular', 'procurador',
                    'documento', 'apresentacao', 'natureza', 'marca', 'classe')

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single
        registry
        """
        self.__dict__ = dict.fromkeys(self.simplefields)
        self.complemento = []

    def parse(self, line):
        m = self.regexp.match(line)
        if m:
            gd = dict((k, v) for k, v in m.groupdict().items() if v)
            if 'complemento' in gd:
                self.complemento.append(gd['complemento'])
            else:
                self.__dict__.update(gd)

    def __repr__(self):
        # defines object printed representation
        return pprint.pformat(self.__dict__)

def process(rpi):
    """
    read data and process each group
    """
    d = None

    for line in rpi:
        if line.startswith('No.'):
            if d:
                yield d
            d = Despacho()
        d.parse(line)
    yield d

def main():
    arquivo = file('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print '-' * 20

if __name__ == '__main__':
    main()

回复收藏 0 原文

~没有更多了~