用于将 PDF 转换为文本的 Python 模块

发布于 2024-07-04 12:23:20 字数 1560 浏览 10 评论 0原文

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(13

甜柠檬 2024-07-11 12:23:20

PDFMiner 包自 codeape 发布。

编辑(再次):

PDFMiner 已在版本 20100213 中再次更新,

您可以使用以下命令检查已安装的版本:

>>> import pdfminer
>>> pdfminer.__version__
'20100213'

这是更新的版本(带有对我更改的内容的评论) /添加):

def pdf_to_csv(filename):
    from cStringIO import StringIO  #<-- added so you can copy/paste this to try it
    from pdfminer.converter import LTTextItem, TextConverter
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, LTTextItem):
                    (_,_,x,y) = child.bbox                   #<-- changed
                    line = lines[int(-y)]
                    line[x] = child.text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8")  #<-- changed 
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       #<-- changed
    parser.set_document(doc)     #<-- added
    doc.set_parser(parser)       #<-- added
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

编辑(再次):

这是

def pdf_to_csv(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, LTChar):               #<-- changed
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child.text.encode(self.codec)

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())  #<-- changed
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

编辑(再一次):

更新版本20110515(感谢 Oeufcoque Penteano!):

def pdf_to_csv(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item._objs:                #<-- changed
                if isinstance(child, LTChar):
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec) #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

The PDFMiner package has changed since codeape posted.

EDIT (again):

PDFMiner has been updated again in version 20100213

You can check the version you have installed with the following:

>>> import pdfminer
>>> pdfminer.__version__
'20100213'

Here's the updated version (with comments on what I changed/added):

def pdf_to_csv(filename):
    from cStringIO import StringIO  #<-- added so you can copy/paste this to try it
    from pdfminer.converter import LTTextItem, TextConverter
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, LTTextItem):
                    (_,_,x,y) = child.bbox                   #<-- changed
                    line = lines[int(-y)]
                    line[x] = child.text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8")  #<-- changed 
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       #<-- changed
    parser.set_document(doc)     #<-- added
    doc.set_parser(parser)       #<-- added
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

Edit (yet again):

Here is an update for the latest version in pypi, 20100619p1. In short I replaced LTTextItem with LTChar and passed an instance of LAParams to the CsvConverter constructor.

def pdf_to_csv(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, LTChar):               #<-- changed
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child.text.encode(self.codec)

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())  #<-- changed
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

EDIT (one more time):

Updated for version 20110515 (thanks to Oeufcoque Penteano!):

def pdf_to_csv(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item._objs:                #<-- changed
                if isinstance(child, LTChar):
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec) #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
蒲公英的约定 2024-07-11 12:23:20

尝试 PDFMiner。 它可以从 PDF 文件中提取 HTML、SGML 或“标记 PDF”格式的文本。

带标签的 PDF 格式似乎是最干净的,去掉 XML 标签就只剩下裸露的文本。

Python 3 版本位于:

Try PDFMiner. It can extract text from PDF files as HTML, SGML or "Tagged PDF" format.

The Tagged PDF format seems to be the cleanest, and stripping out the XML tags leaves just the bare text.

A Python 3 version is available under:

时光清浅 2024-07-11 12:23:20

PDFminer 在我尝试使用的 pdf 文件的每一页上都给了我一行 [第 1 页,共 7 页...]。

到目前为止我得到的最好的答案是 pdftoipe,或者它基于 Xpdf 的 C++ 代码。

请参阅我的问题了解以下内容的输出pdftoipe 看起来像。

PDFminer gave me perhaps one line [page 1 of 7...] on every page of a pdf file I tried with it.

The best answer I have so far is pdftoipe, or the c++ code it's based on Xpdf.

see my question for what the output of pdftoipe looks like.

温柔少女心 2024-07-11 12:23:20

此外,还有 PDFTextStream,它是一个商业 Java 库,也可以在 Python 中使用。

Additionally there is PDFTextStream which is a commercial Java library that can also be used from Python.

我ぃ本無心為│何有愛 2024-07-11 12:23:20

我已将 pdftohtml-xml 参数一起使用,使用 subprocess.Popen() 读取结果,这将为您提供 x 坐标、y 坐标pdf 中每个文本片段的宽度、高度和字体。 我认为这也是“evince”可能使用的,因为会出现相同的错误消息。

如果您需要处理柱状数据,它会变得稍微复杂一些,因为您必须发明一种适合您的 pdf 文件的算法。 问题在于,制作 PDF 文件的程序实际上并不一定以任何逻辑格式排列文本。 您可以尝试简单的排序算法,有时它会起作用,但可能会有一些“落后者”和“流浪者”,即没有按照您想象的顺序排列的文本片段。 所以你必须发挥创意。

我花了大约 5 个小时才为我正在处理的 pdf 找到一份。 但现在效果很好。 祝你好运。

I have used pdftohtml with the -xml argument, read the result with subprocess.Popen(), that will give you x coord, y coord, width, height, and font, of every snippet of text in the pdf. I think this is what 'evince' probably uses too because the same error messages spew out.

If you need to process columnar data, it gets slightly more complicated as you have to invent an algorithm that suits your pdf file. The problem is that the programs that make PDF files don't really necessarily lay out the text in any logical format. You can try simple sorting algorithms and it works sometimes, but there can be little 'stragglers' and 'strays', pieces of text that don't get put in the order you thought they would. So you have to get creative.

It took me about 5 hours to figure out one for the pdf's I was working on. But it works pretty good now. Good luck.

苦笑流年记忆 2024-07-11 12:23:20

今天找到了这个解决方案。 对我来说效果很好。 甚至将 PDF 页面渲染为 PNG 图像。
http://www.swftools.org/gfx_tutorial.html

Found that solution today. Works great for me. Even rendering PDF pages to PNG images.
http://www.swftools.org/gfx_tutorial.html

桃扇骨 2024-07-11 12:23:20

重新利用pdfminer自带的pdf2txt.py代码; 您可以创建一个函数来获取 pdf 的路径; 可选,输出类型 (txt|html|xml|tag) 并选择命令行 pdf2txt {'-o': '/path/to/outfile.txt' ...}。 默认情况下,您可以调用:

convert_pdf(path)

将创建一个文本文件,它是文件系统上原始 pdf 的同级文件。

def convert_pdf(path, outtype='txt', opts={}):
    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.cmapdb import CMapDB

    outfile = path[:-3] + outtype
    outdir = '/'.join(path.split('/')[:-1])

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
    device.close()

    outfp.close()
    return

Repurposing the pdf2txt.py code that comes with pdfminer; you can make a function that will take a path to the pdf; optionally, an outtype (txt|html|xml|tag) and opts like the commandline pdf2txt {'-o': '/path/to/outfile.txt' ...}. By default, you can call:

convert_pdf(path)

A text file will be created, a sibling on the filesystem to the original pdf.

def convert_pdf(path, outtype='txt', opts={}):
    import sys
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter, TagExtractor
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfdevice import PDFDevice
    from pdfminer.cmapdb import CMapDB

    outfile = path[:-3] + outtype
    outdir = '/'.join(path.split('/')[:-1])

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
    device.close()

    outfp.close()
    return
世界等同你 2024-07-11 12:23:20

由于这些解决方案都不支持最新版本的 PDFMiner,因此我编写了一个简单的解决方案,它将使用 PDFMiner 返回 pdf 的文本。 这适用于那些使用 process_pdf 遇到导入错误的人,

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

def pdfparser(data):

    fp = file(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print data

if __name__ == '__main__':
    pdfparser(sys.argv[1])  

请参阅以下适用于 Python 3 的代码:

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io

def pdfparser(data):

    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print(data)

if __name__ == '__main__':
    pdfparser(sys.argv[1])  

Since none for these solutions support the latest version of PDFMiner I wrote a simple solution that will return text of a pdf using PDFMiner. This will work for those who are getting import errors with process_pdf

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO

def pdfparser(data):

    fp = file(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print data

if __name__ == '__main__':
    pdfparser(sys.argv[1])  

See below code that works for Python 3:

import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
from pdfminer.layout import LAParams
import io

def pdfparser(data):

    fp = open(data, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.

    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        data =  retstr.getvalue()

    print(data)

if __name__ == '__main__':
    pdfparser(sys.argv[1])  
浮光之海 2024-07-11 12:23:20

Pdftotext 一个开源程序(Xpdf的一部分),您可以从Python调用它(不是您所要求的)但可能有用)。 我用过没有任何问题。 我认为谷歌在谷歌桌面中使用它。

Pdftotext An open source program (part of Xpdf) which you could call from python (not what you asked for but might be useful). I've used it with no problems. I think google use it in google desktop.

流年已逝 2024-07-11 12:23:20

pyPDF 工作正常(假设您正在使用格式良好的 PDF)。 如果您想要的只是文本(带空格),您可以这样做:

import pyPdf
pdf = pyPdf.PdfFileReader(open(filename, "rb"))
for page in pdf.pages:
    print page.extractText()

您还可以轻松访问元数据、图像数据等。

extractText 代码中的注释指出:

找到所有文本绘制命令,位于
它们在中提供的顺序
内容流,并提取文本。
这对于某些 PDF 文件效果很好,
但对其他人来说很糟糕,取决于
使用的发电机。 这会是
未来将进一步完善。 不要依赖
文本的顺序
函数,因为如果这个它会改变
功能更加完善。

这是否是一个问题取决于您对文本执行的操作(例如,如果顺序无关紧要,那就没问题,或者如果生成器按照显示的顺序将文本添加到流中,那就没问题) 。 我日常使用pyPdf提取代码,没有任何问题。

pyPDF works fine (assuming that you're working with well-formed PDFs). If all you want is the text (with spaces), you can just do:

import pyPdf
pdf = pyPdf.PdfFileReader(open(filename, "rb"))
for page in pdf.pages:
    print page.extractText()

You can also easily get access to the metadata, image data, and so forth.

A comment in the extractText code notes:

Locate all text drawing commands, in
the order they are provided in the
content stream, and extract the text.
This works well for some PDF files,
but poorly for others, depending on
the generator used. This will be
refined in the future. Do not rely on
the order of text coming out of this
function, as it will change if this
function is made more sophisticated.

Whether or not this is a problem depends on what you're doing with the text (e.g. if the order doesn't matter, it's fine, or if the generator adds text to the stream in the order it will be displayed, it's fine). I have pyPdf extraction code in daily use, without any problems.

乄_柒ぐ汐 2024-07-11 12:23:20

您还可以非常轻松地使用 pdfminer 作为库。 您可以访问 pdf 的内容模型,并且可以创建您自己的文本提取。 我这样做是为了使用下面的代码将 pdf 内容转换为分号分隔的文本。

该函数只是根据 y 和 x 坐标对 TextItem 内容对象进行排序,并将 y 坐标相同的项目输出为一个文本行,并使用“;”分隔同一行上的对象。 人物。

使用这种方法,我能够从 pdf 中提取文本,而其他工具无法从中提取适合进一步解析的内容。 我尝试过的其他工具包括 pdftotext、ps2ascii 和在线工具 pdftextonline.com。

pdfminer 是一个非常有用的 pdf 抓取工具。


def pdf_to_csv(filename):
    from pdflib.page import TextItem, TextConverter
    from pdflib.pdfparser import PDFDocument, PDFParser
    from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, TextItem):
                    (_,_,x,y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child.text

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, "ascii")

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

更新

上面的代码是针对旧版本的 API 编写的,请参阅下面的评论。

You can also quite easily use pdfminer as a library. You have access to the pdf's content model, and can create your own text extraction. I did this to convert pdf contents to semi-colon separated text, using the code below.

The function simply sorts the TextItem content objects according to their y and x coordinates, and outputs items with the same y coordinate as one text line, separating the objects on the same line with ';' characters.

Using this approach, I was able to extract text from a pdf that no other tool was able to extract content suitable for further parsing from. Other tools I tried include pdftotext, ps2ascii and the online tool pdftextonline.com.

pdfminer is an invaluable tool for pdf-scraping.


def pdf_to_csv(filename):
    from pdflib.page import TextItem, TextConverter
    from pdflib.pdfparser import PDFDocument, PDFParser
    from pdflib.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item.objs:
                if isinstance(child, TextItem):
                    (_,_,x,y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child.text

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, "ascii")

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()

UPDATE:

The code above is written against an old version of the API, see my comment below.

若无相欠,怎会相见 2024-07-11 12:23:20

slate 是一个可以非常简单地使用库中的 PDFMiner 的项目:

>>> with open('example.pdf') as f:
...    doc = slate.PDF(f)
...
>>> doc
[..., ..., ...]
>>> doc[1]
'Text from page 2...'   

slate is a project that makes it very simple to use PDFMiner from a library:

>>> with open('example.pdf') as f:
...    doc = slate.PDF(f)
...
>>> doc
[..., ..., ...]
>>> doc[1]
'Text from page 2...'   
苏璃陌 2024-07-11 12:23:20

我需要在 python 模块中将特定的 PDF 转换为纯文本。 在阅读了他们的 pdf2txt.py 工具我写了这个简单的片段:

from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

def to_txt(pdf_path):
    input_ = file(pdf_path, 'rb')
    output = StringIO()

    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)

    return output.getvalue() 

I needed to convert a specific PDF to plain text within a python module. I used PDFMiner 20110515, after reading through their pdf2txt.py tool I wrote this simple snippet:

from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams

def to_txt(pdf_path):
    input_ = file(pdf_path, 'rb')
    output = StringIO()

    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)

    return output.getvalue() 
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文