将多页PDF文件拆分为使用Python的多个PDF文件?

发布于 2025-01-31 10:04:09 字数 213 浏览 7 评论 0原文

我想使用一个多页PDF文件,并每页创建单独的PDF文件。

我已经下载了 reportlab 并浏览了文档,但似乎旨在PDF生成。我还没有看到有关处理PDF文件本身的任何信息。

在Python中有一种简单的方法吗?

I would like to take a multi-page pdf file and create separate pdf files per page.

I have downloaded reportlab and have browsed the documentation, but it seems aimed at pdf generation. I haven't yet seen anything about processing PDF files themselves.

Is there an easy way to do this in python?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(9

再可℃爱ぅ一点好了 2025-02-07 10:04:09
from PyPDF2 import PdfWriter, PdfReader

inputpdf = PdfReader(open("document.pdf", "rb"))

for i in range(len(inputpdf.pages)):
    output = PdfWriter()
    output.add_page(inputpdf.pages[i])
    with open("document-page%s.pdf" % i, "wb") as outputStream:
        output.write(outputStream)

ETC。

from PyPDF2 import PdfWriter, PdfReader

inputpdf = PdfReader(open("document.pdf", "rb"))

for i in range(len(inputpdf.pages)):
    output = PdfWriter()
    output.add_page(inputpdf.pages[i])
    with open("document-page%s.pdf" % i, "wb") as outputStream:
        output.write(outputStream)

etc.

过去的过去 2025-02-07 10:04:09

更新了PYPDF(3.0.0)最新版本的解决方案,并分配一系列页面。

from PyPDF2 import PdfReader, PdfWriter

file_name = r'c:\temp\junk.pdf'
pages = (121, 130)

reader = PdfReader(file_name)
writer = PdfWriter()
page_range = range(pages[0], pages[1] + 1)

for page_num, page in enumerate(reader.pages, 1):
    if page_num in page_range:
        writer.add_page(page)

with open(f'{file_name}_page_{pages[0]}-{pages[1]}.pdf', 'wb') as out:
    writer.write(out)

Updated solution for the latest release of PyPDF (3.0.0) and to split a range of pages.

from PyPDF2 import PdfReader, PdfWriter

file_name = r'c:\temp\junk.pdf'
pages = (121, 130)

reader = PdfReader(file_name)
writer = PdfWriter()
page_range = range(pages[0], pages[1] + 1)

for page_num, page in enumerate(reader.pages, 1):
    if page_num in page_range:
        writer.add_page(page)

with open(f'{file_name}_page_{pages[0]}-{pages[1]}.pdf', 'wb') as out:
    writer.write(out)

往事随风而去 2025-02-07 10:04:09

我在这里错过了一个解决方案,您将PDF分为由所有页面组成的两个部分,因此如果有人在寻找相同的问题,我将附加解决方案:

from PyPDF2 import PdfFileWriter, PdfFileReader

def split_pdf_to_two(filename,page_number):
    pdf_reader = PdfFileReader(open(filename, "rb"))
    try:
        assert page_number < pdf_reader.numPages
        pdf_writer1 = PdfFileWriter()
        pdf_writer2 = PdfFileWriter()

        for page in range(page_number):
            pdf_writer1.addPage(pdf_reader.getPage(page))

        for page in range(page_number,pdf_reader.getNumPages()):
            pdf_writer2.addPage(pdf_reader.getPage(page))

        with open("part1.pdf", 'wb') as file1:
            pdf_writer1.write(file1)

        with open("part2.pdf", 'wb') as file2:
            pdf_writer2.write(file2)

    except AssertionError as e:
        print("Error: The PDF you are cutting has less pages than you want to cut!")

I missed here a solution where you split the PDF to two parts consisting of all pages so I append my solution if somebody was looking for the same:

from PyPDF2 import PdfFileWriter, PdfFileReader

def split_pdf_to_two(filename,page_number):
    pdf_reader = PdfFileReader(open(filename, "rb"))
    try:
        assert page_number < pdf_reader.numPages
        pdf_writer1 = PdfFileWriter()
        pdf_writer2 = PdfFileWriter()

        for page in range(page_number):
            pdf_writer1.addPage(pdf_reader.getPage(page))

        for page in range(page_number,pdf_reader.getNumPages()):
            pdf_writer2.addPage(pdf_reader.getPage(page))

        with open("part1.pdf", 'wb') as file1:
            pdf_writer1.write(file1)

        with open("part2.pdf", 'wb') as file2:
            pdf_writer2.write(file2)

    except AssertionError as e:
        print("Error: The PDF you are cutting has less pages than you want to cut!")
无所的.畏惧 2025-02-07 10:04:09

PYPDF2软件包使您能够将单个PDF拆分为多个PDF。

import os
from PyPDF2 import PdfFileReader, PdfFileWriter

pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
    pdf_writer = PdfFileWriter()
    pdf_writer.addPage(pdf.getPage(page))

    output_filename = '{}_page_{}.pdf'.format(fname, page+1)

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print('Created: {}'.format(output_filename))

PYPDF2 3.0.0的更改

import os
from PyPDF2 import PdfReader, PdfWriter
path = 'pdf_forms/myform.pdf'
fname = 'fname'

pdf = PdfReader(path)
for page in range(len(pdf.pages)):
    pdf_writer = PdfWriter()
    pdf_writer.add_page(pdf.pages[page])

    output_filename =         
     'pdf_forms/splitted/{}_page_{}.pdf'.format(fname, page+1)

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print('Created: {}'.format(output_filename))

来源: https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-p.-merging-pdfs-with-python/

The PyPDF2 package gives you the ability to split up a single PDF into multiple ones.

import os
from PyPDF2 import PdfFileReader, PdfFileWriter

pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
    pdf_writer = PdfFileWriter()
    pdf_writer.addPage(pdf.getPage(page))

    output_filename = '{}_page_{}.pdf'.format(fname, page+1)

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print('Created: {}'.format(output_filename))

Changes for PyPDF2 3.0.0

import os
from PyPDF2 import PdfReader, PdfWriter
path = 'pdf_forms/myform.pdf'
fname = 'fname'

pdf = PdfReader(path)
for page in range(len(pdf.pages)):
    pdf_writer = PdfWriter()
    pdf_writer.add_page(pdf.pages[page])

    output_filename =         
     'pdf_forms/splitted/{}_page_{}.pdf'.format(fname, page+1)

    with open(output_filename, 'wb') as out:
        pdf_writer.write(out)

    print('Created: {}'.format(output_filename))

Source: https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/

软糯酥胸 2025-02-07 10:04:09
import fitz

src = fitz.open("source.pdf")
for page in src:
    tar = fitz.open()  # output PDF for 1 page
    # copy over current page
    tar.insert_pdf(src, from_page=page.number, to_page=page.number)
    tar.save(f"page-{page.number}.pdf")
    tar.close()
import fitz

src = fitz.open("source.pdf")
for page in src:
    tar = fitz.open()  # output PDF for 1 page
    # copy over current page
    tar.insert_pdf(src, from_page=page.number, to_page=page.number)
    tar.save(f"page-{page.number}.pdf")
    tar.close()
無處可尋 2025-02-07 10:04:09

pypdf2用于拆分PDF的较早答案不再使用最新版本更新。作者建议改用pypdf,此版本的pypdf2 == 3.0.1将是pypdf2的最后版本。该功能需要修改如下:

import os
from PyPDF2 import PdfReader, PdfWriter

def split_pdfs(input_file_path):
    inputpdf = PdfReader(open(input_file_path, "rb"))

    out_paths = []
    if not os.path.exists("outputs"):
        os.makedirs("outputs")

    for i, page in enumerate(inputpdf.pages):
        output = PdfWriter()
        output.add_page(page)

        out_file_path = f"outputs/{input_file_path[:-4]}_{i}.pdf"
        with open(out_file_path, "wb") as output_stream:
            output.write(output_stream)

        out_paths.append(out_file_path)
    return out_paths

注意:相同的功能也将与pypdf一起使用。导入pdfreaderpdfwriter来自pypdf而不是pypdf2

The earlier answers with PyPDF2 for splitting pdfs are not working anymore with the latest version update. The authors recommend using pypdf instead and this version of PyPDF2==3.0.1 will be the last version of PyPDF2. The function needs to be modified as follows:

import os
from PyPDF2 import PdfReader, PdfWriter

def split_pdfs(input_file_path):
    inputpdf = PdfReader(open(input_file_path, "rb"))

    out_paths = []
    if not os.path.exists("outputs"):
        os.makedirs("outputs")

    for i, page in enumerate(inputpdf.pages):
        output = PdfWriter()
        output.add_page(page)

        out_file_path = f"outputs/{input_file_path[:-4]}_{i}.pdf"
        with open(out_file_path, "wb") as output_stream:
            output.write(output_stream)

        out_paths.append(out_file_path)
    return out_paths

Note: The same function will work with pypdf as well. Import PdfReader and PdfWriter from pypdf rather than PyPDF2.

飘逸的'云 2025-02-07 10:04:09

我知道该代码与Python无关,但是我觉得自己要张贴这件R代码,该R代码简单,灵活并且可以工作。 R中的PDFTOOLS包在将PDF拆分时令人惊讶。

library(pdftools) #Rpackage
pdf_subset('D:\\file\\20.02.20\\22 GT 2017.pdf',
           pages = 1:51, output = "subset.pdf")

I know that the code is not related to python, however i felt like posting this piece of R code which is simple, flexible and works amazingly. The PDFtools package in R is amazing in splitting merging PDFs at ease.

library(pdftools) #Rpackage
pdf_subset('D:\\file\\20.02.20\\22 GT 2017.pdf',
           pages = 1:51, output = "subset.pdf")
禾厶谷欠 2025-02-07 10:04:09
# pip install pymupdf     

import fitz  # Importing the fitz library

# specify the path of the PDF file
pdf_path = "2021.pdf"

# open the PDF
try:
    pdf = fitz.open(pdf_path)
except FileNotFoundError:
    print("File not found")
  • 导入fitz fitz库用于使用PDF文件和图像。
  • 打开PDF :脚本尝试打开位于“ 2021.pdf”的PDF文件。如果该文件在指定的路径上不存在,则将捕获filenotfounderror并打印“未找到文件”。
# iterate over the pages starting from the 6th page
for i, page in enumerate(pdf[6:], start=6):
    try:
        # render the page as a JPEG image
        pix = page.get_pixmap(alpha=False)
        # save the image
        pix.writeJPG(f"page_{i+1}.jpg")
    except Exception as e:
        print(e)
  • 迭代页面上的页面:脚本从第六页开始在PDF的页面上迭代。 Python使用基于零的索引,因此pdf [6:]是指第七页开始。
  • 渲染页面为图像:对于每个页面,它将页面转换为pixmap,这是页面的图像表示。 alpha = false参数用于指定图像不应包括透明度。
  • 保存图像:每个页面图像都保存为JPEG文件。生成文件名以反映页码(page_ {i+1} .jpg)。枚举从6开始,但我们将其增加1,以反映从7开始的实际页码。
# close the PDF
print("Successful")
pdf.close()
# pip install pymupdf     

import fitz  # Importing the fitz library

# specify the path of the PDF file
pdf_path = "2021.pdf"

# open the PDF
try:
    pdf = fitz.open(pdf_path)
except FileNotFoundError:
    print("File not found")
  • Importing fitz: The fitz library is used for working with PDF files and images.
  • Open PDF: The script attempts to open a PDF file located at "2021.pdf". If the file doesn't exist at the specified path, it catches a FileNotFoundError and prints "File not found".
# iterate over the pages starting from the 6th page
for i, page in enumerate(pdf[6:], start=6):
    try:
        # render the page as a JPEG image
        pix = page.get_pixmap(alpha=False)
        # save the image
        pix.writeJPG(f"page_{i+1}.jpg")
    except Exception as e:
        print(e)
  • Iterating Over Pages: The script iterates over the pages of the PDF starting from the 6th page. Python uses zero-based indexing, so pdf[6:] refers to the 7th page onwards.
  • Rendering Pages as Images: For each page, it converts the page into a pixmap, which is an image representation of the page. The alpha=False parameter is used to specify that the image should not include transparency.
  • Saving Images: Each page image is saved as a JPEG file. The filenames are generated to reflect the page numbers (page_{i+1}.jpg). The enumeration starts at 6, but we increment it by 1 to reflect the actual page numbers starting from 7.
# close the PDF
print("Successful")
pdf.close()
于我来说 2025-02-07 10:04:09
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import sys
import glob
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

if getattr(sys, 'frozen', False):
    _location_ = os.path.dirname(os.path.realpath(sys.executable))
elif __file__:
    _location_ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))

for file in glob.glob(__location__ + "/*.pdf"):
    if file.endswith('.pdf'):
        pdf_file = open(os.path.join(__location__, file), 'rb')
        pdf_reader = PdfFileReader(pdf_file)
        
pageNumbers = pdf_reader.getNumPages()

for i in range (pageNumbers):
    pdf_writer = PdfFileWriter()
    pdf_writer.addPage(pdf_reader.getPage(i))
    split_motive = open('Page ' + str(i+1) + '.pdf', 'wb')
    pdf_writer.write(split_motive)
    split_motive.close()

pdf_file.close()

a>

from PyPDF2 import PdfFileReader, PdfFileWriter
import os
import sys
import glob
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

if getattr(sys, 'frozen', False):
    _location_ = os.path.dirname(os.path.realpath(sys.executable))
elif __file__:
    _location_ = os.path.realpath(
    os.path.join(os.getcwd(), os.path.dirname(__file__)))

for file in glob.glob(__location__ + "/*.pdf"):
    if file.endswith('.pdf'):
        pdf_file = open(os.path.join(__location__, file), 'rb')
        pdf_reader = PdfFileReader(pdf_file)
        
pageNumbers = pdf_reader.getNumPages()

for i in range (pageNumbers):
    pdf_writer = PdfFileWriter()
    pdf_writer.addPage(pdf_reader.getPage(i))
    split_motive = open('Page ' + str(i+1) + '.pdf', 'wb')
    pdf_writer.write(split_motive)
    split_motive.close()

pdf_file.close()

Link to article

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文