使用Python压缩PDF的图像（帮助更换PDF中的图像）

发布于 2025-01-21 01:27:24 字数 9022 浏览 2 评论 0原文

我的目标是拥有一个 Python 脚本，它将获取 PDF、提取其图像、压缩/调整它们的大小，然后将这些新数据推送到 PDF，从而生成一个新的、更小的 PDF。

到目前为止，除了用正确的对应数据替换图像数据之外，我已经完成了所有操作。我找到了图像数据所在的正确位置（在“stream”和“streamend”标签之间）。

这是我的代码（已更新）：

def crunchPdfImages(file_to_crunch, max_width=1200, max_height=628):
    page_ratio = max_width / max_height
    working_folder = os.path.dirname(file_to_crunch)
    working_dir = os.path.join(working_folder, "temp_working_dir")
    if not (os.path.exists(working_dir)): os.mkdir(working_dir)
    
    # Get all the image...
    extractPDFImages(file_to_crunch, working_dir)
    # Compress all the images... (no resize)
    all_image_list = [entry.path for entry in os.scandir(working_dir) if isImage(entry.path)]
    for an_image in all_image_list:
        img_picture = Image.open(an_image).convert("RGB")
        img_picture.save(an_image, "JPEG", optimize=True)
    
    # Uncompress the PDF
    pdf_folder = os.path.join(working_dir, "pdf_uncompressed")
    if not (os.path.exists(pdf_folder)): os.mkdir(pdf_folder)
    pdf_datain_file = os.path.join(pdf_folder, "uncompressed_pdf.pdf")
    pdf_dataout_file = os.path.join(pdf_folder, "new_images_pdf.pdf")
    pypdftk.uncompress('"' + file_to_crunch + '"', pdf_datain_file)
    
    # Now get to work...
    #   The PDF is comprised of objects, some of which are lablled as images.
    #   Each image has the line "/Subtype /Image" before the "stream" which is then ended by "endstream" then "endobj".
    #   In between the stream and endstream is the encoded image data... hopefully I can replace this in the same order that
    #   the images were taken out.
    picture_replace_count = 0
    pdf_openfile_in = open(pdf_datain_file, "rb")
    pdf_openfile_out = open(pdf_dataout_file, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    looking_for_next_stream = False
    found_stream_and_removing = False
    updating_xref_stage = 0
    skip_a_line = False
    
    for line in pdf_file_lines:
        new_line_addition = "" # For adding to byte count, resetting to null here just in case
        current_line_val = line.decode("Latin-1").strip()
        
        if (looking_for_next_stream):
            # Last image tag has been found but not dealt with, so find the stream then
            if (current_line_val[:8] == "/Length "):
                # Update the length
                skip_a_line = True
                new_img_size = str(os.path.getsize(all_image_list[picture_replace_count]))
                new_line = r"/Length " + new_img_size + "\n"
                pdf_openfile_out.write(new_line.encode("latin-1")) # add new line
            if (current_line_val == "stream"):
                print("Stream start found... skipping stream information")
                looking_for_next_stream = False # it's been found
                found_stream_and_removing = True # time to delete
                
                new_line_addition = "stream\n".encode("latin-1")
                pdf_openfile_out.write(new_line_addition) # add the line in or it will be skipped
                
        elif (found_stream_and_removing):
            if (current_line_val == "endstream"):
                print("Stream end found")
                found_stream_and_removing = False # Passed through all image data line
                # Now, add in the new image data and continue on.
                print("Adding new image data...")
                #new_image_file = open(all_image_list[picture_replace_count], "rb")
                img = Image.open(all_image_list[picture_replace_count], mode='r')
                img_byte_arr = io.BytesIO()
                img.save(img_byte_arr, format='JPEG')
                img_byte_arr = img_byte_arr.getvalue()
                
                pdf_openfile_out.write(img_byte_arr)
                new_line_addition = img_byte_arr
                """
                for imgline in new_image_file.readlines():
                    #pdf_openfile_out.write(imgline.encode("Latin-1"))
                    #pdf_openfile_out.write(imgline)
                    pass
                #pdf_openfile_out.write("[IMAGEADD]".encode("latin-1")) # add new line
                """
                #new_image_file.close()
                picture_replace_count += 1
                print("New image added.")
                pdf_openfile_out.write("\n".encode("latin-1")) # add new line
                
                if (picture_replace_count >= len(all_image_list)):
                    updating_xref_stage = 1 # All images replaced, now edit the xref.
                    
        elif (current_line_val == r"/Subtype /Image"):
            print("Found an image place, number " + str(picture_replace_count))
            print("Looking for stream start...")
            looking_for_next_stream = True
            # Find next 
        
        if not (found_stream_and_removing) and not (skip_a_line) and not (updating_xref_stage == 4): 
            pdf_openfile_out.write(line)
        
        skip_a_line = False
        

    pdf_openfile_in.close()
    pdf_openfile_out.close()
    
    print("Rebuilding xref table (post newfile creation)")
    rebuildXrefTable(pdf_dataout_file)

为了重建外部参照表（根据此处的注释），我运行以下函数。我确实尝试在前一个函数中执行此操作，但数据最终以不同的大小输出。我还没有弄清楚如何准确地将照片日期推送到 PDF 流中。

def rebuildXrefTable(pdf_file_in, pdf_file_out=None):
    # Updating the xref table:
    #   * Assumes uncompressed PDF file
    #   To do this I need the number of bytes that precede and object (this is used as a reference).
    #   So, each line I will need to count the byte number and tally up
    #   When an object is found, the byte_count will be added to the reference list and then used to create the xref table
    #   Also need to update the "startxref" at the bottom (similar principle).
    
    if (pdf_file_out == None): pdf_file_out = os.path.join(os.path.dirname(pdf_file_in), "rebuilt_xref_pdf.pdf")
    print("Updating xref table of: " + os.path.basename(pdf_file_in))
    
    byte_count = 0
    xref_start = 0
    object_location_reference = []
    updating_xref_stage = 1
    pdf_openfile_in = open(pdf_file_in, "rb")
    pdf_openfile_out = open(pdf_file_out, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    for line in pdf_file_lines:
        current_line_val = line.decode("Latin-1").strip()
        if (" obj" in current_line_val):
            # Check if the place is an object loc, store byte reference and object index
            obj_ref_index = current_line_val.split(" ")[0]
            print("Found new object (index, location): (" + str(obj_ref_index) + ", " + str(byte_count) + ")")
            object_location_reference.append((int(obj_ref_index), byte_count))
        elif ("startxref" in current_line_val):
            # This is the last thing to edit (right at the bottom, update the xref start location and then add the file end.
            print("Updating the xref start value with new data...")
            new_line = "startxref\n" + str(xref_start) + "\n" + r"%%EOF"
            pdf_openfile_out.write(new_line.encode("latin-1"))
            break
        elif ("xref" in current_line_val):
            print("Recording the new xref byte location")
            preceeding_str = current_line_val.split("xref")[0]
            preceeding_count = len(preceeding_str.encode("latin-1"))
            xref_start = byte_count + preceeding_count # used at the end
            updating_xref_stage = 2 
        
        elif (updating_xref_stage == 2 or updating_xref_stage == 3): 
            # This stage simply skips the first 2 xref data lines (and prints it o the new file as is)
            updating_xref_stage += 1
        elif (updating_xref_stage == 4):
            print("Creating new xref object byte location table...")
            object_location_reference.sort() # Sort the collected xref locations by their object index.
            # Now add the new xref data information
            for xref_loc in object_location_reference:
                new_val = str(xref_loc[1]).zfill(10) # Pad the number out
                new_val = new_val + " 00000 n \n"
                pdf_openfile_out.write(new_val.encode("latin-1"))
            updating_xref_stage = 5
        elif (updating_xref_stage == 5):
            # Stage 5 doesn't record the read in lines into new file
            if ("trailer" in current_line_val): updating_xref_stage = 5
        
        # Write to file
        if not (updating_xref_stage == 5):
            pdf_openfile_out.write(line)
            byte_count += len(line)
    
    pdf_openfile_in.close()
    pdf_openfile_out.close()

外部参照表是准确的并且指向正确的字节位置，我还确保它的顺序正确（使用对象索引号，而不是它在文件中出现的顺序 - 这与原始 PDF 文档匹配）。

如果我不尝试替换任何内容，只是将数据吐出到新的 PDF 文件中，它就可以工作，我可以打开新文件。但是，当插入替换 JPG 数据时，PDF 无法打开，因为它已损坏。

我不知道如何将正确的数据从压缩图像推送到 PDF 文件。

我也尝试过简单地推送 JPG 数据，如下所示：

image = open(all_image_list[picture_replace_count], 'rb')
pdf_openfile_out.write(image.read())
image.close()

使用：Python 3.8

原文

My goal is to have a Python script that will take a PDF, extract its images, compress/resize them, and then push this new data to the PDF which will result in a new, smaller PDF.

So far I have done everything except replace the image data with the correct counterpart. I find the correct place where the image data is located (between the "stream" and "streamend" tags).

Here is my code (updated):

def crunchPdfImages(file_to_crunch, max_width=1200, max_height=628):
    page_ratio = max_width / max_height
    working_folder = os.path.dirname(file_to_crunch)
    working_dir = os.path.join(working_folder, "temp_working_dir")
    if not (os.path.exists(working_dir)): os.mkdir(working_dir)
    
    # Get all the image...
    extractPDFImages(file_to_crunch, working_dir)
    # Compress all the images... (no resize)
    all_image_list = [entry.path for entry in os.scandir(working_dir) if isImage(entry.path)]
    for an_image in all_image_list:
        img_picture = Image.open(an_image).convert("RGB")
        img_picture.save(an_image, "JPEG", optimize=True)
    
    # Uncompress the PDF
    pdf_folder = os.path.join(working_dir, "pdf_uncompressed")
    if not (os.path.exists(pdf_folder)): os.mkdir(pdf_folder)
    pdf_datain_file = os.path.join(pdf_folder, "uncompressed_pdf.pdf")
    pdf_dataout_file = os.path.join(pdf_folder, "new_images_pdf.pdf")
    pypdftk.uncompress('"' + file_to_crunch + '"', pdf_datain_file)
    
    # Now get to work...
    #   The PDF is comprised of objects, some of which are lablled as images.
    #   Each image has the line "/Subtype /Image" before the "stream" which is then ended by "endstream" then "endobj".
    #   In between the stream and endstream is the encoded image data... hopefully I can replace this in the same order that
    #   the images were taken out.
    picture_replace_count = 0
    pdf_openfile_in = open(pdf_datain_file, "rb")
    pdf_openfile_out = open(pdf_dataout_file, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    looking_for_next_stream = False
    found_stream_and_removing = False
    updating_xref_stage = 0
    skip_a_line = False
    
    for line in pdf_file_lines:
        new_line_addition = "" # For adding to byte count, resetting to null here just in case
        current_line_val = line.decode("Latin-1").strip()
        
        if (looking_for_next_stream):
            # Last image tag has been found but not dealt with, so find the stream then
            if (current_line_val[:8] == "/Length "):
                # Update the length
                skip_a_line = True
                new_img_size = str(os.path.getsize(all_image_list[picture_replace_count]))
                new_line = r"/Length " + new_img_size + "\n"
                pdf_openfile_out.write(new_line.encode("latin-1")) # add new line
            if (current_line_val == "stream"):
                print("Stream start found... skipping stream information")
                looking_for_next_stream = False # it's been found
                found_stream_and_removing = True # time to delete
                
                new_line_addition = "stream\n".encode("latin-1")
                pdf_openfile_out.write(new_line_addition) # add the line in or it will be skipped
                
        elif (found_stream_and_removing):
            if (current_line_val == "endstream"):
                print("Stream end found")
                found_stream_and_removing = False # Passed through all image data line
                # Now, add in the new image data and continue on.
                print("Adding new image data...")
                #new_image_file = open(all_image_list[picture_replace_count], "rb")
                img = Image.open(all_image_list[picture_replace_count], mode='r')
                img_byte_arr = io.BytesIO()
                img.save(img_byte_arr, format='JPEG')
                img_byte_arr = img_byte_arr.getvalue()
                
                pdf_openfile_out.write(img_byte_arr)
                new_line_addition = img_byte_arr
                """
                for imgline in new_image_file.readlines():
                    #pdf_openfile_out.write(imgline.encode("Latin-1"))
                    #pdf_openfile_out.write(imgline)
                    pass
                #pdf_openfile_out.write("[IMAGEADD]".encode("latin-1")) # add new line
                """
                #new_image_file.close()
                picture_replace_count += 1
                print("New image added.")
                pdf_openfile_out.write("\n".encode("latin-1")) # add new line
                
                if (picture_replace_count >= len(all_image_list)):
                    updating_xref_stage = 1 # All images replaced, now edit the xref.
                    
        elif (current_line_val == r"/Subtype /Image"):
            print("Found an image place, number " + str(picture_replace_count))
            print("Looking for stream start...")
            looking_for_next_stream = True
            # Find next 
        
        if not (found_stream_and_removing) and not (skip_a_line) and not (updating_xref_stage == 4): 
            pdf_openfile_out.write(line)
        
        skip_a_line = False
        

    pdf_openfile_in.close()
    pdf_openfile_out.close()
    
    print("Rebuilding xref table (post newfile creation)")
    rebuildXrefTable(pdf_dataout_file)

And to rebuild the xref table (as per a comment here) I run the following function. I did try to do it within the previous function, but the data ended up being outputted at a different size. I haven't worked out how to accurately push the photo date into the PDF stream.

def rebuildXrefTable(pdf_file_in, pdf_file_out=None):
    # Updating the xref table:
    #   * Assumes uncompressed PDF file
    #   To do this I need the number of bytes that precede and object (this is used as a reference).
    #   So, each line I will need to count the byte number and tally up
    #   When an object is found, the byte_count will be added to the reference list and then used to create the xref table
    #   Also need to update the "startxref" at the bottom (similar principle).
    
    if (pdf_file_out == None): pdf_file_out = os.path.join(os.path.dirname(pdf_file_in), "rebuilt_xref_pdf.pdf")
    print("Updating xref table of: " + os.path.basename(pdf_file_in))
    
    byte_count = 0
    xref_start = 0
    object_location_reference = []
    updating_xref_stage = 1
    pdf_openfile_in = open(pdf_file_in, "rb")
    pdf_openfile_out = open(pdf_file_out, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    for line in pdf_file_lines:
        current_line_val = line.decode("Latin-1").strip()
        if (" obj" in current_line_val):
            # Check if the place is an object loc, store byte reference and object index
            obj_ref_index = current_line_val.split(" ")[0]
            print("Found new object (index, location): (" + str(obj_ref_index) + ", " + str(byte_count) + ")")
            object_location_reference.append((int(obj_ref_index), byte_count))
        elif ("startxref" in current_line_val):
            # This is the last thing to edit (right at the bottom, update the xref start location and then add the file end.
            print("Updating the xref start value with new data...")
            new_line = "startxref\n" + str(xref_start) + "\n" + r"%%EOF"
            pdf_openfile_out.write(new_line.encode("latin-1"))
            break
        elif ("xref" in current_line_val):
            print("Recording the new xref byte location")
            preceeding_str = current_line_val.split("xref")[0]
            preceeding_count = len(preceeding_str.encode("latin-1"))
            xref_start = byte_count + preceeding_count # used at the end
            updating_xref_stage = 2 
        
        elif (updating_xref_stage == 2 or updating_xref_stage == 3): 
            # This stage simply skips the first 2 xref data lines (and prints it o the new file as is)
            updating_xref_stage += 1
        elif (updating_xref_stage == 4):
            print("Creating new xref object byte location table...")
            object_location_reference.sort() # Sort the collected xref locations by their object index.
            # Now add the new xref data information
            for xref_loc in object_location_reference:
                new_val = str(xref_loc[1]).zfill(10) # Pad the number out
                new_val = new_val + " 00000 n \n"
                pdf_openfile_out.write(new_val.encode("latin-1"))
            updating_xref_stage = 5
        elif (updating_xref_stage == 5):
            # Stage 5 doesn't record the read in lines into new file
            if ("trailer" in current_line_val): updating_xref_stage = 5
        
        # Write to file
        if not (updating_xref_stage == 5):
            pdf_openfile_out.write(line)
            byte_count += len(line)
    
    pdf_openfile_in.close()
    pdf_openfile_out.close()

The xref table is accurate and points to the right byte-location, I also made sure it was in the proper order (using the object index number, not the order it appears in the file--this matches the original PDF document).

If I don't try to replace anything and just spit out the data into the new PDF file, it works and I can open the new file. However, when the replacement JPG data is inserted the PDF can't open because it is broken.

I don't know how to push the right data to the PDF file from the compressed images.

I have also tried to simply push the JPG data like this:

image = open(all_image_list[picture_replace_count], 'rb')
pdf_openfile_out.write(image.read())
image.close()

Using: Python 3.8

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

戴着白色围巾的女孩 2025-01-28 01:27:24

感谢这里的评论，我已经解决了这个问题。必须重建外部参照表，并将 JPG 数据作为一个整体放入。这是工作代码：

import os
import pypdftk # pdftk main prog has to be installed and added to path too...
import pdf2image
from PIL import Image

def crunchPdfImages(file_to_crunch):
    working_folder = os.path.dirname(file_to_crunch)
    working_dir = os.path.join(working_folder, "temp_working_dir")
    if not (os.path.exists(working_dir)): os.mkdir(working_dir)
    
    # Get all the image...
    extractPDFImages(file_to_crunch, working_dir)
    # Compress all the images... (no resize, just optimise)
    all_image_list = [entry.path for entry in os.scandir(working_dir) if isImage(entry.path)]
    if (len(all_image_list) > 0):
        for an_image in all_image_list:
            img_picture = Image.open(an_image).convert("RGB")
            img_picture.save(an_image, "JPEG", optimize=True)
    else:
        print("No images found in PDF...")
    
    # Uncompress the PDF
    pdf_folder = os.path.join(working_dir, "pdf_uncompressed")
    if not (os.path.exists(pdf_folder)): os.mkdir(pdf_folder)
    pdf_datain_file = os.path.join(pdf_folder, "uncompressed_pdf.pdf")
    pdf_dataout_file = os.path.join(pdf_folder, "new_images_pdf.pdf")
    print("Uncompressing PDF...")
    pypdftk.uncompress('"' + file_to_crunch + '"', '"' + pdf_datain_file + '"')
    
    # Now get to work...
    #   The PDF is comprised of objects, some of which are lablled as images.
    #   Each image has the line "/Subtype /Image" before the "stream" which is then ended by "endstream" then "endobj".
    #   In between the stream and endstream is the encoded image data... hopefully I can replace this in the same order that
    #   the images were taken out.
    picture_replace_count = 0
    pdf_openfile_in = open(pdf_datain_file, "rb")
    pdf_openfile_out = open(pdf_dataout_file, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    looking_for_next_stream = False
    found_stream_and_removing = False
    skip_a_line = False
    
    for line in pdf_file_lines:
        new_line_addition = "" # For adding to byte count, resetting to null here just in case
        current_line_val = line.decode("Latin-1").strip()
        
        if (looking_for_next_stream):
            # Last image tag has been found but not dealt with, so find the stream then
            if (current_line_val[:8] == "/Length "):
                # Update the length
                skip_a_line = True
                new_img_size = str(os.path.getsize(all_image_list[picture_replace_count]))
                new_line = r"/Length " + new_img_size + "\n"
                pdf_openfile_out.write(new_line.encode("latin-1")) # add new line
            if (current_line_val == "stream"):
                print("Stream start found... skipping stream information")
                looking_for_next_stream = False # it's been found
                found_stream_and_removing = True # time to delete
                
                new_line_addition = "stream\n".encode("latin-1")
                pdf_openfile_out.write(new_line_addition) # add the line in or it will be skipped
                
        elif (found_stream_and_removing):
            if (current_line_val == "endstream"):
                print("Stream end found")
                found_stream_and_removing = False # Passed through all image data line
                # Now, add in the new image data and continue on.
                print("Adding new image data...")
                
                image = open(all_image_list[picture_replace_count], 'rb')
                pdf_openfile_out.write(image.read())
                image.close()
                
                picture_replace_count += 1
                pdf_openfile_out.write("\n".encode("latin-1")) # add new line
                    
        elif (current_line_val == r"/Subtype /Image"):
            print("Found an image place, number " + str(picture_replace_count))
            print("Looking for stream start...")
            looking_for_next_stream = True
            # Find next 
        
        if not (found_stream_and_removing) and not (skip_a_line): 
            pdf_openfile_out.write(line)
        
        skip_a_line = False

    pdf_openfile_in.close()
    pdf_openfile_out.close()
    
    print("Rebuilding xref table (post newfile creation)")
    rebuildXrefTable(pdf_dataout_file)

def rebuildXrefTable(pdf_file_in, pdf_file_out=None):
    # Updating the xref table:
    #   * Assumes uncompressed PDF file
    #   To do this I need the number of bytes that precede and object (this is used as a reference).
    #   So, each line I will need to count the byte number and tally up
    #   When an object is found, the byte_count will be added to the reference list and then used to create the xref table
    #   Also need to update the "startxref" at the bottom (similar principle).
    
    if (pdf_file_out == None): pdf_file_out = os.path.join(os.path.dirname(pdf_file_in), "rebuilt_xref_pdf.pdf")
    print("Updating xref table of: " + os.path.basename(pdf_file_in))
    
    byte_count = 0
    xref_start = 0
    object_location_reference = []
    updating_xref_stage = 1
    pdf_openfile_in = open(pdf_file_in, "rb")
    pdf_openfile_out = open(pdf_file_out, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    for line in pdf_file_lines:
        current_line_val = line.decode("Latin-1").strip()
        if (" obj" in current_line_val):
            # Check if the place is an object loc, store byte reference and object index
            obj_ref_index = current_line_val.split(" ")[0]
            print("Found new object (index, location): (" + str(obj_ref_index) + ", " + str(byte_count) + ")")
            object_location_reference.append((int(obj_ref_index), byte_count))
        elif ("startxref" in current_line_val):
            # This is the last thing to edit (right at the bottom, update the xref start location and then add the file end.
            print("Updating the xref start value with new data...")
            new_line = "startxref\n" + str(xref_start) + "\n" + r"%%EOF"
            pdf_openfile_out.write(new_line.encode("latin-1"))
            break
        elif ("xref" in current_line_val):
            print("Recording the new xref byte location")
            preceeding_str = current_line_val.split("xref")[0]
            preceeding_count = len(preceeding_str.encode("latin-1"))
            xref_start = byte_count + preceeding_count # used at the end
            updating_xref_stage = 2 
        
        elif (updating_xref_stage == 2 or updating_xref_stage == 3): 
            # This stage simply skips the first 2 xref data lines (and prints it o the new file as is)
            updating_xref_stage += 1
        elif (updating_xref_stage == 4):
            print("Creating new xref object byte location table...")
            object_location_reference.sort() # Sort the collected xref locations by their object index.
            # Now add the new xref data information
            for xref_loc in object_location_reference:
                new_val = str(xref_loc[1]).zfill(10) # Pad the number out
                new_val = new_val + " 00000 n \n"
                pdf_openfile_out.write(new_val.encode("latin-1"))
            updating_xref_stage = 5
        elif (updating_xref_stage == 5):
            # Stage 5 doesn't record the read in lines into new file, step 6 will.
            if ("trailer" in current_line_val): updating_xref_stage = 6
        
        # Write to file
        if not (updating_xref_stage == 5):
            pdf_openfile_out.write(line)
            byte_count += len(line)
    
    pdf_openfile_in.close()
    pdf_openfile_out.close()

# To use the PDF compression:
crunchPdfImages(r"C:\Users\Person\Desktop\Test Folder\Pdf File.pdf")

Thanks to the comments here, I have resolved the issue. The xref table had to be rebuilt, and the JPG data placed in as a whole. Here is the working code:

import os
import pypdftk # pdftk main prog has to be installed and added to path too...
import pdf2image
from PIL import Image

def crunchPdfImages(file_to_crunch):
    working_folder = os.path.dirname(file_to_crunch)
    working_dir = os.path.join(working_folder, "temp_working_dir")
    if not (os.path.exists(working_dir)): os.mkdir(working_dir)
    
    # Get all the image...
    extractPDFImages(file_to_crunch, working_dir)
    # Compress all the images... (no resize, just optimise)
    all_image_list = [entry.path for entry in os.scandir(working_dir) if isImage(entry.path)]
    if (len(all_image_list) > 0):
        for an_image in all_image_list:
            img_picture = Image.open(an_image).convert("RGB")
            img_picture.save(an_image, "JPEG", optimize=True)
    else:
        print("No images found in PDF...")
    
    # Uncompress the PDF
    pdf_folder = os.path.join(working_dir, "pdf_uncompressed")
    if not (os.path.exists(pdf_folder)): os.mkdir(pdf_folder)
    pdf_datain_file = os.path.join(pdf_folder, "uncompressed_pdf.pdf")
    pdf_dataout_file = os.path.join(pdf_folder, "new_images_pdf.pdf")
    print("Uncompressing PDF...")
    pypdftk.uncompress('"' + file_to_crunch + '"', '"' + pdf_datain_file + '"')
    
    # Now get to work...
    #   The PDF is comprised of objects, some of which are lablled as images.
    #   Each image has the line "/Subtype /Image" before the "stream" which is then ended by "endstream" then "endobj".
    #   In between the stream and endstream is the encoded image data... hopefully I can replace this in the same order that
    #   the images were taken out.
    picture_replace_count = 0
    pdf_openfile_in = open(pdf_datain_file, "rb")
    pdf_openfile_out = open(pdf_dataout_file, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    looking_for_next_stream = False
    found_stream_and_removing = False
    skip_a_line = False
    
    for line in pdf_file_lines:
        new_line_addition = "" # For adding to byte count, resetting to null here just in case
        current_line_val = line.decode("Latin-1").strip()
        
        if (looking_for_next_stream):
            # Last image tag has been found but not dealt with, so find the stream then
            if (current_line_val[:8] == "/Length "):
                # Update the length
                skip_a_line = True
                new_img_size = str(os.path.getsize(all_image_list[picture_replace_count]))
                new_line = r"/Length " + new_img_size + "\n"
                pdf_openfile_out.write(new_line.encode("latin-1")) # add new line
            if (current_line_val == "stream"):
                print("Stream start found... skipping stream information")
                looking_for_next_stream = False # it's been found
                found_stream_and_removing = True # time to delete
                
                new_line_addition = "stream\n".encode("latin-1")
                pdf_openfile_out.write(new_line_addition) # add the line in or it will be skipped
                
        elif (found_stream_and_removing):
            if (current_line_val == "endstream"):
                print("Stream end found")
                found_stream_and_removing = False # Passed through all image data line
                # Now, add in the new image data and continue on.
                print("Adding new image data...")
                
                image = open(all_image_list[picture_replace_count], 'rb')
                pdf_openfile_out.write(image.read())
                image.close()
                
                picture_replace_count += 1
                pdf_openfile_out.write("\n".encode("latin-1")) # add new line
                    
        elif (current_line_val == r"/Subtype /Image"):
            print("Found an image place, number " + str(picture_replace_count))
            print("Looking for stream start...")
            looking_for_next_stream = True
            # Find next 
        
        if not (found_stream_and_removing) and not (skip_a_line): 
            pdf_openfile_out.write(line)
        
        skip_a_line = False

    pdf_openfile_in.close()
    pdf_openfile_out.close()
    
    print("Rebuilding xref table (post newfile creation)")
    rebuildXrefTable(pdf_dataout_file)

def rebuildXrefTable(pdf_file_in, pdf_file_out=None):
    # Updating the xref table:
    #   * Assumes uncompressed PDF file
    #   To do this I need the number of bytes that precede and object (this is used as a reference).
    #   So, each line I will need to count the byte number and tally up
    #   When an object is found, the byte_count will be added to the reference list and then used to create the xref table
    #   Also need to update the "startxref" at the bottom (similar principle).
    
    if (pdf_file_out == None): pdf_file_out = os.path.join(os.path.dirname(pdf_file_in), "rebuilt_xref_pdf.pdf")
    print("Updating xref table of: " + os.path.basename(pdf_file_in))
    
    byte_count = 0
    xref_start = 0
    object_location_reference = []
    updating_xref_stage = 1
    pdf_openfile_in = open(pdf_file_in, "rb")
    pdf_openfile_out = open(pdf_file_out, "wb")
    pdf_file_lines = pdf_openfile_in.readlines()
    
    for line in pdf_file_lines:
        current_line_val = line.decode("Latin-1").strip()
        if (" obj" in current_line_val):
            # Check if the place is an object loc, store byte reference and object index
            obj_ref_index = current_line_val.split(" ")[0]
            print("Found new object (index, location): (" + str(obj_ref_index) + ", " + str(byte_count) + ")")
            object_location_reference.append((int(obj_ref_index), byte_count))
        elif ("startxref" in current_line_val):
            # This is the last thing to edit (right at the bottom, update the xref start location and then add the file end.
            print("Updating the xref start value with new data...")
            new_line = "startxref\n" + str(xref_start) + "\n" + r"%%EOF"
            pdf_openfile_out.write(new_line.encode("latin-1"))
            break
        elif ("xref" in current_line_val):
            print("Recording the new xref byte location")
            preceeding_str = current_line_val.split("xref")[0]
            preceeding_count = len(preceeding_str.encode("latin-1"))
            xref_start = byte_count + preceeding_count # used at the end
            updating_xref_stage = 2 
        
        elif (updating_xref_stage == 2 or updating_xref_stage == 3): 
            # This stage simply skips the first 2 xref data lines (and prints it o the new file as is)
            updating_xref_stage += 1
        elif (updating_xref_stage == 4):
            print("Creating new xref object byte location table...")
            object_location_reference.sort() # Sort the collected xref locations by their object index.
            # Now add the new xref data information
            for xref_loc in object_location_reference:
                new_val = str(xref_loc[1]).zfill(10) # Pad the number out
                new_val = new_val + " 00000 n \n"
                pdf_openfile_out.write(new_val.encode("latin-1"))
            updating_xref_stage = 5
        elif (updating_xref_stage == 5):
            # Stage 5 doesn't record the read in lines into new file, step 6 will.
            if ("trailer" in current_line_val): updating_xref_stage = 6
        
        # Write to file
        if not (updating_xref_stage == 5):
            pdf_openfile_out.write(line)
            byte_count += len(line)
    
    pdf_openfile_in.close()
    pdf_openfile_out.close()

# To use the PDF compression:
crunchPdfImages(r"C:\Users\Person\Desktop\Test Folder\Pdf File.pdf")

回复收藏 0 原文

~没有更多了~