使用Python将AutoCAD SHX文本转换为PDF中的可搜索文本

发布于 2025-02-06 01:56:34 字数 4238 浏览 3 评论 0原文

无法搜索具有 AutoCAD SHX文本的PDF 无法搜索。我们有一个带注释的PDF文件。我的目标是将所有AutoCAD SHX文本从PDF转换为文本，以便能够搜索它。我发现“ nofollow noreferrer”>转换autocad shx shx shx pdf shx pdf注释到可搜索的pdf 显示了如何转换pdf 发短信。

但是，我无法获得确切的位置，将转换后的文本放在注释的顶部，并且以90度的定向与原始文本定向。我能够将所有页面带有内容，并将它们合并到新的PDF文件中。

这是我的代码

import io,os
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4, landscape
from datetime import date, time, datetime,timedelta

def convert_annot(pdf_name):
    pdf = PdfFileReader(open(pdf_name, "rb"))
    numPages = pdf.getNumPages()
    output = PdfFileWriter()

    # Loop to pages
    for i in range(0, numPages):
        page = pdf.getPage(i) 
        objs=[]
        try :
            for annot in page['/Annots']:
                obj=annot.getObject()
                if 'AutoCAD SHX Text' in obj.values():
                    print (obj['/Contents'],obj['/Rect'])
                    objs.append(obj)
        
            page_size=pdf.getPage(0).mediaBox.upperRight
            w, h =page_size
            packet = io.BytesIO()
            c = canvas.Canvas(packet,pagesize=landscape(A4))
            c.setFont('Helvetica-Bold',12)
            c.setFillColor(colors.red)
            for obj in objs:
                if '/Contents' in obj:
                    xy=tuple(obj['/Rect'])
                    llx,lly,urx,ury=xy   #LowerLeftX,LowerLeftY,UpperRightX, UpperRightY
                    text=obj['/Contents']
                    x1=int(urx)
                    y1=int(lly)
                    c.drawString(x1,y1,text)
            c.save()

            #buffer start from 0
            packet.seek(0)
            new_pdf = PdfFileReader(packet)
            page=None
            new_pdf_file_name=None

            #read existing pdf
            page = pdf.pages[i]
            page.merge_page(new_pdf.pages[0])
            output.add_page(page)
        except :
            page_size=pdf.getPage(0).mediaBox.upperRight
            w, h =page_size
            packet = io.BytesIO()
            c = canvas.Canvas(packet,pagesize=landscape(A4))
            c.setFont('Helvetica-Bold',12)
            c.setFillColor(colors.red)
            for obj in objs:
                if '/Contents' in obj:
                    xy=tuple(obj['/Rect'])
                    llx,lly,urx,ury=xy   #LowerLeftX,LowerLeftY,UpperRightX, UpperRightY
                    text=obj['/Contents']
                    x1=int(urx)
                    y1=int(lly)
                    c.drawString(x1,y1,text)
            c.save()
            #buffer start from 0
            packet.seek(0)
            new_pdf = PdfFileReader(packet)
            page=None
            new_pdf_file_name=None
            #read existing pdf
            page = pdf.pages[i]
            page.merge_page(new_pdf.pages[0])
            output.add_page(page)

    # Finally output new pdf
    new_pdf_file_name=os.path.splitext(pdf_name)[0]+".annot.pdf"
    outputStream = open(new_pdf_file_name, "wb")
    output.write(outputStream)
    outputStream.close()

if __name__ == '__main__':
    pdf_name='samplePdf.pdf'
    convert_annot(pdf_name)

示例PDF带有注释（无法选择或搜索）：

输出结果：

文本 notes （红色）是不完全放置在文本的位置注释（黑色| AutoCAD SHX文本）。

我是Python的新手。我正在尝试获取注释的确切位置，以便可以将其放置在转换后的文本上。我仍在考虑使用 pdfminer 获得坐标的方法。问题是我不知道如何在对象的循环中使用它（在页面中选择的AutoCAD SHX文本注释），以便每个对象都有一个精确的坐标，用于放置转换后的文本。

预期结果应为：

感谢特定转换文本的位置（使用PDFMiner）的任何想法或概念。

原文

PDF with AutoCAD SHX Text cannot be searched. We have a PDF files with annotation. My goal is to convert all AutoCAD SHX Text from PDF into text to be able to search it. I found Convert AutoCAD SHX PDF annotation into searchable PDF that shows how to convert annotation to text.

However, I cannot get the exact location to place the converted text at the top of annotation and it is oriented in 90 degrees to the original text. I am able to get all the pages with contents and merge them to a new PDF file.

Here is my code

import io,os
import PyPDF2
from PyPDF2 import PdfFileWriter, PdfFileReader
from reportlab.pdfgen import canvas
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4, landscape
from datetime import date, time, datetime,timedelta

def convert_annot(pdf_name):
    pdf = PdfFileReader(open(pdf_name, "rb"))
    numPages = pdf.getNumPages()
    output = PdfFileWriter()

    # Loop to pages
    for i in range(0, numPages):
        page = pdf.getPage(i) 
        objs=[]
        try :
            for annot in page['/Annots']:
                obj=annot.getObject()
                if 'AutoCAD SHX Text' in obj.values():
                    print (obj['/Contents'],obj['/Rect'])
                    objs.append(obj)
        
            page_size=pdf.getPage(0).mediaBox.upperRight
            w, h =page_size
            packet = io.BytesIO()
            c = canvas.Canvas(packet,pagesize=landscape(A4))
            c.setFont('Helvetica-Bold',12)
            c.setFillColor(colors.red)
            for obj in objs:
                if '/Contents' in obj:
                    xy=tuple(obj['/Rect'])
                    llx,lly,urx,ury=xy   #LowerLeftX,LowerLeftY,UpperRightX, UpperRightY
                    text=obj['/Contents']
                    x1=int(urx)
                    y1=int(lly)
                    c.drawString(x1,y1,text)
            c.save()

            #buffer start from 0
            packet.seek(0)
            new_pdf = PdfFileReader(packet)
            page=None
            new_pdf_file_name=None

            #read existing pdf
            page = pdf.pages[i]
            page.merge_page(new_pdf.pages[0])
            output.add_page(page)
        except :
            page_size=pdf.getPage(0).mediaBox.upperRight
            w, h =page_size
            packet = io.BytesIO()
            c = canvas.Canvas(packet,pagesize=landscape(A4))
            c.setFont('Helvetica-Bold',12)
            c.setFillColor(colors.red)
            for obj in objs:
                if '/Contents' in obj:
                    xy=tuple(obj['/Rect'])
                    llx,lly,urx,ury=xy   #LowerLeftX,LowerLeftY,UpperRightX, UpperRightY
                    text=obj['/Contents']
                    x1=int(urx)
                    y1=int(lly)
                    c.drawString(x1,y1,text)
            c.save()
            #buffer start from 0
            packet.seek(0)
            new_pdf = PdfFileReader(packet)
            page=None
            new_pdf_file_name=None
            #read existing pdf
            page = pdf.pages[i]
            page.merge_page(new_pdf.pages[0])
            output.add_page(page)

    # Finally output new pdf
    new_pdf_file_name=os.path.splitext(pdf_name)[0]+".annot.pdf"
    outputStream = open(new_pdf_file_name, "wb")
    output.write(outputStream)
    outputStream.close()

if __name__ == '__main__':
    pdf_name='samplePdf.pdf'
    convert_annot(pdf_name)

Sample PDF with annotation (cannot be selected or search):

Output result:

The text NOTES (red color) is not placed exactly in the position of the text NOTES (black color | AutoCAD SHX Text).

I am new to Python. I am trying to get the exact location of annotation so that I can place to it the converted text. I'm still thinking a way on how do it by using PdfMiner to get the coordinates. The problem is I don't know how can I use it inside the loop of the object (selected AutoCAD SHX Text annotation in a page) so that every object will have an exact coordinates for placing the converted text.

Expected result should be: