将文档AI与Google的Python一起使用Document AI和Google CodeLabs的代码返回错误或空的结果

发布于 2025-01-28 18:06:44 字数 3200 浏览 5 评论 0原文

我尝试了Codelabs.developers.google.com的以下代码：

import pandas as pd
from google.cloud import documentai_v1 as documentai


def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as file:
        file_content = file.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the sample form
    result = documentai_client.process_document(request=request)

    return result.document


PROJECT_ID = "YOUR_PROJECT_ID"
LOCATION = "YOUR_PROJECT_LOCATION"  # Format is 'us' or 'eu'
PROCESSOR_ID = "PROCUREMENT_SPLITTER_ID"  # Create processor in Cloud Console

# The local file in your current working directory
FILE_PATH = "procurement_multi_document.pdf"
# Refer to https://cloud.google.com/document-ai/docs/processors-list
# for supported file types
MIME_TYPE = "application/pdf"

document = online_process(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    file_path=FILE_PATH,
    mime_type=MIME_TYPE,
)

print("Document processing complete.")

types = []
confidence = []
pages = []

# Each Document.entity is a classification
for entity in document.entities:
    classification = entity.type_
    types.append(classification)
    confidence.append(f"{entity.confidence:.0%}")

    # entity.page_ref contains the pages that match the classification
    pages_list = []
    for page_ref in entity.page_anchor.page_refs:
        pages_list.append(page_ref.page)
    pages.append(pages_list)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame({"Classification": types, "Confidence": confidence, "Pages": pages})

print(df)

预期输出（类似）：

$ python3 classification.py
Document processing complete.
         Classification Confidence Pages
0     invoice_statement       100%   [0]
1     receipt_statement        98%   [1]
2                 other        81%   [2]
3     utility_statement       100%   [3]
4  restaurant_statement       100%   [4]

结果输出：

Document processing complete.
Empty DataFrame
Columns: [Classification, Confidence, Pages]
Index: []

我检查了Google Documentai的其他所有输入，其他一切似乎都很好。但是，我可以使用document.text毫无问题地获取文本（OCR）。也许我已经达到了我的DocumentAi配额的结尾？如果可能的话，还有其他原因或如何解决？谢谢。

原文

I tried the following code from codelabs.developers.google.com:

import pandas as pd
from google.cloud import documentai_v1 as documentai


def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as file:
        file_content = file.read()

    # Load Binary Data into Document AI RawDocument Object
    raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

    # Configure the process request
    request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

    # Use the Document AI client to process the sample form
    result = documentai_client.process_document(request=request)

    return result.document


PROJECT_ID = "YOUR_PROJECT_ID"
LOCATION = "YOUR_PROJECT_LOCATION"  # Format is 'us' or 'eu'
PROCESSOR_ID = "PROCUREMENT_SPLITTER_ID"  # Create processor in Cloud Console

# The local file in your current working directory
FILE_PATH = "procurement_multi_document.pdf"
# Refer to https://cloud.google.com/document-ai/docs/processors-list
# for supported file types
MIME_TYPE = "application/pdf"

document = online_process(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    file_path=FILE_PATH,
    mime_type=MIME_TYPE,
)

print("Document processing complete.")

types = []
confidence = []
pages = []

# Each Document.entity is a classification
for entity in document.entities:
    classification = entity.type_
    types.append(classification)
    confidence.append(f"{entity.confidence:.0%}")

    # entity.page_ref contains the pages that match the classification
    pages_list = []
    for page_ref in entity.page_anchor.page_refs:
        pages_list.append(page_ref.page)
    pages.append(pages_list)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame({"Classification": types, "Confidence": confidence, "Pages": pages})

print(df)

Expected output (something like this):

$ python3 classification.py
Document processing complete.
         Classification Confidence Pages
0     invoice_statement       100%   [0]
1     receipt_statement        98%   [1]
2                 other        81%   [2]
3     utility_statement       100%   [3]
4  restaurant_statement       100%   [4]

Resulting output:

Document processing complete.
Empty DataFrame
Columns: [Classification, Confidence, Pages]
Index: []

I checked every other input to google documentAI and everything else seems fine. I can however get the text (ocr) using document.text without problem. Maybe I have reached the end of my documentAI quota? Any other reason or how to resolve this if possible? Thank you.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

最佳男配角 2025-02-04 18:06:44

文档AI是一种文档理解解决方案，它使用机器学习使非结构化数据更易于理解和分析。来自您使用的代码，用于专业处理器将输入作为发票或收据或实用程序语句，这些发票是非结构化文档并采购结构化数据。文档AI中支持的文件类型在此。问题似乎是您正在使用的文件类型。尝试使用文档中列出的文件类型。有关更多信息，您可以检查此 link 。

我尝试了以下代码以获取预期的输出。

代码

import pandas as pd
from google.cloud import documentai_v1 as documentai


def online_process(
   project_id: "project",
   location: "us",
   processor_id: "processor-id",
   file_path: "file_path",
   mime_type: "mime_type",
) -> documentai.Document:
   
   opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

   # Instantiates a client
   documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

   resource_name = documentai_client.processor_path(project_id, location, processor_id)

   # Read the file into memory
   with open(file_path, "rb") as file:
       file_content = file.read()

   # Load Binary Data into Document AI RawDocument Object
   raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

   # Configure the process request
   request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

   # Use the Document AI client to process the sample form
   result = documentai_client.process_document(request=request)

   return result.document


PROJECT_ID = "project"
LOCATION = "us" 
PROCESSOR_ID = "processor-id"  

# The local file in your current working directory
FILE_PATH = "file_path"
# Refer to https://cloud.google.com/document-ai/docs/processors-list
# for supported file types
MIME_TYPE = "mime_type"

document = online_process(
   project_id=PROJECT_ID,
   location=LOCATION,
   processor_id=PROCESSOR_ID,
   file_path=FILE_PATH,
   mime_type=MIME_TYPE,
)

print("Document processing complete.")

types = []
confidence = []
pages = []

# Each Document.entity is a classification
for entity in document.entities:
   classification = entity.type_
   types.append(classification)
   confidence.append(f"{entity.confidence:.0%}")

   # entity.page_ref contains the pages that match the classification
   pages_list = []
   for page_ref in entity.page_anchor.page_refs:
       pages_list.append(page_ref.page)
   pages.append(pages_list)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame({"Classification": types, "Confidence": confidence, "Pages": pages})

print(df)

不同文件的输出：

procurement_multi_document.pdf
invoice.pdf
sample.pdf -pdf文件仅包含段落
kitten.png-包含单个图像

Document AI is a document understanding solution that uses machine learning to make unstructured data easier to understand and analyze. From the code that you have used is for Specialized Processor Procurement DocAI which takes input as invoice or receipts or utility statements which are unstructured documents and procure into structured data. The file types which are supported in Document AI are listed in this document. The problem seems to be the type of file you are using. Try to use the file type listed in the documentation. For more information you can check this link .

I have tried the below code to get the expected output.

Code

import pandas as pd
from google.cloud import documentai_v1 as documentai


def online_process(
   project_id: "project",
   location: "us",
   processor_id: "processor-id",
   file_path: "file_path",
   mime_type: "mime_type",
) -> documentai.Document:
   
   opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

   # Instantiates a client
   documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

   resource_name = documentai_client.processor_path(project_id, location, processor_id)

   # Read the file into memory
   with open(file_path, "rb") as file:
       file_content = file.read()

   # Load Binary Data into Document AI RawDocument Object
   raw_document = documentai.RawDocument(content=file_content, mime_type=mime_type)

   # Configure the process request
   request = documentai.ProcessRequest(name=resource_name, raw_document=raw_document)

   # Use the Document AI client to process the sample form
   result = documentai_client.process_document(request=request)

   return result.document


PROJECT_ID = "project"
LOCATION = "us" 
PROCESSOR_ID = "processor-id"  

# The local file in your current working directory
FILE_PATH = "file_path"
# Refer to https://cloud.google.com/document-ai/docs/processors-list
# for supported file types
MIME_TYPE = "mime_type"

document = online_process(
   project_id=PROJECT_ID,
   location=LOCATION,
   processor_id=PROCESSOR_ID,
   file_path=FILE_PATH,
   mime_type=MIME_TYPE,
)

print("Document processing complete.")

types = []
confidence = []
pages = []

# Each Document.entity is a classification
for entity in document.entities:
   classification = entity.type_
   types.append(classification)
   confidence.append(f"{entity.confidence:.0%}")

   # entity.page_ref contains the pages that match the classification
   pages_list = []
   for page_ref in entity.page_anchor.page_refs:
       pages_list.append(page_ref.page)
   pages.append(pages_list)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame({"Classification": types, "Confidence": confidence, "Pages": pages})

print(df)

Outputs for different files :