@Dharma_Teja_Dhulipud,
I have tried with pretrained-ocr-v2.0-2023-06-02.
15 pages → working.
16 pages → not working.
The error was:
Couldn’t preview the document: Document pages in non-imageless mode exceed the limit: 15 got 16. Try using imageless mode to increase the limit to 30.
This behavior should be expected as it’s written in the documentation:
Note: To extend the maximum page limit for online and synchronous requests up to 30, be sure to enable imageless_mode in the ProcessRequest.
That said, you’re saying that it is not working with batch mode. I have tried with 10 documents with more than 16 pages and had no trouble.
Can you share with us how you’re requesting a batch to operate so we can see if there is an issue?
Otherwise, here’s a Python code that I have used to generate a successful batch:
"""Test batch processing of documents using Google Cloud Document AI."""
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
from google.cloud.documentai_toolbox import gcs_utilities
project_id = "[YOUR_PROJECT_ID]" # TODO: Update this to your GCP project ID
location = "[YOUR_PROCESSOR_LOCATION]" # TODO: Update this to your processor location (e.g., us or eu)
processor_id = "[YOUR_PROCESSOR_ID]" # TODO: Update this to your processor ID
gcs_bucket_name = "[YOUR_BUCKET_NAME]" # TODO: Update this to your GCS bucket name
gcs_prefix = "input/" # Folder in the bucket where input documents are stored
gcs_output_uri = f"gs://{gcs_bucket_name}/output/" # GCS location for output files
batch_size = 10 # Number of documents to process in each batch
def process_document_batch(
project_id: str,
location: str,
processor_id: str,
gcs_bucket_name: str,
gcs_prefix: str,
gcs_output_uri: str,
batch_size: int,
) -> None:
"""Performs batch processing on documents in a GCS bucket."""
# 1. Initialize the Document AI client
# The endpoint must match the location (us or eu)
client_options = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
client = documentai.DocumentProcessorServiceClient(client_options=client_options)
# 2. Create the full processor name
# Resource path format: projects/{project_id}/locations/{location}/processors/{processor_id}
name = client.processor_path(project_id, location, processor_id)
# 3. Create batches of documents using the Toolbox utility
# It automatically handles the discovery of files in the bucket
batches = gcs_utilities.create_batches(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix, batch_size=batch_size
)
print(f"Total batches created: {len(batches)}")
for i, batch in enumerate(batches):
# 4. Define the output configuration
# Document AI writes results as JSON files to this GCS location
output_config = documentai.DocumentOutputConfig(
gcs_output_config=documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=gcs_output_uri
)
)
# 5. Build the BatchProcessRequest
request = documentai.BatchProcessRequest(
name=name,
input_documents=batch,
document_output_config=output_config,
)
# 6. Start the asynchronous batch processing operation
print(f"Executing batch {i+1}/{len(batches)}...")
operation = client.batch_process_documents(request=request)
# Wait for the operation to finish
# Note: Large batches may take several minutes
print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result(timeout=600)
print(f"Batch {i+1} completed successfully.")
if __name__ == "__main__":
process_document_batch(
project_id=project_id,
location=location,
processor_id=processor_id,
gcs_bucket_name=gcs_bucket_name,
gcs_prefix=gcs_prefix,
gcs_output_uri=gcs_output_uri,
batch_size=batch_size,
)