-
Notifications
You must be signed in to change notification settings - Fork 6.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
docs(samples): add OCR, form, quality, splitter and specialized proce…
…ssing samples (#239) * docs(samples): add processing samples for OCR, quality, splitter and specialized * Update quality, specialized and splitter samples * Fix lint issues * Fix snippet tags * update library from v1 to v1beta3 * restore previous processing sample to avoid sample tag breakage
- Loading branch information
Showing
14 changed files
with
700 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
# [START documentai_process_form_document] | ||
|
||
# TODO(developer): Uncomment these variables before running the sample. | ||
# project_id= 'YOUR_PROJECT_ID' | ||
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu' | ||
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console | ||
# file_path = '/path/to/local/pdf' | ||
|
||
def process_document_form_sample( | ||
project_id: str, location: str, processor_id: str, file_path: str | ||
): | ||
from google.cloud import documentai_v1beta3 as documentai | ||
|
||
# You must set the api_endpoint if you use a location other than 'us', e.g.: | ||
opts = {} | ||
if location == "eu": | ||
opts = {"api_endpoint": "eu-documentai.googleapis.com"} | ||
|
||
client = documentai.DocumentProcessorServiceClient(client_options=opts) | ||
|
||
# The full resource name of the processor, e.g.: | ||
# projects/project-id/locations/location/processor/processor-id | ||
# You must create new processors in the Cloud Console first | ||
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
|
||
with open(file_path, "rb") as image: | ||
image_content = image.read() | ||
|
||
# Read the file into memory | ||
document = {"content": image_content, "mime_type": "application/pdf"} | ||
|
||
# Configure the process request | ||
request = {"name": name, "raw_document": document} | ||
|
||
# Recognizes text entities in the PDF document | ||
result = client.process_document(request=request) | ||
|
||
print("Document processing complete.") | ||
|
||
# Read the table and form fields output from the processor | ||
# The form processor also contains OCR data. For more information | ||
# on how to parse OCR data please see the OCR sample. | ||
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
document = result.document | ||
text = document.text | ||
print(f"Full document text: {repr(text)}\n") | ||
print(f"There are {len(document.pages)} page(s) in this document.") | ||
|
||
# Read the text recognition output from the processor | ||
for page in document.pages: | ||
print(f"\n\n**** Page {page.page_number} ****") | ||
|
||
print(f"Found {len(page.tables)} table(s):") | ||
for table in page.tables: | ||
num_collumns = len(table.header_rows[0].cells) | ||
num_rows = len(table.body_rows) | ||
print(f'Table with {num_collumns} columns and {num_rows} rows:') | ||
print_table_info(table, text) | ||
print(f'Found {len(page.form_fields)} form fields:') | ||
for field in page.form_fields: | ||
name = layout_to_text(field.field_name, text) | ||
value = layout_to_text(field.field_value, text) | ||
print(f" * {repr(name.strip())}: {repr(value.strip())}") | ||
|
||
|
||
def print_table_info(table: dict, text: str) -> None: | ||
# Print header row | ||
header_row_text = '' | ||
for header_cell in table.header_rows[0].cells: | ||
header_cell_text = layout_to_text(header_cell.layout, text) | ||
header_row_text += f'{repr(header_cell_text.strip())} | ' | ||
print(f'Collumns: {header_row_text[:-3]}') | ||
# Print first body row | ||
body_row_text = '' | ||
for body_cell in table.body_rows[0].cells: | ||
body_cell_text = layout_to_text(body_cell.layout, text) | ||
body_row_text += f'{repr(body_cell_text.strip())} | ' | ||
print(f'First row data: {body_row_text[:-3]}\n') | ||
|
||
|
||
def layout_to_text(layout: dict, text: str) -> str: | ||
""" | ||
Document AI identifies form fields by their offsets in the entirity of the | ||
document's text. This function converts offsets to a string. | ||
""" | ||
response = "" | ||
# If a text segment spans several lines, it will | ||
# be stored in different text segments. | ||
for segment in layout.text_anchor.text_segments: | ||
start_index = ( | ||
int(segment.start_index) | ||
if segment in layout.text_anchor.text_segments | ||
else 0 | ||
) | ||
end_index = int(segment.end_index) | ||
response += text[start_index:end_index] | ||
return response | ||
|
||
|
||
# [END documentai_process_form_document] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# # Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import os | ||
|
||
from samples.snippets import process_document_form_sample | ||
|
||
|
||
location = "us" | ||
project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
processor_id = "90484cfdedb024f6" | ||
file_path = "resources/invoice.pdf" | ||
|
||
|
||
def test_process_documents(capsys): | ||
process_document_form_sample.process_document_form_sample( | ||
project_id=project_id, | ||
location=location, | ||
processor_id=processor_id, | ||
file_path=file_path, | ||
) | ||
out, _ = capsys.readouterr() | ||
|
||
expected_strings = [ | ||
"There are 1 page(s) in this document.", | ||
"Table with 4 columns and 6 rows", | ||
"Found 13 form fields", | ||
"'BALANCE DUE': '$2140.00'", | ||
] | ||
for expected_string in expected_strings: | ||
assert expected_string in out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
# Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
# [START documentai_process_ocr_document] | ||
|
||
# TODO(developer): Uncomment these variables before running the sample. | ||
# project_id= 'YOUR_PROJECT_ID' | ||
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu' | ||
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console | ||
# file_path = '/path/to/local/pdf' | ||
|
||
def process_document_ocr_sample( | ||
project_id: str, location: str, processor_id: str, file_path: str | ||
) -> None: | ||
from google.cloud import documentai_v1beta3 as documentai | ||
|
||
# You must set the api_endpoint if you use a location other than 'us', e.g.: | ||
opts = {} | ||
if location == "eu": | ||
opts = {"api_endpoint": "eu-documentai.googleapis.com"} | ||
|
||
client = documentai.DocumentProcessorServiceClient(client_options=opts) | ||
|
||
# The full resource name of the processor, e.g.: | ||
# projects/project-id/locations/location/processor/processor-id | ||
# You must create new processors in the Cloud Console first | ||
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}" | ||
|
||
with open(file_path, "rb") as image: | ||
image_content = image.read() | ||
|
||
# Read the file into memory | ||
document = {"content": image_content, "mime_type": "application/pdf"} | ||
|
||
# Configure the process request | ||
request = {"name": name, "raw_document": document} | ||
|
||
# Recognizes text entities in the PDF document | ||
result = client.process_document(request=request) | ||
|
||
print("Document processing complete.") | ||
|
||
# Read the text recognition output from the processor | ||
# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document | ||
document = result.document | ||
text = document.text | ||
print(f"Full document text: {repr(text)}\n") | ||
print(f"There are {len(document.pages)} page(s) in this document.\n") | ||
|
||
for page in document.pages: | ||
print(f"Page {page.page_number}:") | ||
print_page_dimensions(page.dimension) | ||
print_detected_langauges(page.detected_languages) | ||
print_paragraphs(page.paragraphs, text) | ||
print_blocks(page.blocks, text) | ||
print_lines(page.lines, text) | ||
print_tokens(page.tokens, text) | ||
|
||
|
||
def print_page_dimensions(dimension: dict) -> None: | ||
print(f" Width: {str(dimension.width)}") | ||
print(f" Height: {str(dimension.height)}") | ||
|
||
|
||
def print_detected_langauges(detected_languages: dict) -> None: | ||
print(" Detected languages:") | ||
for lang in detected_languages: | ||
code = lang.language_code | ||
conf_percent = '{:.1%}'.format(lang.confidence) | ||
print(f" {code} ({conf_percent} confidence)") | ||
|
||
|
||
def print_paragraphs(paragraphs: dict, text: str) -> None: | ||
print(f" {len(paragraphs)} paragraphs detected:") | ||
first_paragraph_text = layout_to_text(paragraphs[0].layout, text) | ||
print(f" First paragraph text: {repr(first_paragraph_text)}") | ||
last_paragraph_text = layout_to_text(paragraphs[-1].layout, text) | ||
print(f" Last paragraph text: {repr(last_paragraph_text)}") | ||
|
||
|
||
def print_blocks(blocks: dict, text: str) -> None: | ||
print(f" {len(blocks)} blocks detected:") | ||
first_block_text = layout_to_text(blocks[0].layout, text) | ||
print(f" First text block: {repr(first_block_text)}") | ||
last_block_text = layout_to_text(blocks[-1].layout, text) | ||
print(f" Last text block: {repr(last_block_text)}") | ||
|
||
|
||
def print_lines(lines: dict, text: str) -> None: | ||
print(f" {len(lines)} lines detected:") | ||
first_line_text = layout_to_text(lines[0].layout, text) | ||
print(f" First line text: {repr(first_line_text)}") | ||
last_line_text = layout_to_text(lines[-1].layout, text) | ||
print(f" Last line text: {repr(last_line_text)}") | ||
|
||
|
||
def print_tokens(tokens: dict, text: str) -> None: | ||
print(f" {len(tokens)} tokens detected:") | ||
first_token_text = layout_to_text(tokens[0].layout, text) | ||
first_token_break_type = tokens[0].detected_break.type_.name | ||
print(f" First token text: {repr(first_token_text)}") | ||
print(f" First token break type: {repr(first_token_break_type)}") | ||
last_token_text = layout_to_text(tokens[-1].layout, text) | ||
last_token_break_type = tokens[-1].detected_break.type_.name | ||
print(f" Last token text: {repr(last_token_text)}") | ||
print(f" Last token break type: {repr(last_token_break_type)}") | ||
|
||
|
||
def layout_to_text(layout: dict, text: str) -> str: | ||
""" | ||
Document AI identifies text in different parts of the document by their | ||
offsets in the entirity of the document's text. This function converts | ||
offsets to a string. | ||
""" | ||
response = "" | ||
# If a text segment spans several lines, it will | ||
# be stored in different text segments. | ||
for segment in layout.text_anchor.text_segments: | ||
start_index = ( | ||
int(segment.start_index) | ||
if segment in layout.text_anchor.text_segments | ||
else 0 | ||
) | ||
end_index = int(segment.end_index) | ||
response += text[start_index:end_index] | ||
return response | ||
|
||
|
||
# [END documentai_process_ocr_document] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# # Copyright 2020 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import os | ||
|
||
from samples.snippets import process_document_ocr_sample | ||
|
||
location = "us" | ||
project_id = os.environ["GOOGLE_CLOUD_PROJECT"] | ||
processor_id = "91e072f8626a76b7" | ||
file_path = "resources/handwritten_form.pdf" | ||
|
||
|
||
def test_process_documents(capsys): | ||
process_document_ocr_sample.process_document_ocr_sample( | ||
project_id=project_id, | ||
location=location, | ||
processor_id=processor_id, | ||
file_path=file_path, | ||
) | ||
out, _ = capsys.readouterr() | ||
|
||
assert "Page 1" in out | ||
assert "en" in out | ||
assert "FakeDoc" in out |
Oops, something went wrong.