Skip to content

Commit

Permalink
samples: new Doc AI samples for v1beta3 (#206)
Browse files Browse the repository at this point in the history
* samples: new Doc AI samples for v1beta3

* disabled batch parsetable test

* updated pomxs
  • Loading branch information
munkhuushmgl authored Oct 15, 2020
1 parent 3f8be86 commit b8b9fca
Show file tree
Hide file tree
Showing 13 changed files with 655 additions and 4 deletions.
1 change: 1 addition & 0 deletions document-ai/snippets/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
<dependency>
<groupId>com.google.cloud</groupId>
<artifactId>google-cloud-document-ai</artifactId>
<version>0.3.0</version>
</dependency>
<!-- [END documentai_install_with_bom] -->
<dependency>
Expand Down
Binary file added document-ai/snippets/resources/invoice.pdf
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsResponse;
import com.google.cloud.documentai.v1beta2.Document;
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceClient;
import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceSettings;
import com.google.cloud.documentai.v1beta2.FormExtractionParams;
import com.google.cloud.documentai.v1beta2.GcsDestination;
import com.google.cloud.documentai.v1beta2.GcsSource;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package documentai.v1beta3;

// [START documentai_batch_process_document]

import com.google.api.gax.longrunning.OperationFuture;
import com.google.api.gax.paging.Page;
import com.google.api.gax.rpc.UnknownException;
import com.google.cloud.documentai.v1beta3.BatchProcessMetadata;
import com.google.cloud.documentai.v1beta3.BatchProcessRequest;
import com.google.cloud.documentai.v1beta3.BatchProcessResponse;
import com.google.cloud.documentai.v1beta3.Document;
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
import com.google.cloud.storage.Blob;
import com.google.cloud.storage.BlobId;
import com.google.cloud.storage.Bucket;
import com.google.cloud.storage.Storage;
import com.google.cloud.storage.StorageOptions;
import com.google.protobuf.util.JsonFormat;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

public class BatchProcessDocumentBeta {
public static void batchProcessDocument()
throws IOException, InterruptedException, TimeoutException, ExecutionException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String location = "your-project-location"; // Format is "us" or "eu".
String processerId = "your-processor-id";
String outputGcsBucketName = "your-gcs-bucket-name";
String outputGcsPrefix = "PREFIX";
String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.pdf";
batchProcessDocument(
projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix);
}

public static void batchProcessDocument(
String projectId,
String location,
String processorId,
String gcsInputUri,
String gcsOutputBucketName,
String gcsOutputUriPrefix)
throws IOException, InterruptedException, TimeoutException, ExecutionException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
String name =
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);

BatchProcessRequest.BatchInputConfig batchInputConfig =
BatchProcessRequest.BatchInputConfig.newBuilder()
.setGcsSource(gcsInputUri)
.setMimeType("application/pdf")
.build();

String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix);
BatchProcessRequest.BatchOutputConfig outputConfig =
BatchProcessRequest.BatchOutputConfig.newBuilder().setGcsDestination(fullGcsPath).build();

// Configure the batch process request.
BatchProcessRequest request =
BatchProcessRequest.newBuilder()
.setName(name)
.addInputConfigs(batchInputConfig)
.setOutputConfig(outputConfig)
.build();

OperationFuture<BatchProcessResponse, BatchProcessMetadata> future =
client.batchProcessDocumentsAsync(request);

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
// Note: first request to the service takes longer than subsequent
// requests.
System.out.println("Waiting for operation to complete...");
future.get(120, TimeUnit.SECONDS);

System.out.println("Document processing complete.");

Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
Bucket bucket = storage.get(gcsOutputBucketName);

// List all of the files in the Storage bucket.
Page<Blob> blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/"));
int idx = 0;
for (Blob blob : blobs.iterateAll()) {
if (!blob.isDirectory()) {
System.out.printf("Fetched file #%d\n", ++idx);
// Read the results

// Download and store json data in a temp file.
File tempFile = File.createTempFile("file", ".json");
Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName()));
fileInfo.downloadTo(tempFile.toPath());

// Parse json file into Document.
FileReader reader = new FileReader(tempFile);
Document.Builder builder = Document.newBuilder();
JsonFormat.parser().merge(reader, builder);

Document document = builder.build();

// Get all of the document text as one big string.
String text = document.getText();

// Read the text recognition output from the processor
System.out.println("The document contains the following paragraphs:");
Document.Page page1 = document.getPages(0);
List<Document.Page.Paragraph> paragraphList = page1.getParagraphsList();
for (Document.Page.Paragraph paragraph : paragraphList) {
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
System.out.printf("Paragraph text:%s\n", paragraphText);
}

// Form parsing provides additional output about
// form-formatted PDFs. You must create a form
// processor in the Cloud Console to see full field details.
System.out.println("The following form key/value pairs were detected:");

for (Document.Page.FormField field : page1.getFormFieldsList()) {
String fieldName = getText(field.getFieldName().getTextAnchor(), text);
String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))", fieldName, fieldValue);
}

// Clean up temp file.
tempFile.deleteOnExit();
}
}
}
}

// Extract shards from the text field
private static String getText(Document.TextAnchor textAnchor, String text) {
if (textAnchor.getTextSegmentsList().size() > 0) {
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
return text.substring(startIdx, endIdx);
}
return "[NO TEXT]";
}
}
// [END documentai_batch_process_document]
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package documentai.v1beta3;

// [START documentai_process_document]

import com.google.cloud.documentai.v1beta3.Document;
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
import com.google.cloud.documentai.v1beta3.ProcessRequest;
import com.google.cloud.documentai.v1beta3.ProcessResponse;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;

public class ProcessDocumentBeta {
public static void processDocument()
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String location = "your-project-location"; // Format is "us" or "eu".
String processerId = "your-processor-id";
String filePath = "path/to/input/file.pdf";
processDocument(projectId, location, processerId, filePath);
}

public static void processDocument(
String projectId, String location, String processorId, String filePath)
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
String name =
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);

// Read the file.
byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));

// Convert the image data to a Buffer and base64 encode it.
ByteString content = ByteString.copyFrom(imageFileData);

Document document =
Document.newBuilder().setContent(content).setMimeType("application/pdf").build();

// Configure the process request.
ProcessRequest request =
ProcessRequest.newBuilder().setName(name).setDocument(document).build();

// Recognizes text entities in the PDF document
ProcessResponse result = client.processDocument(request);
Document documentResponse = result.getDocument();

// Get all of the document text as one big string
String text = documentResponse.getText();

// Read the text recognition output from the processor
System.out.println("The document contains the following paragraphs:");
Document.Page firstPage = documentResponse.getPages(0);
List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();

for (Document.Page.Paragraph paragraph : paragraphs) {
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
System.out.printf("Paragraph text:\n%s\n", paragraphText);
}

// Form parsing provides additional output about
// form-formatted PDFs. You must create a form
// processor in the Cloud Console to see full field details.
System.out.println("The following form key/value pairs were detected:");

for (Document.Page.FormField field : firstPage.getFormFieldsList()) {
String fieldName = getText(field.getFieldName().getTextAnchor(), text);
String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);

System.out.println("Extracted form fields pair:");
System.out.printf("\t(%s, %s))\n", fieldName, fieldValue);
}
}
}

// Extract shards from the text field
private static String getText(Document.TextAnchor textAnchor, String text) {
if (textAnchor.getTextSegmentsList().size() > 0) {
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
return text.substring(startIdx, endIdx);
}
return "[NO TEXT]";
}
}
// [END documentai_process_document]
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright 2020 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package documentai.v1beta3;

// [START documentai_quickstart]

import com.google.cloud.documentai.v1beta3.Document;
import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
import com.google.cloud.documentai.v1beta3.ProcessRequest;
import com.google.cloud.documentai.v1beta3.ProcessResponse;
import com.google.protobuf.ByteString;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeoutException;

public class QuickStart {
public static void quickStart()
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// TODO(developer): Replace these variables before running the sample.
String projectId = "your-project-id";
String location = "your-project-location"; // Format is "us" or "eu".
String processerId = "your-processor-id";
String filePath = "path/to/input/file.pdf";
quickStart(projectId, location, processerId, filePath);
}

public static void quickStart(
String projectId, String location, String processorId, String filePath)
throws IOException, InterruptedException, ExecutionException, TimeoutException {
// Initialize client that will be used to send requests. This client only needs to be created
// once, and can be reused for multiple requests. After completing all of your requests, call
// the "close" method on the client to safely clean up any remaining background resources.
try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
String name =
String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);

// Read the file.
byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));

// Convert the image data to a Buffer and base64 encode it.
ByteString content = ByteString.copyFrom(imageFileData);

Document document =
Document.newBuilder().setContent(content).setMimeType("application/pdf").build();

// Configure the process request.
ProcessRequest request =
ProcessRequest.newBuilder().setName(name).setDocument(document).build();

// Recognizes text entities in the PDF document
ProcessResponse result = client.processDocument(request);
Document documentResponse = result.getDocument();

// Get all of the document text as one big string
String text = documentResponse.getText();

// Read the text recognition output from the processor
System.out.println("The document contains the following paragraphs:");
Document.Page firstPage = documentResponse.getPages(0);
List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();

for (Document.Page.Paragraph paragraph : paragraphs) {
String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
System.out.printf("Paragraph text:\n%s\n", paragraphText);
}
}
}

// Extract shards from the text field
private static String getText(Document.TextAnchor textAnchor, String text) {
if (textAnchor.getTextSegmentsList().size() > 0) {
int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
return text.substring(startIdx, endIdx);
}
return "[NO TEXT]";
}
}
// [END documentai_quickstart]
Loading

0 comments on commit b8b9fca

Please sign in to comment.