samples: new Doc AI samples for v1beta3 (#206)

* samples: new Doc AI samples for v1beta3 * disabled batch parsetable test * updated pomxs
GoogleCloudPlatform · Oct 15, 2020 · b8b9fca · b8b9fca
1 parent 3f8be86
commit b8b9fca
Show file tree

Hide file tree

Showing 13 changed files with 655 additions and 4 deletions.
diff --git a/document-ai/snippets/pom.xml b/document-ai/snippets/pom.xml
@@ -41,6 +41,7 @@
     <dependency>
       <groupId>com.google.cloud</groupId>
       <artifactId>google-cloud-document-ai</artifactId>
+      <version>0.3.0</version>
     </dependency>
     <!-- [END documentai_install_with_bom] -->
     <dependency>

diff --git a/document-ai/snippets/resources/invoice.pdf b/document-ai/snippets/resources/invoice.pdf
diff --git a/document-ai/snippets/src/main/java/documentai/v1beta2/BatchParseFormBeta.java b/document-ai/snippets/src/main/java/documentai/v1beta2/BatchParseFormBeta.java
@@ -24,7 +24,6 @@
 import com.google.cloud.documentai.v1beta2.BatchProcessDocumentsResponse;
 import com.google.cloud.documentai.v1beta2.Document;
 import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceClient;
-import com.google.cloud.documentai.v1beta2.DocumentUnderstandingServiceSettings;
 import com.google.cloud.documentai.v1beta2.FormExtractionParams;
 import com.google.cloud.documentai.v1beta2.GcsDestination;
 import com.google.cloud.documentai.v1beta2.GcsSource;

diff --git a/document-ai/snippets/src/main/java/documentai/v1beta3/BatchProcessDocumentBeta.java b/document-ai/snippets/src/main/java/documentai/v1beta3/BatchProcessDocumentBeta.java
@@ -0,0 +1,170 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_batch_process_document]
+
+import com.google.api.gax.longrunning.OperationFuture;
+import com.google.api.gax.paging.Page;
+import com.google.api.gax.rpc.UnknownException;
+import com.google.cloud.documentai.v1beta3.BatchProcessMetadata;
+import com.google.cloud.documentai.v1beta3.BatchProcessRequest;
+import com.google.cloud.documentai.v1beta3.BatchProcessResponse;
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.storage.Blob;
+import com.google.cloud.storage.BlobId;
+import com.google.cloud.storage.Bucket;
+import com.google.cloud.storage.Storage;
+import com.google.cloud.storage.StorageOptions;
+import com.google.protobuf.util.JsonFormat;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+
+public class BatchProcessDocumentBeta {
+  public static void batchProcessDocument()
+      throws IOException, InterruptedException, TimeoutException, ExecutionException {
+    // TODO(developer): Replace these variables before running the sample.
+    String projectId = "your-project-id";
+    String location = "your-project-location"; // Format is "us" or "eu".
+    String processerId = "your-processor-id";
+    String outputGcsBucketName = "your-gcs-bucket-name";
+    String outputGcsPrefix = "PREFIX";
+    String inputGcsUri = "gs://your-gcs-bucket/path/to/input/file.pdf";
+    batchProcessDocument(
+        projectId, location, processerId, inputGcsUri, outputGcsBucketName, outputGcsPrefix);
+  }
+
+  public static void batchProcessDocument(
+      String projectId,
+      String location,
+      String processorId,
+      String gcsInputUri,
+      String gcsOutputBucketName,
+      String gcsOutputUriPrefix)
+      throws IOException, InterruptedException, TimeoutException, ExecutionException {
+    // Initialize client that will be used to send requests. This client only needs to be created
+    // once, and can be reused for multiple requests. After completing all of your requests, call
+    // the "close" method on the client to safely clean up any remaining background resources.
+    try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+      // The full resource name of the processor, e.g.:
+      // projects/project-id/locations/location/processor/processor-id
+      // You must create new processors in the Cloud Console first
+      String name =
+          String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+      BatchProcessRequest.BatchInputConfig batchInputConfig =
+          BatchProcessRequest.BatchInputConfig.newBuilder()
+              .setGcsSource(gcsInputUri)
+              .setMimeType("application/pdf")
+              .build();
+
+      String fullGcsPath = String.format("gs://%s/%s/", gcsOutputBucketName, gcsOutputUriPrefix);
+      BatchProcessRequest.BatchOutputConfig outputConfig =
+          BatchProcessRequest.BatchOutputConfig.newBuilder().setGcsDestination(fullGcsPath).build();
+
+      // Configure the batch process request.
+      BatchProcessRequest request =
+          BatchProcessRequest.newBuilder()
+              .setName(name)
+              .addInputConfigs(batchInputConfig)
+              .setOutputConfig(outputConfig)
+              .build();
+
+      OperationFuture<BatchProcessResponse, BatchProcessMetadata> future =
+          client.batchProcessDocumentsAsync(request);
+
+      // Batch process document using a long-running operation.
+      // You can wait for now, or get results later.
+      // Note: first request to the service takes longer than subsequent
+      // requests.
+      System.out.println("Waiting for operation to complete...");
+      future.get(120, TimeUnit.SECONDS);
+
+      System.out.println("Document processing complete.");
+
+      Storage storage = StorageOptions.newBuilder().setProjectId(projectId).build().getService();
+      Bucket bucket = storage.get(gcsOutputBucketName);
+
+      // List all of the files in the Storage bucket.
+      Page<Blob> blobs = bucket.list(Storage.BlobListOption.prefix(gcsOutputUriPrefix + "/"));
+      int idx = 0;
+      for (Blob blob : blobs.iterateAll()) {
+        if (!blob.isDirectory()) {
+          System.out.printf("Fetched file #%d\n", ++idx);
+          // Read the results
+
+          // Download and store json data in a temp file.
+          File tempFile = File.createTempFile("file", ".json");
+          Blob fileInfo = storage.get(BlobId.of(gcsOutputBucketName, blob.getName()));
+          fileInfo.downloadTo(tempFile.toPath());
+
+          // Parse json file into Document.
+          FileReader reader = new FileReader(tempFile);
+          Document.Builder builder = Document.newBuilder();
+          JsonFormat.parser().merge(reader, builder);
+
+          Document document = builder.build();
+
+          // Get all of the document text as one big string.
+          String text = document.getText();
+
+          // Read the text recognition output from the processor
+          System.out.println("The document contains the following paragraphs:");
+          Document.Page page1 = document.getPages(0);
+          List<Document.Page.Paragraph> paragraphList = page1.getParagraphsList();
+          for (Document.Page.Paragraph paragraph : paragraphList) {
+            String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+            System.out.printf("Paragraph text:%s\n", paragraphText);
+          }
+
+          // Form parsing provides additional output about
+          // form-formatted PDFs. You  must create a form
+          // processor in the Cloud Console to see full field details.
+          System.out.println("The following form key/value pairs were detected:");
+
+          for (Document.Page.FormField field : page1.getFormFieldsList()) {
+            String fieldName = getText(field.getFieldName().getTextAnchor(), text);
+            String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
+
+            System.out.println("Extracted form fields pair:");
+            System.out.printf("\t(%s, %s))", fieldName, fieldValue);
+          }
+
+          // Clean up temp file.
+          tempFile.deleteOnExit();
+        }
+      }
+    }
+  }
+
+  // Extract shards from the text field
+  private static String getText(Document.TextAnchor textAnchor, String text) {
+    if (textAnchor.getTextSegmentsList().size() > 0) {
+      int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+      int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+      return text.substring(startIdx, endIdx);
+    }
+    return "[NO TEXT]";
+  }
+}
+// [END documentai_batch_process_document]
diff --git a/document-ai/snippets/src/main/java/documentai/v1beta3/ProcessDocumentBeta.java b/document-ai/snippets/src/main/java/documentai/v1beta3/ProcessDocumentBeta.java
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_process_document]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class ProcessDocumentBeta {
+  public static void processDocument()
+      throws IOException, InterruptedException, ExecutionException, TimeoutException {
+    // TODO(developer): Replace these variables before running the sample.
+    String projectId = "your-project-id";
+    String location = "your-project-location"; // Format is "us" or "eu".
+    String processerId = "your-processor-id";
+    String filePath = "path/to/input/file.pdf";
+    processDocument(projectId, location, processerId, filePath);
+  }
+
+  public static void processDocument(
+      String projectId, String location, String processorId, String filePath)
+      throws IOException, InterruptedException, ExecutionException, TimeoutException {
+    // Initialize client that will be used to send requests. This client only needs to be created
+    // once, and can be reused for multiple requests. After completing all of your requests, call
+    // the "close" method on the client to safely clean up any remaining background resources.
+    try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+      // The full resource name of the processor, e.g.:
+      // projects/project-id/locations/location/processor/processor-id
+      // You must create new processors in the Cloud Console first
+      String name =
+          String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+      // Read the file.
+      byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+      // Convert the image data to a Buffer and base64 encode it.
+      ByteString content = ByteString.copyFrom(imageFileData);
+
+      Document document =
+          Document.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+      // Configure the process request.
+      ProcessRequest request =
+          ProcessRequest.newBuilder().setName(name).setDocument(document).build();
+
+      // Recognizes text entities in the PDF document
+      ProcessResponse result = client.processDocument(request);
+      Document documentResponse = result.getDocument();
+
+      // Get all of the document text as one big string
+      String text = documentResponse.getText();
+
+      // Read the text recognition output from the processor
+      System.out.println("The document contains the following paragraphs:");
+      Document.Page firstPage = documentResponse.getPages(0);
+      List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();
+
+      for (Document.Page.Paragraph paragraph : paragraphs) {
+        String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+        System.out.printf("Paragraph text:\n%s\n", paragraphText);
+      }
+
+      // Form parsing provides additional output about
+      // form-formatted PDFs. You  must create a form
+      // processor in the Cloud Console to see full field details.
+      System.out.println("The following form key/value pairs were detected:");
+
+      for (Document.Page.FormField field : firstPage.getFormFieldsList()) {
+        String fieldName = getText(field.getFieldName().getTextAnchor(), text);
+        String fieldValue = getText(field.getFieldValue().getTextAnchor(), text);
+
+        System.out.println("Extracted form fields pair:");
+        System.out.printf("\t(%s, %s))\n", fieldName, fieldValue);
+      }
+    }
+  }
+
+  // Extract shards from the text field
+  private static String getText(Document.TextAnchor textAnchor, String text) {
+    if (textAnchor.getTextSegmentsList().size() > 0) {
+      int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+      int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+      return text.substring(startIdx, endIdx);
+    }
+    return "[NO TEXT]";
+  }
+}
+// [END documentai_process_document]
diff --git a/document-ai/snippets/src/main/java/documentai/v1beta3/QuickStart.java b/document-ai/snippets/src/main/java/documentai/v1beta3/QuickStart.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2020 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package documentai.v1beta3;
+
+// [START documentai_quickstart]
+
+import com.google.cloud.documentai.v1beta3.Document;
+import com.google.cloud.documentai.v1beta3.DocumentProcessorServiceClient;
+import com.google.cloud.documentai.v1beta3.ProcessRequest;
+import com.google.cloud.documentai.v1beta3.ProcessResponse;
+import com.google.protobuf.ByteString;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.List;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+public class QuickStart {
+  public static void quickStart()
+      throws IOException, InterruptedException, ExecutionException, TimeoutException {
+    // TODO(developer): Replace these variables before running the sample.
+    String projectId = "your-project-id";
+    String location = "your-project-location"; // Format is "us" or "eu".
+    String processerId = "your-processor-id";
+    String filePath = "path/to/input/file.pdf";
+    quickStart(projectId, location, processerId, filePath);
+  }
+
+  public static void quickStart(
+      String projectId, String location, String processorId, String filePath)
+      throws IOException, InterruptedException, ExecutionException, TimeoutException {
+    // Initialize client that will be used to send requests. This client only needs to be created
+    // once, and can be reused for multiple requests. After completing all of your requests, call
+    // the "close" method on the client to safely clean up any remaining background resources.
+    try (DocumentProcessorServiceClient client = DocumentProcessorServiceClient.create()) {
+      // The full resource name of the processor, e.g.:
+      // projects/project-id/locations/location/processor/processor-id
+      // You must create new processors in the Cloud Console first
+      String name =
+          String.format("projects/%s/locations/%s/processors/%s", projectId, location, processorId);
+
+      // Read the file.
+      byte[] imageFileData = Files.readAllBytes(Paths.get(filePath));
+
+      // Convert the image data to a Buffer and base64 encode it.
+      ByteString content = ByteString.copyFrom(imageFileData);
+
+      Document document =
+          Document.newBuilder().setContent(content).setMimeType("application/pdf").build();
+
+      // Configure the process request.
+      ProcessRequest request =
+          ProcessRequest.newBuilder().setName(name).setDocument(document).build();
+
+      // Recognizes text entities in the PDF document
+      ProcessResponse result = client.processDocument(request);
+      Document documentResponse = result.getDocument();
+
+      // Get all of the document text as one big string
+      String text = documentResponse.getText();
+
+      // Read the text recognition output from the processor
+      System.out.println("The document contains the following paragraphs:");
+      Document.Page firstPage = documentResponse.getPages(0);
+      List<Document.Page.Paragraph> paragraphs = firstPage.getParagraphsList();
+
+      for (Document.Page.Paragraph paragraph : paragraphs) {
+        String paragraphText = getText(paragraph.getLayout().getTextAnchor(), text);
+        System.out.printf("Paragraph text:\n%s\n", paragraphText);
+      }
+    }
+  }
+
+  // Extract shards from the text field
+  private static String getText(Document.TextAnchor textAnchor, String text) {
+    if (textAnchor.getTextSegmentsList().size() > 0) {
+      int startIdx = (int) textAnchor.getTextSegments(0).getStartIndex();
+      int endIdx = (int) textAnchor.getTextSegments(0).getEndIndex();
+      return text.substring(startIdx, endIdx);
+    }
+    return "[NO TEXT]";
+  }
+}
+// [END documentai_quickstart]