docs(samples): new Doc AI samples for v1beta3 (#101)

* docs(samples): new Doc AI samples for v1beta3 * feat: add a new version to the library * fix: adds processor ID as literal in tests * fix: removed apiEndpoint from client instantiation Co-authored-by: Sofia Leon <[email protected]> Co-authored-by: sofisl <[email protected]> Co-authored-by: Justin Beckwith <[email protected]>
GoogleCloudPlatform · Nov 14, 2022 · d8678f2 · d8678f2
1 parent be4aacd
commit d8678f2
Show file tree

Hide file tree

Showing 9 changed files with 435 additions and 34 deletions.
diff --git a/document-ai/.eslintrc.yml b/document-ai/.eslintrc.yml
@@ -1,6 +1,4 @@
 ---
 rules:
   no-console: off
-  node/no-missing-require: off
-  node/no-extraneous-require: off
-
+  node/no-unsupported-features/node-builtins: off
diff --git a/document-ai/batch-process-document.v1beta3.js b/document-ai/batch-process-document.v1beta3.js
@@ -0,0 +1,145 @@
+/**
+ * Copyright 2020 Google LLC
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+const uuid = require('uuid');
+
+async function main(
+  projectId = 'YOUR_PROJECT_ID',
+  location = 'YOUR_PROJECT_LOCATION',
+  processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console
+  gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf',
+  gcsOutputUri = 'output-bucket',
+  gcsOutputUriPrefix = uuid.v4()
+) {
+  // [START documentai_batch_process_document]
+  /**
+   * TODO(developer): Uncomment these variables before running the sample.
+   */
+  // const projectId = 'YOUR_PROJECT_ID';
+  // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
+  // const processorId = 'YOUR_PROCESSOR_ID';
+  // const gcsInputUri = 'YOUR_SOURCE_PDF';
+  // const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
+  // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';
+
+  // Imports the Google Cloud client library
+  const {
+    DocumentProcessorServiceClient,
+  } = require('@google-cloud/documentai').v1beta3;
+  const {Storage} = require('@google-cloud/storage');
+
+  // Instantiates Document AI, Storage clients
+  const client = new DocumentProcessorServiceClient();
+  const storage = new Storage();
+
+  const {default: PQueue} = require('p-queue');
+
+  async function batchProcessDocument() {
+    const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
+
+    // Configure the batch process request.
+    const request = {
+      name,
+      inputConfigs: [
+        {
+          gcsSource: gcsInputUri,
+          mimeType: 'application/pdf',
+        },
+      ],
+      outputConfig: {
+        gcsDestination: `${gcsOutputUri}/${gcsOutputUriPrefix}/`,
+      },
+    };
+
+    // Batch process document using a long-running operation.
+    // You can wait for now, or get results later.
+    // Note: first request to the service takes longer than subsequent
+    // requests.
+    const [operation] = await client.batchProcessDocuments(request);
+
+    // Wait for operation to complete.
+    await operation.promise();
+
+    console.log('Document processing complete.');
+
+    // Query Storage bucket for the results file(s).
+    const query = {
+      prefix: gcsOutputUriPrefix,
+    };
+
+    console.log('Fetching results ...');
+
+    // List all of the files in the Storage bucket
+    const [files] = await storage.bucket(gcsOutputUri).getFiles(query);
+
+    // Add all asynchronous downloads to queue for execution.
+    const queue = new PQueue({concurrency: 15});
+    const tasks = files.map((fileInfo, index) => async () => {
+      // Get the file as a buffer
+      const [file] = await fileInfo.download();
+
+      console.log(`Fetched file #${index + 1}:`);
+
+      // The results stored in the output Storage location
+      // are formatted as a document object.
+      const document = JSON.parse(file.toString());
+      const {text} = document;
+
+      // Extract shards from the text field
+      const getText = textAnchor => {
+        if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
+          return '';
+        }
+
+        // First shard in document doesn't have startIndex property
+        const startIndex = textAnchor.textSegments[0].startIndex || 0;
+        const endIndex = textAnchor.textSegments[0].endIndex;
+
+        return text.substring(startIndex, endIndex);
+      };
+
+      // Read the text recognition output from the processor
+      console.log('The document contains the following paragraphs:');
+
+      const [page1] = document.pages;
+      const {paragraphs} = page1;
+      for (const paragraph of paragraphs) {
+        const paragraphText = getText(paragraph.layout.textAnchor);
+        console.log(`Paragraph text:\n${paragraphText}`);
+      }
+
+      // Form parsing provides additional output about
+      // form-formatted PDFs. You  must create a form
+      // processor in the Cloud Console to see full field details.
+      console.log('\nThe following form key/value pairs were detected:');
+
+      const {formFields} = page1;
+      for (const field of formFields) {
+        const fieldName = getText(field.fieldName.textAnchor);
+        const fieldValue = getText(field.fieldValue.textAnchor);
+
+        console.log('Extracted key value pair:');
+        console.log(`\t(${fieldName}, ${fieldValue})`);
+      }
+    });
+    await queue.addAll(tasks);
+  }
+  // [END documentai_batch_process_document]
+
+  batchProcessDocument();
+}
+main(...process.argv.slice(2));
diff --git a/document-ai/package.json b/document-ai/package.json
@@ -4,7 +4,7 @@
   "license": "Apache-2.0",
   "author": "Google LLC",
   "engines": {
-    "node": ">=8"
+    "node": ">= 10.17.0"
   },
   "files": [
     "*.js"
@@ -14,7 +14,9 @@
   },
   "dependencies": {
     "@google-cloud/documentai": "^2.1.1",
-    "@google-cloud/storage": "^5.0.0"
+    "@google-cloud/storage": "^5.0.0",
+    "p-queue": "^6.6.2",
+    "uuid": "^8.3.1"
   },
   "devDependencies": {
     "chai": "^4.2.0",

diff --git a/document-ai/process-document.v1beta3.js b/document-ai/process-document.v1beta3.js
@@ -0,0 +1,107 @@
+/**
+ * Copyright 2020, Google, Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+'use strict';
+
+async function main(projectId, location, processorId, filePath) {
+  // [START documentai_process_document]
+  /**
+   * TODO(developer): Uncomment these variables before running the sample.
+   */
+  // const projectId = 'YOUR_PROJECT_ID';
+  // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
+  // const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
+  // const filePath = '/path/to/local/pdf';
+
+  const {
+    DocumentProcessorServiceClient,
+  } = require('@google-cloud/documentai').v1beta3;
+
+  // Instantiates a client
+  const client = new DocumentProcessorServiceClient();
+
+  async function processDocument() {
+    // The full resource name of the processor, e.g.:
+    // projects/project-id/locations/location/processor/processor-id
+    // You must create new processors in the Cloud Console first
+    const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
+
+    // Read the file into memory.
+    const fs = require('fs').promises;
+    const imageFile = await fs.readFile(filePath);
+
+    // Convert the image data to a Buffer and base64 encode it.
+    const encodedImage = Buffer.from(imageFile).toString('base64');
+
+    const request = {
+      name,
+      document: {
+        content: encodedImage,
+        mimeType: 'application/pdf',
+      },
+    };
+
+    // Recognizes text entities in the PDF document
+    const [result] = await client.processDocument(request);
+    const {document} = result;
+
+    // Get all of the document text as one big string
+    const {text} = document;
+
+    // Extract shards from the text field
+    const getText = textAnchor => {
+      if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
+        return '';
+      }
+
+      // First shard in document doesn't have startIndex property
+      const startIndex = textAnchor.textSegments[0].startIndex || 0;
+      const endIndex = textAnchor.textSegments[0].endIndex;
+
+      return text.substring(startIndex, endIndex);
+    };
+
+    // Read the text recognition output from the processor
+    console.log('The document contains the following paragraphs:');
+    const [page1] = document.pages;
+    const {paragraphs} = page1;
+
+    for (const paragraph of paragraphs) {
+      const paragraphText = getText(paragraph.layout.textAnchor);
+      console.log(`Paragraph text:\n${paragraphText}`);
+    }
+
+    // Form parsing provides additional output about
+    // form-formatted PDFs. You  must create a form
+    // processor in the Cloud Console to see full field details.
+    console.log('\nThe following form key/value pairs were detected:');
+
+    const {formFields} = page1;
+    for (const field of formFields) {
+      const fieldName = getText(field.fieldName.textAnchor);
+      const fieldValue = getText(field.fieldValue.textAnchor);
+
+      console.log('Extracted key value pair:');
+      console.log(`\t(${fieldName}, ${fieldValue})`);
+    }
+  }
+  // [END documentai_process_document]
+  await processDocument();
+}
+
+main(...process.argv.slice(2)).catch(err => {
+  console.error(err);
+  process.exitCode = 1;
+});
diff --git a/document-ai/quickstart.js b/document-ai/quickstart.js
@@ -15,56 +15,72 @@
 
 'use strict';
 
-async function main(
-  projectId,
-  location,
-  gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
-) {
+async function main(projectId, location, processorId, filePath) {
   // [START documentai_quickstart]
   /**
    * TODO(developer): Uncomment these variables before running the sample.
    */
   // const projectId = 'YOUR_PROJECT_ID';
   // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
-  // const gcsInputUri = 'YOUR_SOURCE_PDF';
+  // const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
+  // const filePath = '/path/to/local/pdf';
 
   const {
-    DocumentUnderstandingServiceClient,
-  } = require('@google-cloud/documentai').v1beta2;
-  const client = new DocumentUnderstandingServiceClient();
+    DocumentProcessorServiceClient,
+  } = require('@google-cloud/documentai').v1beta3;
+
+  // Instantiates a client
+  const client = new DocumentProcessorServiceClient();
 
   async function quickstart() {
-    // Configure the request for processing the PDF
-    const parent = `projects/${projectId}/locations/${location}`;
+    // The full resource name of the processor, e.g.:
+    // projects/project-id/locations/location/processor/processor-id
+    // You must create new processors in the Cloud Console first
+    const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
+
+    // Read the file into memory.
+    const fs = require('fs').promises;
+    const imageFile = await fs.readFile(filePath);
+
+    // Convert the image data to a Buffer and base64 encode it.
+    const encodedImage = Buffer.from(imageFile).toString('base64');
+
     const request = {
-      parent,
-      inputConfig: {
-        gcsSource: {
-          uri: gcsInputUri,
-        },
+      name,
+      document: {
+        content: encodedImage,
         mimeType: 'application/pdf',
       },
     };
 
     // Recognizes text entities in the PDF document
     const [result] = await client.processDocument(request);
+    const {document} = result;
 
     // Get all of the document text as one big string
-    const {text} = result;
+    const {text} = document;
 
     // Extract shards from the text field
-    function extractText(textAnchor) {
+    const getText = textAnchor => {
+      if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
+        return '';
+      }
+
       // First shard in document doesn't have startIndex property
       const startIndex = textAnchor.textSegments[0].startIndex || 0;
       const endIndex = textAnchor.textSegments[0].endIndex;
 
       return text.substring(startIndex, endIndex);
-    }
+    };
+
+    // Read the text recognition output from the processor
+    console.log('The document contains the following paragraphs:');
+    const [page1] = document.pages;
+    const {paragraphs} = page1;
 
-    for (const entity of result.entities) {
-      console.log(`\nEntity text: ${extractText(entity.textAnchor)}`);
-      console.log(`Entity type: ${entity.type}`);
-      console.log(`Entity mention text: ${entity.mentionText}`);
+    for (const paragraph of paragraphs) {
+      const paragraphText = getText(paragraph.layout.textAnchor);
+      console.log(`Paragraph text:\n${paragraphText}`);
     }
   }
   // [END documentai_quickstart]

diff --git a/document-ai/resources/invoice.pdf b/document-ai/resources/invoice.pdf