diff --git a/document-ai/.eslintrc.yml b/document-ai/.eslintrc.yml index eef2752ae8..98634adbef 100644 --- a/document-ai/.eslintrc.yml +++ b/document-ai/.eslintrc.yml @@ -1,6 +1,4 @@ --- rules: no-console: off - node/no-missing-require: off - node/no-extraneous-require: off - \ No newline at end of file + node/no-unsupported-features/node-builtins: off diff --git a/document-ai/batch-process-document.v1beta3.js b/document-ai/batch-process-document.v1beta3.js new file mode 100644 index 0000000000..75921524d0 --- /dev/null +++ b/document-ai/batch-process-document.v1beta3.js @@ -0,0 +1,145 @@ +/** + * Copyright 2020 Google LLC + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const uuid = require('uuid'); + +async function main( + projectId = 'YOUR_PROJECT_ID', + location = 'YOUR_PROJECT_LOCATION', + processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console + gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf', + gcsOutputUri = 'output-bucket', + gcsOutputUriPrefix = uuid.v4() +) { + // [START documentai_batch_process_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processorId = 'YOUR_PROCESSOR_ID'; + // const gcsInputUri = 'YOUR_SOURCE_PDF'; + // const gcsOutputUri = 'YOUR_STORAGE_BUCKET'; + // const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX'; + + // Imports the Google Cloud client library + const { + DocumentProcessorServiceClient, + } = require('@google-cloud/documentai').v1beta3; + const {Storage} = require('@google-cloud/storage'); + + // Instantiates Document AI, Storage clients + const client = new DocumentProcessorServiceClient(); + const storage = new Storage(); + + const {default: PQueue} = require('p-queue'); + + async function batchProcessDocument() { + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Configure the batch process request. + const request = { + name, + inputConfigs: [ + { + gcsSource: gcsInputUri, + mimeType: 'application/pdf', + }, + ], + outputConfig: { + gcsDestination: `${gcsOutputUri}/${gcsOutputUriPrefix}/`, + }, + }; + + // Batch process document using a long-running operation. + // You can wait for now, or get results later. + // Note: first request to the service takes longer than subsequent + // requests. + const [operation] = await client.batchProcessDocuments(request); + + // Wait for operation to complete. + await operation.promise(); + + console.log('Document processing complete.'); + + // Query Storage bucket for the results file(s). + const query = { + prefix: gcsOutputUriPrefix, + }; + + console.log('Fetching results ...'); + + // List all of the files in the Storage bucket + const [files] = await storage.bucket(gcsOutputUri).getFiles(query); + + // Add all asynchronous downloads to queue for execution. + const queue = new PQueue({concurrency: 15}); + const tasks = files.map((fileInfo, index) => async () => { + // Get the file as a buffer + const [file] = await fileInfo.download(); + + console.log(`Fetched file #${index + 1}:`); + + // The results stored in the output Storage location + // are formatted as a document object. + const document = JSON.parse(file.toString()); + const {text} = document; + + // Extract shards from the text field + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + + const [page1] = document.pages; + const {paragraphs} = page1; + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + console.log('\nThe following form key/value pairs were detected:'); + + const {formFields} = page1; + for (const field of formFields) { + const fieldName = getText(field.fieldName.textAnchor); + const fieldValue = getText(field.fieldValue.textAnchor); + + console.log('Extracted key value pair:'); + console.log(`\t(${fieldName}, ${fieldValue})`); + } + }); + await queue.addAll(tasks); + } + // [END documentai_batch_process_document] + + batchProcessDocument(); +} +main(...process.argv.slice(2)); diff --git a/document-ai/package.json b/document-ai/package.json index e84be17d21..8d166d1dd4 100644 --- a/document-ai/package.json +++ b/document-ai/package.json @@ -4,7 +4,7 @@ "license": "Apache-2.0", "author": "Google LLC", "engines": { - "node": ">=8" + "node": ">= 10.17.0" }, "files": [ "*.js" @@ -14,7 +14,9 @@ }, "dependencies": { "@google-cloud/documentai": "^2.1.1", - "@google-cloud/storage": "^5.0.0" + "@google-cloud/storage": "^5.0.0", + "p-queue": "^6.6.2", + "uuid": "^8.3.1" }, "devDependencies": { "chai": "^4.2.0", diff --git a/document-ai/process-document.v1beta3.js b/document-ai/process-document.v1beta3.js new file mode 100644 index 0000000000..3213f3a563 --- /dev/null +++ b/document-ai/process-document.v1beta3.js @@ -0,0 +1,107 @@ +/** + * Copyright 2020, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +async function main(projectId, location, processorId, filePath) { + // [START documentai_process_document] + /** + * TODO(developer): Uncomment these variables before running the sample. + */ + // const projectId = 'YOUR_PROJECT_ID'; + // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' + // const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; + + const { + DocumentProcessorServiceClient, + } = require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); + + async function processDocument() { + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + + const request = { + name, + document: { + content: encodedImage, + mimeType: 'application/pdf', + }, + }; + + // Recognizes text entities in the PDF document + const [result] = await client.processDocument(request); + const {document} = result; + + // Get all of the document text as one big string + const {text} = document; + + // Extract shards from the text field + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + + // First shard in document doesn't have startIndex property + const startIndex = textAnchor.textSegments[0].startIndex || 0; + const endIndex = textAnchor.textSegments[0].endIndex; + + return text.substring(startIndex, endIndex); + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + const [page1] = document.pages; + const {paragraphs} = page1; + + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); + } + + // Form parsing provides additional output about + // form-formatted PDFs. You must create a form + // processor in the Cloud Console to see full field details. + console.log('\nThe following form key/value pairs were detected:'); + + const {formFields} = page1; + for (const field of formFields) { + const fieldName = getText(field.fieldName.textAnchor); + const fieldValue = getText(field.fieldValue.textAnchor); + + console.log('Extracted key value pair:'); + console.log(`\t(${fieldName}, ${fieldValue})`); + } + } + // [END documentai_process_document] + await processDocument(); +} + +main(...process.argv.slice(2)).catch(err => { + console.error(err); + process.exitCode = 1; +}); diff --git a/document-ai/quickstart.js b/document-ai/quickstart.js index de56ea4597..2d6c4a6fbb 100644 --- a/document-ai/quickstart.js +++ b/document-ai/quickstart.js @@ -15,56 +15,72 @@ 'use strict'; -async function main( - projectId, - location, - gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf' -) { +async function main(projectId, location, processorId, filePath) { // [START documentai_quickstart] /** * TODO(developer): Uncomment these variables before running the sample. */ // const projectId = 'YOUR_PROJECT_ID'; // const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu' - // const gcsInputUri = 'YOUR_SOURCE_PDF'; + // const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console + // const filePath = '/path/to/local/pdf'; const { - DocumentUnderstandingServiceClient, - } = require('@google-cloud/documentai').v1beta2; - const client = new DocumentUnderstandingServiceClient(); + DocumentProcessorServiceClient, + } = require('@google-cloud/documentai').v1beta3; + + // Instantiates a client + const client = new DocumentProcessorServiceClient(); async function quickstart() { - // Configure the request for processing the PDF - const parent = `projects/${projectId}/locations/${location}`; + // The full resource name of the processor, e.g.: + // projects/project-id/locations/location/processor/processor-id + // You must create new processors in the Cloud Console first + const name = `projects/${projectId}/locations/${location}/processors/${processorId}`; + + // Read the file into memory. + const fs = require('fs').promises; + const imageFile = await fs.readFile(filePath); + + // Convert the image data to a Buffer and base64 encode it. + const encodedImage = Buffer.from(imageFile).toString('base64'); + const request = { - parent, - inputConfig: { - gcsSource: { - uri: gcsInputUri, - }, + name, + document: { + content: encodedImage, mimeType: 'application/pdf', }, }; // Recognizes text entities in the PDF document const [result] = await client.processDocument(request); + const {document} = result; // Get all of the document text as one big string - const {text} = result; + const {text} = document; // Extract shards from the text field - function extractText(textAnchor) { + const getText = textAnchor => { + if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) { + return ''; + } + // First shard in document doesn't have startIndex property const startIndex = textAnchor.textSegments[0].startIndex || 0; const endIndex = textAnchor.textSegments[0].endIndex; return text.substring(startIndex, endIndex); - } + }; + + // Read the text recognition output from the processor + console.log('The document contains the following paragraphs:'); + const [page1] = document.pages; + const {paragraphs} = page1; - for (const entity of result.entities) { - console.log(`\nEntity text: ${extractText(entity.textAnchor)}`); - console.log(`Entity type: ${entity.type}`); - console.log(`Entity mention text: ${entity.mentionText}`); + for (const paragraph of paragraphs) { + const paragraphText = getText(paragraph.layout.textAnchor); + console.log(`Paragraph text:\n${paragraphText}`); } } // [END documentai_quickstart] diff --git a/document-ai/resources/invoice.pdf b/document-ai/resources/invoice.pdf new file mode 100644 index 0000000000..7722734a43 Binary files /dev/null and b/document-ai/resources/invoice.pdf differ diff --git a/document-ai/test/batch-process-document.v1beta3.test.js b/document-ai/test/batch-process-document.v1beta3.test.js new file mode 100644 index 0000000000..5cd8b41147 --- /dev/null +++ b/document-ai/test/batch-process-document.v1beta3.test.js @@ -0,0 +1,62 @@ +// Copyright 2019 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +'use strict'; + +const {Storage} = require('@google-cloud/storage'); +const { + DocumentProcessorServiceClient, +} = require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const cp = require('child_process'); +const assert = require('assert'); +const {describe, it, before, after} = require('mocha'); +const uuid = require('uuid'); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const storage = new Storage(); +const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`; +const cmd = 'node batch-process-document.v1beta3.js'; + +const testProcessDocument = { + projectId: '', + location: 'us', + processorId: '8f1123c1b125e0b7', + gcsInputUri: 'gs://cloud-samples-data/documentai/invoice.pdf', + gcsOutputUriPrefix: uuid.v4(), +}; + +describe('Document AI batch parse form', () => { + before(async () => { + testProcessDocument.projectId = await client.getProjectId(); + await storage.createBucket(bucketName); + }); + + after(async () => { + const bucket = storage.bucket(bucketName); + await bucket.deleteFiles({force: true}); + await bucket.delete(); + }); + + it('should parse the GCS invoice example as a form', async () => { + const output = execSync( + `${cmd} ${testProcessDocument.projectId} ${testProcessDocument.location} ${testProcessDocument.processorId} ${testProcessDocument.gcsInputUri} gs://${bucketName} ${testProcessDocument.gcsOutputUriPrefix}` + ); + assert.notStrictEqual(output.indexOf('Extracted'), -1); + }); +}); diff --git a/document-ai/test/process-document.v1beta3.test.js b/document-ai/test/process-document.v1beta3.test.js new file mode 100644 index 0000000000..48a46f0916 --- /dev/null +++ b/document-ai/test/process-document.v1beta3.test.js @@ -0,0 +1,53 @@ +/** + * Copyright 2019, Google, Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +'use strict'; + +const path = require('path'); +const assert = require('assert'); +const cp = require('child_process'); + +const { + DocumentProcessorServiceClient, +} = require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + +const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); + +const cwd = path.join(__dirname, '..'); +const LOCATION = 'us'; +const PROCESSOR_ID = '8f1123c1b125e0b7'; + +const fileName = 'invoice.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); + +describe('Process document', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run document (process invoice)', async () => { + const stdout = execSync( + `node ./process-document.v1beta3.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Paragraph'), -1); + assert.notStrictEqual(stdout.indexOf('Extracted'), -1); + }); +}); diff --git a/document-ai/test/quickstart.test.js b/document-ai/test/quickstart.test.js index 8a44c2d6c8..d26e563275 100644 --- a/document-ai/test/quickstart.test.js +++ b/document-ai/test/quickstart.test.js @@ -16,20 +16,38 @@ 'use strict'; const path = require('path'); -const {assert} = require('chai'); +const assert = require('assert'); const cp = require('child_process'); +const { + DocumentProcessorServiceClient, +} = require('@google-cloud/documentai').v1beta3; +const client = new DocumentProcessorServiceClient({ + apiEndpoint: 'us-documentai.googleapis.com', +}); + const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'}); const cwd = path.join(__dirname, '..'); -const projectId = process.env.GCLOUD_PROJECT; const LOCATION = 'us'; +const PROCESSOR_ID = '8f1123c1b125e0b7'; + +const fileName = 'invoice.pdf'; +const filePath = path.resolve(path.join(__dirname, `../resources/${fileName}`)); describe('Quickstart', () => { + let projectId; + before(async () => { + projectId = await client.getProjectId(); + }); + it('should run quickstart', async () => { - const stdout = execSync(`node ./quickstart.js ${projectId} ${LOCATION}`, { - cwd, - }); - assert.match(stdout, /Entity/); + const stdout = execSync( + `node ./quickstart.js ${projectId} ${LOCATION} ${PROCESSOR_ID} ${filePath}`, + { + cwd, + } + ); + assert.notStrictEqual(stdout.indexOf('Paragraph'), -1); }); });