Skip to content

Commit

Permalink
docs(samples): new Doc AI samples for v1beta3 (#101)
Browse files Browse the repository at this point in the history
* docs(samples): new Doc AI samples for v1beta3
* feat: add a new version to the library
* fix: adds processor ID as literal in tests
* fix: removed apiEndpoint from client instantiation

Co-authored-by: Sofia Leon <[email protected]>
Co-authored-by: sofisl <[email protected]>
Co-authored-by: Justin Beckwith <[email protected]>
  • Loading branch information
4 people authored and Ace Nassri committed Nov 14, 2022
1 parent be4aacd commit d8678f2
Show file tree
Hide file tree
Showing 9 changed files with 435 additions and 34 deletions.
4 changes: 1 addition & 3 deletions document-ai/.eslintrc.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
---
rules:
no-console: off
node/no-missing-require: off
node/no-extraneous-require: off

node/no-unsupported-features/node-builtins: off
145 changes: 145 additions & 0 deletions document-ai/batch-process-document.v1beta3.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
/**
* Copyright 2020 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

const uuid = require('uuid');

async function main(
projectId = 'YOUR_PROJECT_ID',
location = 'YOUR_PROJECT_LOCATION',
processorId = 'YOUR_PROCESSOR_ID', // Create this in the Cloud Console
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf',
gcsOutputUri = 'output-bucket',
gcsOutputUriPrefix = uuid.v4()
) {
// [START documentai_batch_process_document]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
// const processorId = 'YOUR_PROCESSOR_ID';
// const gcsInputUri = 'YOUR_SOURCE_PDF';
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';

// Imports the Google Cloud client library
const {
DocumentProcessorServiceClient,
} = require('@google-cloud/documentai').v1beta3;
const {Storage} = require('@google-cloud/storage');

// Instantiates Document AI, Storage clients
const client = new DocumentProcessorServiceClient();
const storage = new Storage();

const {default: PQueue} = require('p-queue');

async function batchProcessDocument() {
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;

// Configure the batch process request.
const request = {
name,
inputConfigs: [
{
gcsSource: gcsInputUri,
mimeType: 'application/pdf',
},
],
outputConfig: {
gcsDestination: `${gcsOutputUri}/${gcsOutputUriPrefix}/`,
},
};

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
// Note: first request to the service takes longer than subsequent
// requests.
const [operation] = await client.batchProcessDocuments(request);

// Wait for operation to complete.
await operation.promise();

console.log('Document processing complete.');

// Query Storage bucket for the results file(s).
const query = {
prefix: gcsOutputUriPrefix,
};

console.log('Fetching results ...');

// List all of the files in the Storage bucket
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

// Add all asynchronous downloads to queue for execution.
const queue = new PQueue({concurrency: 15});
const tasks = files.map((fileInfo, index) => async () => {
// Get the file as a buffer
const [file] = await fileInfo.download();

console.log(`Fetched file #${index + 1}:`);

// The results stored in the output Storage location
// are formatted as a document object.
const document = JSON.parse(file.toString());
const {text} = document;

// Extract shards from the text field
const getText = textAnchor => {
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
return '';
}

// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return text.substring(startIndex, endIndex);
};

// Read the text recognition output from the processor
console.log('The document contains the following paragraphs:');

const [page1] = document.pages;
const {paragraphs} = page1;
for (const paragraph of paragraphs) {
const paragraphText = getText(paragraph.layout.textAnchor);
console.log(`Paragraph text:\n${paragraphText}`);
}

// Form parsing provides additional output about
// form-formatted PDFs. You must create a form
// processor in the Cloud Console to see full field details.
console.log('\nThe following form key/value pairs were detected:');

const {formFields} = page1;
for (const field of formFields) {
const fieldName = getText(field.fieldName.textAnchor);
const fieldValue = getText(field.fieldValue.textAnchor);

console.log('Extracted key value pair:');
console.log(`\t(${fieldName}, ${fieldValue})`);
}
});
await queue.addAll(tasks);
}
// [END documentai_batch_process_document]

batchProcessDocument();
}
main(...process.argv.slice(2));
6 changes: 4 additions & 2 deletions document-ai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"license": "Apache-2.0",
"author": "Google LLC",
"engines": {
"node": ">=8"
"node": ">= 10.17.0"
},
"files": [
"*.js"
Expand All @@ -14,7 +14,9 @@
},
"dependencies": {
"@google-cloud/documentai": "^2.1.1",
"@google-cloud/storage": "^5.0.0"
"@google-cloud/storage": "^5.0.0",
"p-queue": "^6.6.2",
"uuid": "^8.3.1"
},
"devDependencies": {
"chai": "^4.2.0",
Expand Down
107 changes: 107 additions & 0 deletions document-ai/process-document.v1beta3.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/**
* Copyright 2020, Google, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

async function main(projectId, location, processorId, filePath) {
// [START documentai_process_document]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
// const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
// const filePath = '/path/to/local/pdf';

const {
DocumentProcessorServiceClient,
} = require('@google-cloud/documentai').v1beta3;

// Instantiates a client
const client = new DocumentProcessorServiceClient();

async function processDocument() {
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;

// Read the file into memory.
const fs = require('fs').promises;
const imageFile = await fs.readFile(filePath);

// Convert the image data to a Buffer and base64 encode it.
const encodedImage = Buffer.from(imageFile).toString('base64');

const request = {
name,
document: {
content: encodedImage,
mimeType: 'application/pdf',
},
};

// Recognizes text entities in the PDF document
const [result] = await client.processDocument(request);
const {document} = result;

// Get all of the document text as one big string
const {text} = document;

// Extract shards from the text field
const getText = textAnchor => {
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
return '';
}

// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return text.substring(startIndex, endIndex);
};

// Read the text recognition output from the processor
console.log('The document contains the following paragraphs:');
const [page1] = document.pages;
const {paragraphs} = page1;

for (const paragraph of paragraphs) {
const paragraphText = getText(paragraph.layout.textAnchor);
console.log(`Paragraph text:\n${paragraphText}`);
}

// Form parsing provides additional output about
// form-formatted PDFs. You must create a form
// processor in the Cloud Console to see full field details.
console.log('\nThe following form key/value pairs were detected:');

const {formFields} = page1;
for (const field of formFields) {
const fieldName = getText(field.fieldName.textAnchor);
const fieldValue = getText(field.fieldValue.textAnchor);

console.log('Extracted key value pair:');
console.log(`\t(${fieldName}, ${fieldValue})`);
}
}
// [END documentai_process_document]
await processDocument();
}

main(...process.argv.slice(2)).catch(err => {
console.error(err);
process.exitCode = 1;
});
62 changes: 39 additions & 23 deletions document-ai/quickstart.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,56 +15,72 @@

'use strict';

async function main(
projectId,
location,
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
async function main(projectId, location, processorId, filePath) {
// [START documentai_quickstart]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
// const gcsInputUri = 'YOUR_SOURCE_PDF';
// const processor = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
// const filePath = '/path/to/local/pdf';

const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai').v1beta2;
const client = new DocumentUnderstandingServiceClient();
DocumentProcessorServiceClient,
} = require('@google-cloud/documentai').v1beta3;

// Instantiates a client
const client = new DocumentProcessorServiceClient();

async function quickstart() {
// Configure the request for processing the PDF
const parent = `projects/${projectId}/locations/${location}`;
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;

// Read the file into memory.
const fs = require('fs').promises;
const imageFile = await fs.readFile(filePath);

// Convert the image data to a Buffer and base64 encode it.
const encodedImage = Buffer.from(imageFile).toString('base64');

const request = {
parent,
inputConfig: {
gcsSource: {
uri: gcsInputUri,
},
name,
document: {
content: encodedImage,
mimeType: 'application/pdf',
},
};

// Recognizes text entities in the PDF document
const [result] = await client.processDocument(request);
const {document} = result;

// Get all of the document text as one big string
const {text} = result;
const {text} = document;

// Extract shards from the text field
function extractText(textAnchor) {
const getText = textAnchor => {
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
return '';
}

// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return text.substring(startIndex, endIndex);
}
};

// Read the text recognition output from the processor
console.log('The document contains the following paragraphs:');
const [page1] = document.pages;
const {paragraphs} = page1;

for (const entity of result.entities) {
console.log(`\nEntity text: ${extractText(entity.textAnchor)}`);
console.log(`Entity type: ${entity.type}`);
console.log(`Entity mention text: ${entity.mentionText}`);
for (const paragraph of paragraphs) {
const paragraphText = getText(paragraph.layout.textAnchor);
console.log(`Paragraph text:\n${paragraphText}`);
}
}
// [END documentai_quickstart]
Expand Down
Binary file added document-ai/resources/invoice.pdf
Binary file not shown.
Loading

0 comments on commit d8678f2

Please sign in to comment.