Skip to content

Commit

Permalink
feat(samples): adds Document AI parse table, forms samples (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
telpirion authored and Ace Nassri committed Nov 14, 2022
1 parent 0edc349 commit 7d64ced
Show file tree
Hide file tree
Showing 5 changed files with 381 additions and 3 deletions.
7 changes: 4 additions & 3 deletions document-ai/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@
"*.js"
],
"scripts": {
"test": "mocha test/*.js"
"test": "mocha test/*.js --timeout 600000"
},
"dependencies": {
"@google-cloud/documentai": "^0.0.1"
"@google-cloud/documentai": "^0.0.1",
"@google-cloud/storage": "^4.2.0"
},
"devDependencies": {
"chai": "^4.2.0",
"mocha": "^6.2.0"
}
}
}
135 changes: 135 additions & 0 deletions document-ai/parseForm.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/**
* Copyright 2020 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

const uuid = require('uuid');

async function main(
projectId = 'YOUR_PROJECT_ID',
gcsOutputUri = 'output-bucket',
gcsOutputUriPrefix = uuid.v4(),
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
// [START document_parse_form]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';

// Imports the Google Cloud client library
const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai');
const {Storage} = require('@google-cloud/storage');

const client = new DocumentUnderstandingServiceClient();
const storage = new Storage();

async function parseFormGCS(inputUri, outputUri, outputUriPrefix) {
// Configure the batch process request.
const request = {
inputConfig: {
gcsSource: {
uri: inputUri,
},
mimeType: 'application/pdf',
},
outputConfig: {
gcsDestination: {
uri: `${outputUri}${outputUriPrefix}`,
},
pagesPerShard: 1,
},
formExtractionParams: {
enabled: true,
keyValuePairHints: [
{
key: 'Phone',
valueTypes: ['PHONE_NUMBER'],
},
{
key: 'Contact',
valueTypes: ['EMAIL', 'NAME'],
},
],
},
};

// Configure the request for batch process
const requests = {
parent: `projects/${projectId}`,
requests: [request],
};

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
const [operation] = await client.batchProcessDocuments(requests);

// Wait for operation to complete.
await operation.promise();

console.log('Document processing complete.');

// Query Storage bucket for the results file(s).
const query = {
prefix: outputUriPrefix,
};

console.log('Fetching results ...');

// List all of the files in the Storage bucket
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

files.forEach(async (fileInfo, index) => {
// Get the file as a buffer
const [file] = await fileInfo.download();

console.log(`Fetched file #${index + 1}:`);

// Read the results
const results = JSON.parse(file.toString());

// Get all of the document text as one big string.
const text = results.text;

// Utility to extract text anchors from text field.
const getText = textAnchor => {
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;

return `\t${text.substring(startIndex, endIndex)}`;
};

// Process the output
const [page1] = results.pages;
const formFields = page1.formFields;

formFields.forEach(field => {
const fieldName = getText(field.fieldName.textAnchor);
const fieldValue = getText(field.fieldValue.textAnchor);

console.log('Extracted key value pair:');
console.log(`\t(${fieldName}, ${fieldValue})`);
});
});
}
// [END document_parse_form]

parseFormGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
}
main(...process.argv.slice(2));
140 changes: 140 additions & 0 deletions document-ai/parseTable.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
/**
* Copyright 2020 Google LLC
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

'use strict';

const uuid = require('uuid');

async function main(
projectId = 'YOUR_PROJECT_ID',
gcsOutputUri = 'output-bucket',
gcsOutputUriPrefix = uuid.v4(),
gcsInputUri = 'gs://cloud-samples-data/documentai/invoice.pdf'
) {
// [START document_parse_table]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const gcsOutputUri = 'YOUR_STORAGE_BUCKET';
// const gcsOutputUriPrefix = 'YOUR_STORAGE_PREFIX';

// Imports the Google Cloud client library
const {
DocumentUnderstandingServiceClient,
} = require('@google-cloud/documentai');
const {Storage} = require('@google-cloud/storage');

const client = new DocumentUnderstandingServiceClient();
const storage = new Storage();

async function parseTableGCS(inputUri, outputUri, outputUriPrefix) {
// Configure the batch process request.
const request = {
inputConfig: {
gcsSource: {
uri: inputUri,
},
mimeType: 'application/pdf',
},
outputConfig: {
gcsDestination: {
uri: `${outputUri}${outputUriPrefix}`,
},
pagesPerShard: 1,
},
tableExtractionParams: {
enabled: true,
tableBoundHints: [
{
boundingBox: {
normalizedVertices: [
{x: 0, y: 0},
{x: 1, y: 0},
{x: 1, y: 1},
{x: 0, y: 1},
],
},
},
],
},
};

// Configure the request for batch process
const requests = {
parent: `projects/${projectId}`,
requests: [request],
};

// Batch process document using a long-running operation.
// You can wait for now, or get results later.
const [operation] = await client.batchProcessDocuments(requests);

// Wait for operation to complete.
await operation.promise();

console.log('Document processing complete.');

// Query Storage bucket for the results file(s).
const query = {
prefix: outputUriPrefix,
};

console.log('Fetching results ...');

// List all of the files in the Storage bucket
const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

files.forEach(async (fileInfo, index) => {
// Get the file as a buffer
const [file] = await fileInfo.download();

console.log(`Fetched file #${index + 1}:`);

// Read the results
const results = JSON.parse(file.toString());

// Get all of the document text as one big string
const text = results.text;

// Get the first table in the document
const [page1] = results.pages;
const [table] = page1.tables;
const [headerRow] = table.headerRows;

console.log('Results from first table processed:');
console.log(
`First detected language: ${page1.detectedLanguages[0].languageCode}`
);

console.log('Header row:');
headerRow.cells.forEach(tableCell => {
if (tableCell.layout.textAnchor.textSegments) {
// Extract shards from the text field
// First shard in document doesn't have startIndex property
const startIndex =
tableCell.layout.textAnchor.textSegments[0].startIndex || 0;
const endIndex = tableCell.layout.textAnchor.textSegments[0].endIndex;

console.log(`\t${text.substring(startIndex, endIndex)}`);
}
});
});
}
// [END document_parse_table]

parseTableGCS(gcsInputUri, gcsOutputUri, gcsOutputUriPrefix);
}
main(...process.argv.slice(2));
51 changes: 51 additions & 0 deletions document-ai/test/parseForm.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

'use strict';

const {Storage} = require('@google-cloud/storage');
const cp = require('child_process');
const {assert} = require('chai');
const {describe, it, before, after} = require('mocha');
const uuid = require('uuid');

const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'});

const storage = new Storage();
const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`;
const cmd = `node parseForm.js`;

const testParseForm = {
projectId: process.env.GCLOUD_PROJECT,
gcsOutputUriPrefix: uuid.v4(),
};

describe(`Document AI parse form`, () => {
before(async () => {
await storage.createBucket(bucketName);
});

after(async () => {
const bucket = storage.bucket(bucketName);
await bucket.deleteFiles({force: true});
await bucket.delete();
});

it(`should parse the GCS invoice example as a form`, async () => {
const output = execSync(
`${cmd} ${testParseForm.projectId} gs://${bucketName}/`
);
assert.match(output, /Extracted key value pair:/);
});
});
51 changes: 51 additions & 0 deletions document-ai/test/parseTable.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

'use strict';

const {Storage} = require('@google-cloud/storage');
const cp = require('child_process');
const {assert} = require('chai');
const {describe, it, before, after} = require('mocha');
const uuid = require('uuid');

const execSync = cmd => cp.execSync(cmd, {encoding: 'utf-8'});

const storage = new Storage();
const bucketName = `nodejs-docs-samples-test-${uuid.v4()}`;
const cmd = `node parseTable.js`;

const testParseTable = {
projectId: process.env.GCLOUD_PROJECT,
gcsOutputUriPrefix: uuid.v4(),
};

describe(`Document AI parse table`, () => {
before(async () => {
await storage.createBucket(bucketName);
});

after(async () => {
const bucket = storage.bucket(bucketName);
await bucket.deleteFiles({force: true});
await bucket.delete();
});

it(`should parse the GCS invoice example as as table`, async () => {
const output = execSync(
`${cmd} ${testParseTable.projectId} gs://${bucketName}/`
);
assert.match(output, /First detected language: en/);
});
});

0 comments on commit 7d64ced

Please sign in to comment.