[ai-form-recognizer] Lazy iterator for words of a line (#18444)

* [ai-form-recognizer] Lazy iterator for words of a line * Use method instead of property * Regenerate API * Polished, wrote changelog, added some more tests, samples * Updated API MD * Improved docs * Apply changes from review
Azure · Nov 10, 2021 · 12c6a81 · 12c6a81
1 parent 6ba071d
commit 12c6a81
Show file tree

Hide file tree

Showing 10 changed files with 396 additions and 13 deletions.
diff --git a/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md b/sdk/formrecognizer/ai-form-recognizer/CHANGELOG.md
@@ -1,15 +1,13 @@
 # Release History
 
-## 4.0.0-beta.2 (Unreleased)
+## 4.0.0-beta.2 (2021-11-09)
 
 ### Features Added
 
-### Breaking Changes
+- Added a `words` method to `DocumentLine`. This method produces an `IterableIterator` that will yield all of the `DocumentWord`s that are contained by the line's `spans`. This allows accessing the words that are related to the line from the line itself.
 
 ### Bugs Fixed
 
-### Other Changes
-
 ## 4.0.0-beta.1 (2021-10-07)
 
 This new major version beta introduces a full redesign of the Azure Form Recognizer client library. To leverage features of the newest Form Recognizer service API (version "2021-09-30-preview" and newer), the new SDK is required, and application code must be changed to use the new clients. Please see the [Migration Guide](https://github.com/azure/azure-sdk-for-js/blob/main/sdk/formrecognizer/ai-form-recognizer/MIGRATION-v3_v4.md) for detailed instructions on how to update application code from version 3.x of the Form Recognizer SDK to the new version (4.x). The following sections contain an outline of the changes.

diff --git a/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md b/sdk/formrecognizer/ai-form-recognizer/review/ai-form-recognizer.api.md
@@ -337,6 +337,7 @@ export interface DocumentLine {
     boundingBox?: number[];
     content: string;
     spans: DocumentSpan[];
+    words: () => IterableIterator<DocumentWord>;
 }
 
 // @public

diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/buildModel.ts
@@ -15,6 +15,9 @@
 
 import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";
 
+import * as dotenv from "dotenv";
+dotenv.config();
+
 async function main() {
   const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
   const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");

diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/composeModel.ts
@@ -14,7 +14,6 @@
 
 import { DocumentModelAdministrationClient, AzureKeyCredential } from "@azure/ai-form-recognizer";
 
-// Load the .env file if it exists
 import * as dotenv from "dotenv";
 dotenv.config();
 

diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/copyModel.ts
@@ -10,6 +10,9 @@
 
 import { AzureKeyCredential, DocumentModelAdministrationClient } from "@azure/ai-form-recognizer";
 
+import * as dotenv from "dotenv";
+dotenv.config();
+
 async function main() {
   const endpoint = process.env.FORM_RECOGNIZER_ENDPOINT ?? "<endpoint>";
   const credential = new AzureKeyCredential(process.env.FORM_RECOGNIZER_API_KEY ?? "<api key>");

diff --git a/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts b/sdk/formrecognizer/ai-form-recognizer/samples-dev/extractLayout.ts
@@ -37,6 +37,20 @@ async function main() {
       console.log("- Page", page.pageNumber, `(unit: ${page.unit})`);
       console.log(`  ${page.width}x${page.height}, angle: ${page.angle}`);
       console.log(`  ${page.lines.length} lines, ${page.words.length} words`);
+
+      if (page.lines.length > 0) {
+        console.log("  Lines:");
+
+        for (const line of page.lines) {
+          console.log(`  - "${line.content}"`);
+
+          // The words of the line can also be iterated independently. The words are computed based on their
+          // corresponding spans.
+          for (const word of line.words()) {
+            console.log(`    - "${word.content}"`);
+          }
+        }
+      }
     }
   }
 

diff --git a/sdk/formrecognizer/ai-form-recognizer/src/index.ts b/sdk/formrecognizer/ai-form-recognizer/src/index.ts
@@ -22,8 +22,6 @@ export {
   DocumentFieldType,
   DocumentKeyValueElement,
   DocumentKeyValuePair,
-  DocumentLine,
-  DocumentPage,
   DocumentSelectionMark,
   DocumentSignatureType,
   DocumentSpan,
@@ -49,6 +47,8 @@ export {
 export {
   AnalysisPoller,
   AnalyzeResult,
+  DocumentPage,
+  DocumentLine,
   DocumentAnalysisPollOperationState,
   AnalyzedDocument,
   FormRecognizerRequestBody,

diff --git a/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts b/sdk/formrecognizer/ai-form-recognizer/src/lro/analyze.ts
@@ -11,10 +11,14 @@ import {
   Document as GeneratedDocument,
   DocumentEntity,
   DocumentKeyValuePair,
-  DocumentPage,
+  DocumentPage as GeneratedDocumentPage,
+  DocumentLine as GeneratedDocumentLine,
+  DocumentSelectionMark,
   DocumentSpan,
   DocumentStyle,
   DocumentTable,
+  DocumentWord,
+  LengthUnit,
 } from "../generated";
 import { DocumentField, toAnalyzedDocumentFieldsFromGenerated } from "../models/fields";
 import { FormRecognizerApiVersion, PollerOptions } from "../options";
@@ -67,7 +71,6 @@ export interface AnalyzedDocument {
  * Transform a REST-level Document response object into the more strongly-typed AnalyzedDocument.
  *
  * @internal
- *
  * @param document - a REST-level document response object
  * @returns an AnalyzedDocument (which has had its fields mapped to stronger DocumentField types)
  */
@@ -132,6 +135,236 @@ export interface AnalyzeResult<Document = AnalyzedDocument> {
   documents: Document[];
 }
 
+/**
+ * A page within an analysis result.
+ */
+export interface DocumentPage {
+  /**
+   * 1-based page number in the input document.
+   */
+  pageNumber: number;
+
+  /**
+   * The general orientation of the content in clockwise direction, measured in degrees between (-180, 180].
+   */
+  angle: number;
+
+  /**
+   * The width of the image/PDF in pixels/inches, respectively.
+   */
+  width: number;
+
+  /**
+   * The height of the image/PDF in pixels/inches, respectively.
+   */
+  height: number;
+
+  /**
+   * The unit used by the width, height, and boundingBox properties. For images, the unit is "pixel". For PDF, the unit is "inch".
+   */
+  unit: LengthUnit;
+
+  /**
+   * Location of the page in the reading order concatenated content.
+   */
+  spans: DocumentSpan[];
+
+  /**
+   * Extracted words from the page.
+   */
+  words: DocumentWord[];
+
+  /**
+   * Extracted selection marks from the page.
+   */
+  selectionMarks?: DocumentSelectionMark[];
+
+  /**
+   * Extracted lines from the page, potentially containing both textual and visual elements.
+   */
+  lines: DocumentLine[];
+}
+
+/**
+ * Convert a REST-level DocumentPage into a convenience layer version.
+ *
+ * @internal
+ * @param generated - a REST-level DocumentPage.
+ * @returns
+ */
+export function toDocumentPageFromGenerated(generated: GeneratedDocumentPage): DocumentPage {
+  // We will just overwrite the `lines` property with the transformed one rather than create a new object.
+  generated.lines = generated.lines.map((line) => toDocumentLineFromGenerated(line, generated));
+
+  return generated as DocumentPage;
+}
+
+/**
+ * A line of adjacent content elements on a page.
+ */
+export interface DocumentLine {
+  /**
+   * Concatenated content of the contained elements in reading order.
+   */
+  content: string;
+
+  /**
+   * Bounding box of the line.
+   */
+  boundingBox?: number[];
+
+  /**
+   * Location of the line in the reading order concatenated content.
+   */
+  spans: DocumentSpan[];
+
+  /**
+   * Compute the `DocumentWord`s that are related to this line.
+   *
+   * This function produces a lazy iterator that will yield one word before computing the next.
+   */
+  words: () => IterableIterator<DocumentWord>;
+}
+
+/**
+ * Tests if one span contains another, by testing that the outer span starts before or at the same character as the
+ * inner span, and that the end position of the outer span is greater than or equal to the end position of the inner
+ * span.
+ *
+ * @internal
+ * @param outer - the outer (potentially containing) span
+ * @param inner - the span to test if `outer` contains
+ * @returns true if `inner` is contained inside of `outer`.
+ */
+export function contains(outer: DocumentSpan, inner: DocumentSpan): boolean {
+  return outer.offset <= inner.offset && outer.offset + outer.length >= inner.offset + inner.length;
+}
+
+/**
+ * Make an empty generator. This might seem silly, but it's useful for satisfying invariants.
+ */
+function* empty(): Generator<never> {
+  /* intentionally empty */
+}
+
+/**
+ * Produces an iterator of the given items starting from the given index.
+ *
+ * @param items - the items to iterate over
+ * @param idx - the index of the first item to begin iterating from
+ */
+function* iterFrom<T>(items: T[], idx: number): Generator<T> {
+  let i = idx;
+
+  while (i < items.length) {
+    yield items[i++];
+  }
+}
+
+/**
+ * Binary search through an array of items to find the first item that could possibly be contained by the given span,
+ * then return an iterator beginning from that item.
+ *
+ * This allows a program to quickly find the first relevant item in the array for consideration when testing for span
+ * inclusion.
+ *
+ * @internal
+ * @param span - the span to use when testing each individual item
+ * @param items - an array of items to binary search through
+ * @returns an iterator beginning from the item identified by the search
+ */
+export function iteratorFromFirstMatchBinarySearch<Spanned extends { span: DocumentSpan }>(
+  span: DocumentSpan,
+  items: Spanned[]
+): IterableIterator<Spanned> {
+  let idx = Math.floor(items.length / 2);
+  let prevIdx = idx;
+  let min = 0;
+  let max = items.length;
+
+  const found = (): boolean =>
+    // The item is found if it starts after the current span and the item before it does not. That means it is the first
+    // item in the array that could be a child if the spans are sorted.
+    items[idx].span.offset >= span.offset && (items[idx - 1]?.span?.offset ?? -1) < span.offset;
+
+  // Binary search to find the first element that could be a child
+  do {
+    if (found()) {
+      return iterFrom(items, idx);
+    } else if (span.offset > items[idx].span.offset) {
+      min = prevIdx = idx;
+      idx = Math.floor(idx + (max - idx) / 2);
+    } else {
+      max = prevIdx = idx;
+      idx = Math.floor(idx - (idx - min) / 2);
+    }
+  } while (idx !== prevIdx);
+
+  // This might seem weird, but it's a simple way to make the types a little more elegant.
+  return empty();
+}
+
+/**
+ * This fast algorithm tests the elements of `childArray` for inclusion in any of the given `spans`, assuming that both
+ * the spans and child items are sorted.
+ *
+ * INVARIANT: the items in both the `spans` iterator and `childrenArray` MUST BE SORTED INCREASING by span _offset_.
+ *
+ * @internal
+ * @param spans - the spans that contain the child elements
+ * @param childrenArray - an array of child items (items that have spans) to test for inclusion in the spans
+ * @returns - an IterableIterator of child items that are included in any span in the `spans` iterator
+ */
+export function* fastGetChildren<Spanned extends { span: DocumentSpan }>(
+  spans: Iterator<DocumentSpan>,
+  childrenArray: Spanned[]
+): Generator<Spanned> {
+  let curSpan = spans.next();
+
+  // Need to exit early if there are no spans.
+  if (curSpan.done) {
+    return;
+  }
+
+  const children = iteratorFromFirstMatchBinarySearch(curSpan.value as DocumentSpan, childrenArray);
+  let curChild = children.next();
+
+  while (!(curChild.done || curSpan.done)) {
+    if (contains(curSpan.value, curChild.value.span)) {
+      // The span is contained, so yield the current child and advance it.
+      yield curChild.value;
+      curChild = children.next();
+    } else if (curSpan.value.offset + curSpan.value.length < curChild.value.span.offset) {
+      // The current span ends before the next potential child starts, so advance the span
+      curSpan = spans.next();
+    } else {
+      // The current child was not contained in the current span, so advance to the next child.
+      curChild = children.next();
+    }
+  }
+}
+
+/**
+ * Transforms a REST-level document line into a convenience layer version.
+ *
+ * @param generated - a REST-level DocumentLine
+ * @param page - the page where the DocumentLine appeared
+ * @returns a convenience layer DocumentLine
+ */
+function toDocumentLineFromGenerated(
+  generated: GeneratedDocumentLine,
+  page: GeneratedDocumentPage
+): DocumentLine {
+  (generated as DocumentLine).words = () =>
+    fastGetChildren(iterFrom(generated.spans, 0), page.words);
+
+  Object.defineProperty(generated, "words", {
+    enumerable: false,
+  });
+
+  return generated as DocumentLine;
+}
+
 /**
  * The state of an analysis operation, which will eventually produce the result type that corresponds to the model.
  */
@@ -192,7 +425,7 @@ export function toAnalyzeResultFromGenerated<
     apiVersion: result.apiVersion as FormRecognizerApiVersion,
     modelId: result.modelId,
     content: result.content,
-    pages: result.pages,
+    pages: result.pages.map((page) => toDocumentPageFromGenerated(page)),
     tables: result.tables ?? [],
     keyValuePairs: result.keyValuePairs ?? [],
     entities: result.entities ?? [],

diff --git a/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts b/sdk/formrecognizer/ai-form-recognizer/src/models/LayoutResult.ts
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-import { DocumentPage, DocumentStyle, DocumentTable } from "../generated";
-import { AnalyzeResult } from "../lro/analyze";
+import { DocumentStyle, DocumentTable } from "../generated";
+import { AnalyzeResult, DocumentPage, toDocumentPageFromGenerated } from "../lro/analyze";
 
 /**
  * Extract from an AnalyzeResult the fields that are produced from layout analysis.
@@ -12,7 +12,7 @@ export function toLayoutResult(analyzeResult: AnalyzeResult<unknown>): LayoutRes
   const { pages, tables, styles } = analyzeResult;
 
   return {
-    pages,
+    pages: pages.map(toDocumentPageFromGenerated),
     tables,
     styles,
   };