From 19a52b77230d860dc01d1c06b4460e9befa3fcd4 Mon Sep 17 00:00:00 2001 From: Mathieu Lajoie Date: Wed, 10 May 2023 15:18:13 -0400 Subject: [PATCH] feat: 1494 Adds accessible semantics --- __mocks__/_failing_page.ts | 1 + src/Page/PageCanvas.spec.tsx | 12 +++++++- src/Page/PageCanvas.tsx | 21 +++++++++++-- src/Page/TextLayer.spec.tsx | 57 ++++++++++++---------------------- src/Page/TextLayer.tsx | 2 +- src/StructTree/StructTree.tsx | 35 +++++++++++++++++++++ src/StructTree/constants.ts | 58 +++++++++++++++++++++++++++++++++++ src/StructTree/index.ts | 1 + src/StructTree/types.ts | 23 ++++++++++++++ src/StructTree/utils.ts | 42 +++++++++++++++++++++++++ 10 files changed, 210 insertions(+), 42 deletions(-) create mode 100644 src/StructTree/StructTree.tsx create mode 100644 src/StructTree/constants.ts create mode 100644 src/StructTree/index.ts create mode 100644 src/StructTree/types.ts create mode 100644 src/StructTree/utils.ts diff --git a/__mocks__/_failing_page.ts b/__mocks__/_failing_page.ts index 0609673fa..1c677fcd5 100644 --- a/__mocks__/_failing_page.ts +++ b/__mocks__/_failing_page.ts @@ -12,6 +12,7 @@ export default { getAnnotations: () => new Promise((resolve, reject) => reject(new Error())), getOperatorList: () => new Promise((resolve, reject) => reject(new Error())), getTextContent: () => new Promise((resolve, reject) => reject(new Error())), + getStructTree: () => new Promise((resolve) => resolve()), getViewport: () => ({ width: 600, height: 800, diff --git a/src/Page/PageCanvas.spec.tsx b/src/Page/PageCanvas.spec.tsx index 6de544ee2..fbf3a6d43 100644 --- a/src/Page/PageCanvas.spec.tsx +++ b/src/Page/PageCanvas.spec.tsx @@ -1,6 +1,6 @@ import { beforeAll, describe, expect, it, vi } from 'vitest'; import React from 'react'; -import { render } from '@testing-library/react'; +import { render, waitFor } from '@testing-library/react'; import { pdfjs } from '../index.test'; @@ -103,5 +103,15 @@ describe('PageCanvas', () => { expect(canvasRef).toHaveBeenCalled(); expect(canvasRef).toHaveBeenCalledWith(expect.any(HTMLElement)); }); + + it('generates a struct tree inside the canvas', async () => { + renderWithContext(, { + page, + scale: 1, + }); + + const canvas = document.querySelector('canvas'); + await waitFor(() => expect(canvas?.children.length).not.toBe(0)); + }); }); }); diff --git a/src/Page/PageCanvas.tsx b/src/Page/PageCanvas.tsx index 046e3eef2..1c5327bb1 100644 --- a/src/Page/PageCanvas.tsx +++ b/src/Page/PageCanvas.tsx @@ -1,4 +1,4 @@ -import React, { useCallback, useContext, useEffect, useMemo, useRef } from 'react'; +import React, { useCallback, useContext, useEffect, useMemo, useRef, useState } from 'react'; import mergeRefs from 'merge-refs'; import invariant from 'tiny-invariant'; import warning from 'tiny-warning'; @@ -15,7 +15,8 @@ import { import { isRef } from '../shared/propTypes'; -import type { RenderParameters } from 'pdfjs-dist/types/src/display/api'; +import type { RenderParameters, StructTreeNode } from 'pdfjs-dist/types/src/display/api'; +import StructTree from '../StructTree'; const ANNOTATION_MODE = pdfjs.AnnotationMode; @@ -34,6 +35,7 @@ export default function PageCanvas(props: PageCanvasProps) { devicePixelRatio: devicePixelRatioProps, onRenderError: onRenderErrorProps, onRenderSuccess: onRenderSuccessProps, + customTextRenderer, page, renderForms, rotate, @@ -45,6 +47,17 @@ export default function PageCanvas(props: PageCanvasProps) { invariant(page, 'Attempted to render page canvas, but no page was specified.'); + const [structTree, setStructTree] = useState(null); + + useEffect(() => { + if (!customTextRenderer) { + page.getStructTree().then((tree) => { + setStructTree(tree); + }); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + const devicePixelRatio = devicePixelRatioProps || getDevicePixelRatio(); /** @@ -169,7 +182,9 @@ export default function PageCanvas(props: PageCanvasProps) { display: 'block', userSelect: 'none', }} - /> + > + {!!structTree && } + ); } diff --git a/src/Page/TextLayer.spec.tsx b/src/Page/TextLayer.spec.tsx index fb581ad9f..5ec53fc31 100644 --- a/src/Page/TextLayer.spec.tsx +++ b/src/Page/TextLayer.spec.tsx @@ -34,6 +34,18 @@ function renderWithContext(children: React.ReactNode, context: Partial { // Loaded page let page: PDFPageProxy; @@ -43,6 +55,9 @@ describe('TextLayer', () => { let desiredTextItems: TextContent['items']; let desiredTextItems2: TextContent['items']; + let markedRenderedTextLayer: HTMLDivElement; + let unmarkedRenderedTextLayer: HTMLDivElement; + beforeAll(async () => { const pdf = await pdfjs.getDocument({ data: pdfFile.arrayBuffer }).promise; @@ -53,6 +68,9 @@ describe('TextLayer', () => { page2 = await pdf.getPage(2); const textContent2 = await page2.getTextContent(); desiredTextItems2 = textContent2.items; + + markedRenderedTextLayer = await getRenderedTextLayer(page, true); + unmarkedRenderedTextLayer = await getRenderedTextLayer(page, false); }); describe('loading', () => { @@ -139,7 +157,7 @@ describe('TextLayer', () => { const wrapper = container.firstElementChild as HTMLDivElement; const textItems = wrapper.children; - expect(textItems).toHaveLength(desiredTextItems.length + 1); + expect(textItems).toHaveLength(markedRenderedTextLayer.children.length + 1); }); it('renders text content properly given customTextRenderer', async () => { @@ -161,42 +179,7 @@ describe('TextLayer', () => { const wrapper = container.firstElementChild as HTMLDivElement; const textItems = wrapper.children; - expect(textItems).toHaveLength(desiredTextItems.length + 1); - }); - - it('maps textContent items to actual TextLayer children properly', async () => { - const { func: onRenderTextLayerSuccess, promise: onRenderTextLayerSuccessPromise } = - makeAsyncCallback(); - - const { container, rerender } = renderWithContext(, { - onRenderTextLayerSuccess, - page, - }); - - expect.assertions(1); - - await onRenderTextLayerSuccessPromise; - - const wrapper = container.firstElementChild as HTMLDivElement; - const innerHTML = wrapper.innerHTML; - - const { func: onRenderTextLayerSuccess2, promise: onRenderTextLayerSuccessPromise2 } = - makeAsyncCallback(); - - const customTextRenderer = (item: { str: string }) => item.str; - - rerender(, { - customTextRenderer, - onRenderTextLayerSuccess: onRenderTextLayerSuccess2, - page, - }); - - await onRenderTextLayerSuccessPromise2; - - const wrapper2 = container.firstElementChild as HTMLDivElement; - const innerHTML2 = wrapper2.innerHTML; - - expect(innerHTML).toEqual(innerHTML2); + expect(textItems).toHaveLength(unmarkedRenderedTextLayer.children.length + 1); }); it('calls customTextRenderer with necessary arguments', async () => { diff --git a/src/Page/TextLayer.tsx b/src/Page/TextLayer.tsx index 16cad4a5a..9d03a4ff3 100644 --- a/src/Page/TextLayer.tsx +++ b/src/Page/TextLayer.tsx @@ -184,7 +184,7 @@ export default function TextLayer() { layer.innerHTML = ''; - const textContentSource = page.streamTextContent(); + const textContentSource = page.streamTextContent({ includeMarkedContent: !customTextRenderer }); const parameters = { container: layer, diff --git a/src/StructTree/StructTree.tsx b/src/StructTree/StructTree.tsx new file mode 100644 index 000000000..5181dd962 --- /dev/null +++ b/src/StructTree/StructTree.tsx @@ -0,0 +1,35 @@ +import React, { useMemo } from 'react'; +import PropTypes from 'prop-types'; +import { getAttributes } from './utils'; +import type { StructTreeProps } from './types'; +import type { StructTreeNode } from 'pdfjs-dist/types/src/display/api'; + +export default function StructTree({ node }: StructTreeProps) { + const attributes = useMemo(() => getAttributes(node), [node]); + + const childNodes = useMemo(() => { + if ( + node.children && + !(node.children.length === 1 && node.children[0] && 'id' in node.children[0]) + ) { + return node.children.map((child, index) => ( + // Safe to use index for key as the array is bound to the pdf structure + // eslint-disable-next-line react/no-array-index-key + + )); + } + return null; + }, [node]); + + return {childNodes}; +} + +StructTree.propTypes = { + node: PropTypes.shape({ + children: PropTypes.array, + role: PropTypes.string, + alt: PropTypes.string, + lang: PropTypes.string, + id: PropTypes.string, + }).isRequired, +}; diff --git a/src/StructTree/constants.ts b/src/StructTree/constants.ts new file mode 100644 index 000000000..cb5e46ead --- /dev/null +++ b/src/StructTree/constants.ts @@ -0,0 +1,58 @@ +// From pdfjs-dist/lib/web/struct_tree_layer_builder.js +export const PDF_ROLE_TO_HTML_ROLE = { + // Document level structure types + Document: null, // There's a "document" role, but it doesn't make sense here. + DocumentFragment: null, + // Grouping level structure types + Part: 'group', + Sect: 'group', // XXX: There's a "section" role, but it's abstract. + Div: 'group', + Aside: 'note', + NonStruct: 'none', + // Block level structure types + P: null, + // H, + H: 'heading', + Title: null, + FENote: 'note', + // Sub-block level structure type + Sub: 'group', + // General inline level structure types + Lbl: null, + Span: null, + Em: null, + Strong: null, + Link: 'link', + Annot: 'note', + Form: 'form', + // Ruby and Warichu structure types + Ruby: null, + RB: null, + RT: null, + RP: null, + Warichu: null, + WT: null, + WP: null, + // List standard structure types + L: 'list', + LI: 'listitem', + LBody: null, + // Table standard structure types + Table: 'table', + TR: 'row', + TH: 'columnheader', + TD: 'cell', + THead: 'columnheader', + TBody: null, + TFoot: null, + // Standard structure type Caption + Caption: null, + // Standard structure type Figure + Figure: 'figure', + // Standard structure type Formula + Formula: null, + // standard structure type Artifact + Artifact: null, +}; + +export const HEADING_PATTERN = /^H(\d+)$/; diff --git a/src/StructTree/index.ts b/src/StructTree/index.ts new file mode 100644 index 000000000..87a0488d7 --- /dev/null +++ b/src/StructTree/index.ts @@ -0,0 +1 @@ +export { default } from './StructTree'; diff --git a/src/StructTree/types.ts b/src/StructTree/types.ts new file mode 100644 index 000000000..42760184d --- /dev/null +++ b/src/StructTree/types.ts @@ -0,0 +1,23 @@ +import { PDF_ROLE_TO_HTML_ROLE } from './constants'; + +export type PdfTagRole = keyof typeof PDF_ROLE_TO_HTML_ROLE; + +export type StructTreeNode = { + children?: StructTreeNode[]; + role?: string; + id?: string; + lang?: string; + alt?: string; +}; + +export type StructTreeProps = { + node: StructTreeNode; +}; + +export type StructTreeAttributes = { + lang?: string; + role?: string; + 'aria-level'?: number; + 'aria-label'?: string; + 'aria-owns'?: string; +}; diff --git a/src/StructTree/utils.ts b/src/StructTree/utils.ts new file mode 100644 index 000000000..439d33d02 --- /dev/null +++ b/src/StructTree/utils.ts @@ -0,0 +1,42 @@ +/* eslint-disable no-bitwise */ +/* eslint-disable prefer-destructuring */ +import { HEADING_PATTERN, PDF_ROLE_TO_HTML_ROLE } from './constants'; +import type { StructTreeAttributes, StructTreeNode, PdfTagRole } from './types'; + +export const getRoleAttributes = (node: StructTreeNode) => { + const attributes: StructTreeAttributes = {}; + if ('role' in node) { + const { role } = node; + const match = role?.match(HEADING_PATTERN); + if (match) { + attributes.role = 'heading'; + attributes['aria-level'] = Number(match[1]); + } else if (role && PDF_ROLE_TO_HTML_ROLE[role as PdfTagRole]) { + attributes.role = PDF_ROLE_TO_HTML_ROLE[role as PdfTagRole] ?? undefined; + } + } + return attributes; +}; + +export const getStandardAttributes = (node: StructTreeNode): StructTreeAttributes => { + const attributes: StructTreeAttributes = {}; + if (node.alt !== undefined) { + attributes['aria-label'] = node.alt; + } + if (node.lang !== undefined) { + attributes.lang = node.lang; + } + if (node.id !== undefined) { + attributes['aria-owns'] = node.id; + } + if (node.children?.length === 1 && node.children[0] && 'id' in node.children[0]) { + return { ...attributes, ...getStandardAttributes(node.children[0]) }; + } + return attributes; +}; + +export const getAttributes = (node: StructTreeNode) => { + if (node) { + return { ...getRoleAttributes(node), ...getStandardAttributes(node) }; + } +};