Skip to content

Commit

Permalink
Allow specifying custom match logic in PDFFindController
Browse files Browse the repository at this point in the history
This patch allows embedders of PDF.js to provide custom match
logic for seaching in PDFs. This is done by subclassing the
PDFFindController class and overriding the `match` method.

`match` is called once per PDF page, receives as parameters the
search query, the page contents, and the page index, and returns
an array of { index, length } objects representing the search
results.
  • Loading branch information
nicolo-ribaudo committed Aug 2, 2024
1 parent b80e552 commit 2847d84
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 63 deletions.
87 changes: 85 additions & 2 deletions test/unit/pdf_find_controller_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ class MockLinkService extends SimpleLinkService {

async function initPdfFindController(
filename,
updateMatchesCountOnProgress = true
updateMatchesCountOnProgress = true,
matcher = undefined
) {
const loadingTask = getDocument(
buildGetDocumentParams(filename || tracemonkeyFileName, {
Expand All @@ -65,7 +66,13 @@ async function initPdfFindController(
const linkService = new MockLinkService();
linkService.setDocument(pdfDocument);

const pdfFindController = new PDFFindController({
let FindControllerClass = PDFFindController;
if (matcher !== undefined) {
FindControllerClass = class extends PDFFindController {};
FindControllerClass.prototype.match = matcher;
}

const pdfFindController = new FindControllerClass({
linkService,
eventBus,
updateMatchesCountOnProgress,
Expand Down Expand Up @@ -1054,4 +1061,80 @@ describe("pdf_find_controller", function () {
const { eventBus } = await initPdfFindController();
await testOnFind({ eventBus });
});

describe("custom matcher", () => {
it("calls to the matcher with the right arguments", async () => {
const QUERY = "Foo bar";

const spy = jasmine
.createSpy("custom find matcher")
.and.callFake(() => [{ index: 0, length: 1 }]);

const { eventBus, pdfFindController } = await initPdfFindController(
null,
false,
spy
);

const PAGES_COUNT = 14;

await testSearch({
eventBus,
pdfFindController,
state: { query: QUERY },
selectedMatch: { pageIndex: 0, matchIndex: 0 },
matchesPerPage: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
});

expect(spy).toHaveBeenCalledTimes(PAGES_COUNT);

for (let i = 0; i < PAGES_COUNT; i++) {
const args = spy.calls.argsFor(i);
expect(args[0]).withContext(`page ${i}`).toBe(QUERY);
expect(args[2]).withContext(`page ${i}`).toBe(i);
}

expect(spy.calls.argsFor(0)[1]).toMatch(/^Trace-based /);
expect(spy.calls.argsFor(1)[1]).toMatch(/^Hence, recording and /);
expect(spy.calls.argsFor(12)[1]).toMatch(/Figure 12. Fraction of time /);
expect(spy.calls.argsFor(13)[1]).toMatch(/^not be interpreted as /);
});

it("uses the results returned by the custom matcher", async () => {
const QUERY = "Foo bar";

// prettier-ignore
const spy = jasmine.createSpy("custom find matcher")
.and.returnValue(undefined)
.withArgs(QUERY, jasmine.anything(), 0)
.and.returnValue([
{ index: 20, length: 3 },
{ index: 50, length: 8 },
])
.withArgs(QUERY, jasmine.anything(), 2)
.and.returnValue([
{ index: 7, length: 19 }
])
.withArgs(QUERY, jasmine.anything(), 13)
.and.returnValue([
{ index: 50, length: 2 },
{ index: 54, length: 9 },
{ index: 80, length: 4 },
]);

const { eventBus, pdfFindController } = await initPdfFindController(
null,
false,
spy
);

await testSearch({
eventBus,
pdfFindController,
state: { query: QUERY },
selectedMatch: { pageIndex: 0, matchIndex: 0 },
matchesPerPage: [2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3],
});
});
});
});
130 changes: 69 additions & 61 deletions web/pdf_find_controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -670,37 +670,6 @@ class PDFFindController {
return true;
}

#calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
if (!query) {
// The query can be empty because some chars like diacritics could have
// been stripped out.
return;
}
const diffs = this._pageDiffs[pageIndex];
let match;
while ((match = query.exec(pageContent)) !== null) {
if (
entireWord &&
!this.#isEntireWord(pageContent, match.index, match[0].length)
) {
continue;
}

const [matchPos, matchLen] = getOriginalIndex(
diffs,
match.index,
match[0].length
);

if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
}
}

#convertToRegExpString(query, hasDiacritics) {
const { matchDiacritics } = this.#state;
let isUnicode = false;
Expand Down Expand Up @@ -771,13 +740,65 @@ class PDFFindController {
return [isUnicode, query];
}

#calculateMatch(pageIndex) {
let query = this.#query;
async #calculateMatch(pageIndex) {
const query = this.#query;
if (query.length === 0) {
return; // Do nothing: the matches should be wiped out already.
}
const { caseSensitive, entireWord } = this.#state;
const pageContent = this._pageContents[pageIndex];
const matcherResult = await this.match(query, pageContent, pageIndex);

const matches = (this._pageMatches[pageIndex] = []);
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
const diffs = this._pageDiffs[pageIndex];

matcherResult?.forEach(({ index, length }) => {
const [matchPos, matchLen] = getOriginalIndex(diffs, index, length);
if (matchLen) {
matches.push(matchPos);
matchesLength.push(matchLen);
}
});

// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}

// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
}
}

/**
* @typedef {Object} SingleFindMatch
* @property {number} index - The start of the matched text in the page's string

Check failure on line 789 in web/pdf_find_controller.js

View workflow job for this annotation

GitHub Actions / Lint (lts/*)

This line has a comment length of 82. Maximum allowed is 80
* contents.
* @property {number} length - The length of the matched text.
*/

/**
* @param {string | string[]} query - The search query.
* @param {string} pageContent - The text content of the page to search in.
* @param {number} pageIndex - The index of the page that is being processed.
* @returns {Promise<SingleFindMatch[]> | SingleFindMatch[] | undefined} An
* array of matches in the provided page.
*/
match(query, pageContent, pageIndex) {
const hasDiacritics = this._hasDiacritics[pageIndex];

let isUnicode = false;
Expand All @@ -799,34 +820,22 @@ class PDFFindController {
})
.join("|");
}
if (!query) {
return undefined;
}

const { caseSensitive, entireWord } = this.#state;
const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
query = query ? new RegExp(query, flags) : null;

this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
query = new RegExp(query, flags);

// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this.#state.highlightAll) {
this.#updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this.#nextPageMatch();
}

// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
this._matchesCountTotal += pageMatchesCount;
if (this.#updateMatchesCountOnProgress) {
if (pageMatchesCount > 0) {
this.#updateUIResultsCount();
const matches = [];
for (const { index, 0: match } of pageContent.matchAll(query)) {
if (entireWord && !this.#isEntireWord(pageContent, index, match.length)) {
continue;
}
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
// For example, in GeckoView we want to have only the final update because
// the Java side provides only one object to update the counts.
this.#updateUIResultsCount();
matches.push({ index, length: match.length });
}
return matches;
}

#extractText() {
Expand Down Expand Up @@ -930,10 +939,9 @@ class PDFFindController {
continue;
}
this._pendingFindMatches.add(i);
this._extractTextPromises[i].then(() => {
this._pendingFindMatches.delete(i);
this.#calculateMatch(i);
});
this._extractTextPromises[i]
.then(() => this.#calculateMatch(i))
.finally(() => this._pendingFindMatches.delete(i));
}
}

Expand Down

0 comments on commit 2847d84

Please sign in to comment.