When updating, write the xref table in the same format as the previou…

…s one (bug 1878916) The specs are unclear about what kind of xref table format must be used. In checking the validity of some pdfs in the preflight tool from Acrobat we can guess that having the same format is the correct way to do. The pdf in the mentioned bug, after having been changed, wasn't correctly displayed in neither Chrome nor Acrobat: it's now fixed.
mozilla · Feb 13, 2024 · 2133da1 · 2133da1
1 parent e60329c
commit 2133da1
Show file tree

Hide file tree

Showing 5 changed files with 204 additions and 77 deletions.
diff --git a/src/core/core_utils.js b/src/core/core_utils.js
@@ -611,6 +611,19 @@ function getRotationMatrix(rotation, width, height) {
   }
 }
 
+/**
+ * Get the number of bytes to use to represent the given positive integer.
+ * If n is zero, the function returns 0 which means that we don't need to waste
+ * a byte to represent it.
+ * @param {number} x - a positive integer.
+ * @returns {number}
+ */
+function getSizeInBytes(x) {
+  // n bits are required for numbers up to 2^n - 1.
+  // So for a number x, we need ceil(log2(1 + x)) bits.
+  return Math.ceil(Math.ceil(Math.log2(1 + x)) / 8);
+}
+
 export {
   arrayBuffersToBytes,
   codePointIter,
@@ -622,6 +635,7 @@ export {
   getLookupTableFactory,
   getNewAnnotationsMap,
   getRotationMatrix,
+  getSizeInBytes,
   isAscii,
   isWhiteSpace,
   log2,

diff --git a/src/core/worker.js b/src/core/worker.js
@@ -35,7 +35,7 @@ import {
   getNewAnnotationsMap,
   XRefParseException,
 } from "./core_utils.js";
-import { Dict, Ref } from "./primitives.js";
+import { Dict, isDict, Ref } from "./primitives.js";
 import { LocalPdfManager, NetworkPdfManager } from "./pdf_manager.js";
 import { AnnotationFactory } from "./annotation.js";
 import { clearGlobalCaches } from "./cleanup_helper.js";
@@ -726,6 +726,8 @@ class WorkerMessageHandler {
           acroFormRef,
           acroForm,
           xfaData,
+          // Use the same kind of XRef as the previous one.
+          useXrefStream: isDict(xref.topDict, "XRef"),
         }).finally(() => {
           xref.resetNewTemporaryRef();
         });

diff --git a/src/core/writer.js b/src/core/writer.js
@@ -18,12 +18,14 @@ import { Dict, isName, Name, Ref } from "./primitives.js";
 import {
   escapePDFName,
   escapeString,
+  getSizeInBytes,
   numberToString,
   parseXFAPath,
 } from "./core_utils.js";
 import { SimpleDOMNode, SimpleXMLParser } from "./xml_parser.js";
 import { BaseStream } from "./base_stream.js";
 import { calculateMD5 } from "./crypto.js";
+import { Stream } from "./stream.js";
 
 async function writeObject(ref, obj, buffer, { encrypt = null }) {
   const transform = encrypt?.createCipherTransform(ref.num, ref.gen);
@@ -281,6 +283,112 @@ function updateXFA({ xfaData, xfaDatasetsRef, newRefs, xref }) {
   newRefs.push({ ref: xfaDatasetsRef, data });
 }
 
+async function getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer) {
+  buffer.push("xref\n");
+  const indexes = getIndexes(newRefs);
+  let indexesPosition = 0;
+  for (const { ref, data } of newRefs) {
+    if (ref.num === indexes[indexesPosition]) {
+      buffer.push(
+        `${indexes[indexesPosition]} ${indexes[indexesPosition + 1]}\n`
+      );
+      indexesPosition += 2;
+    }
+    // The EOL is \r\n to make sure that every entry is exactly 20 bytes long.
+    // (see 7.5.4 - Cross-Reference Table).
+    buffer.push(
+      `${baseOffset.toString().padStart(10, "0")} ${Math.min(ref.gen, 0xffff).toString().padStart(5, "0")} n\r\n`
+    );
+    baseOffset += data.length;
+  }
+  computeIDs(baseOffset, xrefInfo, newXref);
+  buffer.push("trailer\n");
+  await writeDict(newXref, buffer);
+  buffer.push("\nstartxref\n", baseOffset.toString(), "\n%%EOF\n");
+}
+
+function getIndexes(newRefs) {
+  const indexes = [];
+  for (const { ref } of newRefs) {
+    if (ref.num === indexes.at(-2) + indexes.at(-1)) {
+      indexes[indexes.length - 1] += 1;
+    } else {
+      indexes.push(ref.num, 1);
+    }
+  }
+  return indexes;
+}
+
+async function getXRefStreamTable(
+  xrefInfo,
+  baseOffset,
+  newRefs,
+  newXref,
+  buffer
+) {
+  const xrefTableData = [];
+  let maxOffset = 0;
+  let maxGen = 0;
+  for (const { ref, data } of newRefs) {
+    maxOffset = Math.max(maxOffset, baseOffset);
+    const gen = Math.min(ref.gen, 0xffff);
+    maxGen = Math.max(maxGen, gen);
+    xrefTableData.push([1, baseOffset, gen]);
+    baseOffset += data.length;
+  }
+  newXref.set("Index", getIndexes(newRefs));
+  const offsetSize = getSizeInBytes(maxOffset);
+  const maxGenSize = getSizeInBytes(maxGen);
+  const sizes = [1, offsetSize, maxGenSize];
+  newXref.set("W", sizes);
+  computeIDs(baseOffset, xrefInfo, newXref);
+
+  const structSize = sizes.reduce((a, x) => a + x, 0);
+  const data = new Uint8Array(structSize * xrefTableData.length);
+  const stream = new Stream(data);
+  stream.dict = newXref;
+
+  let offset = 0;
+  for (const [type, objOffset, gen] of xrefTableData) {
+    offset = writeInt(type, sizes[0], offset, data);
+    offset = writeInt(objOffset, sizes[1], offset, data);
+    offset = writeInt(gen, sizes[2], offset, data);
+  }
+
+  await writeObject(xrefInfo.newRef, stream, buffer, {});
+  buffer.push("startxref\n", baseOffset.toString(), "\n%%EOF\n");
+}
+
+function computeIDs(baseOffset, xrefInfo, newXref) {
+  if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
+    const md5 = computeMD5(baseOffset, xrefInfo);
+    newXref.set("ID", [xrefInfo.fileIds[0], md5]);
+  }
+}
+
+function getTrailerDict(xrefInfo, newRefs, useXrefStream) {
+  const newXref = new Dict(null);
+  newXref.set("Prev", xrefInfo.startXRef);
+  const refForXrefTable = xrefInfo.newRef;
+  if (useXrefStream) {
+    newRefs.push({ ref: refForXrefTable, data: "" });
+    newXref.set("Size", refForXrefTable.num + 1);
+    newXref.set("Type", Name.get("XRef"));
+  } else {
+    newXref.set("Size", refForXrefTable.num);
+  }
+  if (xrefInfo.rootRef !== null) {
+    newXref.set("Root", xrefInfo.rootRef);
+  }
+  if (xrefInfo.infoRef !== null) {
+    newXref.set("Info", xrefInfo.infoRef);
+  }
+  if (xrefInfo.encryptRef !== null) {
+    newXref.set("Encrypt", xrefInfo.encryptRef);
+  }
+  return newXref;
+}
+
 async function incrementalUpdate({
   originalData,
   xrefInfo,
@@ -293,6 +401,7 @@ async function incrementalUpdate({
   acroFormRef = null,
   acroForm = null,
   xfaData = null,
+  useXrefStream = false,
 }) {
   await updateAcroform({
     xref,
@@ -314,9 +423,6 @@ async function incrementalUpdate({
     });
   }
 
-  const newXref = new Dict(null);
-  const refForXrefTable = xrefInfo.newRef;
-
   let buffer, baseOffset;
   const lastByte = originalData.at(-1);
   if (lastByte === /* \n */ 0x0a || lastByte === /* \r */ 0x0d) {
@@ -328,60 +434,23 @@ async function incrementalUpdate({
     baseOffset = originalData.length + 1;
   }
 
-  newXref.set("Size", refForXrefTable.num + 1);
-  newXref.set("Prev", xrefInfo.startXRef);
-  newXref.set("Type", Name.get("XRef"));
-
-  if (xrefInfo.rootRef !== null) {
-    newXref.set("Root", xrefInfo.rootRef);
-  }
-  if (xrefInfo.infoRef !== null) {
-    newXref.set("Info", xrefInfo.infoRef);
-  }
-  if (xrefInfo.encryptRef !== null) {
-    newXref.set("Encrypt", xrefInfo.encryptRef);
-  }
-
-  // Add a ref for the new xref and sort them
-  newRefs.push({ ref: refForXrefTable, data: "" });
+  const newXref = getTrailerDict(xrefInfo, newRefs, useXrefStream);
   newRefs = newRefs.sort(
     (a, b) => /* compare the refs */ a.ref.num - b.ref.num
   );
-
-  const xrefTableData = [[0, 1, 0xffff]];
-  const indexes = [0, 1];
-  let maxOffset = 0;
-  for (const { ref, data } of newRefs) {
-    maxOffset = Math.max(maxOffset, baseOffset);
-    xrefTableData.push([1, baseOffset, Math.min(ref.gen, 0xffff)]);
-    baseOffset += data.length;
-    indexes.push(ref.num, 1);
+  for (const { data } of newRefs) {
     buffer.push(data);
   }
 
-  newXref.set("Index", indexes);
-
-  if (Array.isArray(xrefInfo.fileIds) && xrefInfo.fileIds.length > 0) {
-    const md5 = computeMD5(baseOffset, xrefInfo);
-    newXref.set("ID", [xrefInfo.fileIds[0], md5]);
-  }
-
-  const offsetSize = Math.ceil(Math.log2(maxOffset) / 8);
-  const sizes = [1, offsetSize, 2];
-  const structSize = sizes[0] + sizes[1] + sizes[2];
-  const tableLength = structSize * xrefTableData.length;
-  newXref.set("W", sizes);
-  newXref.set("Length", tableLength);
-
-  buffer.push(`${refForXrefTable.num} ${refForXrefTable.gen} obj\n`);
-  await writeDict(newXref, buffer, null);
-  buffer.push(" stream\n");
+  await (useXrefStream
+    ? getXRefStreamTable(xrefInfo, baseOffset, newRefs, newXref, buffer)
+    : getXRefTable(xrefInfo, baseOffset, newRefs, newXref, buffer));
 
-  const bufferLen = buffer.reduce((a, str) => a + str.length, 0);
-  const footer = `\nendstream\nendobj\nstartxref\n${baseOffset}\n%%EOF\n`;
-  const array = new Uint8Array(
-    originalData.length + bufferLen + tableLength + footer.length
+  const totalLength = buffer.reduce(
+    (a, str) => a + str.length,
+    originalData.length
   );
+  const array = new Uint8Array(totalLength);
 
   // Original data
   array.set(originalData);
@@ -393,16 +462,6 @@ async function incrementalUpdate({
     offset += str.length;
   }
 
-  // New xref table
-  for (const [type, objOffset, gen] of xrefTableData) {
-    offset = writeInt(type, sizes[0], offset, array);
-    offset = writeInt(objOffset, sizes[1], offset, array);
-    offset = writeInt(gen, sizes[2], offset, array);
-  }
-
-  // Add the footer
-  writeString(footer, offset, array);
-
   return array;
 }
 

diff --git a/test/unit/core_utils_spec.js b/test/unit/core_utils_spec.js
@@ -19,6 +19,7 @@ import {
   escapePDFName,
   escapeString,
   getInheritableProperty,
+  getSizeInBytes,
   isAscii,
   isWhiteSpace,
   log2,
@@ -468,4 +469,21 @@ describe("core_utils", function () {
       );
     });
   });
+
+  describe("getSizeInBytes", function () {
+    it("should get the size in bytes to use to represent a positive integer", function () {
+      expect(getSizeInBytes(0)).toEqual(0);
+      for (let i = 1; i <= 0xff; i++) {
+        expect(getSizeInBytes(i)).toEqual(1);
+      }
+
+      for (let i = 0x100; i <= 0xffff; i += 0x100) {
+        expect(getSizeInBytes(i)).toEqual(2);
+      }
+
+      for (let i = 0x10000; i <= 0xffffff; i += 0x10000) {
+        expect(getSizeInBytes(i)).toEqual(3);
+      }
+    });
+  });
 });