Fetch all hyperlinks in document and replace them #334

miqmago · 2018-10-11T19:02:50Z

Hi, first many thanks for this amazing library.

I'm trying to fetch all links from a document. Until now, I've achieved to retrieve the structure of the document with the example here: https://github.com/galkahana/HummusJS/blob/master/tests/PDFParser.js

I've managed to detect that in some documents, links are as values of LiteralString but other documents has hyperlinks that cannot find in LiteralStrings.

Is there any way to get them at all directly? Seen here in page 394 that Link Annotations is the object that maybe could contain all, but haven't found the way to get Link Annotations objects in the structure of the pdf...

The final purpose is to replace them with another destination, so I've been reading here: #71 (comment), will be possible to replace them? Any thoughts on how to achieve it?

The text was updated successfully, but these errors were encountered:

miqmago · 2018-10-22T17:36:58Z

What I've achieved so far is to fetch the links via Annots dictionary like described here: #329 (https://github.com/galkahana/HummusJSSamples/blob/master/appending-pages-with-comments/appendWithComments.js) and here: #193

I've been reading https://github.com/galkahana/HummusJS/wiki/Embedding-pdf#low-levels and tried to copy links. It works. Also tried to replace links with new ones without success, the new document does not have any link at all... Please any help would be really appreciated. Here is the code I'm using:

import hummus from 'hummus';

const sourcePath = process.argv[2];
const pdfWriter = hummus.createWriter(`${sourcePath}.new.pdf`);

const objCxt = pdfWriter.getObjectsContext();
const cpyCxt = pdfWriter.createPDFCopyingContext(sourcePath);
const cpyCxtParser = cpyCxt.getSourceDocumentParser();

const IS_ONLY_COPY = false;

function linkEditor(replacements, linkObjRef) {
    const linkId = linkObjRef.toPDFIndirectObjectReference().getObjectID();
    const inObject = cpyCxtParser.parseNewObject(linkId);
    const aDictionary = inObject.toPDFDictionary().toJSObject();
    if (aDictionary.Subtype.value === 'Link' && !IS_ONLY_COPY) {
        // All this objects will be replaced with new ones with cpyCxt.replaceSourceObjects
        const newElement = {};
        Object.getOwnPropertyNames(aDictionary).forEach((element) => {
            newElement[element] = aDictionary[element];
            if (element === 'A') {
                newElement[element] = aDictionary[element].toPDFDictionary().toJSObject();
                Object.keys(newElement[element]).forEach((aKey) => {
                    if (aKey === 'URI') {
                        const newUri = newElement[element].URI.toPDFLiteralString();
                        newUri.value = 'http://google.com';
                        newElement[element].URI = newUri;
                    }
                });
            }
        });
        replacements[linkId] = newElement;
    } else {
        // Everything in replacements.copied will be directly copied with objCxt.writeIndirectObjectReference
        replacements.copied = replacements.copied || [];
        replacements.copied.push(cpyCxt.copyObject(linkId));
        // replacements.copied.push(linkId);
    }

    return replacements;
}

function appendPDFPageFromPDFWithAnnotations() {
    // for each page
    for (let i = 0; i < cpyCxtParser.getPagesCount(); i += 1) {
        // grab page dictionary
        const pageDictionary = cpyCxtParser.parsePageDictionary(i);
        if (!pageDictionary.exists('Annots')) {
            // no annotation. append as is
            console.log(`No annotations on page ${i + 1}`);
        } else {
            console.log(`Processing links on page ${i + 1}`);
            // get the annotations array
            const linksArr = cpyCxtParser.queryDictionaryObject(pageDictionary, 'Annots').toJSArray();

            // iterate the array and transform the annotations
            const targetAnnotations = linksArr.reduce(linkEditor, {});
            const { copied } = targetAnnotations;
            delete linkEditor.copied;

            pdfWriter.getEvents().once('OnPageWrite', (event) => {
                // using the page write event, write the new annotations
                event.pageDictionaryContext.writeKey('Annots');
                objCxt.startArray();
                if (copied) {
                    copied.forEach(objectID => objCxt.writeIndirectObjectReference(objectID));
                    // copied.forEach(objectID => cpyCxt.copyDirectObjectAsIs(objectID));
                }
                if (targetAnnotations) {
                    cpyCxt.replaceSourceObjects(targetAnnotations);
                }
                objCxt.endArray(hummus.eTokenSeparatorEndLine);
            });
            // write page. this will trigger the event
        }
        cpyCxt.appendPDFPageFromPDF(i);
    }
}


// second, with the special method. this will copy the pages with the comments
appendPDFPageFromPDFWithAnnotations();

pdfWriter.end();

When IS_ONLY_COPY is true, it would be using objCxt.writeIndirectObjectReference(objectID), the same as https://github.com/galkahana/HummusJSSamples/blob/master/appending-pages-with-comments/appendWithComments.js

This successfully copies the links, but can't modify them.

ebdrup · 2019-05-31T13:48:06Z

@miqmago Did you ever find a way to accomplish replacing all links? I need this :-)
Also @galkahana What an awesome library, thank you so much!

miqmago · 2019-05-31T18:03:26Z

Nope, I ended up by inserting links at defined places, but this does not allow automatization. Maybe with a mix of previous code and this one, one could achieve link replacement, but never tried:

import hummus from 'hummus';

const filePath = process.argv[3];
const writer = hummus.createWriterToModify(filePath, {
    modifiedFilePath: `${process.argv[3]}.new.pdf`,
});
const reader = hummus.createReader(filePath);

// 720 x 540 = 25.4 x 19.05
const fw = 25.4;
const fh = 19.05;

function cm2Px(cm) {
    return (720 * cm) / 25.4;
}

const modifications = [
    {
        page: 0,
        links: [{
            url: '<<yourlink>>',
            x: 8.66,
            y: 14.65,
            w: 7.68,
            h: 2.97,
        }],
    },
];

modifications.forEach((mod) => {
    const { page, links } = mod;
    const modifier = new hummus.PDFPageModifier(writer, page);
    const [, , , pHeight] = reader.parsePage(page).getMediaBox();
    links.forEach((l) => {
        const { url } = l;
        let {
            x, y, w, h,
        } = l;
        x = cm2Px(x);
        y = pHeight - cm2Px(y);
        w = cm2Px(w);
        h = cm2Px(h);

        modifier.startContext();
        modifier.attachURLLinktoCurrentPage(url, x, y, x + w, y - h);
    });
    modifier.writePage();
});

writer.end();

console.log('Reduce file size:');
console.log(`gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4  -dPDFSETTINGS=/printer -dNOPAUSE -dQUIET -dBATCH -sOutputFile=${filePath}-small.pdf ${filePath}.new.pdf`);

miqmago · 2019-05-31T18:04:22Z

@ebdrup please let me know if you achieve to do so!

ebdrup · 2019-06-11T07:25:15Z

@miqmago I didn't have time to look at this. Instead we had a person manually change all the links in the pdf sources.

This was referenced Feb 15, 2021

Annotation modification #455

Open

Annotation editing chunyenHuang/hummusRecipe#208

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fetch all hyperlinks in document and replace them #334

Fetch all hyperlinks in document and replace them #334

miqmago commented Oct 11, 2018 •

edited

Loading

miqmago commented Oct 22, 2018 •

edited

Loading

ebdrup commented May 31, 2019

miqmago commented May 31, 2019

miqmago commented May 31, 2019

ebdrup commented Jun 11, 2019

Fetch all hyperlinks in document and replace them #334

Fetch all hyperlinks in document and replace them #334

Comments

miqmago commented Oct 11, 2018 • edited Loading

miqmago commented Oct 22, 2018 • edited Loading

ebdrup commented May 31, 2019

miqmago commented May 31, 2019

miqmago commented May 31, 2019

ebdrup commented Jun 11, 2019

miqmago commented Oct 11, 2018 •

edited

Loading

miqmago commented Oct 22, 2018 •

edited

Loading