From f62f537d9d7c97ca5f489a77c213c7d28cbf6ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Glorieux?= Date: Thu, 15 Jun 2023 12:52:59 +0200 Subject: [PATCH] Import html rich text as TEI elements Tested with malformed html in Zotero fields. --- TEI.js | 98 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 22 deletions(-) diff --git a/TEI.js b/TEI.js index 16e903f304..f6dd744ff7 100644 --- a/TEI.js +++ b/TEI.js @@ -1,13 +1,11 @@ { "translatorID": "032ae9b7-ab90-9205-a479-baf81f49184a", - "translatorType": 2, "label": "TEI", "creator": "Stefan Majewski", "target": "xml", "minVersion": "4.0.27", - "maxVersion": null, + "maxVersion": "", "priority": 25, - "inRepository": true, "configOptions": { "dataMode": "xml/dom", "getCollections": "true" @@ -19,7 +17,9 @@ "Full TEI Document": false, "Export Collections": false }, - "lastUpdated": "2022-09-30 10:56:50" + "inRepository": true, + "translatorType": 2, + "lastUpdated": "2023-06-15 10:46:23" } // ******************************************************************** @@ -59,17 +59,27 @@ // Zotero.addOption("exportNotes", false); // Zotero.addOption("generateXMLIds", true); -var ns = { +const ns = { tei: "http://www.tei-c.org/ns/1.0", xml: "http://www.w3.org/XML/1998/namespace" }; -var exportedXMLIds = {}; -var generatedItems = {}; -var allItems = {}; +const exportedXMLIds = {}; +const generatedItems = {}; +const allItems = {}; + + +// build one time +const xmlparser = new DOMParser(); +const xmlser = new XMLSerializer(); -// replace formatting with TEI tags +/** + * Replace formatting with TEI tags. + * [2023-06 FG] Legacy, @see appendXML(), to have formatting as element node + * @param {*} title + * @returns + */ function replaceFormatting(title) { var titleText = title; // italics @@ -94,6 +104,50 @@ function replaceFormatting(title) { return titleText; } +/** + * Append possible rich text html as TEI element () + * @param {*} node + * @param {*} html + */ +function appendXML (node, html) { + if (!html) return; + // import html as dom and export is as xml to avoid xml malformations + let dom = xmlparser.parseFromString(html, "text/html"); + html = xmlser.serializeToString(dom.getElementsByTagName("body").item(0)); + // transform html to tei, xslt not available + // https://forums.zotero.org/discussion/comment/344940/ + // tags supported by Zotero + // https://www.zotero.org/support/kb/rich_text_bibliography + let xml = html + // changing namespace of root element + .replace(/http:\/\/www.w3.org\/1999\/xhtml/g, 'http://www.tei-c.org/ns/1.0') + // bold + .replace(//g, '') + .replace(/<\/b>/g, '') + // italics + .replace(//g, '') + .replace(/<\/i>/g, '') + // nocase + .replace(/(.*?)<\/span>/g, '$1') + // seems no more supported in Zotero desk client + // small-caps + .replace(/(.*?)<\/span>/g, '$1') + // subscript + .replace(//g, '') + .replace(/<\/sup>/g, '') + // superscript + .replace(//g, '') + .replace(/<\/sup>/g, '') + ; + dom = xmlparser.parseFromString(xml, "text/xml"); + const children = dom.documentElement.childNodes + for(let i = 0, len = children.length; i < len; i++) { + const child = children[i]; + const imported = node.ownerDocument.importNode(child, true); + node.appendChild(imported); + } +} + function genXMLId(item) { // use Better BibTeX for Zotero citation key if available if (item.extra) { @@ -225,9 +279,7 @@ function generateItem(item, teiDoc) { var analyticTitle = teiDoc.createElementNS(ns.tei, "title"); analyticTitle.setAttribute("level", "a"); analytic.appendChild(analyticTitle); - if (item.title) { - analyticTitle.appendChild(teiDoc.createTextNode(replaceFormatting(item.title))); - } + appendXML(analyticTitle, item.title); // A DOI is presumably for the article, not the journal. if (item.DOI) { idno = teiDoc.createElementNS(ns.tei, "idno"); @@ -246,7 +298,7 @@ function generateItem(item, teiDoc) { else { pubTitle.setAttribute("level", "m"); } - pubTitle.appendChild(teiDoc.createTextNode(replaceFormatting(publicationTitle))); + appendXML(pubTitle, publicationTitle); monogr.appendChild(pubTitle); } @@ -263,7 +315,7 @@ function generateItem(item, teiDoc) { if (item.title) { title = teiDoc.createElementNS(ns.tei, "title"); title.setAttribute("level", "m"); - title.appendChild(teiDoc.createTextNode(replaceFormatting(item.title))); + appendXML(title, item.title); monogr.appendChild(title); } else if (!item.conferenceName) { @@ -291,7 +343,7 @@ function generateItem(item, teiDoc) { if (item.conferenceName) { var conferenceName = teiDoc.createElementNS(ns.tei, "title"); conferenceName.setAttribute("type", "conferenceName"); - conferenceName.appendChild(teiDoc.createTextNode(replaceFormatting(item.conferenceName))); + appendXML(conferenceName, item.conferenceName); monogr.appendChild(conferenceName); } @@ -304,14 +356,14 @@ function generateItem(item, teiDoc) { if (item.series) { title = teiDoc.createElementNS(ns.tei, "title"); title.setAttribute("level", "s"); - title.appendChild(teiDoc.createTextNode(replaceFormatting(item.series))); + appendXML(title, item.series); series.appendChild(title); } if (item.seriesTitle) { var seriesTitle = teiDoc.createElementNS(ns.tei, "title"); seriesTitle.setAttribute("level", "s"); seriesTitle.setAttribute("type", "alternative"); - seriesTitle.appendChild(teiDoc.createTextNode(replaceFormatting(item.seriesTitle))); + appendXML(seriesTitle, item.seriesTitle); series.appendChild(seriesTitle); } if (item.seriesText) { @@ -581,9 +633,8 @@ function doExport() { // Initialize XML Doc - var parser = new DOMParser(); var teiDoc // - = parser.parseFromString('Exported from Zotero

unpublished

Generated from Zotero database

', 'application/xml'); + = xmlparser.parseFromString('Exported from Zotero

unpublished

Generated from Zotero database

', 'application/xml'); var item = null; while (item = Zotero.nextItem()) { // eslint-disable-line no-cond-assign @@ -638,11 +689,14 @@ function doExport() { outputElement = listBibls[0]; } else { - outputElement = teiDoc.createElement("empty"); + outputElement = teiDoc.createElement("empty"); } // write to file. Zotero.write('\n'); - var serializer = new XMLSerializer(); - Zotero.write(serializer.serializeToString(outputElement)); + Zotero.write(xmlser.serializeToString(outputElement)); } +/** BEGIN TEST CASES **/ +var testCases = [ +] +/** END TEST CASES **/