From cfa58e3e99979fa8a9e8290e4125d0b0db27e444 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sat, 27 Jan 2024 21:37:37 +0100 Subject: [PATCH] #4469 - Support for MHTML web page archives - Added experimental support for MHTML files - Added documentation - Remove duplicate CasXmlHandler implementation - Allow embedded images to pass through the default HMTL policy - they are still blocked by the safety net though unless explicitly enabled in the settings file --- .../webanno/api/format/FormatSupport.java | 43 +-- inception/inception-dependencies/pom.xml | 13 + .../META-INF/asciidoc/user-guide.adoc | 2 + .../META-INF/asciidoc/user-guide/formats.adoc | 4 + .../XmlDocumentViewControllerImplBase.java | 58 +++- .../config/ExternalEditorPropertiesImpl.java | 30 +++ .../policy/DefaultHtmlDocumentPolicy.yaml | 20 +- .../policy/SafetyNetDocumentPolicy.java | 9 +- .../XHtmlXmlDocumentViewControllerImpl.java | 34 ++- .../src/main/ts/src/ExternalEditorFactory.ts | 12 +- .../policy/DefaultHtmlDocumentPolicyTest.java | 4 +- ...eAnnotatorHtmlAnnotationEditorFactory.java | 2 + inception/inception-io-html/pom.xml | 27 +- .../io/html/MHtmlDocumentReader.java | 106 ++++++++ .../inception/io/html/MHtmlFormatSupport.java | 131 +++++++++ .../config/HtmlSupportAutoConfiguration.java | 9 + .../io/html/dkprocore/CasXmlHandler.java | 248 ------------------ .../io/html/dkprocore/CasXmlNodeVisitor.java | 140 ++++++++++ .../io/html/dkprocore/HtmlDocumentReader.java | 112 +------- .../asciidoc/user-guide/formats-mhtml.adoc | 42 +++ .../io/xml/dkprocore/CasXmlHandler.java | 9 +- .../ukp/inception/support/io/ZipUtils.java | 39 +++ ...NamspaceDecodingContentHandlerAdapter.java | 179 +++++++++++++ .../support/xml/sanitizer/ElementAction.java | 1 - 24 files changed, 841 insertions(+), 433 deletions(-) create mode 100644 inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlDocumentReader.java create mode 100644 inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlFormatSupport.java delete mode 100644 inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlHandler.java create mode 100644 inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlNodeVisitor.java create mode 100644 inception/inception-io-html/src/main/resources/META-INF/asciidoc/user-guide/formats-mhtml.adoc create mode 100644 inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/NamspaceDecodingContentHandlerAdapter.java diff --git a/inception/inception-api-formats/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/format/FormatSupport.java b/inception/inception-api-formats/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/format/FormatSupport.java index 7629c01b7d3..7fa9f24c549 100644 --- a/inception/inception-api-formats/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/format/FormatSupport.java +++ b/inception/inception-api-formats/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/format/FormatSupport.java @@ -30,14 +30,12 @@ import java.io.File; import java.io.FileNotFoundException; -import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.zip.ZipFile; import org.apache.commons.io.FilenameUtils; import org.apache.commons.lang3.StringUtils; @@ -57,6 +55,7 @@ import de.tudarmstadt.ukp.clarin.webanno.model.Project; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument; +import de.tudarmstadt.ukp.inception.support.io.ZipUtils; import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection; public interface FormatSupport @@ -110,50 +109,14 @@ default boolean hasResources() return false; } - default boolean isAccessibelResource(File aDocFile, String aResourcePath) + default boolean isAccessibleResource(File aDocFile, String aResourcePath) { return DEFAULT_PERMITTED_RESOURCE_EXTENSIONS.contains(getExtension(aResourcePath)); } default InputStream openResourceStream(File aDocFile, String aResourcePath) throws IOException { - if (!hasResources() || !isAccessibelResource(aDocFile, aResourcePath)) { - throw new FileNotFoundException("Resource not found [" + aResourcePath + "]"); - } - - if (aResourcePath.contains("..") || aResourcePath.contains("//")) { - throw new FileNotFoundException("Resource not found [" + aResourcePath + "]"); - } - - // var path = prependIfMissing(normalize(aResourcePath, true), "/"); - - ZipFile zipFile = null; - var success = false; - try { - zipFile = new ZipFile(aDocFile); - var entry = zipFile.getEntry(aResourcePath); - if (entry == null) { - throw new FileNotFoundException("Resource not found [" + aResourcePath + "]"); - } - - var finalZipFile = zipFile; - var is = new FilterInputStream(zipFile.getInputStream(entry)) - { - @Override - public void close() throws IOException - { - super.close(); - finalZipFile.close(); - } - }; - success = true; - return is; - } - finally { - if (!success && zipFile != null) { - zipFile.close(); - } - } + return ZipUtils.openResourceStream(aDocFile, aResourcePath); } /** diff --git a/inception/inception-dependencies/pom.xml b/inception/inception-dependencies/pom.xml index 2a89db2996f..ff5e8f8c79e 100644 --- a/inception/inception-dependencies/pom.xml +++ b/inception/inception-dependencies/pom.xml @@ -877,6 +877,19 @@ 1.16.1 + + + + org.apache.james + apache-mime4j-core + 0.8.9 + + + org.apache.james + apache-mime4j-dom + 0.8.9 + + diff --git a/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc b/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc index b44bc9451f1..e875e943bbd 100644 --- a/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc +++ b/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc @@ -256,6 +256,8 @@ include::{include-dir}formats-html.adoc[leveloffset=+2] include::{include-dir}formats-htmldoc.adoc[leveloffset=+2] +include::{include-dir}formats-mhtml.adoc[leveloffset=+2] + include::{include-dir}formats-imscwb.adoc[leveloffset=+2] // include::{include-dir}formats-lif.adoc[leveloffset=+2] diff --git a/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats.adoc b/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats.adoc index d4b85cef195..136edd8f6c6 100644 --- a/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats.adoc +++ b/inception/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats.adoc @@ -75,6 +75,10 @@ data in a particular format. The **feature flag** column shows which flags you c | `htmldoc` | `format.html.enabled` +| <> +| `mhtml` +| `format.mhtml.enabled` + | <> | `html` | `format.html-legacy.enabled` diff --git a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/XmlDocumentViewControllerImplBase.java b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/XmlDocumentViewControllerImplBase.java index 9296e9d9345..0d181d52c89 100644 --- a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/XmlDocumentViewControllerImplBase.java +++ b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/XmlDocumentViewControllerImplBase.java @@ -17,16 +17,24 @@ */ package de.tudarmstadt.ukp.inception.externaleditor; +import static java.nio.charset.StandardCharsets.UTF_8; + import java.io.IOException; +import java.net.URLEncoder; import java.util.Optional; +import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import de.tudarmstadt.ukp.clarin.webanno.api.export.DocumentImportExportService; +import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument; import de.tudarmstadt.ukp.inception.editor.AnnotationEditorRegistry; import de.tudarmstadt.ukp.inception.externaleditor.policy.DefaultHtmlDocumentPolicy; import de.tudarmstadt.ukp.inception.externaleditor.policy.SafetyNetDocumentPolicy; +import de.tudarmstadt.ukp.inception.support.xml.NamspaceDecodingContentHandlerAdapter; import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection; import de.tudarmstadt.ukp.inception.support.xml.sanitizer.SanitizingContentHandler; @@ -47,7 +55,7 @@ public XmlDocumentViewControllerImplBase(DefaultHtmlDocumentPolicy aDefaultPolic annotationEditorRegistry = aAnnotationEditorRegistry; } - protected ContentHandler applySanitizers(Optional aEditor, SourceDocument doc, + protected ContentHandler applySanitizers(Optional aEditor, SourceDocument aDoc, ContentHandler aCh) throws IOException { @@ -55,7 +63,7 @@ protected ContentHandler applySanitizers(Optional aEditor, SourceDocumen var ch = new SanitizingContentHandler(aCh, safetyNetPolicy.getPolicy()); // Apply format policy if it exists - var formatPolicy = formatRegistry.getFormatPolicy(doc); + var formatPolicy = formatRegistry.getFormatPolicy(aDoc); if (formatPolicy.isPresent()) { ch = new SanitizingContentHandler(ch, formatPolicy.get()); } @@ -73,6 +81,52 @@ protected ContentHandler applySanitizers(Optional aEditor, SourceDocumen return ch; } + protected ContentHandler applyHtmlResourceUrlFilter(SourceDocument aDoc, + ContentHandler aDelegate) + { + var hasResources = formatRegistry.getFormatById(aDoc.getFormat()) + .map(FormatSupport::hasResources).orElse(false); + if (!hasResources) { + return aDelegate; + } + + return new NamspaceDecodingContentHandlerAdapter(aDelegate) + { + @Override + public void startElement(String aUri, String aLocalName, String aQName, + Attributes aAtts) + throws SAXException + { + var atts = aAtts; + + var element = toQName(aUri, aLocalName, aQName); + + if ("a".equalsIgnoreCase(element.getLocalPart()) + || "link".equalsIgnoreCase(element.getLocalPart())) { + atts = filterResourceUrl(aAtts, "href"); + } + + if ("img".equalsIgnoreCase(element.getLocalPart())) { + atts = filterResourceUrl(aAtts, "src"); + } + + super.startElement(aUri, aLocalName, aQName, atts); + } + + private Attributes filterResourceUrl(Attributes aAtts, String attribute) + { + var attributes = new AttributesImpl(aAtts); + var index = attributes.getIndex(attribute); + if (index != -1) { + var value = attributes.getValue(index); + value = "res?resId=" + URLEncoder.encode(value, UTF_8); + attributes.setValue(index, value); + } + return attributes; + } + }; + } + private Optional getEditorPolicy(Optional aEditor) throws IOException { if (!aEditor.isPresent()) { diff --git a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/config/ExternalEditorPropertiesImpl.java b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/config/ExternalEditorPropertiesImpl.java index 6b6623a3550..3ba8ca23a4a 100644 --- a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/config/ExternalEditorPropertiesImpl.java +++ b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/config/ExternalEditorPropertiesImpl.java @@ -37,33 +37,63 @@ public boolean isBlockStyle() return blockStyle; } + public void setBlockStyle(boolean aBlockStyle) + { + blockStyle = aBlockStyle; + } + @Override public boolean isBlockImg() { return blockImg; } + public void setBlockImg(boolean aBlockImg) + { + blockImg = aBlockImg; + } + @Override public boolean isBlockEmbed() { return blockEmbed; } + public void setBlockEmbed(boolean aBlockEmbed) + { + blockEmbed = aBlockEmbed; + } + @Override public boolean isBlockAudio() { return blockAudio; } + public void setBlockAudio(boolean aBlockAudio) + { + blockAudio = aBlockAudio; + } + @Override public boolean isBlockObject() { return blockObject; } + public void setBlockObject(boolean aBlockObject) + { + blockObject = aBlockObject; + } + @Override public boolean isBlockVideo() { return blockVideo; } + + public void setBlockVideo(boolean aBlockVideo) + { + blockVideo = aBlockVideo; + } } diff --git a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicy.yaml b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicy.yaml index 05d88bb1033..bb4917ec92c 100644 --- a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicy.yaml +++ b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicy.yaml @@ -24,18 +24,22 @@ match_without_namespace: true debug: true policies: - action: PASS - elements: ["html", "head", "body", "title"] + elements: ["html", "head", "body", "title"] # Content sectioning - action: PASS elements: ["address", "article", "aside", "footer", "header", "h1", "h2", "h3", "h4", "h5", "h6", "main", "section"] + # Navigaton sectioning (probably not relevant for annotation) + - action: PRUNE + elements: ["nav"] # Text content - action: PASS elements: ["blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr", "li", "menu", "ol", "p", "pre", "ul"] - # Inline text semantics + # Links / anchors - action: SKIP elements: ["a"] + # Inline text semantics - action: PASS elements: ["abbr", "b", "bdi", "bdo", "br", "cite", "code", "data", "dfn", "em", "i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small", @@ -63,5 +67,15 @@ policies: - action: PASS attributes: ["class"] matching: "[a-zA-Z0-9\\s,\\-_]+" - + # Images - depending on the security configurations, the safety net may block these + - action: PASS + elements: ["img"] + - on_elements: ["img"] + action: PASS + attributes: ["decoding", "alt", "width", "height"] + - on_elements: ["img"] + action: PASS + attributes: ["src"] + matching: "res[?]resId=.*" # Allow only access to embedded resources + \ No newline at end of file diff --git a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/SafetyNetDocumentPolicy.java b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/SafetyNetDocumentPolicy.java index 0d524e6c262..cf36d197e21 100644 --- a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/SafetyNetDocumentPolicy.java +++ b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/policy/SafetyNetDocumentPolicy.java @@ -35,6 +35,10 @@ public class SafetyNetDocumentPolicy { static final String SAFETY_NET_POLICY_OVERRIDE_YAML = "safety-net.yaml"; + private static final String[] JAVASCRIPT_ACTIVE_ATTRIBUTES = { "href", "src", "codebase", + "cite", "background", "action", "longdesc", "profile", "classid", "data", "usemap", + "formaction", "icon", "manifest", "poster", "srcset", "archive" }; + private static final String[] JAVASCRIPT_EVENT_ATTRIBUTES = { "onafterprint", "onbeforeprint", "onbeforeunload", "onerror", "onhashchange", "onload", "onmessage", "onoffline", "ononline", "onpagehide", "onpageshow", "onpopstate", "onresize", "onstorage", @@ -95,9 +99,8 @@ private PolicyCollection makeDefaultPolicy() builder.disallowElements("video"); } - builder.disallowAttributes("href", "src", "codebase", "cite", "background", "action", - "longdesc", "profile", "classid", "data", "usemap", "formaction", "icon", - "manifest", "poster", "srcset", "archive").matching(compile("\\s*javascript:.*")) + builder.disallowAttributes(JAVASCRIPT_ACTIVE_ATTRIBUTES) // + .matching(compile("\\s*javascript:.*")) // .globally(); builder.disallowAttributes(JAVASCRIPT_EVENT_ATTRIBUTES).globally(); diff --git a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/xhtml/XHtmlXmlDocumentViewControllerImpl.java b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/xhtml/XHtmlXmlDocumentViewControllerImpl.java index 903666c2331..3258242c927 100644 --- a/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/xhtml/XHtmlXmlDocumentViewControllerImpl.java +++ b/inception/inception-external-editor/src/main/java/de/tudarmstadt/ukp/inception/externaleditor/xhtml/XHtmlXmlDocumentViewControllerImpl.java @@ -124,11 +124,11 @@ public ResponseEntity getDocument(@PathVariable("projectId") long aProje @RequestParam("editor") Optional aEditor, Principal principal) throws Exception { - SourceDocument doc = documentService.getSourceDocument(aProjectId, aDocumentId); + var doc = documentService.getSourceDocument(aProjectId, aDocumentId); - CAS cas = documentService.createOrReadInitialCas(doc); + var cas = documentService.createOrReadInitialCas(doc); - try (StringWriter out = new StringWriter()) { + try (var out = new StringWriter()) { Optional maybeXmlDocument; if (cas.getTypeSystem().getType(XmlDocument._TypeName) != null) { maybeXmlDocument = cas.select(XmlDocument.class).findFirst(); @@ -144,14 +144,16 @@ public ResponseEntity getDocument(@PathVariable("projectId") long aProje var rawHandler = XmlCas2SaxEvents.makeSerializer(out); var sanitizingHandler = applySanitizers(aEditor, doc, rawHandler); + var resourceFilteringHandler = applyHtmlResourceUrlFilter(doc, sanitizingHandler); + var finalHandler = resourceFilteringHandler; // If the CAS contains an actual HTML structure, then we send that. Mind that we do // not inject format-specific CSS then! if (casContainsHtml) { - XmlDocument xml = maybeXmlDocument.get(); + var xml = maybeXmlDocument.get(); startXHtmlDocument(rawHandler); - var serializer = new XmlCas2SaxEvents(xml, sanitizingHandler); + var serializer = new XmlCas2SaxEvents(xml, finalHandler); serializer.process(xml.getRoot()); endXHtmlDocument(rawHandler); @@ -168,21 +170,21 @@ public ResponseEntity getDocument(@PathVariable("projectId") long aProje if (maybeXmlDocument.isEmpty()) { // Gracefully handle the case that the CAS does not contain any XML structure at all // and show only the document text in this case. - renderTextContent(cas, sanitizingHandler); + renderTextContent(cas, finalHandler); } else { var formatPolicy = formatRegistry.getFormatPolicy(doc); var defaultNamespace = formatPolicy.flatMap(policy -> policy.getDefaultNamespace()); if (defaultNamespace.isPresent()) { - sanitizingHandler.startPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX, + finalHandler.startPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX, defaultNamespace.get()); } - renderXmlContent(doc, sanitizingHandler, aEditor, maybeXmlDocument.get()); + renderXmlContent(doc, finalHandler, aEditor, maybeXmlDocument.get()); if (defaultNamespace.isPresent()) { - sanitizingHandler.endPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX); + finalHandler.endPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX); } } rawHandler.endElement(null, null, BODY); @@ -261,7 +263,7 @@ public ResponseEntity getResource( var srcDoc = documentService.getSourceDocument(aProjectId, aDocumentId); var maybeFormatSupport = formatRegistry.getFormatById(srcDoc.getFormat()); - if (!maybeFormatSupport.isPresent()) { + if (maybeFormatSupport.isEmpty()) { return ResponseEntity.notFound().build(); } @@ -269,17 +271,23 @@ public ResponseEntity getResource( var formatSupport = maybeFormatSupport.get(); + if (!formatSupport.hasResources() + || !formatSupport.isAccessibleResource(srcDocFile, aResourceId)) { + LOG.debug("Resource [{}] for document {} not found", aResourceId, srcDoc); + return ResponseEntity.notFound().build(); + } + try { var inputStream = formatSupport.openResourceStream(srcDocFile, aResourceId); - HttpHeaders httpHeaders = new HttpHeaders(); + var httpHeaders = new HttpHeaders(); return new ResponseEntity<>(new InputStreamResource(inputStream), httpHeaders, OK); } catch (FileNotFoundException e) { - LOG.error("Resource [{}] for document {} not found", aResourceId, srcDoc); + LOG.debug("Resource [{}] for document {} not found", aResourceId, srcDoc); return ResponseEntity.notFound().build(); } catch (Exception e) { - LOG.error("Unable to load resource [{}] for document {}", aResourceId, srcDoc, e); + LOG.debug("Unable to load resource [{}] for document {}", aResourceId, srcDoc, e); return ResponseEntity.notFound().build(); } } diff --git a/inception/inception-external-editor/src/main/ts/src/ExternalEditorFactory.ts b/inception/inception-external-editor/src/main/ts/src/ExternalEditorFactory.ts index 775ecbaeff5..1c005c7d882 100644 --- a/inception/inception-external-editor/src/main/ts/src/ExternalEditorFactory.ts +++ b/inception/inception-external-editor/src/main/ts/src/ExternalEditorFactory.ts @@ -189,7 +189,11 @@ export class ExternalEditorFactory implements AnnotationEditorFactory { css.onload = null resolve() } - document.getElementsByTagName('head')[0].appendChild(css) + + const headElements = document.getElementsByTagName('head') + if (headElements.length > 0) { + headElements[0].appendChild(css) + } }) } @@ -204,7 +208,11 @@ export class ExternalEditorFactory implements AnnotationEditorFactory { script.onload = null resolve() } - document.getElementsByTagName('head')[0].appendChild(script) + + const headElements = document.getElementsByTagName('head') + if (headElements.length > 0) { + document.getElementsByTagName('head')[0].appendChild(script) + } }) } diff --git a/inception/inception-external-editor/src/test/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicyTest.java b/inception/inception-external-editor/src/test/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicyTest.java index 0193875ca14..fd26776d8fa 100644 --- a/inception/inception-external-editor/src/test/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicyTest.java +++ b/inception/inception-external-editor/src/test/java/de/tudarmstadt/ukp/inception/externaleditor/policy/DefaultHtmlDocumentPolicyTest.java @@ -41,7 +41,7 @@ void thatOverrideFileIsPickedUp(@TempDir Path aTemp) throws Exception var sut = new DefaultHtmlDocumentPolicy(); - assertThat(sut.getPolicy().getElementPolicies()).hasSize(148); + assertThat(sut.getPolicy().getElementPolicies()).hasSize(152); write(policyFile.toFile(), "policies: []", UTF_8); assertThat(policyFile).exists(); @@ -54,6 +54,6 @@ void thatOverrideFileIsPickedUp(@TempDir Path aTemp) throws Exception Files.delete(policyFile); assertThat(policyFile).doesNotExist(); - assertThat(sut.getPolicy().getElementPolicies()).hasSize(148); + assertThat(sut.getPolicy().getElementPolicies()).hasSize(152); } } diff --git a/inception/inception-html-apache-annotator-editor/src/main/java/de/tudarmstadt/ukp/inception/apacheannotatoreditor/ApacheAnnotatorHtmlAnnotationEditorFactory.java b/inception/inception-html-apache-annotator-editor/src/main/java/de/tudarmstadt/ukp/inception/apacheannotatoreditor/ApacheAnnotatorHtmlAnnotationEditorFactory.java index 03340fb00a3..3311baf1d19 100644 --- a/inception/inception-html-apache-annotator-editor/src/main/java/de/tudarmstadt/ukp/inception/apacheannotatoreditor/ApacheAnnotatorHtmlAnnotationEditorFactory.java +++ b/inception/inception-html-apache-annotator-editor/src/main/java/de/tudarmstadt/ukp/inception/apacheannotatoreditor/ApacheAnnotatorHtmlAnnotationEditorFactory.java @@ -33,6 +33,7 @@ import de.tudarmstadt.ukp.inception.editor.AnnotationEditorFactoryImplBase; import de.tudarmstadt.ukp.inception.editor.action.AnnotationActionHandler; import de.tudarmstadt.ukp.inception.io.html.HtmlFormatSupport; +import de.tudarmstadt.ukp.inception.io.html.MHtmlFormatSupport; import de.tudarmstadt.ukp.inception.io.xml.CustomXmlFormatLoader; import de.tudarmstadt.ukp.inception.io.xml.XmlFormatSupport; import de.tudarmstadt.ukp.inception.preferences.ClientSidePreferencesKey; @@ -80,6 +81,7 @@ public int accepts(Project aProject, String aFormat) switch (aFormat) { case HtmlFormatSupport.ID: // fall-through + case MHtmlFormatSupport.ID: // fall-through case XmlFormatSupport.ID: return PREFERRED; default: diff --git a/inception/inception-io-html/pom.xml b/inception/inception-io-html/pom.xml index 28b002b9943..ad77381faef 100644 --- a/inception/inception-io-html/pom.xml +++ b/inception/inception-io-html/pom.xml @@ -15,7 +15,9 @@ See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 de.tudarmstadt.ukp.inception.app @@ -45,16 +47,12 @@ de.tudarmstadt.ukp.inception.app inception-support - + commons-io commons-io - - org.apache.commons - commons-lang3 - - + org.apache.uima uimaj-core @@ -63,7 +61,7 @@ org.apache.uima uimafit-core - + org.dkpro.core dkpro-core-io-html-asl @@ -88,7 +86,7 @@ eu.openminted.share.annotations omtd-share-annotations-api - + org.jsoup @@ -99,6 +97,15 @@ icu4j + + org.apache.james + apache-mime4j-core + + + org.apache.james + apache-mime4j-dom + + org.springframework spring-context @@ -107,7 +114,7 @@ org.springframework.boot spring-boot-autoconfigure - + junit diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlDocumentReader.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlDocumentReader.java new file mode 100644 index 00000000000..e4b5c05ee4c --- /dev/null +++ b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlDocumentReader.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.io.html; + +import java.io.IOException; + +import org.apache.james.mime4j.dom.Message; +import org.apache.james.mime4j.dom.Multipart; +import org.apache.james.mime4j.dom.TextBody; +import org.apache.james.mime4j.message.DefaultMessageBuilder; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.ConfigurationParameter; +import org.apache.uima.fit.descriptor.MimeTypeCapability; +import org.apache.uima.fit.descriptor.ResourceMetaData; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; +import org.dkpro.core.api.parameter.MimeTypes; +import org.jsoup.Jsoup; +import org.jsoup.select.NodeTraversor; + +import de.tudarmstadt.ukp.inception.io.html.dkprocore.CasXmlNodeVisitor; + +/** + * Reads the contents of a given URL and strips the HTML. Returns the textual contents. Also + * recognizes headings and paragraphs. + */ +@ResourceMetaData(name = "MHTML Reader") +@MimeTypeCapability({ MimeTypes.APPLICATION_XHTML, MimeTypes.TEXT_HTML }) +@TypeCapability(outputs = { // + "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading", + "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph", + "org.dkpro.core.api.xml.type.XmlAttribute", "org.dkpro.core.api.xml.type.XmlDocument", + "org.dkpro.core.api.xml.type.XmlElement", "org.dkpro.core.api.xml.type.XmlNode", + "org.dkpro.core.api.xml.type.XmlTextNode" }) +public class MHtmlDocumentReader + extends JCasResourceCollectionReader_ImplBase +{ + /** + * Normalize whitespace. + */ + public static final String PARAM_NORMALIZE_WHITESPACE = "normalizeWhitespace"; + @ConfigurationParameter(name = PARAM_NORMALIZE_WHITESPACE, defaultValue = "true") + private boolean normalizeWhitespace; + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (var is = res.getInputStream()) { + var builder = new DefaultMessageBuilder(); + var message = builder.parseMessage(is); + var htmlDocument = getDocument(message); + try (var docIs = htmlDocument.getInputStream()) { + var charset = htmlDocument.getMimeCharset(); + if ("US-ASCII".equals(charset)) { + // mime4j uses US_ASCII as default and we cannot override it. While it may be + // technically correct, e.g. Chrome seems to use UTF-8 by default but does not + // provide an encoding the MHTML files... *sigh* + charset = "UTF-8"; + } + var doc = Jsoup.parse(docIs, charset, ""); + + var visitor = new CasXmlNodeVisitor(aJCas, normalizeWhitespace); + + NodeTraversor.traverse(visitor, doc); + } + } + } + + private static TextBody getDocument(Message message) throws IOException + { + var documentUrl = message.getHeader().getField("Snapshot-Content-Location").getBody(); + + if (message.getBody() instanceof Multipart body) { + var documentPart = body.getBodyParts().stream() // + .filter(e -> documentUrl + .equals(e.getHeader().getField("Content-Location").getBody())) + .findFirst().get(); + + if (documentPart.getBody() instanceof TextBody documentBody) { + return documentBody; + } + } + + throw new IOException("Unable to locate embedded HTML document"); + } +} diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlFormatSupport.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlFormatSupport.java new file mode 100644 index 00000000000..56614bb6e17 --- /dev/null +++ b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/MHtmlFormatSupport.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.io.html; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.util.Optional; + +import org.apache.james.mime4j.dom.Message; +import org.apache.james.mime4j.dom.Multipart; +import org.apache.james.mime4j.dom.SingleBody; +import org.apache.james.mime4j.message.DefaultMessageBuilder; +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; + +import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; +import de.tudarmstadt.ukp.clarin.webanno.model.Project; +import de.tudarmstadt.ukp.inception.externaleditor.policy.DefaultHtmlDocumentPolicy; +import de.tudarmstadt.ukp.inception.io.html.config.HtmlSupportAutoConfiguration; +import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection; + +/** + * Support for HTML format. + *

+ * This class is exposed as a Spring Component via + * {@link HtmlSupportAutoConfiguration#mhtmlFormatSupport}. + *

+ */ +public class MHtmlFormatSupport + implements FormatSupport +{ + public static final String ID = "mhtml"; + public static final String NAME = "MHTML (Web archive)"; + + private final DefaultHtmlDocumentPolicy defaultPolicy; + + public MHtmlFormatSupport(DefaultHtmlDocumentPolicy aDefaultPolicy) + { + defaultPolicy = aDefaultPolicy; + } + + @Override + public String getId() + { + return ID; + } + + @Override + public String getName() + { + return NAME; + } + + @Override + public boolean isReadable() + { + return true; + } + + @Override + public CollectionReaderDescription getReaderDescription(Project aProject, + TypeSystemDescription aTSD) + throws ResourceInitializationException + { + return createReaderDescription(MHtmlDocumentReader.class, aTSD); + } + + @Override + public Optional getPolicy() throws IOException + { + return Optional.of(defaultPolicy.getPolicy()); + } + + @Override + public boolean hasResources() + { + return true; + } + + @Override + public InputStream openResourceStream(File aDocFile, String aResourcePath) throws IOException + { + try (var is = new FileInputStream(aDocFile)) { + var builder = new DefaultMessageBuilder(); + var message = builder.parseMessage(is); + var resourceBody = getResourcePartBody(message, aResourcePath); + return resourceBody.getInputStream(); + } + } + + private static SingleBody getResourcePartBody(Message message, String aResourcePath) throws IOException + { + if (message.getBody() instanceof Multipart body) { + var documentPart = body.getBodyParts().stream() // + .filter(e -> { + var field = e.getHeader().getField("Content-Location"); + return field != null && aResourcePath.equals(field.getBody()); + }) // + .findFirst(); + + if (documentPart.isPresent()) { + if (documentPart.get().getBody() instanceof SingleBody resourceBody) { + return resourceBody; + } + } + } + + throw new FileNotFoundException("Resource not found [" + aResourcePath + "]"); + } +} diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/config/HtmlSupportAutoConfiguration.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/config/HtmlSupportAutoConfiguration.java index 67ca2380404..7900f8ca8d6 100644 --- a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/config/HtmlSupportAutoConfiguration.java +++ b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/config/HtmlSupportAutoConfiguration.java @@ -24,6 +24,7 @@ import de.tudarmstadt.ukp.inception.externaleditor.policy.DefaultHtmlDocumentPolicy; import de.tudarmstadt.ukp.inception.io.html.HtmlFormatSupport; import de.tudarmstadt.ukp.inception.io.html.LegacyHtmlFormatSupport; +import de.tudarmstadt.ukp.inception.io.html.MHtmlFormatSupport; @Configuration public class HtmlSupportAutoConfiguration @@ -36,6 +37,14 @@ public HtmlFormatSupport htmlFormatSupport(DefaultHtmlDocumentPolicy aDefaultPol return new HtmlFormatSupport(aDefaultPolicy); } + @Bean + @ConditionalOnProperty(prefix = "format.mhtml", name = "enabled", // + havingValue = "true", matchIfMissing = false) + public MHtmlFormatSupport mhtmlFormatSupport(DefaultHtmlDocumentPolicy aDefaultPolicy) + { + return new MHtmlFormatSupport(aDefaultPolicy); + } + @Bean @ConditionalOnProperty(prefix = "format.html-legacy", name = "enabled", // havingValue = "true", matchIfMissing = false) diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlHandler.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlHandler.java deleted file mode 100644 index 8aad157082d..00000000000 --- a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlHandler.java +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.inception.io.html.dkprocore; - -import static org.apache.commons.lang3.StringUtils.trimToNull; - -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Deque; -import java.util.List; - -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.cas.FSArray; -import org.dkpro.core.api.xml.type.XmlAttribute; -import org.dkpro.core.api.xml.type.XmlDocument; -import org.dkpro.core.api.xml.type.XmlElement; -import org.dkpro.core.api.xml.type.XmlNode; -import org.dkpro.core.api.xml.type.XmlTextNode; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -public class CasXmlHandler - extends DefaultHandler -{ - private final JCas jcas; - private final StringBuilder text; - private final Deque stack; - - private XmlDocument docNode; - private boolean captureText = true; - - public CasXmlHandler(JCas aJCas) - { - jcas = aJCas; - text = new StringBuilder(); - stack = new ArrayDeque<>(); - } - - @Override - public void startDocument() throws SAXException - { - if (docNode != null || !stack.isEmpty() || text.length() != 0) { - throw new SAXException("Illegal document start event when data has already been seen."); - } - - docNode = new XmlDocument(jcas); - docNode.setBegin(text.length()); - } - - @Override - public void endDocument() throws SAXException - { - docNode.setEnd(text.length()); - docNode.addToIndexes(); - - jcas.setDocumentText(text.toString()); - }; - - @Override - public void startElement(String aUri, String aLocalName, String aQName, Attributes aAttributes) - throws SAXException - { - if (docNode == null) { - throw new SAXException( - "Illegal element start event when document start has not been seen."); - } - - XmlElement element = new XmlElement(jcas); - element.setBegin(text.length()); - element.setUri(trimToNull(aUri)); - element.setLocalName(trimToNull(aLocalName)); - element.setQName(trimToNull(aQName)); - - if (aAttributes.getLength() > 0) { - var attributes = new FSArray(jcas, aAttributes.getLength()); - for (int i = 0; i < aAttributes.getLength(); i++) { - XmlAttribute attribute = new XmlAttribute(jcas); - attribute.setUri(trimToNull(aAttributes.getURI(i))); - attribute.setLocalName(trimToNull(aAttributes.getLocalName(i))); - attribute.setQName(trimToNull(aAttributes.getQName(i))); - attribute.setValueType(trimToNull(aAttributes.getType(i))); - attribute.setValue(aAttributes.getValue(i)); - attributes.set(i, attribute); - } - element.setAttributes(attributes); - } - - attachToParent(element); - - boolean capture; - StackFrame parentFrame = stack.peek(); - if (parentFrame != null) { - capture = parentFrame.isCaptureText(); - } - else { - capture = captureText; - } - - stack.push(new StackFrame(element, capture)); - } - - @Override - public void endElement(String aUri, String aLocalName, String aQName) throws SAXException - { - StackFrame frame = stack.pop(); - - XmlElement element = frame.getElement(); - element.setEnd(text.length()); - - // Fill in children - if (!frame.getChildren().isEmpty()) { - var children = new FSArray(jcas, frame.getChildren().size()); - for (int i = 0; i < frame.getChildren().size(); i++) { - children.set(i, frame.getChildren().get(i)); - } - element.setChildren(children); - } - - element.addToIndexes(); - } - - @Override - public void characters(char[] aCh, int aStart, int aLength) throws SAXException - { - if (stack.isEmpty()) { - // We ignore any characters outside the root elements. These could include e.g. - // whitespace in the context of a doctype before the root element or trailing whitespace - // after the root element. - return; - } - - XmlTextNode textNode = new XmlTextNode(jcas); - textNode.setBegin(text.length()); - - if (stack.peek().isCaptureText()) { - text.append(aCh, aStart, aLength); - textNode.setCaptured(true); - } - else { - textNode.setText(new String(aCh, aStart, aLength)); - textNode.setCaptured(false); - } - - textNode.setEnd(text.length()); - textNode.addToIndexes(); - - attachToParent(textNode); - } - - @Override - public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException - { - characters(aCh, aStart, aLength); - } - - private void attachToParent(XmlNode aNode) - { - StackFrame parentFrame = stack.peek(); - if (parentFrame != null) { - aNode.setParent(parentFrame.getElement()); - parentFrame.addChild(aNode); - } - else { - docNode.setRoot((XmlElement) aNode); - } - } - - public CharSequence getText() - { - return text; - } - - public Collection getStack() - { - return Collections.unmodifiableCollection(stack); - } - - public XmlElement getCurrentElement() - { - return stack.peek().getElement(); - } - - public void captureText(boolean aCapture) - { - if (stack.isEmpty()) { - captureText = aCapture; - } - else { - stack.peek().setCaptureText(aCapture); - } - } - - private static class StackFrame - { - private final XmlElement element; - private final List children = new ArrayList<>(); - private boolean captureText; - - public StackFrame(XmlElement aElement, boolean aCaptureText) - { - element = aElement; - captureText = aCaptureText; - } - - public XmlElement getElement() - { - return element; - } - - public void addChild(XmlNode aChild) - { - children.add(aChild); - } - - public List getChildren() - { - return children; - } - - public boolean isCaptureText() - { - return captureText; - } - - public void setCaptureText(boolean aCaptureText) - { - captureText = aCaptureText; - } - } -} diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlNodeVisitor.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlNodeVisitor.java new file mode 100644 index 00000000000..b9752edb6b9 --- /dev/null +++ b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/CasXmlNodeVisitor.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.io.html.dkprocore; + +import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils.trim; +import static de.tudarmstadt.ukp.inception.io.html.dkprocore.internal.JSoupUtil.appendNormalisedText; +import static de.tudarmstadt.ukp.inception.io.html.dkprocore.internal.JSoupUtil.lastCharIsWhitespace; + +import java.util.Map; + +import org.apache.uima.jcas.JCas; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; +import org.xml.sax.helpers.AttributesImpl; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; +import de.tudarmstadt.ukp.inception.io.xml.dkprocore.CasXmlHandler; + +public class CasXmlNodeVisitor + implements NodeVisitor +{ + private final CasXmlHandler handler; + private final JCas jcas; + private final boolean normalizeWhitespace; + private Map mappings; + + public CasXmlNodeVisitor(JCas aJCas, boolean aNormalizeWhitespace) + { + jcas = aJCas; + handler = new CasXmlHandler(aJCas); + normalizeWhitespace = aNormalizeWhitespace; + } + + public void setMappings(Map aMappings) + { + mappings = aMappings; + } + + @Override + public void head(Node node, int depth) + { + try { + if (node instanceof Document) { + handler.startDocument(); + handler.captureText(false); + } + else if (node instanceof TextNode textNode) { + var buffer = new StringBuilder(); + if (normalizeWhitespace) { + appendNormalisedText(buffer, textNode); + } + else { + buffer.append(textNode.getWholeText()); + } + char[] text = buffer.toString().toCharArray(); + handler.characters(text, 0, text.length); + } + else if (node instanceof Element element) { + if (!handler.getText().isEmpty() + && (element.isBlock() || "br".equalsIgnoreCase(element.nodeName())) + && !lastCharIsWhitespace(handler.getText())) { + char[] text = " ".toCharArray(); + handler.characters(text, 0, text.length); + } + + var attributes = new AttributesImpl(); + + if (element.attributes() != null) { + for (Attribute attr : element.attributes()) { + attributes.addAttribute("", "", attr.getKey(), "CDATA", attr.getValue()); + } + } + + if ("body".equalsIgnoreCase(element.tagName())) { + handler.captureText(true); + } + + handler.startElement("", "", element.tagName(), attributes); + } + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + + @Override + public void tail(Node node, int depth) + { + try { + if (node instanceof Document) { + handler.endDocument(); + } + else if (node instanceof Element element) { + // Fetch the current element + var elementFS = handler.getCurrentElement(); + + // Close the current element so that it gets its end offset + handler.endElement("", "", element.tagName()); + + if ("body".equalsIgnoreCase(element.tagName())) { + handler.captureText(false); + } + + if (mappings != null) { + var type = mappings.get(node.nodeName()); + if (type != null) { + int[] span = { elementFS.getBegin(), elementFS.getEnd() }; + trim(handler.getText(), span); + Div div = (Div) jcas.getCas().createAnnotation(jcas.getCasType(type), + span[0], span[1]); + div.setDivType(node.nodeName()); + div.addToIndexes(); + } + } + } + } + catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/HtmlDocumentReader.java b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/HtmlDocumentReader.java index 61b7134e700..e924accbcf5 100644 --- a/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/HtmlDocumentReader.java +++ b/inception/inception-io-html/src/main/java/de/tudarmstadt/ukp/inception/io/html/dkprocore/HtmlDocumentReader.java @@ -17,20 +17,15 @@ */ package de.tudarmstadt.ukp.inception.io.html.dkprocore; -import static de.tudarmstadt.ukp.dkpro.core.api.segmentation.TrimUtils.trim; -import static de.tudarmstadt.ukp.inception.io.html.dkprocore.internal.JSoupUtil.appendNormalisedText; -import static de.tudarmstadt.ukp.inception.io.html.dkprocore.internal.JSoupUtil.lastCharIsWhitespace; import static org.dkpro.core.api.parameter.ComponentParameters.DEFAULT_ENCODING; import java.io.BufferedInputStream; import java.io.IOException; -import java.io.InputStream; import java.util.HashMap; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.uima.UimaContext; -import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.MimeTypeCapability; @@ -42,20 +37,11 @@ import org.dkpro.core.api.parameter.ComponentParameters; import org.dkpro.core.api.parameter.MimeTypes; import org.dkpro.core.api.resources.CompressionUtils; -import org.dkpro.core.api.xml.type.XmlElement; import org.jsoup.Jsoup; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; -import org.jsoup.nodes.TextNode; import org.jsoup.select.NodeTraversor; -import org.jsoup.select.NodeVisitor; -import org.xml.sax.helpers.AttributesImpl; import com.ibm.icu.text.CharsetDetector; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import eu.openminted.share.annotations.api.DocumentationResource; @@ -120,10 +106,8 @@ public void getNext(JCas aJCas) throws IOException, CollectionException Resource res = nextFile(); initCas(aJCas, res); - CAS cas = aJCas.getCas(); - String html; - try (InputStream is = new BufferedInputStream( + try (var is = new BufferedInputStream( CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()))) { if (ENCODING_AUTO.equals(sourceEncoding)) { @@ -135,98 +119,10 @@ public void getNext(JCas aJCas) throws IOException, CollectionException } } - Document doc = Jsoup.parse(html); - - CasXmlHandler handler = new CasXmlHandler(aJCas); - - NodeVisitor visitor = new NodeVisitor() - { - @Override - public void head(Node node, int depth) - { - try { - if (node instanceof Document) { - handler.startDocument(); - handler.captureText(false); - } - else if (node instanceof TextNode) { - TextNode textNode = (TextNode) node; - StringBuilder buffer = new StringBuilder(); - if (normalizeWhitespace) { - appendNormalisedText(buffer, textNode); - } - else { - buffer.append(textNode.getWholeText()); - } - char[] text = buffer.toString().toCharArray(); - handler.characters(text, 0, text.length); - } - else if (node instanceof Element) { - Element element = (Element) node; - if (handler.getText().length() > 0 - && (element.isBlock() || element.nodeName().equals("br")) - && !lastCharIsWhitespace(handler.getText())) { - char[] text = " ".toCharArray(); - handler.characters(text, 0, text.length); - } - - AttributesImpl attributes = new AttributesImpl(); - - if (element.attributes() != null) { - for (Attribute attr : element.attributes()) { - attributes.addAttribute("", "", attr.getKey(), "CDATA", - attr.getValue()); - } - } - - if ("body".equals(element.tagName())) { - handler.captureText(true); - } - - handler.startElement("", "", element.tagName(), attributes); - } - } - catch (Exception e) { - throw new RuntimeException(e); - } - } + var doc = Jsoup.parse(html); - @Override - public void tail(Node node, int depth) - { - try { - if (node instanceof Document) { - handler.endDocument(); - } - else if (node instanceof Element) { - Element element = (Element) node; - - // Fetch the current element - XmlElement elementFS = handler.getCurrentElement(); - - // Close the current element so that it gets its end offset - handler.endElement("", "", element.tagName()); - - if ("body".equals(element.tagName())) { - handler.captureText(false); - } - - Integer type = mappings.get(node.nodeName()); - if (type != null) { - int[] span = { elementFS.getBegin(), elementFS.getEnd() }; - trim(handler.getText(), span); - Div div = (Div) cas.createAnnotation(aJCas.getCasType(type), span[0], - span[1]); - div.setDivType(node.nodeName()); - div.addToIndexes(); - } - } - } - catch (Exception e) { - throw new RuntimeException(e); - } - } - }; + var visitor = new CasXmlNodeVisitor(aJCas, normalizeWhitespace); + visitor.setMappings(mappings); NodeTraversor.traverse(visitor, doc); } diff --git a/inception/inception-io-html/src/main/resources/META-INF/asciidoc/user-guide/formats-mhtml.adoc b/inception/inception-io-html/src/main/resources/META-INF/asciidoc/user-guide/formats-mhtml.adoc new file mode 100644 index 00000000000..2d8c1853d6f --- /dev/null +++ b/inception/inception-io-html/src/main/resources/META-INF/asciidoc/user-guide/formats-mhtml.adoc @@ -0,0 +1,42 @@ +// Licensed to the Technische Universität Darmstadt under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The Technische Universität Darmstadt +// licenses this file to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +[[sect_formats_mhtml]] += MHTML (Web archive) + +==== +CAUTION: Experimental feature. To use this functionality, you need to enable it first by adding `format.mhtml.enabled=true` to the `settings.properties` file. In order to load images from MHTML files, it is currently also necessary to disable image blocking in the safety net using `ui.external.block-img=false` - use with care as this will also enable loading images from external sources. +==== + +link:https://en.wikipedia.org/wiki/MHTML[MHTML] is a format supported by many browsers which stores the website currently shown in the browser along with most resources required to display the page - including but not limited to images. + +E.g. in Chrome, you may save a web page in this format using **Save as...** and then selecting the +format **Web page, Single File**. + +{product-name} will load the web page saved in this format, but it will not look like the original. You will notice that most of the styling will be gone. This usually leads to a lot of boiler plate being visible in particular at the start and end of the document, e.g. page navigation sections, sidebars, etc. which have been inlined into the document structure because they are missing their usual styles. However, other essential styling like paragraph, headings, figures, tables, etc. should mostly be preserved. + +A special feature of the MHTML format is that it also allows images that were part of the original page to be displayed in {product-name}. Note that when saving a page, it is possible that the browser does not capture all the images into the MHTML file. {product-name} will only be able to display those images that are actually included. + +[cols="2,1,1,1,3"] +|==== +| Format | Read | Write | Custom Layers | Description + +| MHTML (`hmtml`) +| yes +| no +| no +| +|==== diff --git a/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java b/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java index 8683ff5868d..8408b89d040 100644 --- a/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java +++ b/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java @@ -175,6 +175,13 @@ public void endElement(String aUri, String aLocalName, String aQName) throws SAX @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { + if (stack.isEmpty()) { + // We ignore any characters outside the root elements. These could include e.g. + // whitespace in the context of a doctype before the root element or trailing whitespace + // after the root element. + return; + } + XmlTextNode textNode = new XmlTextNode(jcas); textNode.setBegin(text.length()); @@ -296,7 +303,7 @@ public XmlElement getElement() return element; } - void addChild(XmlNode aChild) + public void addChild(XmlNode aChild) { children.add(aChild); } diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/io/ZipUtils.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/io/ZipUtils.java index 82adeca7ae0..36aab5320f9 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/io/ZipUtils.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/io/ZipUtils.java @@ -19,11 +19,14 @@ import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.FileOutputStream; +import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; import java.util.zip.ZipOutputStream; import org.apache.commons.io.IOUtils; @@ -117,4 +120,40 @@ public static String normalizeEntryName(ZipEntry aEntry) return entryName; } + + public static InputStream openResourceStream(File aZipFile, String aEntryName) + throws IOException + { + if (aEntryName.contains("..") || aEntryName.contains("//")) { + throw new FileNotFoundException("Resource not found [" + aEntryName + "]"); + } + + ZipFile zipFile = null; + var success = false; + try { + zipFile = new ZipFile(aZipFile); + var entry = zipFile.getEntry(aEntryName); + if (entry == null) { + throw new FileNotFoundException("Resource not found [" + aEntryName + "]"); + } + + var finalZipFile = zipFile; + var is = new FilterInputStream(zipFile.getInputStream(entry)) + { + @Override + public void close() throws IOException + { + super.close(); + finalZipFile.close(); + } + }; + success = true; + return is; + } + finally { + if (!success && zipFile != null) { + zipFile.close(); + } + } + } } diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/NamspaceDecodingContentHandlerAdapter.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/NamspaceDecodingContentHandlerAdapter.java new file mode 100644 index 00000000000..6ba27760af1 --- /dev/null +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/NamspaceDecodingContentHandlerAdapter.java @@ -0,0 +1,179 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.support.xml; + +import static org.apache.commons.lang3.StringUtils.startsWith; + +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Stack; + +import javax.xml.XMLConstants; +import javax.xml.namespace.QName; + +import org.apache.commons.lang3.StringUtils; +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public abstract class NamspaceDecodingContentHandlerAdapter + extends ContentHandlerAdapter +{ + private static final String XMLNS = "xmlns"; + private static final String XMLNS_PREFIX = "xmlns:"; + + private final Stack stack; + + private final Map namespaceMappings = new LinkedHashMap<>(); + + public NamspaceDecodingContentHandlerAdapter(ContentHandler aDelegate) + { + super(aDelegate); + stack = new Stack<>(); + namespaceMappings.put(XMLConstants.XML_NS_PREFIX, XMLConstants.XML_NS_URI); + } + + @Override + public void startDocument() throws SAXException + { + stack.clear(); + namespaceMappings.clear(); + namespaceMappings.put(XMLConstants.XML_NS_PREFIX, XMLConstants.XML_NS_URI); + super.startDocument(); + } + + @Override + public void startPrefixMapping(String aPrefix, String aUri) throws SAXException + { + namespaceMappings.put(aPrefix, aUri); + + super.startPrefixMapping(aPrefix, aUri); + } + + @Override + public void endPrefixMapping(String aPrefix) throws SAXException + { + namespaceMappings.remove(aPrefix); + + super.endPrefixMapping(aPrefix); + } + + @Override + public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts) + throws SAXException + { + var localNamespaces = new LinkedHashMap(); + for (var nsDecl : prefixMappings(aAtts).entrySet()) { + var oldValue = namespaceMappings.put(nsDecl.getKey(), nsDecl.getValue()); + if (oldValue == null) { + localNamespaces.put(nsDecl.getKey(), nsDecl.getValue()); + } + } + + var element = toQName(aUri, aLocalName, aQName); + + stack.push(new Frame(element, localNamespaces)); + + super.startElement(aUri, aLocalName, aQName, aAtts); + } + + private Map prefixMappings(Attributes aAtts) + { + var mappings = new LinkedHashMap(); + if (aAtts != null) { + for (int i = 0; i < aAtts.getLength(); i++) { + String qName = aAtts.getQName(i); + if (XMLNS.equals(qName)) { + mappings.put("", aAtts.getValue(i)); + } + if (startsWith(qName, XMLNS_PREFIX)) { + mappings.put(qName.substring(XMLNS_PREFIX.length()), aAtts.getValue(i)); + } + } + } + return mappings; + } + + @Override + public void endElement(String aUri, String aLocalName, String aQName) throws SAXException + { + var frame = stack.pop(); + super.endElement(frame.element.getNamespaceURI(), frame.element.getLocalPart(), aQName); + frame.namespaces.keySet().forEach(namespaceMappings::remove); + } + + private static final class Frame + { + final QName element; + final Map namespaces; + + public Frame(QName aElement, Map aLocalNamespaces) + { + element = aElement; + + if (aLocalNamespaces != null && !aLocalNamespaces.isEmpty()) { + namespaces = aLocalNamespaces; + } + else { + namespaces = Collections.emptyMap(); + } + } + + @Override + public String toString() + { + return "[" + element.getLocalPart() + "]"; + } + } + + protected QName toQName(String aUri, String aLocalName, String aQName) + { + String prefix = XMLConstants.DEFAULT_NS_PREFIX; + String localName = aLocalName; + + // Workaround bug: localname may contain prefix + if (localName != null) { + var li = localName.indexOf(':'); + if (li >= 0) { + localName = localName.substring(li + 1); + } + } + + var qi = aQName.indexOf(':'); + if (qi >= 0) { + prefix = aQName.substring(0, qi); + } + + if (StringUtils.isEmpty(localName)) { + if (qi >= 0) { + localName = aQName.substring(qi + 1, aQName.length()); + } + else { + localName = aQName; + } + } + + String uri = aUri; + if (StringUtils.isEmpty(uri)) { + uri = namespaceMappings.getOrDefault(prefix, XMLConstants.NULL_NS_URI); + } + + return new QName(uri, localName, prefix); + } +} diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/ElementAction.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/ElementAction.java index 57f76b8e72c..7fc4ac25777 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/ElementAction.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/ElementAction.java @@ -39,5 +39,4 @@ public enum ElementAction * marked to pass. */ PRUNE; - }