Skip to content

Commit

Permalink
#4469 - Support for MHTML web page archives
Browse files Browse the repository at this point in the history
- Added experimental support for MHTML files
- Added documentation
- Remove duplicate CasXmlHandler implementation
- Allow embedded images to pass through the default HMTL policy - they are still blocked by the safety net though unless explicitly enabled in the settings file
  • Loading branch information
reckart committed Jan 27, 2024
1 parent 7e55b1b commit b83ca16
Show file tree
Hide file tree
Showing 21 changed files with 796 additions and 394 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -110,23 +110,17 @@ default boolean hasResources()
return false;
}

default boolean isAccessibelResource(File aDocFile, String aResourcePath)
default boolean isAccessibleResource(File aDocFile, String aResourcePath)
{
return DEFAULT_PERMITTED_RESOURCE_EXTENSIONS.contains(getExtension(aResourcePath));
}

default InputStream openResourceStream(File aDocFile, String aResourcePath) throws IOException
{
if (!hasResources() || !isAccessibelResource(aDocFile, aResourcePath)) {
throw new FileNotFoundException("Resource not found [" + aResourcePath + "]");
}

if (aResourcePath.contains("..") || aResourcePath.contains("//")) {
throw new FileNotFoundException("Resource not found [" + aResourcePath + "]");
}

// var path = prependIfMissing(normalize(aResourcePath, true), "/");

ZipFile zipFile = null;
var success = false;
try {
Expand Down
13 changes: 13 additions & 0 deletions inception/inception-dependencies/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,19 @@
<version>1.16.1</version>
</dependency>

<!-- MIME4J -->

<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-core</artifactId>
<version>0.8.9</version>
</dependency>
<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-dom</artifactId>
<version>0.8.9</version>
</dependency>

<!-- LUCENE MTAS -->

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ include::{include-dir}formats-html.adoc[leveloffset=+2]

include::{include-dir}formats-htmldoc.adoc[leveloffset=+2]

include::{include-dir}formats-mhtml.adoc[leveloffset=+2]

include::{include-dir}formats-imscwb.adoc[leveloffset=+2]

// include::{include-dir}formats-lif.adoc[leveloffset=+2]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ data in a particular format. The **feature flag** column shows which flags you c
| `htmldoc`
| `format.html.enabled`

| <<sect_formats_mhtml>>
| `mhtml`
| `format.mhtml.enabled`

| <<sect_formats_html>>
| `html`
| `format.html-legacy.enabled`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,24 @@
*/
package de.tudarmstadt.ukp.inception.externaleditor;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Optional;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import de.tudarmstadt.ukp.clarin.webanno.api.export.DocumentImportExportService;
import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;
import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument;
import de.tudarmstadt.ukp.inception.editor.AnnotationEditorRegistry;
import de.tudarmstadt.ukp.inception.externaleditor.policy.DefaultHtmlDocumentPolicy;
import de.tudarmstadt.ukp.inception.externaleditor.policy.SafetyNetDocumentPolicy;
import de.tudarmstadt.ukp.inception.support.xml.NamspaceDecodingContentHandlerAdapter;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.SanitizingContentHandler;

Expand All @@ -47,15 +55,15 @@ public XmlDocumentViewControllerImplBase(DefaultHtmlDocumentPolicy aDefaultPolic
annotationEditorRegistry = aAnnotationEditorRegistry;
}

protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocument doc,
protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocument aDoc,
ContentHandler aCh)
throws IOException
{
// Apply safety net
var ch = new SanitizingContentHandler(aCh, safetyNetPolicy.getPolicy());

// Apply format policy if it exists
var formatPolicy = formatRegistry.getFormatPolicy(doc);
var formatPolicy = formatRegistry.getFormatPolicy(aDoc);
if (formatPolicy.isPresent()) {
ch = new SanitizingContentHandler(ch, formatPolicy.get());
}
Expand All @@ -73,6 +81,52 @@ protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocumen
return ch;
}

protected ContentHandler applyHtmlResourceUrlFilter(SourceDocument aDoc,
ContentHandler aDelegate)
{
var hasResources = formatRegistry.getFormatById(aDoc.getFormat())
.map(FormatSupport::hasResources).orElse(false);
if (!hasResources) {
return aDelegate;
}

return new NamspaceDecodingContentHandlerAdapter(aDelegate)
{
@Override
public void startElement(String aUri, String aLocalName, String aQName,
Attributes aAtts)
throws SAXException
{
var atts = aAtts;

var element = toQName(aUri, aLocalName, aQName);

if ("a".equalsIgnoreCase(element.getLocalPart())
|| "link".equalsIgnoreCase(element.getLocalPart())) {
atts = filterResourceUrl(aAtts, "href");
}

if ("img".equalsIgnoreCase(element.getLocalPart())) {
atts = filterResourceUrl(aAtts, "src");
}

super.startElement(aUri, aLocalName, aQName, atts);
}

private Attributes filterResourceUrl(Attributes aAtts, String attribute)
{
var attributes = new AttributesImpl(aAtts);
var index = attributes.getIndex(attribute);
if (index != -1) {
var value = attributes.getValue(index);
value = "res?resId=" + URLEncoder.encode(value, UTF_8);
attributes.setValue(index, value);
}
return attributes;
}
};
}

private Optional<PolicyCollection> getEditorPolicy(Optional<String> aEditor) throws IOException
{
if (!aEditor.isPresent()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,63 @@ public boolean isBlockStyle()
return blockStyle;
}

public void setBlockStyle(boolean aBlockStyle)
{
blockStyle = aBlockStyle;
}

@Override
public boolean isBlockImg()
{
return blockImg;
}

public void setBlockImg(boolean aBlockImg)
{
blockImg = aBlockImg;
}

@Override
public boolean isBlockEmbed()
{
return blockEmbed;
}

public void setBlockEmbed(boolean aBlockEmbed)
{
blockEmbed = aBlockEmbed;
}

@Override
public boolean isBlockAudio()
{
return blockAudio;
}

public void setBlockAudio(boolean aBlockAudio)
{
blockAudio = aBlockAudio;
}

@Override
public boolean isBlockObject()
{
return blockObject;
}

public void setBlockObject(boolean aBlockObject)
{
blockObject = aBlockObject;
}

@Override
public boolean isBlockVideo()
{
return blockVideo;
}

public void setBlockVideo(boolean aBlockVideo)
{
blockVideo = aBlockVideo;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,22 @@ match_without_namespace: true
debug: true
policies:
- action: PASS
elements: ["html", "head", "body", "title"]
elements: ["html", "head", "body", "title"]
# Content sectioning
- action: PASS
elements: ["address", "article", "aside", "footer", "header",
"h1", "h2", "h3", "h4", "h5", "h6", "main", "section"]
# Navigaton sectioning (probably not relevant for annotation)
- action: PRUNE
elements: ["nav"]
# Text content
- action: PASS
elements: ["blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr", "li",
"menu", "ol", "p", "pre", "ul"]
# Inline text semantics
# Links / anchors
- action: SKIP
elements: ["a"]
# Inline text semantics
- action: PASS
elements: ["abbr", "b", "bdi", "bdo", "br", "cite", "code", "data", "dfn",
"em", "i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small",
Expand Down Expand Up @@ -63,5 +67,15 @@ policies:
- action: PASS
attributes: ["class"]
matching: "[a-zA-Z0-9\\s,\\-_]+"

# Images - depending on the security configurations, the safety net may block these
- action: PASS
elements: ["img"]
- on_elements: ["img"]
action: PASS
attributes: ["decoding", "alt", "width", "height"]
- on_elements: ["img"]
action: PASS
attributes: ["src"]
matching: "res[?]resId=.*" # Allow only access to embedded resources


Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ public class SafetyNetDocumentPolicy
{
static final String SAFETY_NET_POLICY_OVERRIDE_YAML = "safety-net.yaml";

private static final String[] JAVASCRIPT_ACTIVE_ATTRIBUTES = { "href", "src", "codebase",
"cite", "background", "action", "longdesc", "profile", "classid", "data", "usemap",
"formaction", "icon", "manifest", "poster", "srcset", "archive" };

private static final String[] JAVASCRIPT_EVENT_ATTRIBUTES = { "onafterprint", "onbeforeprint",
"onbeforeunload", "onerror", "onhashchange", "onload", "onmessage", "onoffline",
"ononline", "onpagehide", "onpageshow", "onpopstate", "onresize", "onstorage",
Expand Down Expand Up @@ -95,9 +99,8 @@ private PolicyCollection makeDefaultPolicy()
builder.disallowElements("video");
}

builder.disallowAttributes("href", "src", "codebase", "cite", "background", "action",
"longdesc", "profile", "classid", "data", "usemap", "formaction", "icon",
"manifest", "poster", "srcset", "archive").matching(compile("\\s*javascript:.*"))
builder.disallowAttributes(JAVASCRIPT_ACTIVE_ATTRIBUTES) //
.matching(compile("\\s*javascript:.*")) //
.globally();

builder.disallowAttributes(JAVASCRIPT_EVENT_ATTRIBUTES).globally();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,11 @@ public ResponseEntity<String> getDocument(@PathVariable("projectId") long aProje
@RequestParam("editor") Optional<String> aEditor, Principal principal)
throws Exception
{
SourceDocument doc = documentService.getSourceDocument(aProjectId, aDocumentId);
var doc = documentService.getSourceDocument(aProjectId, aDocumentId);

CAS cas = documentService.createOrReadInitialCas(doc);
var cas = documentService.createOrReadInitialCas(doc);

try (StringWriter out = new StringWriter()) {
try (var out = new StringWriter()) {
Optional<XmlDocument> maybeXmlDocument;
if (cas.getTypeSystem().getType(XmlDocument._TypeName) != null) {
maybeXmlDocument = cas.select(XmlDocument.class).findFirst();
Expand All @@ -144,14 +144,16 @@ public ResponseEntity<String> getDocument(@PathVariable("projectId") long aProje

var rawHandler = XmlCas2SaxEvents.makeSerializer(out);
var sanitizingHandler = applySanitizers(aEditor, doc, rawHandler);
var resourceFilteringHandler = applyHtmlResourceUrlFilter(doc, sanitizingHandler);
var finalHandler = resourceFilteringHandler;

// If the CAS contains an actual HTML structure, then we send that. Mind that we do
// not inject format-specific CSS then!
if (casContainsHtml) {
XmlDocument xml = maybeXmlDocument.get();
var xml = maybeXmlDocument.get();
startXHtmlDocument(rawHandler);

var serializer = new XmlCas2SaxEvents(xml, sanitizingHandler);
var serializer = new XmlCas2SaxEvents(xml, finalHandler);
serializer.process(xml.getRoot());

endXHtmlDocument(rawHandler);
Expand All @@ -168,21 +170,21 @@ public ResponseEntity<String> getDocument(@PathVariable("projectId") long aProje
if (maybeXmlDocument.isEmpty()) {
// Gracefully handle the case that the CAS does not contain any XML structure at all
// and show only the document text in this case.
renderTextContent(cas, sanitizingHandler);
renderTextContent(cas, finalHandler);
}
else {
var formatPolicy = formatRegistry.getFormatPolicy(doc);
var defaultNamespace = formatPolicy.flatMap(policy -> policy.getDefaultNamespace());

if (defaultNamespace.isPresent()) {
sanitizingHandler.startPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX,
finalHandler.startPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX,
defaultNamespace.get());
}

renderXmlContent(doc, sanitizingHandler, aEditor, maybeXmlDocument.get());
renderXmlContent(doc, finalHandler, aEditor, maybeXmlDocument.get());

if (defaultNamespace.isPresent()) {
sanitizingHandler.endPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX);
finalHandler.endPrefixMapping(XMLConstants.DEFAULT_NS_PREFIX);
}
}
rawHandler.endElement(null, null, BODY);
Expand Down Expand Up @@ -261,25 +263,31 @@ public ResponseEntity<InputStreamResource> getResource(
var srcDoc = documentService.getSourceDocument(aProjectId, aDocumentId);

var maybeFormatSupport = formatRegistry.getFormatById(srcDoc.getFormat());
if (!maybeFormatSupport.isPresent()) {
if (maybeFormatSupport.isEmpty()) {
return ResponseEntity.notFound().build();
}

var srcDocFile = documentService.getSourceDocumentFile(srcDoc);

var formatSupport = maybeFormatSupport.get();

if (!formatSupport.hasResources()
|| !formatSupport.isAccessibleResource(srcDocFile, aResourceId)) {
LOG.debug("Resource [{}] for document {} not found", aResourceId, srcDoc);
return ResponseEntity.notFound().build();
}

try {
var inputStream = formatSupport.openResourceStream(srcDocFile, aResourceId);
HttpHeaders httpHeaders = new HttpHeaders();
var httpHeaders = new HttpHeaders();
return new ResponseEntity<>(new InputStreamResource(inputStream), httpHeaders, OK);
}
catch (FileNotFoundException e) {
LOG.error("Resource [{}] for document {} not found", aResourceId, srcDoc);
LOG.debug("Resource [{}] for document {} not found", aResourceId, srcDoc);
return ResponseEntity.notFound().build();
}
catch (Exception e) {
LOG.error("Unable to load resource [{}] for document {}", aResourceId, srcDoc, e);
LOG.debug("Unable to load resource [{}] for document {}", aResourceId, srcDoc, e);
return ResponseEntity.notFound().build();
}
}
Expand Down
Loading

0 comments on commit b83ca16

Please sign in to comment.