Skip to content

Commit

Permalink
Merge pull request #4470 from inception-project/feature/4469-Support-…
Browse files Browse the repository at this point in the history
…for-MHTML-web-page-archives

#4469 - Support for MHTML web page archives
  • Loading branch information
reckart authored Jan 27, 2024
2 parents 7e55b1b + cfa58e3 commit ed7552c
Show file tree
Hide file tree
Showing 24 changed files with 841 additions and 433 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,12 @@

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.zip.ZipFile;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
Expand All @@ -57,6 +55,7 @@

import de.tudarmstadt.ukp.clarin.webanno.model.Project;
import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument;
import de.tudarmstadt.ukp.inception.support.io.ZipUtils;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection;

public interface FormatSupport
Expand Down Expand Up @@ -110,50 +109,14 @@ default boolean hasResources()
return false;
}

default boolean isAccessibelResource(File aDocFile, String aResourcePath)
default boolean isAccessibleResource(File aDocFile, String aResourcePath)
{
return DEFAULT_PERMITTED_RESOURCE_EXTENSIONS.contains(getExtension(aResourcePath));
}

default InputStream openResourceStream(File aDocFile, String aResourcePath) throws IOException
{
if (!hasResources() || !isAccessibelResource(aDocFile, aResourcePath)) {
throw new FileNotFoundException("Resource not found [" + aResourcePath + "]");
}

if (aResourcePath.contains("..") || aResourcePath.contains("//")) {
throw new FileNotFoundException("Resource not found [" + aResourcePath + "]");
}

// var path = prependIfMissing(normalize(aResourcePath, true), "/");

ZipFile zipFile = null;
var success = false;
try {
zipFile = new ZipFile(aDocFile);
var entry = zipFile.getEntry(aResourcePath);
if (entry == null) {
throw new FileNotFoundException("Resource not found [" + aResourcePath + "]");
}

var finalZipFile = zipFile;
var is = new FilterInputStream(zipFile.getInputStream(entry))
{
@Override
public void close() throws IOException
{
super.close();
finalZipFile.close();
}
};
success = true;
return is;
}
finally {
if (!success && zipFile != null) {
zipFile.close();
}
}
return ZipUtils.openResourceStream(aDocFile, aResourcePath);
}

/**
Expand Down
13 changes: 13 additions & 0 deletions inception/inception-dependencies/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -877,6 +877,19 @@
<version>1.16.1</version>
</dependency>

<!-- MIME4J -->

<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-core</artifactId>
<version>0.8.9</version>
</dependency>
<dependency>
<groupId>org.apache.james</groupId>
<artifactId>apache-mime4j-dom</artifactId>
<version>0.8.9</version>
</dependency>

<!-- LUCENE MTAS -->

<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,8 @@ include::{include-dir}formats-html.adoc[leveloffset=+2]

include::{include-dir}formats-htmldoc.adoc[leveloffset=+2]

include::{include-dir}formats-mhtml.adoc[leveloffset=+2]

include::{include-dir}formats-imscwb.adoc[leveloffset=+2]

// include::{include-dir}formats-lif.adoc[leveloffset=+2]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ data in a particular format. The **feature flag** column shows which flags you c
| `htmldoc`
| `format.html.enabled`

| <<sect_formats_mhtml>>
| `mhtml`
| `format.mhtml.enabled`

| <<sect_formats_html>>
| `html`
| `format.html-legacy.enabled`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,24 @@
*/
package de.tudarmstadt.ukp.inception.externaleditor;

import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Optional;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import de.tudarmstadt.ukp.clarin.webanno.api.export.DocumentImportExportService;
import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;
import de.tudarmstadt.ukp.clarin.webanno.model.SourceDocument;
import de.tudarmstadt.ukp.inception.editor.AnnotationEditorRegistry;
import de.tudarmstadt.ukp.inception.externaleditor.policy.DefaultHtmlDocumentPolicy;
import de.tudarmstadt.ukp.inception.externaleditor.policy.SafetyNetDocumentPolicy;
import de.tudarmstadt.ukp.inception.support.xml.NamspaceDecodingContentHandlerAdapter;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.PolicyCollection;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.SanitizingContentHandler;

Expand All @@ -47,15 +55,15 @@ public XmlDocumentViewControllerImplBase(DefaultHtmlDocumentPolicy aDefaultPolic
annotationEditorRegistry = aAnnotationEditorRegistry;
}

protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocument doc,
protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocument aDoc,
ContentHandler aCh)
throws IOException
{
// Apply safety net
var ch = new SanitizingContentHandler(aCh, safetyNetPolicy.getPolicy());

// Apply format policy if it exists
var formatPolicy = formatRegistry.getFormatPolicy(doc);
var formatPolicy = formatRegistry.getFormatPolicy(aDoc);
if (formatPolicy.isPresent()) {
ch = new SanitizingContentHandler(ch, formatPolicy.get());
}
Expand All @@ -73,6 +81,52 @@ protected ContentHandler applySanitizers(Optional<String> aEditor, SourceDocumen
return ch;
}

protected ContentHandler applyHtmlResourceUrlFilter(SourceDocument aDoc,
ContentHandler aDelegate)
{
var hasResources = formatRegistry.getFormatById(aDoc.getFormat())
.map(FormatSupport::hasResources).orElse(false);
if (!hasResources) {
return aDelegate;
}

return new NamspaceDecodingContentHandlerAdapter(aDelegate)
{
@Override
public void startElement(String aUri, String aLocalName, String aQName,
Attributes aAtts)
throws SAXException
{
var atts = aAtts;

var element = toQName(aUri, aLocalName, aQName);

if ("a".equalsIgnoreCase(element.getLocalPart())
|| "link".equalsIgnoreCase(element.getLocalPart())) {
atts = filterResourceUrl(aAtts, "href");
}

if ("img".equalsIgnoreCase(element.getLocalPart())) {
atts = filterResourceUrl(aAtts, "src");
}

super.startElement(aUri, aLocalName, aQName, atts);
}

private Attributes filterResourceUrl(Attributes aAtts, String attribute)
{
var attributes = new AttributesImpl(aAtts);
var index = attributes.getIndex(attribute);
if (index != -1) {
var value = attributes.getValue(index);
value = "res?resId=" + URLEncoder.encode(value, UTF_8);
attributes.setValue(index, value);
}
return attributes;
}
};
}

private Optional<PolicyCollection> getEditorPolicy(Optional<String> aEditor) throws IOException
{
if (!aEditor.isPresent()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,63 @@ public boolean isBlockStyle()
return blockStyle;
}

public void setBlockStyle(boolean aBlockStyle)
{
blockStyle = aBlockStyle;
}

@Override
public boolean isBlockImg()
{
return blockImg;
}

public void setBlockImg(boolean aBlockImg)
{
blockImg = aBlockImg;
}

@Override
public boolean isBlockEmbed()
{
return blockEmbed;
}

public void setBlockEmbed(boolean aBlockEmbed)
{
blockEmbed = aBlockEmbed;
}

@Override
public boolean isBlockAudio()
{
return blockAudio;
}

public void setBlockAudio(boolean aBlockAudio)
{
blockAudio = aBlockAudio;
}

@Override
public boolean isBlockObject()
{
return blockObject;
}

public void setBlockObject(boolean aBlockObject)
{
blockObject = aBlockObject;
}

@Override
public boolean isBlockVideo()
{
return blockVideo;
}

public void setBlockVideo(boolean aBlockVideo)
{
blockVideo = aBlockVideo;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,22 @@ match_without_namespace: true
debug: true
policies:
- action: PASS
elements: ["html", "head", "body", "title"]
elements: ["html", "head", "body", "title"]
# Content sectioning
- action: PASS
elements: ["address", "article", "aside", "footer", "header",
"h1", "h2", "h3", "h4", "h5", "h6", "main", "section"]
# Navigaton sectioning (probably not relevant for annotation)
- action: PRUNE
elements: ["nav"]
# Text content
- action: PASS
elements: ["blockquote", "dd", "div", "dl", "dt", "figcaption", "figure", "hr", "li",
"menu", "ol", "p", "pre", "ul"]
# Inline text semantics
# Links / anchors
- action: SKIP
elements: ["a"]
# Inline text semantics
- action: PASS
elements: ["abbr", "b", "bdi", "bdo", "br", "cite", "code", "data", "dfn",
"em", "i", "kbd", "mark", "q", "rp", "rt", "ruby", "s", "samp", "small",
Expand Down Expand Up @@ -63,5 +67,15 @@ policies:
- action: PASS
attributes: ["class"]
matching: "[a-zA-Z0-9\\s,\\-_]+"

# Images - depending on the security configurations, the safety net may block these
- action: PASS
elements: ["img"]
- on_elements: ["img"]
action: PASS
attributes: ["decoding", "alt", "width", "height"]
- on_elements: ["img"]
action: PASS
attributes: ["src"]
matching: "res[?]resId=.*" # Allow only access to embedded resources


Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ public class SafetyNetDocumentPolicy
{
static final String SAFETY_NET_POLICY_OVERRIDE_YAML = "safety-net.yaml";

private static final String[] JAVASCRIPT_ACTIVE_ATTRIBUTES = { "href", "src", "codebase",
"cite", "background", "action", "longdesc", "profile", "classid", "data", "usemap",
"formaction", "icon", "manifest", "poster", "srcset", "archive" };

private static final String[] JAVASCRIPT_EVENT_ATTRIBUTES = { "onafterprint", "onbeforeprint",
"onbeforeunload", "onerror", "onhashchange", "onload", "onmessage", "onoffline",
"ononline", "onpagehide", "onpageshow", "onpopstate", "onresize", "onstorage",
Expand Down Expand Up @@ -95,9 +99,8 @@ private PolicyCollection makeDefaultPolicy()
builder.disallowElements("video");
}

builder.disallowAttributes("href", "src", "codebase", "cite", "background", "action",
"longdesc", "profile", "classid", "data", "usemap", "formaction", "icon",
"manifest", "poster", "srcset", "archive").matching(compile("\\s*javascript:.*"))
builder.disallowAttributes(JAVASCRIPT_ACTIVE_ATTRIBUTES) //
.matching(compile("\\s*javascript:.*")) //
.globally();

builder.disallowAttributes(JAVASCRIPT_EVENT_ATTRIBUTES).globally();
Expand Down
Loading

0 comments on commit ed7552c

Please sign in to comment.