Skip to content

Commit

Permalink
TIKA-3666 -- add detection of OLE2 drm encrypted files (#1204)
Browse files Browse the repository at this point in the history
Thanks to Ross Spencer for reopening this issue and linking it to an existing POI issue.
  • Loading branch information
tballison committed Jun 20, 2023
1 parent ba84b84 commit 3471d51
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,40 @@ public static MediaType detect(Set<String> names, DirectoryEntry root) {
if (names == null || names.size() == 0) {
return OLE;
}
//figure out if encrypted/pw protected first
if (names.contains("\u0006DataSpaces")) {
//OLE2 drm encrypted -- TIKA-3666
if (findRecursively(root, "\tDRMDataSpace", 0, 10)) {
return DRM_ENCRYPTED;
}
}

if (names.contains("EncryptedPackage")) {
if (names.contains("EncryptionInfo")) {
// This is a protected OOXML document, which is an OLE2 file
// with an Encrypted Stream which holds the OOXML data
// Without decrypting the stream, we can't tell what kind of
// OOXML file we have. Return a general OOXML Protected type,
// and hope the name based detection can guess the rest!

// This is the standard issue method of encryption for ooxml and
// is supported by POI

//Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
//See TIKA-2982
return OOXML_PROTECTED;
} else if (names.contains("\u0006DataSpaces")) {
//Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
// supported by POI, but we should still detect it.

//Do we also want to look for "DRMEncryptedTransform"?
if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
return DRM_ENCRYPTED;
}
}
}


for (String workbookEntryName : InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
if (names.contains(workbookEntryName)) {
MediaType tmp = processCompObjFormatType(root);
Expand Down Expand Up @@ -247,33 +281,6 @@ public static MediaType detect(Set<String> names, DirectoryEntry root) {
} else if (names.contains("Book")) {
// Excel 95 or older, we won't be able to parse this....
return XLS;
} else if (names.contains("EncryptedPackage")) {
if (names.contains("EncryptionInfo")) {
// This is a protected OOXML document, which is an OLE2 file
// with an Encrypted Stream which holds the OOXML data
// Without decrypting the stream, we can't tell what kind of
// OOXML file we have. Return a general OOXML Protected type,
// and hope the name based detection can guess the rest!

// This is the standard issue method of encryption for ooxml and
// is supported by POI

//Until Tika 1.23, we also required: && names.contains("\u0006DataSpaces")
//See TIKA-2982
return OOXML_PROTECTED;
} else if (names.contains("\u0006DataSpaces")) {
//Try to look for the DRMEncrypted type (TIKA-3666); as of 5.2.0, this is not
// supported by POI, but we should still detect it.

//Do we also want to look for "DRMEncryptedTransform"?
if (findRecursively(root, "DRMEncryptedDataSpace", 0, 10)) {
return DRM_ENCRYPTED;
} else {
return OLE;
}
} else {
return OLE;
}
} else if (names.contains("WordDocument")) {
return DOC;
} else if (names.contains("Quill")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;

Expand All @@ -36,6 +37,7 @@

import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
Expand Down Expand Up @@ -666,4 +668,12 @@ public void testSpecialControlCharacter() throws Exception {
assertContains("Paragraph one", getXML(
"testWORD_specialControlCharacter1415.doc").xml);
}

@Test
public void testEncryptedDRM() throws Exception {
assertThrows(EncryptedDocumentException.class, () -> {
//test file from: https://bz.apache.org/bugzilla/show_bug.cgi?id=62848
getRecursiveMetadata("testWORD_protected_drm.doc");
});
}
}
Binary file not shown.

0 comments on commit 3471d51

Please sign in to comment.