Skip to content

Commit

Permalink
[TIKA-4309] Support MachO
Browse files Browse the repository at this point in the history
  • Loading branch information
alexey-pelykh committed Sep 20, 2024
1 parent ec45a2e commit 59bc2d1
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 27 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -424,18 +424,6 @@
<glob pattern="*.class"/>
</mime-type>

<mime-type type="application/x-java-jnilib">
<_comment>Java Native Library for OSX</_comment>
<magic priority="50">
<match value="0xcafebabe" type="string" offset="0">
<match value="0xfeedface" type="string" offset="4096"/>
<match value="0xfeedfacf" type="string" offset="4096"/>
<match value="0xcefaedfe" type="string" offset="4096"/>
<match value="0xcffaedfe" type="string" offset="4096"/>
</match>
</magic>
<glob pattern="*.jnilib"/>
</mime-type>
<mime-type type="application/vnd.java.hprof ">
<_comment>Java hprof text file</_comment>
<magic priority="50">
Expand Down Expand Up @@ -4306,14 +4294,6 @@
<_comment>LZMA compressed archive</_comment>
<glob pattern="*.lzma"/>
</mime-type>
<mime-type type="application/x-mach-o">
<_comment>Mach-O</_comment>
<tika:link>https://www.nationalarchives.gov.uk/PRONOM/fmt/693</tika:link>
<magic priority="50">
<match value="0xFEEDFACF" offset="0"/>
<match value="0xCFFAEDFE" offset="0"/>
</magic>
</mime-type>
<mime-type type="application/x-memgraph">
<_comment>Apple Xcode Memgraph</_comment>
<sub-class-of type="application/x-bplist"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public class ExecutableParser implements Parser, MachineMetadata {
private static final MediaType ELF_EXECUTABLE = MediaType.application("x-executable");
private static final MediaType ELF_SHAREDLIB = MediaType.application("x-sharedlib");
private static final MediaType ELF_COREDUMP = MediaType.application("x-coredump");
private static final MediaType MACH_O_BINARY = MediaType.application("x-mach-binary");
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(
Arrays.asList(PE_EXE, ELF_GENERAL, ELF_OBJECT, ELF_EXECUTABLE, ELF_SHAREDLIB,
Expand All @@ -63,7 +64,7 @@ public Set<MediaType> getSupportedTypes(ParseContext context) {
}

public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
ParseContext context) throws IOException, SAXException, TikaException {
// We only do metadata, for now
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
Expand All @@ -76,9 +77,14 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
} else if (first4[0] == (byte) 0x7f && first4[1] == (byte) 'E' && first4[2] == (byte) 'L' &&
first4[3] == (byte) 'F') {
parseELF(xhtml, metadata, stream, first4);
} else if ((first4[0] == (byte) 0xCF || first4[0] == (byte) 0xCE) &&
first4[1] == (byte) 0xFA && first4[2] == (byte) 0xED && first4[3] == (byte) 0xFE) {
parseMachO(xhtml, metadata, stream, first4);
} else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE &&
first4[2] == (byte) 0xBA && first4[3] == (byte) 0xBE) {
parseFatMachO(xhtml, metadata, stream, first4);
}


// Finish everything
xhtml.endDocument();
}
Expand All @@ -87,7 +93,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
* Parses a DOS or Windows PE file
*/
public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
byte[] first4) throws TikaException, IOException {
byte[] first4) throws TikaException, IOException {
metadata.set(Metadata.CONTENT_TYPE, PE_EXE.toString());
metadata.set(PLATFORM, PLATFORM_WINDOWS);

Expand All @@ -98,13 +104,14 @@ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream st
// Grab the PE header offset
int peOffset = EndianUtils.readIntLE(stream);

// Reasonability check - while it may go anywhere, it's normally in the first few kb
// Reasonability check - while it may go anywhere, it's normally in the first
// few kb
if (peOffset > 4096 || peOffset < 0x3f) {
return;
}

// Skip the rest of the MS-DOS stub (if PE), until we reach what should
// be the PE header (if this is a PE executable)
// be the PE header (if this is a PE executable)
stream.skip(peOffset - 0x40);

// Read the PE header
Expand Down Expand Up @@ -233,7 +240,7 @@ public void parsePE(XHTMLContentHandler xhtml, Metadata metadata, InputStream st
* Parses a Unix ELF file
*/
public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
byte[] first4) throws TikaException, IOException {
byte[] first4) throws TikaException, IOException {
// Byte 5 is the architecture
int architecture = stream.read();
if (architecture == 1) {
Expand Down Expand Up @@ -400,8 +407,62 @@ public void parseELF(XHTMLContentHandler xhtml, Metadata metadata, InputStream s
break;
}


// Bytes 20-23 are the version
// TODO
}

/**
* Parses a Mach-O file
*/
public void parseMachO(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
byte[] first4) throws TikaException, IOException {
// Bytes 5-8 are the CPU type and architecture bits
int cpuType = EndianUtils.readIntLE(stream);
if ((cpuType >> 24) == 1) {
metadata.set(ARCHITECTURE_BITS, "64");
}
switch (cpuType) {
case 1:
metadata.set(MACHINE_TYPE, MACHINE_VAX);
break;
case 6:
metadata.set(MACHINE_TYPE, MACHINE_M68K);
break;
case 7:
metadata.set(MACHINE_TYPE, MACHINE_x86_32);
break;
case (7 & 0x01000000):
metadata.set(MACHINE_TYPE, MACHINE_x86_64);
break;
case 8:
metadata.set(MACHINE_TYPE, MACHINE_MIPS);
break;
case 12:
case (12 | 0x01000000):
metadata.set(MACHINE_TYPE, MACHINE_ARM);
break;
case 13:
metadata.set(MACHINE_TYPE, MACHINE_M88K);
break;
case 14:
metadata.set(MACHINE_TYPE, MACHINE_SPARC);
break;
case 18:
metadata.set(MACHINE_TYPE, MACHINE_PPC);
break;
}

metadata.set(Metadata.CONTENT_TYPE, MACH_O_BINARY.toString());
}


/**
* Parses a fat Mach-O (universal) file
*/
public void parseFatMachO(XHTMLContentHandler xhtml, Metadata metadata, InputStream stream,
byte[] first4) throws TikaException, IOException {
// TODO: There's no way to return multiple archirectures and/or machine types

metadata.set(Metadata.CONTENT_TYPE, MACH_O_BINARY.toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,37 @@ public void testElfParser_x86_32() throws Exception {

}

@Test
public void testMachOParser_x86_64() throws Exception {
XMLResult r = getXML("testMacOS-x86_64");
Metadata metadata = r.metadata;
assertEquals("application/x-mach-binary", metadata.get(Metadata.CONTENT_TYPE));

assertEquals(ExecutableParser.MACHINE_x86_64, metadata.get(ExecutableParser.MACHINE_TYPE));
assertEquals("64", metadata.get(ExecutableParser.ARCHITECTURE_BITS));

assertContains("<body />", r.xml);
}

@Test
public void testMachOParser_arm64() throws Exception {
XMLResult r = getXML("testMacOS-arm64");
Metadata metadata = r.metadata;
assertEquals("application/x-mach-binary", metadata.get(Metadata.CONTENT_TYPE));

assertEquals(ExecutableParser.MACHINE_ARM, metadata.get(ExecutableParser.MACHINE_TYPE));
assertEquals("64", metadata.get(ExecutableParser.ARCHITECTURE_BITS));

assertContains("<body />", r.xml);
}

@Test
public void testMachOParser_x86_64_arm64() throws Exception {
XMLResult r = getXML("testMacOS-x86_64-arm64");
Metadata metadata = r.metadata;
assertEquals("application/x-mach-binary", metadata.get(Metadata.CONTENT_TYPE));

assertContains("<body />", r.xml);
}

}
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 59bc2d1

Please sign in to comment.