Skip to content

Commit

Permalink
TIKA-4288 -- users should be able to configure the metadata filter pe…
Browse files Browse the repository at this point in the history
…r parse via the ParseContext on the ParseEmitTuple with PipesServer (#1867)
  • Loading branch information
tballison authored Jul 30, 2024
1 parent a53af59 commit 137c0d1
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 6 deletions.
13 changes: 9 additions & 4 deletions tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.DigestingParser;
Expand Down Expand Up @@ -398,8 +399,12 @@ private void actuallyParse(FetchEmitTuple t) {
private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) {
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
//we need to apply this after we pull out the stacktrace
filterMetadata(parseData.getMetadataList());
//we need to apply the metadata filter after we pull out the stacktrace
MetadataFilter filter = t.getParseContext().get(MetadataFilter.class);
if (filter == null) {
filter = tikaConfig.getMetadataFilter();
}
filterMetadata(filter, parseData.getMetadataList());
ParseContext parseContext = t.getParseContext();
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = t.getOnParseException();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig = parseContext.get(EmbeddedDocumentBytesConfig.class);
Expand Down Expand Up @@ -432,10 +437,10 @@ private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseD
}
}

private void filterMetadata(List<Metadata> metadataList) {
private void filterMetadata(MetadataFilter metadataFilter, List<Metadata> metadataList) {
for (Metadata m : metadataList) {
try {
tikaConfig.getMetadataFilter().filter(m);
metadataFilter.filter(m);
} catch (TikaException e) {
LOG.warn("failed to filter metadata", e);
}
Expand Down
22 changes: 20 additions & 2 deletions tika-core/src/test/java/org/apache/tika/pipes/PipesClientTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;

import org.junit.jupiter.api.Assertions;
Expand All @@ -28,11 +29,14 @@

import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.MockUpperCaseFilter;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.fetcher.FetchKey;

class PipesClientTest {
public class PipesClientTest {
String fetcherName = "fs";
String testPdfFile = "testOverlappingText.pdf";

Expand All @@ -49,7 +53,7 @@ public void init()
}

@Test
void process() throws IOException, InterruptedException {
public void testBasic() throws IOException, InterruptedException {
PipesResult pipesResult = pipesClient.process(
new FetchEmitTuple(testPdfFile, new FetchKey(fetcherName, testPdfFile),
new EmitKey(), new Metadata(), new ParseContext(), FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
Expand All @@ -58,4 +62,18 @@ void process() throws IOException, InterruptedException {
Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0);
Assertions.assertEquals("testOverlappingText.pdf", metadata.get("resourceName"));
}

@Test
public void testMetadataFilter() throws IOException, InterruptedException {
ParseContext parseContext = new ParseContext();
MetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new MockUpperCaseFilter()));
parseContext.set(MetadataFilter.class, metadataFilter);
PipesResult pipesResult = pipesClient.process(
new FetchEmitTuple(testPdfFile, new FetchKey(fetcherName, testPdfFile),
new EmitKey(), new Metadata(), parseContext, FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP));
Assertions.assertNotNull(pipesResult.getEmitData().getMetadataList());
Assertions.assertEquals(1, pipesResult.getEmitData().getMetadataList().size());
Metadata metadata = pipesResult.getEmitData().getMetadataList().get(0);
Assertions.assertEquals("TESTOVERLAPPINGTEXT.PDF", metadata.get("resourceName"));
}
}

0 comments on commit 137c0d1

Please sign in to comment.