Skip to content

Commit

Permalink
TIKA-4247 HttpFetcher - add ability to send request headers (#1737)
Browse files Browse the repository at this point in the history
* TIKA-4247 HttpFetcher - add ability to send request headers

set headers in a metadata value for "httpRequestHeaders"
those will be sent along with http request.

* TIKA-4252: fix wrong char

* TIKA-4252: cleaner fix

* TIKA-4252: add http request headers at fetcher config level

* conflict resolution

---------

Co-authored-by: Nicholas DiPiazza <[email protected]>
Co-authored-by: Tilman Hausherr <[email protected]>
  • Loading branch information
3 people authored Aug 7, 2024
1 parent 330dc58 commit 3a0ddc7
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@
import java.nio.file.StandardCopyOption;
import java.security.PrivateKey;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicBoolean;
Expand Down Expand Up @@ -131,6 +134,25 @@ public HttpFetcher(HttpFetcherConfig httpFetcherConfig) {
private HttpClient httpClient;
//back-off client that disables compression
private HttpClient noCompressHttpClient;
private int maxRedirects = 10;
//overall timeout in milliseconds
private long overallTimeout = -1;

private long maxSpoolSize = -1;

//max string length to read from a result if the
//status code was not in the 200 range
private int maxErrMsgSize = 10000;

//httpHeaders to capture in the metadata
private Set<String> httpHeaders = new HashSet<>();

//httpRequestHeaders to add to all outgoing http requests
private Set<String> httpRequestHeaders = new HashSet<>();

//When making the request, what User-Agent is sent.
//By default httpclient adds e.g. "Apache-HttpClient/4.5.13 (Java/x.y.z)"
private String userAgent = null;

JwtGenerator jwtGenerator;

Expand All @@ -144,10 +166,40 @@ public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseC
.setRedirectsEnabled(httpFetcherConfig.getMaxRedirects() > 0)
.build();
get.setConfig(requestConfig);
setHttpRequestHeaders(metadata, get);
putAdditionalHeadersOnRequest(additionalHttpFetcherConfig, get);
return execute(get, metadata, httpClient, true);
}

private void setHttpRequestHeaders(Metadata metadata, HttpGet get) {
if (!StringUtils.isBlank(userAgent)) {
get.setHeader(USER_AGENT, userAgent);
}
// Add the headers from the Fetcher configuration.
if (httpRequestHeaders != null) {
for (String httpRequestHeader : httpRequestHeaders) {
parseHeaderAndPutOnRequest(get, httpRequestHeader);
}
}
// Additionally, headers can be specified per-fetch via the metadata.
String[] httpRequestHeaders = metadata.getValues("httpRequestHeaders");
if (httpRequestHeaders != null) {
for (String httpRequestHeader : httpRequestHeaders) {
parseHeaderAndPutOnRequest(get, httpRequestHeader);
}
}
}

private static void parseHeaderAndPutOnRequest(HttpGet get, String httpRequestHeader) {
String[] parts = httpRequestHeader
.trim().split(":", 2);
if (parts.length >= 2) {
String key = parts[0].trim();
String value = parts[1].trim();
get.setHeader(key, value);
}
}

private HttpFetcherConfig getAdditionalHttpFetcherConfig(ParseContext parseContext) throws JsonProcessingException {
HttpFetcherConfig additionalHttpFetcherConfig = null;
FetcherConfigContainer fetcherConfigContainer = parseContext.get(FetcherConfigContainer.class);
Expand Down Expand Up @@ -466,20 +518,24 @@ public void setMaxRedirects(int maxRedirects) {
*/
@Field
public void setHttpRequestHeaders(List<String> headers) {
this.httpRequestHeaders.clear();
this.httpRequestHeaders.addAll(headers);

httpFetcherConfig.setHttpRequestHeaders(new HttpHeaders());
if (headers != null) {
Map<String, Collection<String>> allParsedHeaders = new HashMap<>();
for (String header : headers) {
httpFetcherConfig
.getHttpRequestHeaders()
.getMap()
.putAll(parseHeaders(header));
Map<String, Collection<String>> parsedHeaders = parseHeaders(header);
allParsedHeaders.putAll(parsedHeaders);
// httpFetcherConfig.getHttpRequestHeaders().getMap() doesn't work:
// "The map does not support put or putAll, nor do its entries support setValue."
}

httpFetcherConfig.getHttpRequestHeaders().setMap(allParsedHeaders);
}
}

public static Map<String, List<String>> parseHeaders(String headersString) {
Map<String, List<String>> headersMap = new HashMap<>();
public static Map<String, Collection<String>> parseHeaders(String headersString) {
Map<String, Collection<String>> headersMap = new HashMap<>();
String[] headers = headersString.split("\n");
for (String header : headers) {
String[] keyValue = header.split(":", 2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.fetcher.FetcherManager;
Expand Down Expand Up @@ -97,6 +98,7 @@ public void before() throws Exception {
httpFetcherConfig.setOverallTimeout(400_000L);
httpFetcherConfig.setMaxSpoolSize(-1L);

httpFetcher = new HttpFetcher();
final HttpResponse mockResponse = buildMockResponse(HttpStatus.SC_OK, IOUtils.toInputStream(CONTENT, Charset.defaultCharset()));

mockClientResponse(mockResponse);
Expand Down Expand Up @@ -224,6 +226,13 @@ public String getReasonPhrase() {
Assertions.assertEquals("fromFetchConfigValue2", fromFetchConfig2s.get(0));
Assertions.assertEquals("fromFetchConfigValue3", fromFetchConfig2s.get(1));

metadata.set(Property.externalText("httpRequestHeaders"), new String[] {" nick1 : val1", "nick2: val2"});
httpFetcher.fetch("http://localhost", metadata, parseContext);
httpGet = httpGetArgumentCaptor.getValue();
Assertions.assertEquals("val1", httpGet.getHeaders("nick1")[0].getValue());
Assertions.assertEquals("val2", httpGet.getHeaders("nick2")[0].getValue());
// also make sure the headers from the fetcher config level are specified - see src/test/resources/tika-config-http.xml
Assertions.assertEquals("headerValueFromFetcherConfig", httpGet.getHeaders("headerNameFromFetcherConfig")[0].getValue());
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
<header>Expires</header>
<header>Content-Length</header>
</httpHeaders>
<httpRequestHeaders>
<header>headerNameFromFetcherConfig: headerValueFromFetcherConfig</header>
</httpRequestHeaders>
</fetcher>
</fetchers>
</properties>
</properties>

0 comments on commit 3a0ddc7

Please sign in to comment.