Skip to content

Commit

Permalink
TIKA-4310 -- add CloseShieldInputStream to JsoupParser to ensure that…
Browse files Browse the repository at this point in the history
… underlying stream is not closed
  • Loading branch information
tballison committed Sep 16, 2024
1 parent 10dc7f6 commit 9492c47
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.util.Set;
import javax.xml.XMLConstants;

import org.apache.commons.io.input.CloseShieldInputStream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
Expand Down Expand Up @@ -127,7 +128,7 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
HtmlMapper mapper = context.get(HtmlMapper.class, new DefaultHtmlMapper());

//do better with baseUri?
Document document = Jsoup.parse(stream, charset.name(), "");
Document document = Jsoup.parse(CloseShieldInputStream.wrap(stream), charset.name(), "");
document.quirksMode(Document.QuirksMode.quirks);
ContentHandler xhtml = new XHTMLDowngradeHandler(
new HtmlHandler(mapper, handler, metadata, context, extractScripts));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.io.Writer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
Expand Down Expand Up @@ -78,6 +79,7 @@
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;

public class HtmlParserTest extends TikaTest {

Expand Down Expand Up @@ -1243,6 +1245,20 @@ public void testPreferenceForTitleElement() throws Exception {
assertEquals("OldMetaTitle", m.get("title"));
}

@Test
public void testStreamNotClosed() throws Exception {
String path = "/test-documents/testHTML.html";
Metadata metadata = new Metadata();
Path tmp = null;
try (TikaInputStream stream = TikaInputStream.get(getResourceAsStream(path))) {
//spool tika stream to disk
tmp = stream.getPath();
new JSoupParser().parse(stream, new WriteOutContentHandler(), metadata, new ParseContext());
//make sure that the tmp file is still there
assertTrue(Files.isRegularFile(tmp));
}
}

private class EncodingDetectorRunner implements Callable<String> {

final static String DONE = "done";
Expand Down

0 comments on commit 9492c47

Please sign in to comment.