From e5ba94c5908b47fdcc673f4c12ee973fe05789d4 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 7 Oct 2022 16:27:41 -0400 Subject: [PATCH] wip on https://github.com/samtools/htsjdk/issues/1614 This is more complicated than originally described because the file encoding isn't really described and java bytes include negative values. EOF is triggered correctly, but it's also triggered on various potential characters which become negative values when they are truncated to byte. It's unclear if this is meant to only read ASCII, ISO 8859-1, or UTF-8 but nothing outside of the ascii space works correctly. --- .../htsjdk/tribble/index/AbstractIndex.java | 11 +++-- .../index/interval/IntervalTreeIndex.java | 3 +- .../tribble/util/LittleEndianInputStream.java | 33 +++++++++---- src/test/java/htsjdk/testutil/Expected.java | 48 +++++++++++++++++++ .../util/LittleEndianInputStreamTest.java | 45 +++++++++++++++++ 5 files changed, 124 insertions(+), 16 deletions(-) create mode 100644 src/test/java/htsjdk/testutil/Expected.java create mode 100644 src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java diff --git a/src/main/java/htsjdk/tribble/index/AbstractIndex.java b/src/main/java/htsjdk/tribble/index/AbstractIndex.java index 256e6716a6..718a81f0fa 100644 --- a/src/main/java/htsjdk/tribble/index/AbstractIndex.java +++ b/src/main/java/htsjdk/tribble/index/AbstractIndex.java @@ -29,6 +29,7 @@ import java.io.BufferedOutputStream; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -302,10 +303,10 @@ private void writeHeader(final LittleEndianOutputStream dos) throws IOException private void readHeader(final LittleEndianInputStream dis) throws IOException { version = dis.readInt(); - indexedPath = IOUtil.getPath(dis.readString()); + indexedPath = IOUtil.getPath(dis.readString(StandardCharsets.US_ASCII)); indexedFileSize = dis.readLong(); indexedFileTS = dis.readLong(); - indexedFileMD5 = dis.readString(); + indexedFileMD5 = dis.readString(StandardCharsets.US_ASCII); flags = dis.readInt(); if (version < 3 && (flags & SEQUENCE_DICTIONARY_FLAG) == SEQUENCE_DICTIONARY_FLAG) { readSequenceDictionary(dis); @@ -314,8 +315,8 @@ private void readHeader(final LittleEndianInputStream dis) throws IOException { if (version >= 3) { int nProperties = dis.readInt(); while (nProperties-- > 0) { - final String key = dis.readString(); - final String value = dis.readString(); + final String key = dis.readString(StandardCharsets.US_ASCII); + final String value = dis.readString(StandardCharsets.US_ASCII); properties.put(key, value); } } @@ -332,7 +333,7 @@ private void readSequenceDictionary(final LittleEndianInputStream dis) throws IO final int size = dis.readInt(); if (size < 0) throw new IllegalStateException("Size of the sequence dictionary entries is negative"); for (int x = 0; x < size; x++) { - dis.readString(); + dis.readString(StandardCharsets.US_ASCII); dis.readInt(); } } diff --git a/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java b/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java index c4b2865dca..011ee26c23 100644 --- a/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java +++ b/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -210,7 +211,7 @@ public void read(final LittleEndianInputStream dis) throws IOException { tree = new IntervalTree(); - name = dis.readString(); + name = dis.readString(StandardCharsets.US_ASCII); int nIntervals = dis.readInt(); while (nIntervals-- > 0) { diff --git a/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java b/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java index 350f47c830..e9d959d197 100644 --- a/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java +++ b/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java @@ -16,6 +16,8 @@ import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; /** @@ -105,22 +107,33 @@ public final float readFloat() throws IOException { } /** - * Read a null terminated byte array and return result as a string - * - * @return - * @throws IOException + * Read a null terminated byte array and return result as a String + * This method decodes theh bytes as UTF-8 string + * @throws IOException if reading from the stream fails for some reason + * @throws EOFException if the stream ends without encountering a null terminator. + * @deprecated Prefer the {@link #readString(Charset)} which allows specifying a charset explicitly */ - + @Deprecated public String readString() throws IOException { - ByteArrayOutputStream bis = new ByteArrayOutputStream(100); - byte b; - while ((b = (byte) in.read()) != 0) { + return readString(StandardCharsets.UTF_8); + } + + /** + * Read a null terminated byte array and return result as a String + * @param charset the Charset to use when decoding the bytes to a String + * @throws IOException if reading from the stream fails for some reason + * @throws EOFException if the stream ends without encountering a null terminator. + */ + public String readString(final Charset charset) throws IOException { + final ByteArrayOutputStream bis = new ByteArrayOutputStream(100); + int b; + while ((b = in.read()) != 0) { if(b < 0) { throw new EOFException(); } - bis.write(b); + bis.write((byte)b); } - return new String(bis.toByteArray()); + return bis.toString(charset.name()); } diff --git a/src/test/java/htsjdk/testutil/Expected.java b/src/test/java/htsjdk/testutil/Expected.java new file mode 100644 index 0000000000..2ab640bc19 --- /dev/null +++ b/src/test/java/htsjdk/testutil/Expected.java @@ -0,0 +1,48 @@ +package htsjdk.testutil; + +import org.testng.Assert; + +public interface Expected { + void test(ThrowingSupplier functionToTest); + + + interface ThrowingConsumer { + void test(T a) throws Exception; + } + + static Expected match(final T expected) { + return new ComparisonExpected<>((T actual) -> Assert.assertEquals(actual, expected)); + } + + static Expected mismatch(final T expected) { + return new ComparisonExpected<>((T actual) -> Assert.assertNotEquals(actual, expected)); + } + + static Expected exception(final Class exceptionClass) { + return functionToTest -> Assert.assertThrows(exceptionClass, functionToTest::produce); + } + + interface ThrowingSupplier { + T produce() throws Exception; + } +} + +final class ComparisonExpected implements Expected { + private final ThrowingConsumer test; + + @Override + public void test(ThrowingSupplier supplier) { + try { + test.test(supplier.produce()); + } catch (AssertionError e) { + throw e; + } catch (Exception e) { + throw new AssertionError(e); + } + } + + ComparisonExpected(ThrowingConsumer test) { + this.test = test; + } + +} diff --git a/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java b/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java new file mode 100644 index 0000000000..a25d3eb3eb --- /dev/null +++ b/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java @@ -0,0 +1,45 @@ +package htsjdk.tribble.util; + +import htsjdk.HtsjdkTest; +import htsjdk.testutil.Expected; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.BufferedInputStream; +import java.io.EOFException; +import java.io.FileInputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class LittleEndianInputStreamTest extends HtsjdkTest { + + + @DataProvider + public Object[][] testCases() { + final String missingTerminator = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_no_terminator.bin"; + final String extendedAsciiFile = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_and_null_terminator.bin"; + final Object utf8File = "src/test/resources/htsjdk/tribble/util/string_with_utf8_emoji_and_null_terminator.txt"; + return new Object[][]{ + {missingTerminator, StandardCharsets.ISO_8859_1, Expected.exception(EOFException.class)}, + {missingTerminator, StandardCharsets.US_ASCII, Expected.exception(EOFException.class)}, + {missingTerminator, StandardCharsets.UTF_8, Expected.exception(EOFException.class)}, + {extendedAsciiFile, StandardCharsets.ISO_8859_1, Expected.match("very dràààààmatic and null terminated")}, + {extendedAsciiFile, StandardCharsets.US_ASCII, Expected.mismatch("very dràààààmatic and null terminated")}, + {extendedAsciiFile, StandardCharsets.UTF_8, Expected.mismatch("very dràààààmatic and null terminated")}, + {utf8File, StandardCharsets.UTF_8, Expected.match("🐋 UTF8 is Great 🐋")}, + {utf8File, StandardCharsets.ISO_8859_1, Expected.mismatch("🐋 UTF8 is Great 🐋")} + }; + } + + @Test(dataProvider = "testCases") + public void testAllCases(String filename, Charset charset, Expected expected) { + expected.test(() -> { + try(final LittleEndianInputStream in = new LittleEndianInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(filename))))){ + return in.readString(charset); + } + }); + } + +} \ No newline at end of file