diff --git a/src/main/java/htsjdk/tribble/index/AbstractIndex.java b/src/main/java/htsjdk/tribble/index/AbstractIndex.java index 256e6716a6..718a81f0fa 100644 --- a/src/main/java/htsjdk/tribble/index/AbstractIndex.java +++ b/src/main/java/htsjdk/tribble/index/AbstractIndex.java @@ -29,6 +29,7 @@ import java.io.BufferedOutputStream; import java.io.File; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -302,10 +303,10 @@ private void writeHeader(final LittleEndianOutputStream dos) throws IOException private void readHeader(final LittleEndianInputStream dis) throws IOException { version = dis.readInt(); - indexedPath = IOUtil.getPath(dis.readString()); + indexedPath = IOUtil.getPath(dis.readString(StandardCharsets.US_ASCII)); indexedFileSize = dis.readLong(); indexedFileTS = dis.readLong(); - indexedFileMD5 = dis.readString(); + indexedFileMD5 = dis.readString(StandardCharsets.US_ASCII); flags = dis.readInt(); if (version < 3 && (flags & SEQUENCE_DICTIONARY_FLAG) == SEQUENCE_DICTIONARY_FLAG) { readSequenceDictionary(dis); @@ -314,8 +315,8 @@ private void readHeader(final LittleEndianInputStream dis) throws IOException { if (version >= 3) { int nProperties = dis.readInt(); while (nProperties-- > 0) { - final String key = dis.readString(); - final String value = dis.readString(); + final String key = dis.readString(StandardCharsets.US_ASCII); + final String value = dis.readString(StandardCharsets.US_ASCII); properties.put(key, value); } } @@ -332,7 +333,7 @@ private void readSequenceDictionary(final LittleEndianInputStream dis) throws IO final int size = dis.readInt(); if (size < 0) throw new IllegalStateException("Size of the sequence dictionary entries is negative"); for (int x = 0; x < size; x++) { - dis.readString(); + dis.readString(StandardCharsets.US_ASCII); dis.readInt(); } } diff --git a/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java b/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java index c4b2865dca..011ee26c23 100644 --- a/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java +++ b/src/main/java/htsjdk/tribble/index/interval/IntervalTreeIndex.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; @@ -210,7 +211,7 @@ public void read(final LittleEndianInputStream dis) throws IOException { tree = new IntervalTree(); - name = dis.readString(); + name = dis.readString(StandardCharsets.US_ASCII); int nIntervals = dis.readInt(); while (nIntervals-- > 0) { diff --git a/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java b/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java index 350f47c830..e9d959d197 100644 --- a/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java +++ b/src/main/java/htsjdk/tribble/util/LittleEndianInputStream.java @@ -16,6 +16,8 @@ import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; /** @@ -105,22 +107,33 @@ public final float readFloat() throws IOException { } /** - * Read a null terminated byte array and return result as a string - * - * @return - * @throws IOException + * Read a null terminated byte array and return result as a String + * This method decodes theh bytes as UTF-8 string + * @throws IOException if reading from the stream fails for some reason + * @throws EOFException if the stream ends without encountering a null terminator. + * @deprecated Prefer the {@link #readString(Charset)} which allows specifying a charset explicitly */ - + @Deprecated public String readString() throws IOException { - ByteArrayOutputStream bis = new ByteArrayOutputStream(100); - byte b; - while ((b = (byte) in.read()) != 0) { + return readString(StandardCharsets.UTF_8); + } + + /** + * Read a null terminated byte array and return result as a String + * @param charset the Charset to use when decoding the bytes to a String + * @throws IOException if reading from the stream fails for some reason + * @throws EOFException if the stream ends without encountering a null terminator. + */ + public String readString(final Charset charset) throws IOException { + final ByteArrayOutputStream bis = new ByteArrayOutputStream(100); + int b; + while ((b = in.read()) != 0) { if(b < 0) { throw new EOFException(); } - bis.write(b); + bis.write((byte)b); } - return new String(bis.toByteArray()); + return bis.toString(charset.name()); } diff --git a/src/test/java/htsjdk/testutil/Expected.java b/src/test/java/htsjdk/testutil/Expected.java new file mode 100644 index 0000000000..2ab640bc19 --- /dev/null +++ b/src/test/java/htsjdk/testutil/Expected.java @@ -0,0 +1,48 @@ +package htsjdk.testutil; + +import org.testng.Assert; + +public interface Expected { + void test(ThrowingSupplier functionToTest); + + + interface ThrowingConsumer { + void test(T a) throws Exception; + } + + static Expected match(final T expected) { + return new ComparisonExpected<>((T actual) -> Assert.assertEquals(actual, expected)); + } + + static Expected mismatch(final T expected) { + return new ComparisonExpected<>((T actual) -> Assert.assertNotEquals(actual, expected)); + } + + static Expected exception(final Class exceptionClass) { + return functionToTest -> Assert.assertThrows(exceptionClass, functionToTest::produce); + } + + interface ThrowingSupplier { + T produce() throws Exception; + } +} + +final class ComparisonExpected implements Expected { + private final ThrowingConsumer test; + + @Override + public void test(ThrowingSupplier supplier) { + try { + test.test(supplier.produce()); + } catch (AssertionError e) { + throw e; + } catch (Exception e) { + throw new AssertionError(e); + } + } + + ComparisonExpected(ThrowingConsumer test) { + this.test = test; + } + +} diff --git a/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java b/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java new file mode 100644 index 0000000000..a25d3eb3eb --- /dev/null +++ b/src/test/java/htsjdk/tribble/util/LittleEndianInputStreamTest.java @@ -0,0 +1,45 @@ +package htsjdk.tribble.util; + +import htsjdk.HtsjdkTest; +import htsjdk.testutil.Expected; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.BufferedInputStream; +import java.io.EOFException; +import java.io.FileInputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; + +public class LittleEndianInputStreamTest extends HtsjdkTest { + + + @DataProvider + public Object[][] testCases() { + final String missingTerminator = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_no_terminator.bin"; + final String extendedAsciiFile = "src/test/resources/htsjdk/tribble/util/string_with_extended_ascii_and_null_terminator.bin"; + final Object utf8File = "src/test/resources/htsjdk/tribble/util/string_with_utf8_emoji_and_null_terminator.txt"; + return new Object[][]{ + {missingTerminator, StandardCharsets.ISO_8859_1, Expected.exception(EOFException.class)}, + {missingTerminator, StandardCharsets.US_ASCII, Expected.exception(EOFException.class)}, + {missingTerminator, StandardCharsets.UTF_8, Expected.exception(EOFException.class)}, + {extendedAsciiFile, StandardCharsets.ISO_8859_1, Expected.match("very dràààààmatic and null terminated")}, + {extendedAsciiFile, StandardCharsets.US_ASCII, Expected.mismatch("very dràààààmatic and null terminated")}, + {extendedAsciiFile, StandardCharsets.UTF_8, Expected.mismatch("very dràààààmatic and null terminated")}, + {utf8File, StandardCharsets.UTF_8, Expected.match("🐋 UTF8 is Great 🐋")}, + {utf8File, StandardCharsets.ISO_8859_1, Expected.mismatch("🐋 UTF8 is Great 🐋")} + }; + } + + @Test(dataProvider = "testCases") + public void testAllCases(String filename, Charset charset, Expected expected) { + expected.test(() -> { + try(final LittleEndianInputStream in = new LittleEndianInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(filename))))){ + return in.readString(charset); + } + }); + } + +} \ No newline at end of file