diff --git a/scripts/install-samtools.sh b/scripts/install-samtools.sh index e847b9af79..97238f6d2f 100755 --- a/scripts/install-samtools.sh +++ b/scripts/install-samtools.sh @@ -1,6 +1,6 @@ #!/bin/sh set -ex wget https://github.com/samtools/samtools/releases/download/1.14/samtools-1.14.tar.bz2 -# CRAM Interop Tests are dependent on the test files in samtools-1.14/htslib-1.14/htscodes/tests/dat +# Note that the CRAM Interop Tests are dependent on the test files in samtools-1.14/htslib-1.14/htscodecs/tests/dat tar -xjvf samtools-1.14.tar.bz2 cd samtools-1.14 && ./configure --prefix=/usr && make && sudo make install \ No newline at end of file diff --git a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java b/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java index 9c1c0c0cae..21326af78e 100644 --- a/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java +++ b/src/main/java/htsjdk/samtools/cram/compression/rans/ransnx16/RANSNx16Decode.java @@ -5,7 +5,6 @@ import htsjdk.samtools.cram.compression.rans.Constants; import htsjdk.samtools.cram.compression.rans.RANSDecode; import htsjdk.samtools.cram.compression.rans.RANSDecodingSymbol; -import htsjdk.samtools.cram.compression.rans.RANSParams; import htsjdk.samtools.cram.compression.rans.Utils; import java.nio.ByteBuffer; @@ -17,17 +16,17 @@ public class RANSNx16Decode extends RANSDecode { private static final int FREQ_TABLE_OPTIONALLY_COMPRESSED_MASK = 0x01; public ByteBuffer uncompress(final ByteBuffer inBuffer) { + + // For RANS decoding, the bytes are read in little endian from the input stream + inBuffer.order(ByteOrder.LITTLE_ENDIAN); return uncompress(inBuffer, 0); } - public ByteBuffer uncompress(final ByteBuffer inBuffer, int outSize) { + private ByteBuffer uncompress(final ByteBuffer inBuffer, int outSize) { if (inBuffer.remaining() == 0) { return EMPTY_BUFFER; } - // For RANS decoding, the bytes are read in little endian from the input stream - inBuffer.order(ByteOrder.LITTLE_ENDIAN); - // the first byte of compressed stream gives the formatFlags final int formatFlags = inBuffer.get() & 0xFF; final RANSNx16Params ransNx16Params = new RANSNx16Params(formatFlags); @@ -70,7 +69,7 @@ public ByteBuffer uncompress(final ByteBuffer inBuffer, int outSize) { uncompressedRLEOutputLength = outSize; outSize = Utils.readUint7(inBuffer); // TODO: maybe move decodeRLEMeta in-line - uncompressedRLEMetaData = decodeRLEMeta(inBuffer, ransNx16Params, uncompressedRLEMetaDataLength, rleSymbols); + uncompressedRLEMetaData = decodeRLEMeta(inBuffer, uncompressedRLEMetaDataLength, rleSymbols); } ByteBuffer outBuffer = ByteBuffer.allocate(outSize); @@ -86,7 +85,7 @@ public ByteBuffer uncompress(final ByteBuffer inBuffer, int outSize) { uncompressOrder0WayN(inBuffer, outBuffer, outSize, ransNx16Params); break; case ONE: - uncompressOrder1WayN(inBuffer, outBuffer, outSize, ransNx16Params); + uncompressOrder1WayN(inBuffer, outBuffer, ransNx16Params); break; default: throw new RuntimeException("Unknown rANS order: " + ransNx16Params.getOrder()); @@ -167,7 +166,6 @@ private ByteBuffer uncompressOrder0WayN( private ByteBuffer uncompressOrder1WayN( final ByteBuffer inBuffer, final ByteBuffer outBuffer, - final int outSize, final RANSNx16Params ransNx16Params) { initializeRANSDecoder(); @@ -286,7 +284,7 @@ private void readFrequencyTableOrder0( private void readFrequencyTableOrder1( final ByteBuffer cp, - int shift) { + final int shift) { final int[][] frequencies = new int[Constants.NUMBER_OF_SYMBOLS][Constants.NUMBER_OF_SYMBOLS]; final ArithmeticDecoder[] D = getD(); final RANSDecodingSymbol[][] decodingSymbols = getDecodingSymbols(); @@ -349,7 +347,10 @@ private static int[] readAlphabet(final ByteBuffer cp){ return alphabet; } - private ByteBuffer decodeRLEMeta(final ByteBuffer inBuffer , final RANSParams ransParams, final int uncompressedRLEMetaDataLength, final int[] rleSymbols) { + private ByteBuffer decodeRLEMeta( + final ByteBuffer inBuffer, + final int uncompressedRLEMetaDataLength, + final int[] rleSymbols) { ByteBuffer uncompressedRLEMetaData; final int compressedRLEMetaDataLength; if ((uncompressedRLEMetaDataLength & 0x01)!=0) { @@ -370,7 +371,7 @@ private ByteBuffer decodeRLEMeta(final ByteBuffer inBuffer , final RANSParams ra int numRLESymbols = uncompressedRLEMetaData.get() & 0xFF; if (numRLESymbols == 0) { - numRLESymbols = 256; + numRLESymbols = Constants.NUMBER_OF_SYMBOLS; } for (int i = 0; i< numRLESymbols; i++) { rleSymbols[uncompressedRLEMetaData.get() & 0xFF] = 1; @@ -378,7 +379,11 @@ private ByteBuffer decodeRLEMeta(final ByteBuffer inBuffer , final RANSParams ra return uncompressedRLEMetaData; } - private ByteBuffer decodeRLE(ByteBuffer inBuffer , final int[] rleSymbols, final ByteBuffer uncompressedRLEMetaData, int uncompressedRLEOutputLength) { + private ByteBuffer decodeRLE( + ByteBuffer inBuffer, + final int[] rleSymbols, + final ByteBuffer uncompressedRLEMetaData, + final int uncompressedRLEOutputLength) { ByteBuffer rleOutBuffer = ByteBuffer.allocate(uncompressedRLEOutputLength); int j = 0; for(int i = 0; j< uncompressedRLEOutputLength; i++){ @@ -396,7 +401,11 @@ private ByteBuffer decodeRLE(ByteBuffer inBuffer , final int[] rleSymbols, final return inBuffer; } - private ByteBuffer decodePack(ByteBuffer inBuffer, final int[] packMappingTable, int numSymbols, int uncompressedPackOutputLength) { + private ByteBuffer decodePack( + ByteBuffer inBuffer, + final int[] packMappingTable, + final int numSymbols, + final int uncompressedPackOutputLength) { ByteBuffer outBufferPack = ByteBuffer.allocate(uncompressedPackOutputLength); int j = 0; @@ -445,38 +454,35 @@ else if (numSymbols <= 16){ return inBuffer; } - private ByteBuffer decodeStripe(ByteBuffer inBuffer, final int outSize){ - + private ByteBuffer decodeStripe(final ByteBuffer inBuffer, final int outSize){ final int numInterleaveStreams = inBuffer.get() & 0xFF; // retrieve lengths of compressed interleaved streams - int[] clen = new int[numInterleaveStreams]; + final int[] compressedLengths = new int[numInterleaveStreams]; for ( int j=0; j j){ - ulen[j]++; + uncompressedLengths[j]++; } - T[j] = uncompress(inBuffer, ulen[j]); + TransposedData[j] = uncompress(inBuffer, uncompressedLengths[j]); } // Transpose - ByteBuffer out = ByteBuffer.allocate(outSize); + final ByteBuffer outBuffer = ByteBuffer.allocate(outSize); for (int j = 0; j 0) { + if (runCounts[i]>0) { numRLESymbols++; } } @@ -540,7 +536,7 @@ private ByteBuffer encodeRLE(final ByteBuffer inBuffer ,final RANSParams ransPar if (numRLESymbols==0) { // Format cannot cope with zero RLE symbols, so pick one! numRLESymbols = 1; - rleSymbols[0] = 1; + runCounts[0] = 1; } // create rleMetaData buffer to store rle metadata. @@ -548,11 +544,11 @@ private ByteBuffer encodeRLE(final ByteBuffer inBuffer ,final RANSParams ransPar // TODO: How did we come up with this calculation for Buffer size? numRLESymbols+1+inputSize ByteBuffer rleMetaData = ByteBuffer.allocate(numRLESymbols+1+inputSize); // rleMetaData - // write number of symbols that are run length encoded to the outBuffer + // write number of symbols that are run length encoded rleMetaData.put((byte) numRLESymbols); - for (int i=0; i<256; i++){ - if (rleSymbols[i] >0){ + for (int i=0; i0){ // write the symbols that are run length encoded rleMetaData.put((byte) i); } @@ -566,7 +562,7 @@ private ByteBuffer encodeRLE(final ByteBuffer inBuffer ,final RANSParams ransPar for (int i = 0; i < inputSize; i++) { encodedData.put(encodedDataIdx++,inBuffer.get(i)); - if (rleSymbols[inBuffer.get(i)&0xFF]>0) { + if (runCounts[inBuffer.get(i)&0xFF]>0) { lastSymbol = inBuffer.get(i) & 0xFF; int run = 0; @@ -585,7 +581,6 @@ private ByteBuffer encodeRLE(final ByteBuffer inBuffer ,final RANSParams ransPar encodedData.limit(encodedDataIdx); // limit and rewind - // TODO: check if position of rleMetadata is at the end of the buffer as expected rleMetaData.limit(rleMetaData.position()); rleMetaData.rewind(); diff --git a/src/test/java/htsjdk/samtools/cram/RANSInteropTest.java b/src/test/java/htsjdk/samtools/cram/RANSInteropTest.java index 516464d7d8..906d8dd45b 100644 --- a/src/test/java/htsjdk/samtools/cram/RANSInteropTest.java +++ b/src/test/java/htsjdk/samtools/cram/RANSInteropTest.java @@ -33,7 +33,7 @@ * with the htslib implementations. The test files for Interop tests is kept in a separate repository, * currently at https://github.com/samtools/htscodecs so it can be shared across htslib/samtools/htsjdk. * - * For native development env, the Interop test files are downloaded locally and made available at "../htscodecs/tests" + * For local development env, the Interop test files must be downloaded locally and made available at "../htscodecs/tests" * For CI env, the Interop test files are made available from the existing samtools installation * at "/samtools-1.14/htslib-1.14/htscodecs/tests" */ @@ -42,105 +42,57 @@ public class RANSInteropTest extends HtsjdkTest { public static final String COMPRESSED_RANSNX16_DIR = "r4x16"; // RANS4x8 codecs and testdata - public Object[][] getRANS4x8TestData() throws IOException { + public Object[][] get4x8TestCases() throws IOException { // params: - // uncompressed testfile path, RANS encoder, RANS decoder, - // RANS params, compressed testfile directory name - final List rans4x8ParamsOrderList = Arrays.asList( - RANSParams.ORDER.ZERO, - RANSParams.ORDER.ONE); + // compressed testfile path, uncompressed testfile path, + // RANS encoder, RANS decoder, RANS params final List testCases = new ArrayList<>(); - getInteropRANSTestFiles() - .forEach(path -> - rans4x8ParamsOrderList.stream().map(rans4x8ParamsOrder -> new Object[]{ - path, - new RANS4x8Encode(), - new RANS4x8Decode(), - new RANS4x8Params(rans4x8ParamsOrder), - COMPRESSED_RANS4X8_DIR - }).forEach(testCases::add)); + for (Path path : getInteropRansCompressedFilePaths(COMPRESSED_RANS4X8_DIR)) { + Object[] objects = new Object[]{ + path, + getRansUnCompressedFilePath(path), + new RANS4x8Encode(), + new RANS4x8Decode(), + getRans4x8Params(path) + }; + testCases.add(objects); + } return testCases.toArray(new Object[][]{}); } // RANSNx16 codecs and testdata - public Object[][] getRANSNx16TestData() throws IOException { + public Object[][] getNx16TestCases() throws IOException { // params: - // uncompressed testfile path, RANS encoder, RANS decoder, - // RANS params, compressed testfile directory name - final List ransNx16ParamsFormatFlagList = Arrays.asList( - 0x00, - RANSNx16Params.ORDER_FLAG_MASK, - RANSNx16Params.RLE_FLAG_MASK, - RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, - RANSNx16Params.N32_FLAG_MASK, - RANSNx16Params.N32_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, - RANSNx16Params.PACK_FLAG_MASK, - RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK, - RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.PACK_FLAG_MASK, - RANSNx16Params.RLE_FLAG_MASK | RANSNx16Params.PACK_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK); + // compressed testfile path, uncompressed testfile path, + // RANS encoder, RANS decoder, RANS params final List testCases = new ArrayList<>(); - getInteropRANSTestFiles() - .forEach(path -> - ransNx16ParamsFormatFlagList.stream().map(ransNx16ParamsFormatFlag -> new Object[]{ - path, - new RANSNx16Encode(), - new RANSNx16Decode(), - new RANSNx16Params(ransNx16ParamsFormatFlag), - COMPRESSED_RANSNX16_DIR - }).forEach(testCases::add)); - return testCases.toArray(new Object[][]{}); - } - - public Object[][] getRansNx16DecodeOnlyTestData() throws IOException { - - // params: - // uncompressed testfile path, RANS encoder, RANS decoder, - // RANS params, compressed testfile directory name - - // Stripe is implemented in the Decoder. It is not implemented in the Encoder. - final List ransNx16ParamsFormatFlagList = Arrays.asList( - RANSNx16Params.STRIPE_FLAG_MASK, - RANSNx16Params.STRIPE_FLAG_MASK | RANSNx16Params.ORDER_FLAG_MASK); - final List testCases = new ArrayList<>(); - getInteropRANSTestFiles() - .forEach(path -> - ransNx16ParamsFormatFlagList.stream().map(ransNx16ParamsFormatFlag -> new Object[]{ - path, - new RANSNx16Encode(), - new RANSNx16Decode(), - new RANSNx16Params(ransNx16ParamsFormatFlag), - COMPRESSED_RANSNX16_DIR - }).forEach(testCases::add)); + for (Path path : getInteropRansCompressedFilePaths(COMPRESSED_RANSNX16_DIR)) { + Object[] objects = new Object[]{ + path, + getRansUnCompressedFilePath(path), + new RANSNx16Encode(), + new RANSNx16Decode(), + getRansNx16Params(path) + }; + testCases.add(objects); + } return testCases.toArray(new Object[][]{}); } - @DataProvider(name = "allRansCodecsAndDataForRoundtrip") - public Object[][] getAllRansCodecsForRoundTrip() throws IOException { - - // params: - // uncompressed testfile path, RANS encoder, RANS decoder, - // RANS params, compressed testfile directory name - - // Since, Stripe is not implemented in the Encoder, - // we don't test round tripping for the cases where Stripe Flag = 1 - return Stream.concat(Arrays.stream(getRANS4x8TestData()), Arrays.stream(getRANSNx16TestData())) - .toArray(Object[][]::new); - } - - @DataProvider(name = "allRansCodecsAndData") - public Object[][] getAllRansCodecs() throws IOException { + @DataProvider(name = "roundTripTestCases") + public Object[][] getRoundTripTestCases() throws IOException { // params: - // uncompressed testfile path, RANS encoder, RANS decoder, - // RANS params, compressed testfile directory name - return Stream.concat(Arrays.stream(getAllRansCodecsForRoundTrip()), Arrays.stream(getRansNx16DecodeOnlyTestData())) + // compressed testfile path, uncompressed testfile path, + // RANS encoder, RANS decoder, RANS params + return Stream.concat(Arrays.stream(get4x8TestCases()), Arrays.stream(getNx16TestCases())) .toArray(Object[][]::new); } @Test(description = "Test if CRAM Interop Test Data is available") - public void testGetHTSCodecsCorpus() { + public void testHtsCodecsCorpusIsAvailable() { if (!CRAMInteropTestUtils.isInteropTestDataAvailable()) { throw new SkipException(String.format("RANS Interop Test Data is not available at %s", CRAMInteropTestUtils.INTEROP_TEST_FILES_PATH)); @@ -148,49 +100,55 @@ public void testGetHTSCodecsCorpus() { } @Test ( - dependsOnMethods = "testGetHTSCodecsCorpus", - dataProvider = "allRansCodecsAndDataForRoundtrip", + dependsOnMethods = "testHtsCodecsCorpusIsAvailable", + dataProvider = "roundTripTestCases", description = "Roundtrip using htsjdk RANS. Compare the output with the original file" ) public void testRANSRoundTrip( - final Path uncompressedInteropPath, + final Path unusedcompressedFilePath, + final Path uncompressedFilePath, final RANSEncode ransEncode, final RANSDecode ransDecode, - final RANSParams params, - final String unusedCompressedDirname) throws IOException { - try (final InputStream uncompressedInteropStream = Files.newInputStream(uncompressedInteropPath)) { + final RANSParams params) throws IOException { + try (final InputStream uncompressedInteropStream = Files.newInputStream(uncompressedFilePath)) { // preprocess the uncompressed data (to match what the htscodecs-library test harness does) // by filtering out the embedded newlines, and then round trip through RANS and compare the // results final ByteBuffer uncompressedInteropBytes = ByteBuffer.wrap(filterEmbeddedNewlines(IOUtils.toByteArray(uncompressedInteropStream))); - final ByteBuffer compressedHtsjdkBytes = ransEncode.compress(uncompressedInteropBytes, params); - uncompressedInteropBytes.rewind(); - Assert.assertEquals(ransDecode.uncompress(compressedHtsjdkBytes), uncompressedInteropBytes); + + // Stripe Flag is not implemented in RANSNx16 Encoder. + // The encoder throws CRAMException if Stripe Flag is used. + if (params instanceof RANSNx16Params){ + RANSNx16Params ransNx16Params = (RANSNx16Params) params; + if (ransNx16Params.isStripe()) { + Assert.assertThrows(CRAMException.class, () -> ransEncode.compress(uncompressedInteropBytes, params)); + } + } else { + final ByteBuffer compressedHtsjdkBytes = ransEncode.compress(uncompressedInteropBytes, params); + uncompressedInteropBytes.rewind(); + Assert.assertEquals(ransDecode.uncompress(compressedHtsjdkBytes), uncompressedInteropBytes); + } } } @Test ( - dependsOnMethods = "testGetHTSCodecsCorpus", - dataProvider = "allRansCodecsAndData", - description = "Compress the original file using htsjdk RANS and compare it with the existing compressed file. " + - "Uncompress the existing compressed file using htsjdk RANS and compare it with the original file.") - public void testRANSPreCompressed( + dependsOnMethods = "testHtsCodecsCorpusIsAvailable", + dataProvider = "roundTripTestCases", + description = "Uncompress the existing compressed file using htsjdk RANS and compare it with the original file.") + public void testDecodeOnly( + final Path compressedFilePath, final Path uncompressedInteropPath, - final RANSEncode unused, + final RANSEncode unusedRansEncode, final RANSDecode ransDecode, - final RANSParams params, - final String compressedInteropDirName) throws IOException { - - final Path preCompressedInteropPath = getCompressedRANSPath(compressedInteropDirName,uncompressedInteropPath, params); - + final RANSParams unusedRansParams) throws IOException { try (final InputStream uncompressedInteropStream = Files.newInputStream(uncompressedInteropPath); - final InputStream preCompressedInteropStream = Files.newInputStream(preCompressedInteropPath) + final InputStream preCompressedInteropStream = Files.newInputStream(compressedFilePath) ) { + // preprocess the uncompressed data (to match what the htscodecs-library test harness does) // by filtering out the embedded newlines, and then round trip through RANS and compare the // results final ByteBuffer uncompressedInteropBytes = ByteBuffer.wrap(filterEmbeddedNewlines(IOUtils.toByteArray(uncompressedInteropStream))); - final ByteBuffer preCompressedInteropBytes = ByteBuffer.wrap(IOUtils.toByteArray(preCompressedInteropStream)); // Use htsjdk to uncompress the precompressed file from htscodecs repo @@ -204,19 +162,6 @@ public void testRANSPreCompressed( } } - // return a list of all RANS test data files in the htscodecs/tests directory - private List getInteropRANSTestFiles() throws IOException { - final List paths = new ArrayList<>(); - Files.newDirectoryStream( - CRAMInteropTestUtils.getInteropTestDataLocation().resolve("dat"), - path -> path.getFileName().startsWith("q4") || - path.getFileName().startsWith("q8") || - path.getFileName().startsWith("qvar") || - path.getFileName().startsWith("q40+dir")) - .forEach(path -> paths.add(path)); - return paths; - } - // the input files have embedded newlines that the test remove before round-tripping... private final byte[] filterEmbeddedNewlines(final byte[] rawBytes) throws IOException { // 1. filters new lines if any. @@ -239,13 +184,60 @@ private final byte[] filterEmbeddedNewlines(final byte[] rawBytes) throws IOExce } } - // Given a test file name, map it to the corresponding rans compressed path - private final Path getCompressedRANSPath(final String ransType,final Path uncompressedInteropPath, RANSParams params) { + // return a list of all encoded test data files in the htscodecs/tests/dat/ directory + private List getInteropRansCompressedFilePaths(final String compressedDir) throws IOException { + final List paths = new ArrayList<>(); + Files.newDirectoryStream( + CRAMInteropTestUtils.getInteropTestDataLocation().resolve("dat/"+compressedDir), + path -> Files.isRegularFile(path)) + .forEach(path -> paths.add(path)); + return paths; + } + + // Given a compressed test file path, return the corresponding uncompressed file path + public static final Path getRansUnCompressedFilePath(final Path compressedInteropPath) { + String uncompressedFileName = getUncompressedFileName(compressedInteropPath.getFileName().toString()); + // Example compressedInteropPath: ../dat/r4x8/q4.1 => unCompressedFilePath: ../dat/q4 + return compressedInteropPath.getParent().getParent().resolve(uncompressedFileName); + } + + public static final String getUncompressedFileName(final String compressedFileName) { + // Returns original filename from compressed file name + int lastDotIndex = compressedFileName.lastIndexOf("."); + if (lastDotIndex >= 0) { + String fileName = compressedFileName.substring(0, lastDotIndex); + return fileName; + } else { + throw new CRAMException("The format of the compressed File Name is not as expected. " + + "The name of the compressed file should contain a period followed by a number that" + + "indicates the order of compression. Actual compressed file name = "+ compressedFileName); + } + } - // Example compressedFileName: r4x16/q4.193 - // the substring after "." in the compressedFileName is the formatFlags (aka. the first byte of the compressed stream) - final String compressedFileName = String.format("%s/%s.%s", ransType, uncompressedInteropPath.getFileName(), params.getFormatFlags()); - return uncompressedInteropPath.getParent().resolve(compressedFileName); + public static final RANSParams getRans4x8Params(final Path compressedInteropPath){ + // Returns RANSParams from compressed file path + final String compressedFileName = compressedInteropPath.getFileName().toString(); + final int lastDotIndex = compressedFileName.lastIndexOf("."); + if (lastDotIndex >= 0 && lastDotIndex < compressedFileName.length() - 1) { + return new RANS4x8Params(RANSParams.ORDER.fromInt(Integer.parseInt(compressedFileName.substring(lastDotIndex + 1)))); + } else { + throw new CRAMException("The format of the compressed File Name is not as expected. " + + "The name of the compressed file should contain a period followed by a number that" + + "indicates the order of compression. Actual compressed file name = "+ compressedFileName); + } + } + + public static final RANSParams getRansNx16Params(final Path compressedInteropPath){ + // Returns RANSParams from compressed file path + final String compressedFileName = compressedInteropPath.getFileName().toString(); + final int lastDotIndex = compressedFileName.lastIndexOf("."); + if (lastDotIndex >= 0 && lastDotIndex < compressedFileName.length() - 1) { + return new RANSNx16Params(Integer.parseInt(compressedFileName.substring(lastDotIndex + 1))); + } else { + throw new CRAMException("The format of the compressed File Name is not as expected. " + + "The name of the compressed file should contain a period followed by a number that" + + "indicates the order of compression. Actual compressed file name = "+ compressedFileName); + } } } \ No newline at end of file