diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f9256b..aced3c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # NAF Changelog +## Current + +## 1.3.0 - 2021-05-17 +- Added `--long` option to _ennaf_ for setting sequence window size. +- Added `--binary` shortcut option to _unnaf_. +- Added support for empty sequences. +- Updated zstd to v1.5.0. + ## 1.2.0 - 2020-09-01 - Added `--sequences` option to _unnaf_. - Added `--binary-stdout` option to _unnaf_. diff --git a/Compress.md b/Compress.md index dac6464..30f7a91 100644 --- a/Compress.md +++ b/Compress.md @@ -10,7 +10,7 @@ `ennaf file.fq -o file.naf` - Compress a FASTQ file (format is detected automatically). -`ennaf -22 file.fa -o file.naf` - Use maximum compression level. +`ennaf -22 --long 31 file.fa -o file.naf` - Use maximum compression level. `gzip -dc file.gz | ennaf -o file.naf` - Recompress from gzip to NAF on the fly. @@ -27,6 +27,13 @@ Maximum level is 22, however take care as levels above 19 are slow and use signi **--level #** - Use compression level #. Same with `-#`, but also supports even faster negative levels, down to -131072. +**--long N** - Use window of size 2^N for sequence stream. +The range is currently from 10 to 31. +If not specified, the default window size depends on compression level. +`--long 31` can improve compression of large repetitive data. +Using large window increases memory consumption of both compression and decompression, +so please be careful with this option if you plan to share compressed files with others. + **--temp-dir DIR** - Use DIR for temporary files. If omitted, uses directory specified in enviroment variable `TMPDIR`. If there's no such variable, tries enviroment variable `TMP`. @@ -110,6 +117,10 @@ while network transfer and decompression may be performed thousands of times by Optimizing user experience is more important in such cases. So, `ennaf -22` is the best option for sequence databases. +On some data `ennaf -22 --text` can be better than the default dna mode. +For maximum compression of large datasets you can add `--long 31`, +but use it carefully as it increases memory consumption of both compression and decompression. + ## Specifying input format Input format (FASTA of FASTQ) is automatically detected from the actual input data, so there's not need to specify it. @@ -196,8 +207,11 @@ you have to switch to text mode (`--text`). ## Using text mode for DNA data Since both `--dna` and `--text` modes can be used for DNA data, which is better? -Short answer: `--dna` is faster and has stronger compression. -For details, see [this benchmark page](http://kirill-kryukov.com/study/naf/benchmark-text-vs-dna-Spur.html). +Normally `--dna` should be preferred, as it's much faster than `--text`, and compression strength is similar. +For strongest possible compression, the choice depends on data. +With less repetitive data such as assembled genomes, `--dna` seems to give stronger compression +([example benchmark](http://kirill-kryukov.com/study/naf/benchmark-text-vs-dna-Spur.html)). +With repetitive data, `--text` is often better. ## Can it compress multiple files into single archive? @@ -207,7 +221,7 @@ First you combine individual FASTA files into a single Multi-Multi-FASTA stream, Example commands: Compressing:
-`mumu.pl --dir 'Helicobacter' 'Helicobacter pylori*' | ennaf -22 --text -o Hp.nafnaf` +`mumu.pl --dir 'Helicobacter' 'Helicobacter pylori*' | ennaf -22 --long 31 --text -o Hp.nafnaf` Decompressing and unpacking:
`unnaf Hp.nafnaf | mumu.pl --unpack --dir 'Helicobacter'` diff --git a/Decompress.md b/Decompress.md index 041a925..148a3ce 100644 --- a/Decompress.md +++ b/Decompress.md @@ -68,6 +68,8 @@ Supported only for DNA and RNA sequences. **--binary-stdout** - Set stdout stream to binary mode. Useful for piping decompressed sequences to md5sum on Windows. +**--binary** - Shortcut for `--binary-stdout --binary-stderr`. + **-h**, **--help** - Show usage help. **-V**, **--version** - Show version. diff --git a/LICENSE b/LICENSE index 22ac5c2..d93e6e5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (c) 2018-2020 Kirill Kryukov +Copyright (c) 2018-2021 Kirill Kryukov This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages diff --git a/Makefile b/Makefile index 3065d97..c806529 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ export prefix = /usr/local -.PHONY: default all test clean install uninstall +.PHONY: default all test test-large clean install uninstall default: $(MAKE) -C zstd/lib ZSTD_LEGACY_SUPPORT=0 ZSTD_LIB_DEPRECATED=0 ZSTD_LIB_DICTBUILDER=0 libzstd.a @@ -15,6 +15,9 @@ all: default test: $(MAKE) -C tests +test-large: + $(MAKE) -C tests large + clean: $(MAKE) -C ennaf clean $(MAKE) -C unnaf clean diff --git a/ennaf/src/compressor.c b/ennaf/src/compressor.c index c9fcaca..c83c90b 100644 --- a/ennaf/src/compressor.c +++ b/ennaf/src/compressor.c @@ -1,21 +1,27 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ - -static ZSTD_CStream* create_zstd_cstream(int level) +static ZSTD_CStream* create_zstd_cstream(int level, int window_size_log) { ZSTD_CStream *s = ZSTD_createCStream(); if (s == NULL) { die("ZSTD_createCStream() error\n"); } + + if (window_size_log != 0) + { + ZSTD_TRY(ZSTD_CCtx_setParameter(s, ZSTD_c_enableLongDistanceMatching, 1)); + ZSTD_TRY(ZSTD_CCtx_setParameter(s, ZSTD_c_windowLog, window_size_log)); + } + size_t const initResult = ZSTD_initCStream(s, level); if (ZSTD_isError(initResult)) { die("ZSTD_initCStream() error: %s\n", ZSTD_getErrorName(initResult)); } return s; } -static void compressor_init(compressor_t *w, const char *name) +static void compressor_init(compressor_t *w, const char *name, int window_size_log) { assert(w != NULL); assert(w->allocated == 0); @@ -35,7 +41,7 @@ static void compressor_init(compressor_t *w, const char *name) w->allocated = COMPRESSED_BUFFER_SIZE; w->buf = (unsigned char *) malloc_or_die(w->allocated); - w->cstream = create_zstd_cstream(compression_level); + w->cstream = create_zstd_cstream(compression_level, window_size_log); w->path = (char *) malloc_or_die(temp_path_length + 1); snprintf(w->path, temp_path_length, "%s/%s.%s", temp_dir, temp_prefix, name); if (verbose) { msg("Temp %s file: \"%s\"\n", name, w->path); } diff --git a/ennaf/src/encoders.c b/ennaf/src/encoders.c index a69632f..74009b5 100644 --- a/ennaf/src/encoders.c +++ b/ennaf/src/encoders.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/encoders.h b/ennaf/src/encoders.h index 3db35df..25215c2 100644 --- a/ennaf/src/encoders.h +++ b/ennaf/src/encoders.h @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/ennaf.c b/ennaf/src/ennaf.c index e498b3d..79a7040 100644 --- a/ennaf/src/ennaf.c +++ b/ennaf/src/ennaf.c @@ -1,12 +1,12 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ -#define VERSION "1.2.0" -#define DATE "2020-09-01" -#define COPYRIGHT_YEARS "2018-2020" +#define VERSION "1.3.0" +#define DATE "2021-05-17" +#define COPYRIGHT_YEARS "2018-2021" #include "platform.h" #include "encoders.h" @@ -34,6 +34,7 @@ static bool force_stdout = false; static bool created_output_file = false; static int compression_level = 1; +static int sequence_window_size_log = 0; static char *temp_dir = NULL; static char *dataset_name = NULL; @@ -243,6 +244,35 @@ static void set_line_length(char *str) } +static void set_sequence_window_size_log(char *str) +{ + assert(str != NULL); + + char *end; + long long a = strtoll(str, &end, 10); + if (*end != '\0') { die("can't parse the value of --long argument\n"); } + + char test_str[21]; + int nc = snprintf(test_str, 21, "%lld", a); + if (nc < 1 || nc > 20 || strcmp(test_str, str) != 0) { die("can't parse the value of --long argument\n"); } + + if (a < ZSTD_WINDOWLOG_MIN) + { + warn("--long value of is %lld is smaller than the lowest supported value %d, using %d instead\n", a, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MIN); + sequence_window_size_log = ZSTD_WINDOWLOG_MIN; + } + else if (a > ZSTD_WINDOWLOG_MAX) + { + warn("--long value of is %lld is larger than the largest supported value %d, using %d instead\n", a, ZSTD_WINDOWLOG_MAX, ZSTD_WINDOWLOG_MAX); + sequence_window_size_log = ZSTD_WINDOWLOG_MAX; + } + else + { + sequence_window_size_log = (int) a; + } +} + + static int parse_input_format(const char *str) { assert(str != NULL); @@ -306,6 +336,7 @@ static void show_help(void) " -o FILE - Write compressed output to FILE\n" " -c - Write to standard output\n" " -#, --level # - Use compression level # (from %d to %d, default: 1)\n" + " --long N - Use window of size 2^N for sequence stream (from %d to %d)\n" " --temp-dir DIR - Use DIR as temporary directory\n" " --name NAME - Use NAME as prefix for temporary files\n" " --title TITLE - Store TITLE as dataset title\n" @@ -322,7 +353,7 @@ static void show_help(void) " --no-mask - Don't store mask\n" " -h, --help - Show help\n" " -V, --version - Show version\n", - min_level, max_level); + min_level, max_level, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX); } @@ -343,6 +374,7 @@ static void parse_command_line(int argc, char **argv) if (!strcmp(argv[i], "--title")) { i++; set_dataset_title(argv[i]); continue; } if (!strcmp(argv[i], "--level")) { i++; set_compression_level(argv[i]); continue; } if (!strcmp(argv[i], "--line-length")) { i++; set_line_length(argv[i]); continue; } + if (!strcmp(argv[i], "--long")) { i++; set_sequence_window_size_log(argv[i]); continue; } // Deprecated, undocumented. if (!strcmp(argv[i], "--out")) { i++; set_output_file_path(argv[i]); continue; } @@ -465,12 +497,13 @@ int main(int argc, char **argv) } make_temp_prefix(); - compressor_init(&IDS, "ids"); - compressor_init(&COMM, "comments"); - compressor_init(&LEN, "lengths"); - if (store_mask) { compressor_init(&MASK, "mask"); } - compressor_init(&SEQ, "sequence"); - if (store_qual) { compressor_init(&QUAL, "quality"); } + + compressor_init(&IDS, "ids", 0); + compressor_init(&COMM, "comments", 0); + compressor_init(&LEN, "lengths", 0); + if (store_mask) { compressor_init(&MASK, "mask", 0); } + compressor_init(&SEQ, "sequence", sequence_window_size_log); + if (store_qual) { compressor_init(&QUAL, "quality", 0); } process(); close_input_file(); diff --git a/ennaf/src/files.c b/ennaf/src/files.c index 2230715..5cf3521 100644 --- a/ennaf/src/files.c +++ b/ennaf/src/files.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/platform.h b/ennaf/src/platform.h index 7f9d8e6..54ca401 100644 --- a/ennaf/src/platform.h +++ b/ennaf/src/platform.h @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/process.c b/ennaf/src/process.c index c1aeab7..3ab63cc 100644 --- a/ennaf/src/process.c +++ b/ennaf/src/process.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository * * The FASTA/Q parser was originally based on Heng Li's kseq.h. @@ -150,6 +150,18 @@ static inline void refill_in_buffer(void) } +__attribute__((always_inline)) +static inline unsigned in_peek_char(void) +{ + if (in_begin >= in_end) + { + refill_in_buffer(); + if (in_end == 0) { return INEOF; } + } + return in_buffer[in_begin]; +} + + __attribute__((always_inline)) static inline unsigned in_get_char(void) { @@ -312,23 +324,27 @@ static void process_well_formed_fasta(void) unsigned long long old_total_seq_size = seq_size_original + seq.length; if (c != INEOF) { - unsigned long long old_len = old_total_seq_size; - while ( (c = in_get_until_specific_char('\n', &seq)) != INEOF) + if (in_peek_char() == '>') { in_begin++; } // Empty sequence. + else { - unsigned long long new_len = seq_size_original + seq.length; - if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } - old_len = new_len; + unsigned long long old_len = old_total_seq_size; + while ( (c = in_get_until_specific_char('\n', &seq)) != INEOF) + { + unsigned long long new_len = seq_size_original + seq.length; + if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + old_len = new_len; - c = in_get_char(); - if (c == '>' || c == INEOF) { break; } - else { in_begin--; } - } + c = in_get_char(); + if (c == '>' || c == INEOF) { break; } + else { in_begin--; } + } - // If the last line is the longest, and has no end-of-line character, handle it correctly. - if (c == INEOF) - { - unsigned long long new_len = seq_size_original + seq.length; - if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + // If the last line is the longest, and has no end-of-line character, handle it correctly. + if (c == INEOF) + { + unsigned long long new_len = seq_size_original + seq.length; + if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + } } } @@ -364,39 +380,43 @@ static void process_non_well_formed_fasta(void) unsigned long long old_total_seq_size = seq_size_original + seq.length; if (c != INEOF) { - unsigned long long old_len = old_total_seq_size; - while ( (c = in_get_until(is_unexpected_arr, &seq)) != INEOF) + if (in_peek_char() == '>') { in_begin++; } // Empty sequence. + else { - if (is_eol_arr[c]) + unsigned long long old_len = old_total_seq_size; + while ( (c = in_get_until(is_unexpected_arr, &seq)) != INEOF) { - unsigned long long new_len = seq_size_original + seq.length; - if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } - old_len = new_len; - - c = in_get_char(); - if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; } - else if (c == '>' || c == INEOF) { break; } - else if (is_eol_arr[c]) + if (is_eol_arr[c]) { - while (c != INEOF && is_eol_arr[c]) { c = in_get_char(); } - if (c == '>' || c == INEOF) { break; } - else if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; } + unsigned long long new_len = seq_size_original + seq.length; + if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + old_len = new_len; + + c = in_get_char(); + if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; } + else if (c == '>' || c == INEOF) { break; } + else if (is_eol_arr[c]) + { + while (c != INEOF && is_eol_arr[c]) { c = in_get_char(); } + if (c == '>' || c == INEOF) { break; } + else if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; } + else if (is_space_arr[c]) {} + else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); } + } else if (is_space_arr[c]) {} else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); } } else if (is_space_arr[c]) {} + else if (c == '>' && in_seq_type == seq_type_text) { str_append_char(&seq, (unsigned char)c); } else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); } } - else if (is_space_arr[c]) {} - else if (c == '>' && in_seq_type == seq_type_text) { str_append_char(&seq, (unsigned char)c); } - else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); } - } - // If the last line is the longest, and has no end-of-line character, handle it correctly. - if (c == INEOF) - { - unsigned long long new_len = seq_size_original + seq.length; - if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + // If the last line is the longest, and has no end-of-line character, handle it correctly. + if (c == INEOF) + { + unsigned long long new_len = seq_size_original + seq.length; + if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; } + } } } diff --git a/ennaf/src/tables.c b/ennaf/src/tables.c index 324f6d9..a58aea9 100644 --- a/ennaf/src/tables.c +++ b/ennaf/src/tables.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/ennaf/src/utils.c b/ennaf/src/utils.c index 23371a7..0f15731 100644 --- a/ennaf/src/utils.c +++ b/ennaf/src/utils.c @@ -1,6 +1,6 @@ /* * NAF compressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -53,6 +53,13 @@ static void die(const char *format, ...) } +#define ZSTD_TRY(f) \ +do { \ + size_t e = f; \ + if (ZSTD_isError(e)) { die("zstd error: %s", ZSTD_getErrorName(e)); } \ +} while (0) + + __attribute__ ((cold)) __attribute__ ((noreturn)) static void out_of_memory(const size_t size) diff --git a/tests/Makefile b/tests/Makefile index a407ccd..3c60f51 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,5 +1,5 @@ -.PHONY: default all clean +.PHONY: default all large clean export TMPDIR=temp @@ -11,6 +11,12 @@ all: @./test-runner.pl interface alphabet charcount small @echo "Success!" +large: + @diff -q Makefile Makefile + @mkdir -p temp + @./test-runner.pl large + @echo "Success!" + clean: @rm -f */*.out @rm -f */*.err diff --git a/tests/interface/ennaf-version.err-ref b/tests/interface/ennaf-version.err-ref index 3073b16..df71fb8 100644 --- a/tests/interface/ennaf-version.err-ref +++ b/tests/interface/ennaf-version.err-ref @@ -1,2 +1,2 @@ -ennaf - NAF compressor, version 1.2.0, 2020-09-01 -Copyright (c) 2018-2020 Kirill Kryukov +ennaf - NAF compressor, version 1.3.0, 2021-05-17 +Copyright (c) 2018-2021 Kirill Kryukov diff --git a/tests/interface/unnaf-version.err-ref b/tests/interface/unnaf-version.err-ref index 628c12d..e2c76a7 100644 --- a/tests/interface/unnaf-version.err-ref +++ b/tests/interface/unnaf-version.err-ref @@ -1,2 +1,2 @@ -unnaf - NAF decompressor, version 1.2.0, 2020-09-01 -Copyright (c) 2018-2020 Kirill Kryukov +unnaf - NAF decompressor, version 1.3.0, 2021-05-17 +Copyright (c) 2018-2021 Kirill Kryukov diff --git a/tests/large/1-default-22-31.e.err-ref b/tests/large/1-default-22-31.e.err-ref new file mode 100644 index 0000000..7009c6c --- /dev/null +++ b/tests/large/1-default-22-31.e.err-ref @@ -0,0 +1,2 @@ +input has 1 unexpected DNA characters: + 'Z': 1 diff --git a/tests/large/1-default-22-31.out-ref b/tests/large/1-default-22-31.out-ref new file mode 100644 index 0000000..5d2ae76 --- /dev/null +++ b/tests/large/1-default-22-31.out-ref @@ -0,0 +1,4 @@ +>1 +actgACGTnN +>2 seq2 +a-tN-MY diff --git a/tests/large/1-default-22-31.test b/tests/large/1-default-22-31.test new file mode 100644 index 0000000..b45bb15 --- /dev/null +++ b/tests/large/1-default-22-31.test @@ -0,0 +1 @@ +ennaf -22 --long 31 {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/large/1-default-22-31.u.err-ref b/tests/large/1-default-22-31.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/large/1.fa b/tests/large/1.fa new file mode 100644 index 0000000..cf76544 --- /dev/null +++ b/tests/large/1.fa @@ -0,0 +1,4 @@ +>1 +actgACGTnN +>2 seq2 +a-tZ-MY diff --git a/tests/small/nodata1-default.e.err-ref b/tests/small/nodata1-default.e.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata1-default.out-ref b/tests/small/nodata1-default.out-ref new file mode 100644 index 0000000..c6f41b0 --- /dev/null +++ b/tests/small/nodata1-default.out-ref @@ -0,0 +1,2 @@ +>1 +>2 diff --git a/tests/small/nodata1-default.test b/tests/small/nodata1-default.test new file mode 100644 index 0000000..361ff7f --- /dev/null +++ b/tests/small/nodata1-default.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/nodata1-default.u.err-ref b/tests/small/nodata1-default.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata1.fa b/tests/small/nodata1.fa new file mode 100644 index 0000000..c6f41b0 --- /dev/null +++ b/tests/small/nodata1.fa @@ -0,0 +1,2 @@ +>1 +>2 diff --git a/tests/small/nodata2-default.e.err-ref b/tests/small/nodata2-default.e.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata2-default.out-ref b/tests/small/nodata2-default.out-ref new file mode 100644 index 0000000..ccc6ba7 --- /dev/null +++ b/tests/small/nodata2-default.out-ref @@ -0,0 +1,3 @@ +>1 +>2 +A diff --git a/tests/small/nodata2-default.test b/tests/small/nodata2-default.test new file mode 100644 index 0000000..361ff7f --- /dev/null +++ b/tests/small/nodata2-default.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/nodata2-default.u.err-ref b/tests/small/nodata2-default.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata2.fa b/tests/small/nodata2.fa new file mode 100644 index 0000000..ccc6ba7 --- /dev/null +++ b/tests/small/nodata2.fa @@ -0,0 +1,3 @@ +>1 +>2 +A diff --git a/tests/small/nodata3-default.e.err-ref b/tests/small/nodata3-default.e.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata3-default.out-ref b/tests/small/nodata3-default.out-ref new file mode 100644 index 0000000..4a879d0 --- /dev/null +++ b/tests/small/nodata3-default.out-ref @@ -0,0 +1,5 @@ +>1 +A +>2 +>3 +C diff --git a/tests/small/nodata3-default.test b/tests/small/nodata3-default.test new file mode 100644 index 0000000..361ff7f --- /dev/null +++ b/tests/small/nodata3-default.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/nodata3-default.u.err-ref b/tests/small/nodata3-default.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata3.fa b/tests/small/nodata3.fa new file mode 100644 index 0000000..4a879d0 --- /dev/null +++ b/tests/small/nodata3.fa @@ -0,0 +1,5 @@ +>1 +A +>2 +>3 +C diff --git a/tests/small/nodata4-default.e.err-ref b/tests/small/nodata4-default.e.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata4-default.out-ref b/tests/small/nodata4-default.out-ref new file mode 100644 index 0000000..b74e559 --- /dev/null +++ b/tests/small/nodata4-default.out-ref @@ -0,0 +1,3 @@ +>1 +A +>2 diff --git a/tests/small/nodata4-default.test b/tests/small/nodata4-default.test new file mode 100644 index 0000000..361ff7f --- /dev/null +++ b/tests/small/nodata4-default.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/nodata4-default.u.err-ref b/tests/small/nodata4-default.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/nodata4.fa b/tests/small/nodata4.fa new file mode 100644 index 0000000..b74e559 --- /dev/null +++ b/tests/small/nodata4.fa @@ -0,0 +1,3 @@ +>1 +A +>2 diff --git a/tests/small/noname-default.e.err-ref b/tests/small/noname-default.e.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/noname-default.out-ref b/tests/small/noname-default.out-ref new file mode 100644 index 0000000..89417fa --- /dev/null +++ b/tests/small/noname-default.out-ref @@ -0,0 +1,2 @@ +> +AAGA diff --git a/tests/small/noname-default.test b/tests/small/noname-default.test new file mode 100644 index 0000000..361ff7f --- /dev/null +++ b/tests/small/noname-default.test @@ -0,0 +1 @@ +ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err diff --git a/tests/small/noname-default.u.err-ref b/tests/small/noname-default.u.err-ref new file mode 100644 index 0000000..e69de29 diff --git a/tests/small/noname.fa b/tests/small/noname.fa new file mode 100644 index 0000000..89417fa --- /dev/null +++ b/tests/small/noname.fa @@ -0,0 +1,2 @@ +> +AAGA diff --git a/tests/test-runner.pl b/tests/test-runner.pl index f6f33e2..1d61d3b 100755 --- a/tests/test-runner.pl +++ b/tests/test-runner.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # # Test runner script -# Copyright (c) 2018-2019 Kirill Kryukov +# Copyright (c) 2018-2021 Kirill Kryukov # See README.md and LICENSE files of this repository # diff --git a/unnaf/src/files.c b/unnaf/src/files.c index a975153..e726e19 100644 --- a/unnaf/src/files.c +++ b/unnaf/src/files.c @@ -1,10 +1,9 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ - static void open_input_file(void) { assert(IN == NULL); diff --git a/unnaf/src/input.c b/unnaf/src/input.c index dd94368..dab6a2e 100644 --- a/unnaf/src/input.c +++ b/unnaf/src/input.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ @@ -268,6 +268,8 @@ static size_t initialize_input_decompression(void) input_decompression_stream = ZSTD_createDStream(); if (!input_decompression_stream) { die("can't create input decompression stream\n"); } + ZSTD_TRY(ZSTD_DCtx_setParameter(input_decompression_stream, ZSTD_d_windowLogMax, ZSTD_WINDOWLOG_MAX)); + size_t bytes_to_read = ZSTD_initDStream(input_decompression_stream); if (ZSTD_isError(bytes_to_read)) { die("can't initialize input decompression stream: %s\n", ZSTD_getErrorName(bytes_to_read)); } @@ -298,6 +300,8 @@ static void initialize_memory_decompression(void) memory_decompression_stream = ZSTD_createDStream(); if (!memory_decompression_stream) { die("can't create memory decompression stream\n"); } + ZSTD_TRY(ZSTD_DCtx_setParameter(memory_decompression_stream, ZSTD_d_windowLogMax, ZSTD_WINDOWLOG_MAX)); + memory_bytes_to_read = ZSTD_initDStream(memory_decompression_stream); if (ZSTD_isError(memory_bytes_to_read)) { die("can't initialize memory decompression stream: %s\n", ZSTD_getErrorName(memory_bytes_to_read)); } } diff --git a/unnaf/src/output-fastq.c b/unnaf/src/output-fastq.c index 50b924c..a7cbdcf 100644 --- a/unnaf/src/output-fastq.c +++ b/unnaf/src/output-fastq.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/unnaf/src/output-sequences.c b/unnaf/src/output-sequences.c index 9e632e9..fd035da 100644 --- a/unnaf/src/output-sequences.c +++ b/unnaf/src/output-sequences.c @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/unnaf/src/output.c b/unnaf/src/output.c index 5854586..4c716ee 100644 --- a/unnaf/src/output.c +++ b/unnaf/src/output.c @@ -1,10 +1,9 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ - static void print_list_of_parts(void) { int printed = 0; @@ -388,10 +387,24 @@ static inline void print_dna_buffer_as_fasta(int masking) total_seq_n_bp_remaining -= cur_seq_len_n_bp_remaining; } - if (lengths_buffer[cur_seq_len_index] != 4294967295u) + if (lengths_buffer[cur_seq_len_index] == 4294967295u) + { + cur_seq_len_index++; + } + else { fputc('\n', OUT); + cur_seq_len_index++; cur_seq_index++; + + // Print empty sequences without empty lines. + while (cur_seq_len_index < n_lengths && cur_seq_index < N && lengths_buffer[cur_seq_len_index] == 0) + { + print_fasta_name(cur_seq_index); + cur_seq_len_index++; + cur_seq_index++; + } + if (cur_seq_index < N) { print_fasta_name(cur_seq_index); @@ -399,7 +412,6 @@ static inline void print_dna_buffer_as_fasta(int masking) } } - cur_seq_len_index++; if (cur_seq_len_index >= n_lengths) { break; } cur_seq_len_n_bp_remaining = lengths_buffer[cur_seq_len_index]; @@ -607,10 +619,18 @@ static void print_fasta(int masking) total_seq_length = read_number(IN); compressed_seq_size = read_number(IN); total_seq_n_bp_remaining = total_seq_length; - cur_seq_len_n_bp_remaining = lengths_buffer[0]; - print_fasta_name(0); + while (cur_seq_len_index < n_lengths && cur_seq_index < N && lengths_buffer[cur_seq_len_index] == 0) + { + print_fasta_name(cur_seq_index); + cur_seq_len_index++; + cur_seq_index++; + } + if (cur_seq_index >= N) { return; } + + print_fasta_name(cur_seq_index); cur_line_n_bp_remaining = max_line_length; + cur_seq_len_n_bp_remaining = lengths_buffer[cur_seq_len_index]; size_t bytes_to_read = initialize_input_decompression(); size_t input_size; diff --git a/unnaf/src/platform.h b/unnaf/src/platform.h index d44e44d..5a6f792 100644 --- a/unnaf/src/platform.h +++ b/unnaf/src/platform.h @@ -1,6 +1,6 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ diff --git a/unnaf/src/unnaf.c b/unnaf/src/unnaf.c index eba28be..18b6ca9 100644 --- a/unnaf/src/unnaf.c +++ b/unnaf/src/unnaf.c @@ -1,12 +1,12 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ -#define VERSION "1.2.0" -#define DATE "2020-09-01" -#define COPYRIGHT_YEARS "2018-2020" +#define VERSION "1.3.0" +#define DATE "2021-05-17" +#define COPYRIGHT_YEARS "2018-2021" #include "platform.h" @@ -270,7 +270,9 @@ static void show_help(void) " -c - Write to standard output\n" " --line-length N - Use lines of width N for FASTA output\n" " --no-mask - Ignore mask\n" - " --binary - Binary output (no 0D 0A on Windows)\n" + " --binary-stdout - Set stdout stream to binary mode.\n" + " --binary-stderr - Set stderr stream to binary mode.\n" + " --binary - Shortcut for \"--binary-stdout --binary-stderr\"\n" " -h, --help - Show help\n" " -V, --version - Show version\n" ); @@ -311,6 +313,7 @@ static void parse_command_line(int argc, char **argv) if (!strcmp(argv[i], "--no-mask")) { use_mask = false; continue; } if (!strcmp(argv[i], "--binary-stdout")) { binary_stdout = true; continue; } if (!strcmp(argv[i], "--binary-stderr")) { if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; } + if (!strcmp(argv[i], "--binary")) { binary_stdout = true; if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; } if (!strcmp(argv[i], "--help")) { show_help(); exit(0); } if (!strcmp(argv[i], "--verbose")) { verbose = true; continue; } if (!strcmp(argv[i], "--version")) { print_version = true; continue; } diff --git a/unnaf/src/utils.c b/unnaf/src/utils.c index 0a7f6c5..42828df 100644 --- a/unnaf/src/utils.c +++ b/unnaf/src/utils.c @@ -1,10 +1,9 @@ /* * NAF decompressor - * Copyright (c) 2018-2020 Kirill Kryukov + * Copyright (c) 2018-2021 Kirill Kryukov * See README.md and LICENSE files of this repository */ - //__attribute__ ((format (printf, 1, 2))) static void msg(const char *format, ...) { @@ -41,6 +40,13 @@ static void die(const char *format, ...) } +#define ZSTD_TRY(f) \ +do { \ + size_t e = f; \ + if (ZSTD_isError(e)) { die("zstd error: %s", ZSTD_getErrorName(e)); } \ +} while (0) + + __attribute__ ((cold)) __attribute__ ((noreturn)) static inline void incomplete(void) diff --git a/zstd b/zstd index b706286..a488ba1 160000 --- a/zstd +++ b/zstd @@ -1 +1 @@ -Subproject commit b706286adbba780006a47ef92df0ad7a785666b6 +Subproject commit a488ba114ec17ea1054b9057c26a046fc122b3b6