diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5f9256b..aced3c9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,13 @@
# NAF Changelog
+## Current
+
+## 1.3.0 - 2021-05-17
+- Added `--long` option to _ennaf_ for setting sequence window size.
+- Added `--binary` shortcut option to _unnaf_.
+- Added support for empty sequences.
+- Updated zstd to v1.5.0.
+
## 1.2.0 - 2020-09-01
- Added `--sequences` option to _unnaf_.
- Added `--binary-stdout` option to _unnaf_.
diff --git a/Compress.md b/Compress.md
index dac6464..30f7a91 100644
--- a/Compress.md
+++ b/Compress.md
@@ -10,7 +10,7 @@
`ennaf file.fq -o file.naf` - Compress a FASTQ file (format is detected automatically).
-`ennaf -22 file.fa -o file.naf` - Use maximum compression level.
+`ennaf -22 --long 31 file.fa -o file.naf` - Use maximum compression level.
`gzip -dc file.gz | ennaf -o file.naf` - Recompress from gzip to NAF on the fly.
@@ -27,6 +27,13 @@ Maximum level is 22, however take care as levels above 19 are slow and use signi
**--level #** - Use compression level #.
Same with `-#`, but also supports even faster negative levels, down to -131072.
+**--long N** - Use window of size 2^N for sequence stream.
+The range is currently from 10 to 31.
+If not specified, the default window size depends on compression level.
+`--long 31` can improve compression of large repetitive data.
+Using large window increases memory consumption of both compression and decompression,
+so please be careful with this option if you plan to share compressed files with others.
+
**--temp-dir DIR** - Use DIR for temporary files.
If omitted, uses directory specified in enviroment variable `TMPDIR`.
If there's no such variable, tries enviroment variable `TMP`.
@@ -110,6 +117,10 @@ while network transfer and decompression may be performed thousands of times by
Optimizing user experience is more important in such cases.
So, `ennaf -22` is the best option for sequence databases.
+On some data `ennaf -22 --text` can be better than the default dna mode.
+For maximum compression of large datasets you can add `--long 31`,
+but use it carefully as it increases memory consumption of both compression and decompression.
+
## Specifying input format
Input format (FASTA of FASTQ) is automatically detected from the actual input data, so there's not need to specify it.
@@ -196,8 +207,11 @@ you have to switch to text mode (`--text`).
## Using text mode for DNA data
Since both `--dna` and `--text` modes can be used for DNA data, which is better?
-Short answer: `--dna` is faster and has stronger compression.
-For details, see [this benchmark page](http://kirill-kryukov.com/study/naf/benchmark-text-vs-dna-Spur.html).
+Normally `--dna` should be preferred, as it's much faster than `--text`, and compression strength is similar.
+For strongest possible compression, the choice depends on data.
+With less repetitive data such as assembled genomes, `--dna` seems to give stronger compression
+([example benchmark](http://kirill-kryukov.com/study/naf/benchmark-text-vs-dna-Spur.html)).
+With repetitive data, `--text` is often better.
## Can it compress multiple files into single archive?
@@ -207,7 +221,7 @@ First you combine individual FASTA files into a single Multi-Multi-FASTA stream,
Example commands:
Compressing:
-`mumu.pl --dir 'Helicobacter' 'Helicobacter pylori*' | ennaf -22 --text -o Hp.nafnaf`
+`mumu.pl --dir 'Helicobacter' 'Helicobacter pylori*' | ennaf -22 --long 31 --text -o Hp.nafnaf`
Decompressing and unpacking:
`unnaf Hp.nafnaf | mumu.pl --unpack --dir 'Helicobacter'`
diff --git a/Decompress.md b/Decompress.md
index 041a925..148a3ce 100644
--- a/Decompress.md
+++ b/Decompress.md
@@ -68,6 +68,8 @@ Supported only for DNA and RNA sequences.
**--binary-stdout** - Set stdout stream to binary mode. Useful for piping decompressed sequences to md5sum on Windows.
+**--binary** - Shortcut for `--binary-stdout --binary-stderr`.
+
**-h**, **--help** - Show usage help.
**-V**, **--version** - Show version.
diff --git a/LICENSE b/LICENSE
index 22ac5c2..d93e6e5 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2018-2020 Kirill Kryukov
+Copyright (c) 2018-2021 Kirill Kryukov
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
diff --git a/Makefile b/Makefile
index 3065d97..c806529 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
export prefix = /usr/local
-.PHONY: default all test clean install uninstall
+.PHONY: default all test test-large clean install uninstall
default:
$(MAKE) -C zstd/lib ZSTD_LEGACY_SUPPORT=0 ZSTD_LIB_DEPRECATED=0 ZSTD_LIB_DICTBUILDER=0 libzstd.a
@@ -15,6 +15,9 @@ all: default
test:
$(MAKE) -C tests
+test-large:
+ $(MAKE) -C tests large
+
clean:
$(MAKE) -C ennaf clean
$(MAKE) -C unnaf clean
diff --git a/ennaf/src/compressor.c b/ennaf/src/compressor.c
index c9fcaca..c83c90b 100644
--- a/ennaf/src/compressor.c
+++ b/ennaf/src/compressor.c
@@ -1,21 +1,27 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-
-static ZSTD_CStream* create_zstd_cstream(int level)
+static ZSTD_CStream* create_zstd_cstream(int level, int window_size_log)
{
ZSTD_CStream *s = ZSTD_createCStream();
if (s == NULL) { die("ZSTD_createCStream() error\n"); }
+
+ if (window_size_log != 0)
+ {
+ ZSTD_TRY(ZSTD_CCtx_setParameter(s, ZSTD_c_enableLongDistanceMatching, 1));
+ ZSTD_TRY(ZSTD_CCtx_setParameter(s, ZSTD_c_windowLog, window_size_log));
+ }
+
size_t const initResult = ZSTD_initCStream(s, level);
if (ZSTD_isError(initResult)) { die("ZSTD_initCStream() error: %s\n", ZSTD_getErrorName(initResult)); }
return s;
}
-static void compressor_init(compressor_t *w, const char *name)
+static void compressor_init(compressor_t *w, const char *name, int window_size_log)
{
assert(w != NULL);
assert(w->allocated == 0);
@@ -35,7 +41,7 @@ static void compressor_init(compressor_t *w, const char *name)
w->allocated = COMPRESSED_BUFFER_SIZE;
w->buf = (unsigned char *) malloc_or_die(w->allocated);
- w->cstream = create_zstd_cstream(compression_level);
+ w->cstream = create_zstd_cstream(compression_level, window_size_log);
w->path = (char *) malloc_or_die(temp_path_length + 1);
snprintf(w->path, temp_path_length, "%s/%s.%s", temp_dir, temp_prefix, name);
if (verbose) { msg("Temp %s file: \"%s\"\n", name, w->path); }
diff --git a/ennaf/src/encoders.c b/ennaf/src/encoders.c
index a69632f..74009b5 100644
--- a/ennaf/src/encoders.c
+++ b/ennaf/src/encoders.c
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/ennaf/src/encoders.h b/ennaf/src/encoders.h
index 3db35df..25215c2 100644
--- a/ennaf/src/encoders.h
+++ b/ennaf/src/encoders.h
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/ennaf/src/ennaf.c b/ennaf/src/ennaf.c
index e498b3d..79a7040 100644
--- a/ennaf/src/ennaf.c
+++ b/ennaf/src/ennaf.c
@@ -1,12 +1,12 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-#define VERSION "1.2.0"
-#define DATE "2020-09-01"
-#define COPYRIGHT_YEARS "2018-2020"
+#define VERSION "1.3.0"
+#define DATE "2021-05-17"
+#define COPYRIGHT_YEARS "2018-2021"
#include "platform.h"
#include "encoders.h"
@@ -34,6 +34,7 @@ static bool force_stdout = false;
static bool created_output_file = false;
static int compression_level = 1;
+static int sequence_window_size_log = 0;
static char *temp_dir = NULL;
static char *dataset_name = NULL;
@@ -243,6 +244,35 @@ static void set_line_length(char *str)
}
+static void set_sequence_window_size_log(char *str)
+{
+ assert(str != NULL);
+
+ char *end;
+ long long a = strtoll(str, &end, 10);
+ if (*end != '\0') { die("can't parse the value of --long argument\n"); }
+
+ char test_str[21];
+ int nc = snprintf(test_str, 21, "%lld", a);
+ if (nc < 1 || nc > 20 || strcmp(test_str, str) != 0) { die("can't parse the value of --long argument\n"); }
+
+ if (a < ZSTD_WINDOWLOG_MIN)
+ {
+ warn("--long value of is %lld is smaller than the lowest supported value %d, using %d instead\n", a, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MIN);
+ sequence_window_size_log = ZSTD_WINDOWLOG_MIN;
+ }
+ else if (a > ZSTD_WINDOWLOG_MAX)
+ {
+ warn("--long value of is %lld is larger than the largest supported value %d, using %d instead\n", a, ZSTD_WINDOWLOG_MAX, ZSTD_WINDOWLOG_MAX);
+ sequence_window_size_log = ZSTD_WINDOWLOG_MAX;
+ }
+ else
+ {
+ sequence_window_size_log = (int) a;
+ }
+}
+
+
static int parse_input_format(const char *str)
{
assert(str != NULL);
@@ -306,6 +336,7 @@ static void show_help(void)
" -o FILE - Write compressed output to FILE\n"
" -c - Write to standard output\n"
" -#, --level # - Use compression level # (from %d to %d, default: 1)\n"
+ " --long N - Use window of size 2^N for sequence stream (from %d to %d)\n"
" --temp-dir DIR - Use DIR as temporary directory\n"
" --name NAME - Use NAME as prefix for temporary files\n"
" --title TITLE - Store TITLE as dataset title\n"
@@ -322,7 +353,7 @@ static void show_help(void)
" --no-mask - Don't store mask\n"
" -h, --help - Show help\n"
" -V, --version - Show version\n",
- min_level, max_level);
+ min_level, max_level, ZSTD_WINDOWLOG_MIN, ZSTD_WINDOWLOG_MAX);
}
@@ -343,6 +374,7 @@ static void parse_command_line(int argc, char **argv)
if (!strcmp(argv[i], "--title")) { i++; set_dataset_title(argv[i]); continue; }
if (!strcmp(argv[i], "--level")) { i++; set_compression_level(argv[i]); continue; }
if (!strcmp(argv[i], "--line-length")) { i++; set_line_length(argv[i]); continue; }
+ if (!strcmp(argv[i], "--long")) { i++; set_sequence_window_size_log(argv[i]); continue; }
// Deprecated, undocumented.
if (!strcmp(argv[i], "--out")) { i++; set_output_file_path(argv[i]); continue; }
@@ -465,12 +497,13 @@ int main(int argc, char **argv)
}
make_temp_prefix();
- compressor_init(&IDS, "ids");
- compressor_init(&COMM, "comments");
- compressor_init(&LEN, "lengths");
- if (store_mask) { compressor_init(&MASK, "mask"); }
- compressor_init(&SEQ, "sequence");
- if (store_qual) { compressor_init(&QUAL, "quality"); }
+
+ compressor_init(&IDS, "ids", 0);
+ compressor_init(&COMM, "comments", 0);
+ compressor_init(&LEN, "lengths", 0);
+ if (store_mask) { compressor_init(&MASK, "mask", 0); }
+ compressor_init(&SEQ, "sequence", sequence_window_size_log);
+ if (store_qual) { compressor_init(&QUAL, "quality", 0); }
process();
close_input_file();
diff --git a/ennaf/src/files.c b/ennaf/src/files.c
index 2230715..5cf3521 100644
--- a/ennaf/src/files.c
+++ b/ennaf/src/files.c
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/ennaf/src/platform.h b/ennaf/src/platform.h
index 7f9d8e6..54ca401 100644
--- a/ennaf/src/platform.h
+++ b/ennaf/src/platform.h
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/ennaf/src/process.c b/ennaf/src/process.c
index c1aeab7..3ab63cc 100644
--- a/ennaf/src/process.c
+++ b/ennaf/src/process.c
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*
* The FASTA/Q parser was originally based on Heng Li's kseq.h.
@@ -150,6 +150,18 @@ static inline void refill_in_buffer(void)
}
+__attribute__((always_inline))
+static inline unsigned in_peek_char(void)
+{
+ if (in_begin >= in_end)
+ {
+ refill_in_buffer();
+ if (in_end == 0) { return INEOF; }
+ }
+ return in_buffer[in_begin];
+}
+
+
__attribute__((always_inline))
static inline unsigned in_get_char(void)
{
@@ -312,23 +324,27 @@ static void process_well_formed_fasta(void)
unsigned long long old_total_seq_size = seq_size_original + seq.length;
if (c != INEOF)
{
- unsigned long long old_len = old_total_seq_size;
- while ( (c = in_get_until_specific_char('\n', &seq)) != INEOF)
+ if (in_peek_char() == '>') { in_begin++; } // Empty sequence.
+ else
{
- unsigned long long new_len = seq_size_original + seq.length;
- if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
- old_len = new_len;
+ unsigned long long old_len = old_total_seq_size;
+ while ( (c = in_get_until_specific_char('\n', &seq)) != INEOF)
+ {
+ unsigned long long new_len = seq_size_original + seq.length;
+ if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ old_len = new_len;
- c = in_get_char();
- if (c == '>' || c == INEOF) { break; }
- else { in_begin--; }
- }
+ c = in_get_char();
+ if (c == '>' || c == INEOF) { break; }
+ else { in_begin--; }
+ }
- // If the last line is the longest, and has no end-of-line character, handle it correctly.
- if (c == INEOF)
- {
- unsigned long long new_len = seq_size_original + seq.length;
- if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ // If the last line is the longest, and has no end-of-line character, handle it correctly.
+ if (c == INEOF)
+ {
+ unsigned long long new_len = seq_size_original + seq.length;
+ if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ }
}
}
@@ -364,39 +380,43 @@ static void process_non_well_formed_fasta(void)
unsigned long long old_total_seq_size = seq_size_original + seq.length;
if (c != INEOF)
{
- unsigned long long old_len = old_total_seq_size;
- while ( (c = in_get_until(is_unexpected_arr, &seq)) != INEOF)
+ if (in_peek_char() == '>') { in_begin++; } // Empty sequence.
+ else
{
- if (is_eol_arr[c])
+ unsigned long long old_len = old_total_seq_size;
+ while ( (c = in_get_until(is_unexpected_arr, &seq)) != INEOF)
{
- unsigned long long new_len = seq_size_original + seq.length;
- if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
- old_len = new_len;
-
- c = in_get_char();
- if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; }
- else if (c == '>' || c == INEOF) { break; }
- else if (is_eol_arr[c])
+ if (is_eol_arr[c])
{
- while (c != INEOF && is_eol_arr[c]) { c = in_get_char(); }
- if (c == '>' || c == INEOF) { break; }
- else if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; }
+ unsigned long long new_len = seq_size_original + seq.length;
+ if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ old_len = new_len;
+
+ c = in_get_char();
+ if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; }
+ else if (c == '>' || c == INEOF) { break; }
+ else if (is_eol_arr[c])
+ {
+ while (c != INEOF && is_eol_arr[c]) { c = in_get_char(); }
+ if (c == '>' || c == INEOF) { break; }
+ else if (!is_unexpected_arr[c]) { str_append_char(&seq, (unsigned char)c); continue; }
+ else if (is_space_arr[c]) {}
+ else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); }
+ }
else if (is_space_arr[c]) {}
else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); }
}
else if (is_space_arr[c]) {}
+ else if (c == '>' && in_seq_type == seq_type_text) { str_append_char(&seq, (unsigned char)c); }
else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); }
}
- else if (is_space_arr[c]) {}
- else if (c == '>' && in_seq_type == seq_type_text) { str_append_char(&seq, (unsigned char)c); }
- else { unexpected_input_char(c); str_append_char(&seq, unexpected_seq_char_replacement); }
- }
- // If the last line is the longest, and has no end-of-line character, handle it correctly.
- if (c == INEOF)
- {
- unsigned long long new_len = seq_size_original + seq.length;
- if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ // If the last line is the longest, and has no end-of-line character, handle it correctly.
+ if (c == INEOF)
+ {
+ unsigned long long new_len = seq_size_original + seq.length;
+ if (new_len - old_len > longest_line_length) { longest_line_length = new_len - old_len; }
+ }
}
}
diff --git a/ennaf/src/tables.c b/ennaf/src/tables.c
index 324f6d9..a58aea9 100644
--- a/ennaf/src/tables.c
+++ b/ennaf/src/tables.c
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/ennaf/src/utils.c b/ennaf/src/utils.c
index 23371a7..0f15731 100644
--- a/ennaf/src/utils.c
+++ b/ennaf/src/utils.c
@@ -1,6 +1,6 @@
/*
* NAF compressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
@@ -53,6 +53,13 @@ static void die(const char *format, ...)
}
+#define ZSTD_TRY(f) \
+do { \
+ size_t e = f; \
+ if (ZSTD_isError(e)) { die("zstd error: %s", ZSTD_getErrorName(e)); } \
+} while (0)
+
+
__attribute__ ((cold))
__attribute__ ((noreturn))
static void out_of_memory(const size_t size)
diff --git a/tests/Makefile b/tests/Makefile
index a407ccd..3c60f51 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -1,5 +1,5 @@
-.PHONY: default all clean
+.PHONY: default all large clean
export TMPDIR=temp
@@ -11,6 +11,12 @@ all:
@./test-runner.pl interface alphabet charcount small
@echo "Success!"
+large:
+ @diff -q Makefile Makefile
+ @mkdir -p temp
+ @./test-runner.pl large
+ @echo "Success!"
+
clean:
@rm -f */*.out
@rm -f */*.err
diff --git a/tests/interface/ennaf-version.err-ref b/tests/interface/ennaf-version.err-ref
index 3073b16..df71fb8 100644
--- a/tests/interface/ennaf-version.err-ref
+++ b/tests/interface/ennaf-version.err-ref
@@ -1,2 +1,2 @@
-ennaf - NAF compressor, version 1.2.0, 2020-09-01
-Copyright (c) 2018-2020 Kirill Kryukov
+ennaf - NAF compressor, version 1.3.0, 2021-05-17
+Copyright (c) 2018-2021 Kirill Kryukov
diff --git a/tests/interface/unnaf-version.err-ref b/tests/interface/unnaf-version.err-ref
index 628c12d..e2c76a7 100644
--- a/tests/interface/unnaf-version.err-ref
+++ b/tests/interface/unnaf-version.err-ref
@@ -1,2 +1,2 @@
-unnaf - NAF decompressor, version 1.2.0, 2020-09-01
-Copyright (c) 2018-2020 Kirill Kryukov
+unnaf - NAF decompressor, version 1.3.0, 2021-05-17
+Copyright (c) 2018-2021 Kirill Kryukov
diff --git a/tests/large/1-default-22-31.e.err-ref b/tests/large/1-default-22-31.e.err-ref
new file mode 100644
index 0000000..7009c6c
--- /dev/null
+++ b/tests/large/1-default-22-31.e.err-ref
@@ -0,0 +1,2 @@
+input has 1 unexpected DNA characters:
+ 'Z': 1
diff --git a/tests/large/1-default-22-31.out-ref b/tests/large/1-default-22-31.out-ref
new file mode 100644
index 0000000..5d2ae76
--- /dev/null
+++ b/tests/large/1-default-22-31.out-ref
@@ -0,0 +1,4 @@
+>1
+actgACGTnN
+>2 seq2
+a-tN-MY
diff --git a/tests/large/1-default-22-31.test b/tests/large/1-default-22-31.test
new file mode 100644
index 0000000..b45bb15
--- /dev/null
+++ b/tests/large/1-default-22-31.test
@@ -0,0 +1 @@
+ennaf -22 --long 31 {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/large/1-default-22-31.u.err-ref b/tests/large/1-default-22-31.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/large/1.fa b/tests/large/1.fa
new file mode 100644
index 0000000..cf76544
--- /dev/null
+++ b/tests/large/1.fa
@@ -0,0 +1,4 @@
+>1
+actgACGTnN
+>2 seq2
+a-tZ-MY
diff --git a/tests/small/nodata1-default.e.err-ref b/tests/small/nodata1-default.e.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata1-default.out-ref b/tests/small/nodata1-default.out-ref
new file mode 100644
index 0000000..c6f41b0
--- /dev/null
+++ b/tests/small/nodata1-default.out-ref
@@ -0,0 +1,2 @@
+>1
+>2
diff --git a/tests/small/nodata1-default.test b/tests/small/nodata1-default.test
new file mode 100644
index 0000000..361ff7f
--- /dev/null
+++ b/tests/small/nodata1-default.test
@@ -0,0 +1 @@
+ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/small/nodata1-default.u.err-ref b/tests/small/nodata1-default.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata1.fa b/tests/small/nodata1.fa
new file mode 100644
index 0000000..c6f41b0
--- /dev/null
+++ b/tests/small/nodata1.fa
@@ -0,0 +1,2 @@
+>1
+>2
diff --git a/tests/small/nodata2-default.e.err-ref b/tests/small/nodata2-default.e.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata2-default.out-ref b/tests/small/nodata2-default.out-ref
new file mode 100644
index 0000000..ccc6ba7
--- /dev/null
+++ b/tests/small/nodata2-default.out-ref
@@ -0,0 +1,3 @@
+>1
+>2
+A
diff --git a/tests/small/nodata2-default.test b/tests/small/nodata2-default.test
new file mode 100644
index 0000000..361ff7f
--- /dev/null
+++ b/tests/small/nodata2-default.test
@@ -0,0 +1 @@
+ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/small/nodata2-default.u.err-ref b/tests/small/nodata2-default.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata2.fa b/tests/small/nodata2.fa
new file mode 100644
index 0000000..ccc6ba7
--- /dev/null
+++ b/tests/small/nodata2.fa
@@ -0,0 +1,3 @@
+>1
+>2
+A
diff --git a/tests/small/nodata3-default.e.err-ref b/tests/small/nodata3-default.e.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata3-default.out-ref b/tests/small/nodata3-default.out-ref
new file mode 100644
index 0000000..4a879d0
--- /dev/null
+++ b/tests/small/nodata3-default.out-ref
@@ -0,0 +1,5 @@
+>1
+A
+>2
+>3
+C
diff --git a/tests/small/nodata3-default.test b/tests/small/nodata3-default.test
new file mode 100644
index 0000000..361ff7f
--- /dev/null
+++ b/tests/small/nodata3-default.test
@@ -0,0 +1 @@
+ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/small/nodata3-default.u.err-ref b/tests/small/nodata3-default.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata3.fa b/tests/small/nodata3.fa
new file mode 100644
index 0000000..4a879d0
--- /dev/null
+++ b/tests/small/nodata3.fa
@@ -0,0 +1,5 @@
+>1
+A
+>2
+>3
+C
diff --git a/tests/small/nodata4-default.e.err-ref b/tests/small/nodata4-default.e.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata4-default.out-ref b/tests/small/nodata4-default.out-ref
new file mode 100644
index 0000000..b74e559
--- /dev/null
+++ b/tests/small/nodata4-default.out-ref
@@ -0,0 +1,3 @@
+>1
+A
+>2
diff --git a/tests/small/nodata4-default.test b/tests/small/nodata4-default.test
new file mode 100644
index 0000000..361ff7f
--- /dev/null
+++ b/tests/small/nodata4-default.test
@@ -0,0 +1 @@
+ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/small/nodata4-default.u.err-ref b/tests/small/nodata4-default.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/nodata4.fa b/tests/small/nodata4.fa
new file mode 100644
index 0000000..b74e559
--- /dev/null
+++ b/tests/small/nodata4.fa
@@ -0,0 +1,3 @@
+>1
+A
+>2
diff --git a/tests/small/noname-default.e.err-ref b/tests/small/noname-default.e.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/noname-default.out-ref b/tests/small/noname-default.out-ref
new file mode 100644
index 0000000..89417fa
--- /dev/null
+++ b/tests/small/noname-default.out-ref
@@ -0,0 +1,2 @@
+>
+AAGA
diff --git a/tests/small/noname-default.test b/tests/small/noname-default.test
new file mode 100644
index 0000000..361ff7f
--- /dev/null
+++ b/tests/small/noname-default.test
@@ -0,0 +1 @@
+ennaf {GROUP}.fa 2>{TEST}.e.err | unnaf >{TEST}.out 2>{TEST}.u.err
diff --git a/tests/small/noname-default.u.err-ref b/tests/small/noname-default.u.err-ref
new file mode 100644
index 0000000..e69de29
diff --git a/tests/small/noname.fa b/tests/small/noname.fa
new file mode 100644
index 0000000..89417fa
--- /dev/null
+++ b/tests/small/noname.fa
@@ -0,0 +1,2 @@
+>
+AAGA
diff --git a/tests/test-runner.pl b/tests/test-runner.pl
index f6f33e2..1d61d3b 100755
--- a/tests/test-runner.pl
+++ b/tests/test-runner.pl
@@ -1,7 +1,7 @@
#!/usr/bin/env perl
#
# Test runner script
-# Copyright (c) 2018-2019 Kirill Kryukov
+# Copyright (c) 2018-2021 Kirill Kryukov
# See README.md and LICENSE files of this repository
#
diff --git a/unnaf/src/files.c b/unnaf/src/files.c
index a975153..e726e19 100644
--- a/unnaf/src/files.c
+++ b/unnaf/src/files.c
@@ -1,10 +1,9 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-
static void open_input_file(void)
{
assert(IN == NULL);
diff --git a/unnaf/src/input.c b/unnaf/src/input.c
index dd94368..dab6a2e 100644
--- a/unnaf/src/input.c
+++ b/unnaf/src/input.c
@@ -1,6 +1,6 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
@@ -268,6 +268,8 @@ static size_t initialize_input_decompression(void)
input_decompression_stream = ZSTD_createDStream();
if (!input_decompression_stream) { die("can't create input decompression stream\n"); }
+ ZSTD_TRY(ZSTD_DCtx_setParameter(input_decompression_stream, ZSTD_d_windowLogMax, ZSTD_WINDOWLOG_MAX));
+
size_t bytes_to_read = ZSTD_initDStream(input_decompression_stream);
if (ZSTD_isError(bytes_to_read)) { die("can't initialize input decompression stream: %s\n", ZSTD_getErrorName(bytes_to_read)); }
@@ -298,6 +300,8 @@ static void initialize_memory_decompression(void)
memory_decompression_stream = ZSTD_createDStream();
if (!memory_decompression_stream) { die("can't create memory decompression stream\n"); }
+ ZSTD_TRY(ZSTD_DCtx_setParameter(memory_decompression_stream, ZSTD_d_windowLogMax, ZSTD_WINDOWLOG_MAX));
+
memory_bytes_to_read = ZSTD_initDStream(memory_decompression_stream);
if (ZSTD_isError(memory_bytes_to_read)) { die("can't initialize memory decompression stream: %s\n", ZSTD_getErrorName(memory_bytes_to_read)); }
}
diff --git a/unnaf/src/output-fastq.c b/unnaf/src/output-fastq.c
index 50b924c..a7cbdcf 100644
--- a/unnaf/src/output-fastq.c
+++ b/unnaf/src/output-fastq.c
@@ -1,6 +1,6 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/unnaf/src/output-sequences.c b/unnaf/src/output-sequences.c
index 9e632e9..fd035da 100644
--- a/unnaf/src/output-sequences.c
+++ b/unnaf/src/output-sequences.c
@@ -1,6 +1,6 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/unnaf/src/output.c b/unnaf/src/output.c
index 5854586..4c716ee 100644
--- a/unnaf/src/output.c
+++ b/unnaf/src/output.c
@@ -1,10 +1,9 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-
static void print_list_of_parts(void)
{
int printed = 0;
@@ -388,10 +387,24 @@ static inline void print_dna_buffer_as_fasta(int masking)
total_seq_n_bp_remaining -= cur_seq_len_n_bp_remaining;
}
- if (lengths_buffer[cur_seq_len_index] != 4294967295u)
+ if (lengths_buffer[cur_seq_len_index] == 4294967295u)
+ {
+ cur_seq_len_index++;
+ }
+ else
{
fputc('\n', OUT);
+ cur_seq_len_index++;
cur_seq_index++;
+
+ // Print empty sequences without empty lines.
+ while (cur_seq_len_index < n_lengths && cur_seq_index < N && lengths_buffer[cur_seq_len_index] == 0)
+ {
+ print_fasta_name(cur_seq_index);
+ cur_seq_len_index++;
+ cur_seq_index++;
+ }
+
if (cur_seq_index < N)
{
print_fasta_name(cur_seq_index);
@@ -399,7 +412,6 @@ static inline void print_dna_buffer_as_fasta(int masking)
}
}
- cur_seq_len_index++;
if (cur_seq_len_index >= n_lengths) { break; }
cur_seq_len_n_bp_remaining = lengths_buffer[cur_seq_len_index];
@@ -607,10 +619,18 @@ static void print_fasta(int masking)
total_seq_length = read_number(IN);
compressed_seq_size = read_number(IN);
total_seq_n_bp_remaining = total_seq_length;
- cur_seq_len_n_bp_remaining = lengths_buffer[0];
- print_fasta_name(0);
+ while (cur_seq_len_index < n_lengths && cur_seq_index < N && lengths_buffer[cur_seq_len_index] == 0)
+ {
+ print_fasta_name(cur_seq_index);
+ cur_seq_len_index++;
+ cur_seq_index++;
+ }
+ if (cur_seq_index >= N) { return; }
+
+ print_fasta_name(cur_seq_index);
cur_line_n_bp_remaining = max_line_length;
+ cur_seq_len_n_bp_remaining = lengths_buffer[cur_seq_len_index];
size_t bytes_to_read = initialize_input_decompression();
size_t input_size;
diff --git a/unnaf/src/platform.h b/unnaf/src/platform.h
index d44e44d..5a6f792 100644
--- a/unnaf/src/platform.h
+++ b/unnaf/src/platform.h
@@ -1,6 +1,6 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
diff --git a/unnaf/src/unnaf.c b/unnaf/src/unnaf.c
index eba28be..18b6ca9 100644
--- a/unnaf/src/unnaf.c
+++ b/unnaf/src/unnaf.c
@@ -1,12 +1,12 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-#define VERSION "1.2.0"
-#define DATE "2020-09-01"
-#define COPYRIGHT_YEARS "2018-2020"
+#define VERSION "1.3.0"
+#define DATE "2021-05-17"
+#define COPYRIGHT_YEARS "2018-2021"
#include "platform.h"
@@ -270,7 +270,9 @@ static void show_help(void)
" -c - Write to standard output\n"
" --line-length N - Use lines of width N for FASTA output\n"
" --no-mask - Ignore mask\n"
- " --binary - Binary output (no 0D 0A on Windows)\n"
+ " --binary-stdout - Set stdout stream to binary mode.\n"
+ " --binary-stderr - Set stderr stream to binary mode.\n"
+ " --binary - Shortcut for \"--binary-stdout --binary-stderr\"\n"
" -h, --help - Show help\n"
" -V, --version - Show version\n"
);
@@ -311,6 +313,7 @@ static void parse_command_line(int argc, char **argv)
if (!strcmp(argv[i], "--no-mask")) { use_mask = false; continue; }
if (!strcmp(argv[i], "--binary-stdout")) { binary_stdout = true; continue; }
if (!strcmp(argv[i], "--binary-stderr")) { if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; }
+ if (!strcmp(argv[i], "--binary")) { binary_stdout = true; if (!binary_stderr) { binary_stderr = true; change_stderr_to_binary(); } continue; }
if (!strcmp(argv[i], "--help")) { show_help(); exit(0); }
if (!strcmp(argv[i], "--verbose")) { verbose = true; continue; }
if (!strcmp(argv[i], "--version")) { print_version = true; continue; }
diff --git a/unnaf/src/utils.c b/unnaf/src/utils.c
index 0a7f6c5..42828df 100644
--- a/unnaf/src/utils.c
+++ b/unnaf/src/utils.c
@@ -1,10 +1,9 @@
/*
* NAF decompressor
- * Copyright (c) 2018-2020 Kirill Kryukov
+ * Copyright (c) 2018-2021 Kirill Kryukov
* See README.md and LICENSE files of this repository
*/
-
//__attribute__ ((format (printf, 1, 2)))
static void msg(const char *format, ...)
{
@@ -41,6 +40,13 @@ static void die(const char *format, ...)
}
+#define ZSTD_TRY(f) \
+do { \
+ size_t e = f; \
+ if (ZSTD_isError(e)) { die("zstd error: %s", ZSTD_getErrorName(e)); } \
+} while (0)
+
+
__attribute__ ((cold))
__attribute__ ((noreturn))
static inline void incomplete(void)
diff --git a/zstd b/zstd
index b706286..a488ba1 160000
--- a/zstd
+++ b/zstd
@@ -1 +1 @@
-Subproject commit b706286adbba780006a47ef92df0ad7a785666b6
+Subproject commit a488ba114ec17ea1054b9057c26a046fc122b3b6