diff --git a/.gitignore b/.gitignore
index f0884ff..c2a1047 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,11 +14,10 @@
*.directory
unikmer/unikmer*
unikmer/binaries*
-unikmer/*.unik
doc/site/*
*ssshtest
-testdata/*.unik
+*.unik
t_*
*.nextflow.log*
*.brename_detail.txt
diff --git a/README.md b/README.md
index 0a4e0e0..9debfd7 100644
--- a/README.md
+++ b/README.md
@@ -183,57 +183,78 @@ label |encoded-kmera|gzip-compressedb|compact-fo
# counting (only keep the canonical k-mers and compact output)
# memusg -t unikmer count -k 23 Ecoli-IAI39.fasta.gz -o Ecoli-IAI39.fasta.gz.k23 --canonical --compact
$ memusg -t unikmer count -k 23 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k23 --canonical --compact
- elapsed time: 1.088s
- peak rss: 210.93 MB
-
+ elapsed time: 0.897s
+ peak rss: 192.41 MB
+
# counting (only keep the canonical k-mers and sort k-mers)
- # memusg -t unikmer count -k 23 Ecoli-IAI39.fasta.gz -o Ecoli-IAI39.fasta.gz.k23.sorted --canonical --compact --sort
- $ memusg -t unikmer count -k 23 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k23.sorted --canonical --compact --sort
- elapsed time: 2.063s
- peak rss: 337.55 MB
+ # memusg -t unikmer count -k 23 Ecoli-IAI39.fasta.gz -o Ecoli-IAI39.fasta.gz.k23.sorted --canonical --sort
+ $ memusg -t unikmer count -k 23 Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k23.sorted --canonical --sort
+ elapsed time: 1.136s
+ peak rss: 227.28 MB
# counting and assigning global TaxIds
- $ unikmer count -k 23 -K -c -s Ecoli-IAI39.fasta.gz -o Ecoli-IAI39.fasta.gz.k23.sorted -t 585057
- $ unikmer count -k 23 -K -c -s Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k23.sorted -t 511145
- $ unikmer count -k 23 -K -c -s A.muciniphila-ATCC_BAA-835.fasta.gz -o A.muciniphila-ATCC_BAA-835.fasta.gz.sorted -t 349741
-
+ $ unikmer count -k 23 -K -s Ecoli-IAI39.fasta.gz -o Ecoli-IAI39.fasta.gz.k23.sorted -t 585057
+ $ unikmer count -k 23 -K -s Ecoli-MG1655.fasta.gz -o Ecoli-MG1655.fasta.gz.k23.sorted -t 511145
+ $ unikmer count -k 23 -K -s A.muciniphila-ATCC_BAA-835.fasta.gz -o A.muciniphila-ATCC_BAA-835.fasta.gz.sorted -t 349741
+
+ # counting minimizer and ouputting in linear order
+ $ unikmer count -k 23 -W 5 -H -K -l A.muciniphila-ATCC_BAA-835.fasta.gz -o A.muciniphila-ATCC_BAA-835.fasta.gz.m
# view
- $ unikmer view Ecoli-MG1655.fasta.gz.k23.sorted.unik --show-TaxId | head -n 3
+ $ unikmer view Ecoli-MG1655.fasta.gz.k23.sorted.unik --show-taxid | head -n 3
AAAAAAAAACCATCCAAATCTGG 511145
AAAAAAAAACCGCTAGTATATTC 511145
AAAAAAAAACCTGAAAAAAACGG 511145
-
+
+ # view (hashed k-mers needs original FASTA/Q file)
+ $ unikmer view --show-code --genome A.muciniphila-ATCC_BAA-835.fasta.gz A.muciniphila-ATCC_BAA-835.fasta.gz.m.unik | head -n 3
+ CATCCGCCATCTTTGGGGTGTCG 1210726578792
+ AGCGCAAAATCCCCAAACATGTA 2286899379883
+ AACTGATTTTTGATGATGACTCC 3542156397282
+
+ # find the positions of k-mers
+ $ seqkit locate -M A.muciniphila-ATCC_BAA-835.fasta.gz \
+ -f <(unikmer view -a -g A.muciniphila-ATCC_BAA-835.fasta.gz A.muciniphila-ATCC_BAA-835.fasta.gz.m.unik | seqkit head -n 5 ) \
+ | csvtk sort -t -k start:n | head -n 6 | csvtk pretty -t
+ seqID patternName pattern strand start end
+ ----------- ------------------- ----------------------- ------ ----- ---
+ NC_010655.1 2090893901864583115 ATCTTATAAAATAACCACATAAC + 3 25
+ NC_010655.1 696051979077366638 TTATAAAATAACCACATAACTTA + 6 28
+ NC_010655.1 390297872016815006 TATAAAATAACCACATAACTTAA + 7 29
+ NC_010655.1 2582400417208090837 AAAATAACCACATAACTTAAAAA + 10 32
+ NC_010655.1 3048591415312050785 TAACCACATAACTTAAAAAGAAT + 14 36
# stats
$ unikmer stats *.unik -a -j 10
- file k gzipped compact canonical sorted include-TaxId global-TaxId number
- A.muciniphila-ATCC_BAA-835.fasta.gz.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 349741 2,630,905
- Ecoli-IAI39.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 585057 4,902,266
- Ecoli-IAI39.fasta.gz.k23.unik 23 ✓ ✓ ✓ ✕ ✕ 4,902,266
- Ecoli-MG1655.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 511145 4,546,632
- Ecoli-MG1655.fasta.gz.k23.unik 23 ✓ ✓ ✓ ✕ ✕ 4,546,632
-
+ file k canonical hashed scaled include-taxid global-taxid sorted compact gzipped version number description
+ A.muciniphila-ATCC_BAA-835.fasta.gz.m.unik 23 ✓ ✓ ✕ ✕ ✕ ✕ ✓ v5.0 860,900
+ A.muciniphila-ATCC_BAA-835.fasta.gz.sorted.unik 23 ✓ ✕ ✕ ✕ 349741 ✓ ✕ ✓ v5.0 2,630,905
+ Ecoli-IAI39.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✕ ✕ 585057 ✓ ✕ ✓ v5.0 4,902,266
+ Ecoli-IAI39.fasta.gz.k23.unik 23 ✓ ✕ ✕ ✕ ✕ ✓ ✓ v5.0 4,902,266
+ Ecoli-MG1655.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✕ ✕ 511145 ✓ ✕ ✓ v5.0 4,546,632
+ Ecoli-MG1655.fasta.gz.k23.unik 23 ✓ ✕ ✕ ✕ ✕ ✓ ✓ v5.0 4,546,632
+
# concat
$ memusg -t unikmer concat *.k23.sorted.unik -o concat.k23 -c
- elapsed time: 1.205s
- peak rss: 60.07 MB
+ elapsed time: 1.020s
+ peak rss: 25.86 MB
+
# union
- $ memusg -t unikmer union *.k23.sorted.unik -o union.k23 -c -s
- elapsed time: 5.449s
- peak rss: 709.93 MB
+ $ memusg -t unikmer union *.k23.sorted.unik -o union.k23 -s
+ elapsed time: 3.991s
+ peak rss: 590.92 MB
# or sorting with limited memory.
# note that taxonomy database need some memory.
$ memusg -t unikmer sort *.k23.sorted.unik -o union2.k23 -u -m 1M
- elapsed time: 4.474s
- peak rss: 333.82 MB
+ elapsed time: 3.538s
+ peak rss: 324.2 MB
$ unikmer view -t union.k23.unik | md5sum
4c038832209278840d4d75944b29219c -
@@ -243,20 +264,20 @@ label |encoded-kmera|gzip-compressedb|compact-fo
# duplicated k-mers
$ memusg -t unikmer sort *.k23.sorted.unik -o dup.k23 -d -m 1M
- elapsed time: 4.374s
- peak rss: 306.06 MB
+ elapsed time: 1.143s
+ peak rss: 240.18 MB
# intersection
- $ memusg -t unikmer inter *.k23.sorted.unik -o inter.k23 -c -s
- elapsed time: 2.506s
- peak rss: 194.94 MB
+ $ memusg -t unikmer inter *.k23.sorted.unik -o inter.k23
+ elapsed time: 1.481s
+ peak rss: 399.94 MB
# difference
- $ memusg -t unikmer diff -j 10 *.k23.sorted.unik -o diff.k23 -c -s
- elapsed time: 2.179s
- peak rss: 177.79 MB
+ $ memusg -t unikmer diff -j 10 *.k23.sorted.unik -o diff.k23 -s
+ elapsed time: 0.793s
+ peak rss: 338.06 MB
$ ls -lh *.unik
@@ -274,18 +295,19 @@ label |encoded-kmera|gzip-compressedb|compact-fo
$ unikmer stats *.unik -a -j 10
- file k gzipped compact canonical sorted include-TaxId global-TaxId number
- A.muciniphila-ATCC_BAA-835.fasta.gz.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 349741 2,630,905
- concat.k23.unik 23 ✓ ✓ ✓ ✕ ✓ 9,448,898
- diff.k23.unik 23 ✓ ✕ ✓ ✓ ✓ 2,326,096
- dup.k23.unik 23 ✓ ✕ ✓ ✓ ✓ 2,576,169
- Ecoli-IAI39.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 585057 4,902,266
- Ecoli-IAI39.fasta.gz.k23.unik 23 ✓ ✓ ✓ ✕ ✕ 4,902,266
- Ecoli-MG1655.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✓ ✓ ✕ 511145 4,546,632
- Ecoli-MG1655.fasta.gz.k23.unik 23 ✓ ✓ ✓ ✕ ✕ 4,546,632
- inter.k23.unik 23 ✓ ✕ ✓ ✓ ✓ 2,576,170
- union2.k23.unik 23 ✓ ✕ ✓ ✓ ✓ 6,872,728
- union.k23.unik 23 ✓ ✕ ✓ ✓ ✓ 6,872,728
+ file k canonical hashed scaled include-taxid global-taxid sorted compact gzipped version number description
+ A.muciniphila-ATCC_BAA-835.fasta.gz.m.unik 23 ✓ ✓ ✕ ✕ ✕ ✕ ✓ v5.0 860,900
+ A.muciniphila-ATCC_BAA-835.fasta.gz.sorted.unik 23 ✓ ✕ ✕ ✕ 349741 ✓ ✕ ✓ v5.0 2,630,905
+ concat.k23.unik 23 ✓ ✕ ✕ ✓ ✕ ✓ ✓ v5.0 -1
+ diff.k23.unik 23 ✓ ✕ ✕ ✓ ✕ ✕ ✓ v5.0 2,326,096
+ dup.k23.unik 23 ✓ ✕ ✕ ✓ ✓ ✕ ✓ v5.0 0
+ Ecoli-IAI39.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✕ ✕ 585057 ✓ ✕ ✓ v5.0 4,902,266
+ Ecoli-IAI39.fasta.gz.k23.unik 23 ✓ ✕ ✕ ✕ ✕ ✓ ✓ v5.0 4,902,266
+ Ecoli-MG1655.fasta.gz.k23.sorted.unik 23 ✓ ✕ ✕ ✕ 511145 ✓ ✕ ✓ v5.0 4,546,632
+ Ecoli-MG1655.fasta.gz.k23.unik 23 ✓ ✕ ✕ ✕ ✕ ✓ ✓ v5.0 4,546,632
+ inter.k23.unik 23 ✓ ✕ ✕ ✓ ✓ ✕ ✓ v5.0 2,576,170
+ union2.k23.unik 23 ✓ ✕ ✕ ✓ ✓ ✕ ✓ v5.0 6,872,728
+ union.k23.unik 23 ✓ ✕ ✕ ✓ ✓ ✕ ✓ v5.0 6,872,728
# -----------------------------------------------------------------------------------------
diff --git a/unikmer/cmd/concat.go b/unikmer/cmd/concat.go
index a13f1cf..39899a7 100644
--- a/unikmer/cmd/concat.go
+++ b/unikmer/cmd/concat.go
@@ -64,7 +64,7 @@ Attentions:
sortedKmers := getFlagBool(cmd, "sorted")
globalTaxid := getFlagUint32(cmd, "taxid")
hasGlobalTaxid := globalTaxid > 0
- number := getFlagUint64(cmd, "number")
+ number := uint64(getFlagInt64(cmd, "number"))
if hasGlobalTaxid && opt.Verbose {
log.Warningf("discarding all taxids and assigning new global taxid: %d", globalTaxid)