Skip to content

Commit

Permalink
faster sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Oct 23, 2020
1 parent cb70453 commit a0f6ac1
Show file tree
Hide file tree
Showing 9 changed files with 61 additions and 26 deletions.
3 changes: 2 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
- v0.13.0
- new command `unikmer common`: Finding k-mers shared by most of multiple binary files.
- `unikmer common/count/diff/grep/rfilter/sort/split/union`: faster sorting.
- `unikmer uniqs`: better result for flag `--circular`.
- `unikmer search`: fix a bug when searching on database with >1 hashes.
- `unikmer search`: fix a bug when searching on database with more than one hash.
- v0.12.0
- `unikmer`:
- support longer k (k>32) by saving ntHash.
Expand Down
7 changes: 5 additions & 2 deletions unikmer/cmd/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ import (
"io"
"os"
"runtime"
"sort"

"github.com/pkg/errors"
"github.com/shenwei356/unikmer"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var commonCmd = &cobra.Command{
Expand All @@ -57,6 +58,7 @@ Tips:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

var err error

Expand Down Expand Up @@ -339,7 +341,8 @@ Tips:
log.Infof("no shared k-mers found")
}

sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)

if hasTaxid || hasMixTaxid {
for _, code := range codes {
Expand Down
7 changes: 5 additions & 2 deletions unikmer/cmd/count.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ import (
"io"
"regexp"
"runtime"
"sort"
"strconv"

"github.com/pkg/errors"
"github.com/shenwei356/bio/seq"
"github.com/shenwei356/bio/seqio/fastx"
"github.com/shenwei356/unikmer"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var countCmd = &cobra.Command{
Expand All @@ -44,6 +45,7 @@ var countCmd = &cobra.Command{
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs
seq.ValidateSeq = false

var err error
Expand Down Expand Up @@ -410,7 +412,8 @@ var countCmd = &cobra.Command{
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codes))
}
sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)
if opt.Verbose {
log.Infof("done sorting")
}
Expand Down
7 changes: 5 additions & 2 deletions unikmer/cmd/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ import (
"io"
"os"
"runtime"
"sort"
"sync"

"github.com/pkg/errors"
"github.com/shenwei356/unikmer"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var diffCmd = &cobra.Command{
Expand Down Expand Up @@ -81,6 +82,7 @@ Tips:
threads := opt.NumCPUs

runtime.GOMAXPROCS(threads)
sorts.MaxProcs = opt.NumCPUs

mc := make([]unikmer.CodeTaxid, 0, mapInitSize)

Expand Down Expand Up @@ -585,7 +587,8 @@ Tips:
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codes))
}
sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)
if opt.Verbose {
log.Infof("done sorting")
}
Expand Down
16 changes: 11 additions & 5 deletions unikmer/cmd/grep.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ import (
"os"
"path/filepath"
"runtime"
"sort"
"strconv"
"sync"

Expand All @@ -37,6 +36,8 @@ import (
"github.com/shenwei356/unikmer"
"github.com/shenwei356/util/pathutil"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var grepCmd = &cobra.Command{
Expand All @@ -58,6 +59,7 @@ Tips:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

var err error

Expand Down Expand Up @@ -681,12 +683,14 @@ Tips:
if opt.Verbose {
log.Infof("[file %d/%d] sorting %d k-mers", i+1, nfiles, len(_codesTaxids))
}
sort.Sort(unikmer.CodeTaxidSlice(_codesTaxids))
// sort.Sort(unikmer.CodeTaxidSlice(_codesTaxids))
sorts.Quicksort(unikmer.CodeTaxidSlice(_codesTaxids))
} else {
if opt.Verbose {
log.Infof("[file %d/%d] sorting %d k-mers", i+1, nfiles, len(_codes))
}
sort.Sort(unikmer.CodeSlice(_codes))
// sort.Sort(unikmer.CodeSlice(_codes))
sortutil.Uint64s(_codes)
}

if opt.Verbose {
Expand Down Expand Up @@ -787,12 +791,14 @@ Tips:
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codesTaxids))
}
sort.Sort(unikmer.CodeTaxidSlice(codesTaxids))
// sort.Sort(unikmer.CodeTaxidSlice(codesTaxids))
sorts.Quicksort(unikmer.CodeTaxidSlice(codesTaxids))
} else {
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codes))
}
sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)
}
if opt.Verbose {
log.Infof("done sorting")
Expand Down
2 changes: 2 additions & 0 deletions unikmer/cmd/rfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/shenwei356/util/pathutil"
"github.com/shenwei356/util/stringutil"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
)

// rfilterCmd represents
Expand All @@ -61,6 +62,7 @@ Rank file:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

var err error

Expand Down
22 changes: 15 additions & 7 deletions unikmer/cmd/sort.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ import (
"os"
"path/filepath"
"runtime"
"sort"
"sync"

"github.com/pkg/errors"
"github.com/shenwei356/unikmer"
"github.com/shenwei356/util/pathutil"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var sortCmd = &cobra.Command{
Expand All @@ -60,6 +61,7 @@ Tips:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

outFile0 := getFlagString(cmd, "out-prefix")
unique := getFlagBool(cmd, "unique")
Expand Down Expand Up @@ -261,12 +263,14 @@ Tips:
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt))
}
sort.Sort(unikmer.CodeTaxidSlice(mt))
// sort.Sort(unikmer.CodeTaxidSlice(mt))
sorts.Quicksort(unikmer.CodeTaxidSlice(mt))
} else {
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m))
}
sort.Sort(unikmer.CodeSlice(m))
// sort.Sort(unikmer.CodeSlice(m))
sortutil.Uint64s(m)
}
if opt.Verbose {
log.Infof("[chunk %d] done sorting", iTmpFile)
Expand Down Expand Up @@ -322,12 +326,14 @@ Tips:
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt))
}
sort.Sort(unikmer.CodeTaxidSlice(mt))
// sort.Sort(unikmer.CodeTaxidSlice(mt))
sorts.Quicksort(unikmer.CodeTaxidSlice(mt))
} else {
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m))
}
sort.Sort(unikmer.CodeSlice(m))
// sort.Sort(unikmer.CodeSlice(m))
sortutil.Uint64s(m)
}
if opt.Verbose {
log.Infof("[chunk %d] done sorting", iTmpFile)
Expand Down Expand Up @@ -446,12 +452,14 @@ Tips:
if opt.Verbose {
log.Infof("sorting %d k-mers", len(mt))
}
sort.Sort(unikmer.CodeTaxidSlice(mt))
// sort.Sort(unikmer.CodeTaxidSlice(mt))
sorts.Quicksort(unikmer.CodeTaxidSlice(mt))
} else {
if opt.Verbose {
log.Infof("sorting %d k-mers", len(m))
}
sort.Sort(unikmer.CodeSlice(m))
// sort.Sort(unikmer.CodeSlice(m))
sortutil.Uint64s(m)
}
if opt.Verbose {
log.Infof("done sorting")
Expand Down
13 changes: 9 additions & 4 deletions unikmer/cmd/split.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ import (
"os"
"path/filepath"
"runtime"
"sort"
"sync"

"github.com/pkg/errors"
"github.com/shenwei356/util/pathutil"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"

"github.com/shenwei356/unikmer"
"github.com/spf13/cobra"
Expand All @@ -57,6 +58,7 @@ Tips:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

outDir := getFlagString(cmd, "out-dir")
force := getFlagBool(cmd, "force")
Expand Down Expand Up @@ -300,12 +302,14 @@ Tips:
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt))
}
sort.Sort(unikmer.CodeTaxidSlice(mt))
// sort.Sort(unikmer.CodeTaxidSlice(mt))
sorts.Quicksort(unikmer.CodeTaxidSlice(mt))
} else {
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m))
}
sort.Sort(unikmer.CodeSlice(m))
// sort.Sort(unikmer.CodeSlice(m))
sortutil.Uint64s(m)
}

var _n int64
Expand Down Expand Up @@ -376,7 +380,8 @@ Tips:
if opt.Verbose {
log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m))
}
sort.Sort(unikmer.CodeSlice(m))
// sort.Sort(unikmer.CodeSlice(m))
sortutil.Uint64s(m)
if opt.Verbose {
log.Infof("[chunk %d] done sorting", iTmpFile)
log.Infof("[chunk %d] writing to file: %s", iTmpFile, outFile)
Expand Down
10 changes: 7 additions & 3 deletions unikmer/cmd/union.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ import (
"io"
"os"
"runtime"
"sort"

"github.com/pkg/errors"
"github.com/shenwei356/unikmer"
"github.com/spf13/cobra"
"github.com/twotwotwo/sorts"
"github.com/twotwotwo/sorts/sortutil"
)

var unionCmd = &cobra.Command{
Expand All @@ -51,6 +52,7 @@ Tips:
Run: func(cmd *cobra.Command, args []string) {
opt := getOptions(cmd)
runtime.GOMAXPROCS(opt.NumCPUs)
sorts.MaxProcs = opt.NumCPUs

var err error

Expand Down Expand Up @@ -272,7 +274,8 @@ Tips:
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codes))
}
sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)
if opt.Verbose {
log.Infof("done sorting")
}
Expand All @@ -292,7 +295,8 @@ Tips:
if opt.Verbose {
log.Infof("sorting %d k-mers", len(codes))
}
sort.Sort(unikmer.CodeSlice(codes))
// sort.Sort(unikmer.CodeSlice(codes))
sortutil.Uint64s(codes)
if opt.Verbose {
log.Infof("done sorting")
}
Expand Down

0 comments on commit a0f6ac1

Please sign in to comment.