From a0f6ac13c339b5268b8b3e6269c7a9eb809010bf Mon Sep 17 00:00:00 2001 From: Wei Shen Date: Fri, 23 Oct 2020 20:12:23 +0800 Subject: [PATCH] faster sorting --- CHANGES.md | 3 ++- unikmer/cmd/common.go | 7 +++++-- unikmer/cmd/count.go | 7 +++++-- unikmer/cmd/diff.go | 7 +++++-- unikmer/cmd/grep.go | 16 +++++++++++----- unikmer/cmd/rfilter.go | 2 ++ unikmer/cmd/sort.go | 22 +++++++++++++++------- unikmer/cmd/split.go | 13 +++++++++---- unikmer/cmd/union.go | 10 +++++++--- 9 files changed, 61 insertions(+), 26 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 396e284..6ce86aa 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,7 +1,8 @@ - v0.13.0 - new command `unikmer common`: Finding k-mers shared by most of multiple binary files. + - `unikmer common/count/diff/grep/rfilter/sort/split/union`: faster sorting. - `unikmer uniqs`: better result for flag `--circular`. - - `unikmer search`: fix a bug when searching on database with >1 hashes. + - `unikmer search`: fix a bug when searching on database with more than one hash. - v0.12.0 - `unikmer`: - support longer k (k>32) by saving ntHash. diff --git a/unikmer/cmd/common.go b/unikmer/cmd/common.go index fec9389..effb652 100644 --- a/unikmer/cmd/common.go +++ b/unikmer/cmd/common.go @@ -26,11 +26,12 @@ import ( "io" "os" "runtime" - "sort" "github.com/pkg/errors" "github.com/shenwei356/unikmer" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var commonCmd = &cobra.Command{ @@ -57,6 +58,7 @@ Tips: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs var err error @@ -339,7 +341,8 @@ Tips: log.Infof("no shared k-mers found") } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) if hasTaxid || hasMixTaxid { for _, code := range codes { diff --git a/unikmer/cmd/count.go b/unikmer/cmd/count.go index 8661f72..6a2d626 100644 --- a/unikmer/cmd/count.go +++ b/unikmer/cmd/count.go @@ -25,7 +25,6 @@ import ( "io" "regexp" "runtime" - "sort" "strconv" "github.com/pkg/errors" @@ -33,6 +32,8 @@ import ( "github.com/shenwei356/bio/seqio/fastx" "github.com/shenwei356/unikmer" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var countCmd = &cobra.Command{ @@ -44,6 +45,7 @@ var countCmd = &cobra.Command{ Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs seq.ValidateSeq = false var err error @@ -410,7 +412,8 @@ var countCmd = &cobra.Command{ if opt.Verbose { log.Infof("sorting %d k-mers", len(codes)) } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) if opt.Verbose { log.Infof("done sorting") } diff --git a/unikmer/cmd/diff.go b/unikmer/cmd/diff.go index d55acf8..b7294c8 100644 --- a/unikmer/cmd/diff.go +++ b/unikmer/cmd/diff.go @@ -26,12 +26,13 @@ import ( "io" "os" "runtime" - "sort" "sync" "github.com/pkg/errors" "github.com/shenwei356/unikmer" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var diffCmd = &cobra.Command{ @@ -81,6 +82,7 @@ Tips: threads := opt.NumCPUs runtime.GOMAXPROCS(threads) + sorts.MaxProcs = opt.NumCPUs mc := make([]unikmer.CodeTaxid, 0, mapInitSize) @@ -585,7 +587,8 @@ Tips: if opt.Verbose { log.Infof("sorting %d k-mers", len(codes)) } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) if opt.Verbose { log.Infof("done sorting") } diff --git a/unikmer/cmd/grep.go b/unikmer/cmd/grep.go index b5756bf..d57185f 100644 --- a/unikmer/cmd/grep.go +++ b/unikmer/cmd/grep.go @@ -27,7 +27,6 @@ import ( "os" "path/filepath" "runtime" - "sort" "strconv" "sync" @@ -37,6 +36,8 @@ import ( "github.com/shenwei356/unikmer" "github.com/shenwei356/util/pathutil" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var grepCmd = &cobra.Command{ @@ -58,6 +59,7 @@ Tips: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs var err error @@ -681,12 +683,14 @@ Tips: if opt.Verbose { log.Infof("[file %d/%d] sorting %d k-mers", i+1, nfiles, len(_codesTaxids)) } - sort.Sort(unikmer.CodeTaxidSlice(_codesTaxids)) + // sort.Sort(unikmer.CodeTaxidSlice(_codesTaxids)) + sorts.Quicksort(unikmer.CodeTaxidSlice(_codesTaxids)) } else { if opt.Verbose { log.Infof("[file %d/%d] sorting %d k-mers", i+1, nfiles, len(_codes)) } - sort.Sort(unikmer.CodeSlice(_codes)) + // sort.Sort(unikmer.CodeSlice(_codes)) + sortutil.Uint64s(_codes) } if opt.Verbose { @@ -787,12 +791,14 @@ Tips: if opt.Verbose { log.Infof("sorting %d k-mers", len(codesTaxids)) } - sort.Sort(unikmer.CodeTaxidSlice(codesTaxids)) + // sort.Sort(unikmer.CodeTaxidSlice(codesTaxids)) + sorts.Quicksort(unikmer.CodeTaxidSlice(codesTaxids)) } else { if opt.Verbose { log.Infof("sorting %d k-mers", len(codes)) } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) } if opt.Verbose { log.Infof("done sorting") diff --git a/unikmer/cmd/rfilter.go b/unikmer/cmd/rfilter.go index fef71dc..c4c4f4f 100644 --- a/unikmer/cmd/rfilter.go +++ b/unikmer/cmd/rfilter.go @@ -36,6 +36,7 @@ import ( "github.com/shenwei356/util/pathutil" "github.com/shenwei356/util/stringutil" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" ) // rfilterCmd represents @@ -61,6 +62,7 @@ Rank file: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs var err error diff --git a/unikmer/cmd/sort.go b/unikmer/cmd/sort.go index a756609..9c62c0f 100644 --- a/unikmer/cmd/sort.go +++ b/unikmer/cmd/sort.go @@ -27,13 +27,14 @@ import ( "os" "path/filepath" "runtime" - "sort" "sync" "github.com/pkg/errors" "github.com/shenwei356/unikmer" "github.com/shenwei356/util/pathutil" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var sortCmd = &cobra.Command{ @@ -60,6 +61,7 @@ Tips: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs outFile0 := getFlagString(cmd, "out-prefix") unique := getFlagBool(cmd, "unique") @@ -261,12 +263,14 @@ Tips: if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt)) } - sort.Sort(unikmer.CodeTaxidSlice(mt)) + // sort.Sort(unikmer.CodeTaxidSlice(mt)) + sorts.Quicksort(unikmer.CodeTaxidSlice(mt)) } else { if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m)) } - sort.Sort(unikmer.CodeSlice(m)) + // sort.Sort(unikmer.CodeSlice(m)) + sortutil.Uint64s(m) } if opt.Verbose { log.Infof("[chunk %d] done sorting", iTmpFile) @@ -322,12 +326,14 @@ Tips: if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt)) } - sort.Sort(unikmer.CodeTaxidSlice(mt)) + // sort.Sort(unikmer.CodeTaxidSlice(mt)) + sorts.Quicksort(unikmer.CodeTaxidSlice(mt)) } else { if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m)) } - sort.Sort(unikmer.CodeSlice(m)) + // sort.Sort(unikmer.CodeSlice(m)) + sortutil.Uint64s(m) } if opt.Verbose { log.Infof("[chunk %d] done sorting", iTmpFile) @@ -446,12 +452,14 @@ Tips: if opt.Verbose { log.Infof("sorting %d k-mers", len(mt)) } - sort.Sort(unikmer.CodeTaxidSlice(mt)) + // sort.Sort(unikmer.CodeTaxidSlice(mt)) + sorts.Quicksort(unikmer.CodeTaxidSlice(mt)) } else { if opt.Verbose { log.Infof("sorting %d k-mers", len(m)) } - sort.Sort(unikmer.CodeSlice(m)) + // sort.Sort(unikmer.CodeSlice(m)) + sortutil.Uint64s(m) } if opt.Verbose { log.Infof("done sorting") diff --git a/unikmer/cmd/split.go b/unikmer/cmd/split.go index 475173c..4313026 100644 --- a/unikmer/cmd/split.go +++ b/unikmer/cmd/split.go @@ -27,11 +27,12 @@ import ( "os" "path/filepath" "runtime" - "sort" "sync" "github.com/pkg/errors" "github.com/shenwei356/util/pathutil" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" "github.com/shenwei356/unikmer" "github.com/spf13/cobra" @@ -57,6 +58,7 @@ Tips: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs outDir := getFlagString(cmd, "out-dir") force := getFlagBool(cmd, "force") @@ -300,12 +302,14 @@ Tips: if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(mt)) } - sort.Sort(unikmer.CodeTaxidSlice(mt)) + // sort.Sort(unikmer.CodeTaxidSlice(mt)) + sorts.Quicksort(unikmer.CodeTaxidSlice(mt)) } else { if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m)) } - sort.Sort(unikmer.CodeSlice(m)) + // sort.Sort(unikmer.CodeSlice(m)) + sortutil.Uint64s(m) } var _n int64 @@ -376,7 +380,8 @@ Tips: if opt.Verbose { log.Infof("[chunk %d] sorting %d k-mers", iTmpFile, len(m)) } - sort.Sort(unikmer.CodeSlice(m)) + // sort.Sort(unikmer.CodeSlice(m)) + sortutil.Uint64s(m) if opt.Verbose { log.Infof("[chunk %d] done sorting", iTmpFile) log.Infof("[chunk %d] writing to file: %s", iTmpFile, outFile) diff --git a/unikmer/cmd/union.go b/unikmer/cmd/union.go index 2ad741a..edd43c5 100644 --- a/unikmer/cmd/union.go +++ b/unikmer/cmd/union.go @@ -26,11 +26,12 @@ import ( "io" "os" "runtime" - "sort" "github.com/pkg/errors" "github.com/shenwei356/unikmer" "github.com/spf13/cobra" + "github.com/twotwotwo/sorts" + "github.com/twotwotwo/sorts/sortutil" ) var unionCmd = &cobra.Command{ @@ -51,6 +52,7 @@ Tips: Run: func(cmd *cobra.Command, args []string) { opt := getOptions(cmd) runtime.GOMAXPROCS(opt.NumCPUs) + sorts.MaxProcs = opt.NumCPUs var err error @@ -272,7 +274,8 @@ Tips: if opt.Verbose { log.Infof("sorting %d k-mers", len(codes)) } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) if opt.Verbose { log.Infof("done sorting") } @@ -292,7 +295,8 @@ Tips: if opt.Verbose { log.Infof("sorting %d k-mers", len(codes)) } - sort.Sort(unikmer.CodeSlice(codes)) + // sort.Sort(unikmer.CodeSlice(codes)) + sortutil.Uint64s(codes) if opt.Verbose { log.Infof("done sorting") }