Merge pull request #84 from sanger-tol/dev

Release 1.2
sanger-tol · Dec 19, 2023 · 91efc18 · 91efc18
2 parents c5d5612 + db5c8b9
commit 91efc18
Show file tree

Hide file tree

Showing 58 changed files with 690 additions and 445 deletions.
diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml
@@ -8,21 +8,21 @@ jobs:
     # Only run if comment is on a PR with the main repo, and if it contains the magic keywords
     if: >
       contains(github.event.comment.html_url, '/pull/') &&
-      contains(github.event.comment.body, '@nf-core-bot fix linting') &&
+      contains(github.event.comment.body, '@sanger-tolsoft fix linting') &&
       github.repository == 'sanger-tol/readmapping'
     runs-on: ubuntu-latest
     steps:
-      # Use the @nf-core-bot token to check out so we can push later
+      # Use the @sanger-tolsoft token to check out so we can push later
       - uses: actions/checkout@v3
         with:
-          token: ${{ secrets.nf_core_bot_auth_token }}
+          token: ${{ secrets.sangertolsoft_access_token }}
 
       # Action runs on the issue comment, so we don't get the PR by default
       # Use the gh cli to check out the PR
       - name: Checkout Pull Request
         run: gh pr checkout ${{ github.event.issue.number }}
         env:
-          GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
+          GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }}
 
       - uses: actions/setup-node@v3
 
@@ -34,9 +34,9 @@ jobs:
         id: prettier_status
         run: |
           if prettier --check ${GITHUB_WORKSPACE}; then
-            echo "name=result::pass" >> $GITHUB_OUTPUT
+            echo "result=pass" >> $GITHUB_OUTPUT
           else
-            echo "name=result::fail" >> $GITHUB_OUTPUT
+            echo "result=fail" >> $GITHUB_OUTPUT
           fi
 
       - name: Run 'prettier --write'
@@ -46,8 +46,8 @@ jobs:
       - name: Commit & push changes
         if: steps.prettier_status.outputs.result == 'fail'
         run: |
-          git config user.email "[email protected]"
-          git config user.name "nf-core-bot"
+          git config user.email "[email protected]"
+          git config user.name "sanger-tolsoft"
           git config push.default upstream
           git add .
           git status

diff --git a/.github/workflows/sangertest.yml → .github/workflows/sanger_test.yml b/.github/workflows/sangertest.yml → .github/workflows/sanger_test.yml
@@ -8,16 +8,22 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Launch workflow via tower
-        uses: nf-core/tower-action@v2
+        uses: seqeralabs/action-tower-launch@v2
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
           compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          pipeline: ${{ github.repository }}
           revision: ${{ github.sha }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }}
           parameters: |
             {
               "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
             }
-          profiles: test,sanger,singularity
+          profiles: test,sanger,singularity,cleanup
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/.github/workflows/sangerfulltest.yml → .github/workflows/sanger_test_full.yml b/.github/workflows/sangerfulltest.yml → .github/workflows/sanger_test_full.yml
@@ -22,16 +22,22 @@ jobs:
         if: github.event_name == 'workflow_dispatch'
 
       - name: Launch workflow via tower
-        uses: nf-core/tower-action@v2
+        uses: seqeralabs/action-tower-launch@v2
         with:
           workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
           access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
           compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
-          pipeline: ${{ github.repository }}
           revision: ${{ env.REVISION }}
           workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
           parameters: |
             {
               "outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
             }
-          profiles: test_full,sanger,singularity
+          profiles: test_full,sanger,singularity,cleanup
+
+      - uses: actions/upload-artifact@v3
+        with:
+          name: Tower debug log file
+          path: |
+            tower_action_*.log
+            tower_action_*.json
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,44 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[1.2.0](https://github.com/sanger-tol/readmapping/releases/tag/1.2.0)] – Norwegian Ridgeback - [2023-12-19]
+
+### Enhancements & fixes
+
+- Restored recording read-groups (`@RG`) in the BAM/CRAM files.
+- Updated the CI procedure to use "sanger-tol" rather than "nf-core" names.
+- [crumble](https://github.com/jkbonfield/crumble) now used to compress the
+  PacBio HiFi alignments.
+- Execution statistics now under `pipeline_info/readmapping/` (to be consistent
+  with the other sanger-tol pipelines).
+- All resource requirements (memory, time, CPUs) now fit the actual usage. This
+  is achieved by automatically adjusting to the size of the input whenever
+  possible.
+- Added the `--use_work_dir_as_temp` parameter to make SAMTOOLS_COLLATE use its
+  work directory for temporary files instead of `$TMPDIR`. It can be used to avoid
+  leaving unwanted temporary files on a HPC.
+
+### Parameters
+
+| Old parameter | New parameter            |
+| ------------- | ------------------------ |
+|               | `--use_work_dir_as_temp` |
+
+> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
+
+### Software dependencies
+
+Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Dependency | Old version     | New version   |
+| ---------- | --------------- | ------------- |
+| `blast`    | 2.12.0          | 2.13.0        |
+| `crumble`  |                 | 0.9.1         |
+| `samtools` | 1.14 and 1.16.1 | 1.14 and 1.17 |
+| `multiqc`  | 1.13            | 1.14          |
+
+> **NB:** Dependency has been **updated** if both old and new version information is present. </br> **NB:** Dependency has been **added** if just the new version information is present. </br> **NB:** Dependency has been **removed** if version information isn't present.
+
 ## [[1.1.0](https://github.com/sanger-tol/readmapping/releases/tag/1.1.0)] – Hebridean Black - [2023-03-16]
 
 ### Enhancements & fixes

diff --git a/conf/base.config b/conf/base.config
@@ -2,64 +2,135 @@
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     sanger-tol/readmapping Nextflow base config file
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    A 'blank slate' config file, appropriate for general use on most high performance
-    compute environments. Assumes that all software is installed and available on
-    the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
-----------------------------------------------------------------------------------------
 */
 
-process {
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Increasing the number of CPUs often gives diminishing returns, so we increase it
+    following a logarithm curve. Example:
+        - 0 < value <= 1: start + step
+        - 1 < value <= 2: start + 2*step
+        - 2 < value <= 4: start + 3*step
+        - 4 < value <= 8: start + 4*step
+    In order to support re-runs, the step increase may be multiplied by the attempt
+    number prior to calling this function.
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
 
-    cpus   = { check_max( 1    * task.attempt, 'cpus'   ) }
-    memory = { check_max( 6.GB * task.attempt, 'memory' ) }
-    time   = { check_max( 4.h  * task.attempt, 'time'   ) }
+// Modified logarithm function that doesn't return negative numbers
+def positive_log(value, base) {
+    if (value <= 1) {
+        return 0
+    } else {
+        return Math.log(value)/Math.log(base)
+    }
+}
+
+def log_increase_cpus(start, step, value, base) {
+    return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus')
+}
 
-    errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
-    maxRetries    = 1
+
+process {
+
+    errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
+    maxRetries    = 5
     maxErrors     = '-1'
 
-    // Process-specific resource requirements
-    // NOTE - Please try and re-use the labels below as much as possible.
-    //        These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
-    //        If possible, it would be nice to keep the same label naming convention when
-    //        adding in your local modules too.
-    // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
-    withLabel:process_single {
-        cpus   = { check_max( 1                  , 'cpus'    ) }
-        memory = { check_max( 6.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h  * task.attempt, 'time'    ) }
+    // In this configuration file, we give little resources by default and
+    // explicitly bump them up for some processes.
+    // All rules should still increase resources every attempt to allow the
+    // pipeline to self-heal from MEMLIMIT/RUNLIMIT.
+
+    // Default
+    cpus   = 1
+    memory = { check_max( 50.MB * task.attempt, 'memory' ) }
+    time   = { check_max( 30.min * task.attempt, 'time' ) }
+
+    withName: 'SAMTOOLS_(CONVERT|FILTER)' {
+        time   = { check_max( 1.hour * task.attempt, 'time' ) }
+    }
+
+    withName: 'SAMTOOLS_(FASTA)' {
+        time   = { check_max( 2.hour * task.attempt, 'time' ) }
+    }
+
+    withName: 'SAMTOOLS_(STATS)' {
+        // Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C
+        time   = { check_max( 4.hour * task.attempt, 'time' ) }
     }
-    withLabel:process_low {
-        cpus   = { check_max( 2     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 12.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 4.h   * task.attempt, 'time'    ) }
+
+    withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' {
+        time   = { check_max( 8.hour * task.attempt, 'time' ) }
     }
-    withLabel:process_medium {
-        cpus   = { check_max( 6     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 36.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 8.h   * task.attempt, 'time'    ) }
+
+    withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' {
+        memory = { check_max( 250.MB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_high {
-        cpus   = { check_max( 12    * task.attempt, 'cpus'    ) }
-        memory = { check_max( 72.GB * task.attempt, 'memory'  ) }
-        time   = { check_max( 16.h  * task.attempt, 'time'    ) }
+
+    withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' {
+        memory = { check_max( 1.GB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_long {
-        time   = { check_max( 20.h  * task.attempt, 'time'    ) }
+    withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' {
+        memory = { check_max( 2.GB  * task.attempt, 'memory'  ) }
     }
-    withLabel:process_high_memory {
-        memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+
+    withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
+        memory = { check_max( 1.GB  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
     }
-    withLabel:error_ignore {
-        errorStrategy = 'ignore'
+
+    withName: 'SAMTOOLS_SORMADUP' {
+        cpus   = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
+        memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) }
+        time   = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
     }
-    withLabel:error_retry {
-        errorStrategy = 'retry'
-        maxRetries    = 2
+
+    withName: SAMTOOLS_SORT {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
+        // Memory increases by 768M for each thread
+        memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) }
+        time   = { check_max(        8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) }
     }
-    withName:BWAMEM2_INDEX {
-        memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) }
+
+    withName: BLAST_BLASTN {
+        time   = { check_max(          2.hour  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
+        memory = { check_max( 100.MB + 20.MB   * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
+        // The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
     }
+
+    withName: BWAMEM2_INDEX {
+        memory = { check_max( 24.GB  * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) }
+        time   = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time'   ) }
+        // Not multithreaded
+    }
+
+    withName: BWAMEM2_MEM {
+        // Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads
+        cpus   = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) }
+        // Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size
+        // Runtime is considered proportional to the number of reads and inversely to number of threads
+        time   = { check_max( 3.h * task.attempt *  Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) }
+        // Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM
+        // Memory usage of SAMTOOLS_VIEW is negligible.
+        memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) }
+    }
+
+    withName: MINIMAP2_ALIGN {
+        cpus   = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
+        memory = { check_max( (6.GB * Math.ceil( reference.size()  / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) }
+        time   = { check_max(        3.h  * Math.ceil( meta.read_count   / 1000000   ) * task.attempt, 'time'   ) }
+    }
+
+    withName: CRUMBLE {
+        // No correlation between memory usage and the number of reads or the genome size.
+        // Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
+        // The formula below tries to mimic that growth and relies on job retries being allowed.
+        memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) }
+        // Slightly better correlation between runtime and the number of reads.
+        time   = { check_max( 1.5.h + 1.h  * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time'   ) }
+    }
+
     withName:CUSTOM_DUMPSOFTWAREVERSIONS {
         cache = false
     }