Merge pull request #306 from sanger-tol/main

Main to Dev
sanger-tol · Jul 12, 2024 · 3f88a62 · 3f88a62
2 parents c3ecafe + b7475b7
commit 3f88a62
Show file tree

Hide file tree

Showing 18 changed files with 104 additions and 254 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,7 +22,7 @@ jobs:
     name: Run pipeline with test data
     # Only run on push if this is the nf-core dev branch (merged PRs)
     if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/treeval') }}"
-    runs-on: ubuntu-latest
+    runs-on: ubuntu2204-8c
     strategy:
       matrix:
         NXF_VER:

diff --git a/LICENSE b/LICENSE
@@ -19,3 +19,7 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+The filter_five_end.ph script has been taken from the Arima Mapping Pipeline, has not been modified and is subject to the below license:
+
+Copyright (c) 2017 Arima Genomics, Inc.
diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh
@@ -1,29 +1,82 @@
 #!/bin/bash
-cram_path=$1
-chunkn=0
-for cram in ${cram_path}/*.cram; do
-    rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g")
 
-    crampath=$(readlink -f ${cram})
+# generate_cram_csv.sh
+# -------------------
+# Generate a csv file describing the CRAM folder
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
+# Author = yy5
+# ><((((°>    Y    ><((((°>    U     ><((((°>    M     ><((((°>     I     ><((((°>
 
-    ncontainers=$(zcat ${crampath}.crai|wc -l)
-    base=$(basename $cram .cram)
+# Function to process chunking of a CRAM file
+chunk_cram() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+    realcram=$(readlink -f ${cram})
+    realcrai=$(readlink -f ${cram}.crai)
+    local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
+    local ncontainers=$(zcat "${realcrai}" | wc -l)
+    local base=$(basename "${realcram}" .cram)
+    local from=0
+    local to=10000
 
-    from=0
-    to=10000
 
-
-    while [ $to -lt $ncontainers ]
-    do
-        echo $crampath,${crampath}.crai,${from},${to},${base},${chunkn},${rgline}
-        from=$((to+1))
-        ((to+=10000))
+    while [ $to -lt $ncontainers ]; do
+        echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
+        from=$((to + 1))
+        ((to += 10000))
         ((chunkn++))
     done
 
-    if [ $from -le $ncontainers ]
-    then
-        echo $crampath,${crampath}.crai,${from},${ncontainers},${base},${chunkn},${rgline}
+    if [ $from -le $ncontainers ]; then
+        echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv
         ((chunkn++))
     fi
+
+    echo $chunkn
+}
+
+# Function to process a CRAM file
+process_cram_file() {
+    local cram=$1
+    local chunkn=$2
+    local outcsv=$3
+
+    local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
+    local num_read_groups=$(echo "$read_groups" | wc -w)
+
+    if [ "$num_read_groups" -gt 1 ]; then
+        # Multiple read groups: process each separately
+        for rg in $read_groups; do
+            local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
+            samtools view -h -r "$rg" -o "$output_cram" "$cram"
+            samtools index "$output_cram"
+            chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv")
+        done
+    else
+        # Single read group or no read groups
+        chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv")
+    fi
+
+    echo $chunkn
+}
+
+#  /\_/\        /\_/\
+# ( o.o ) main ( o.o )
+#  > ^ <        > ^ <
+
+# Check if cram_path is provided
+if [ -z "$1" ]; then
+    echo "Usage: $0 <cram_path>"
+    exit 1
+fi
+
+cram_path=$1
+chunkn=0
+outcsv=$2
+
+# Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
+for cram in ${cram_path}/*.cram; do
+    realcram=$(readlink -f $cram)
+    chunkn=$(process_cram_file $realcram $chunkn $outcsv)
 done
diff --git a/conf/base.config b/conf/base.config
@@ -128,7 +128,7 @@ process {
 
     withName: '.*:.*:GENE_ALIGNMENT:.*:MINIMAP2_ALIGN' {
         cpus   = { check_max( 6        * task.attempt, 'cpus'    ) }
-        memory = { check_max( 1.GB     * ( reference.size() < 2e9 ? 64 : Math.ceil( ( reference.size() / 1e+9 ) * 45 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
+        memory = { check_max( 1.GB     * ( reference.size() < 2e9 ? 70 : Math.ceil( ( reference.size() / 1e+9 ) * 50 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
         time   = { check_max( 10.h     * task.attempt, 'time'    ) }
     }
 
@@ -142,7 +142,7 @@ process {
     withName: '.*:.*:READ_COVERAGE:MINIMAP2_ALIGN' {
         cpus    = { check_max( 20       * 1, 'cpus'                 ) }
         memory  = { check_max( 1.GB     * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
-        time    = { check_max( 1.h      * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 60), 'time'      ) }
+        time    = { check_max( 1.h      * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48), 'time'      ) }
     }
 
     withName: '.*:.*:READ_COVERAGE:BEDTOOLS_GENOMECOV' {
@@ -177,13 +177,13 @@ process {
     }
 
     withName: PRETEXTMAP_STANDRD{
-        cpus    = { check_max( 16       * 1, 'cpus'   ) }
+        cpus    = { check_max( 8       * 1, 'cpus'   ) }
         memory  = { check_max( 3.GB     * task.attempt, 'memory' ) }
     }
 
     withName: PRETEXTMAP_HIGHRES {
-        cpus    = { check_max( 20       * task.attempt, 'cpus'   ) }
-        memory  = { check_max( 6.GB     * Math.ceil( task.attempt * 2.6 ), 'memory' ) }
+        cpus    = { check_max( 6       * task.attempt, 'cpus'   ) }
+        memory  = { check_max( 20.GB     * Math.ceil( task.attempt * 2.6 ), 'memory' ) }
     }
 
     withName: PRETEXT_GRAPH {
@@ -207,7 +207,7 @@ process {
     // add a cpus 16 if bam.size() >= 50GB
     withName: BAMTOBED_SORT {
         cpus    = { check_max( 12     * 1, 'cpus'    ) }
-        memory  = { check_max( 2.GB * Math.ceil( bam.size() / 1e+9 ) * task.attempt, 'memory'  ) }
+        memory  = { check_max( 3.GB * Math.ceil( bam.size() / 1e+9 ) * task.attempt, 'memory'  ) }
     }
 
     withName: SAMTOOLS_MARKDUP {
@@ -240,7 +240,7 @@ process {
 
     // Large Genomes > 4Gb
     withName: FASTK_FASTK {
-        cpus   = { check_max( 25        * task.attempt, 'cpus'    ) }
+        cpus   = { check_max( 8         * task.attempt, 'cpus'    ) }
         memory = { check_max( 100.GB    * task.attempt, 'memory'  ) }
     }
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -21,7 +21,7 @@ process {
 
     // Files to be uploaded to the TreeVal JBrowse2 instance
     // .genome, .gz.{tbi|csi}, .bigBed, .bigWig, .paf
-    withName: "GENERATE_GENOME_FILE|TABIX_BGZIPTABIX|UCSC_BEDTOBIGBED|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|.*:.*:SYNTENY:MINIMAP2_ALIGN|.*:.*:.*:GENERATE_SORTED_GENOME:GNU_SORT|.*:.*:.*:GENERATE_UNSORTED_GENOME:CUSTOM_GETCHROMSIZES" {
+    withName: "GENERATE_GENOME|TABIX_BGZIPTABIX|UCSC_BEDTOBIGBED|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|.*:.*:SYNTENY:MINIMAP2_ALIGN|.*:.*:.*:GENERATE_SORTED_GENOME:GNU_SORT|.*:.*:.*:GENERATE_UNSORTED_GENOME:CUSTOM_GETCHROMSIZES" {
         publishDir = [
             path: { "${params.outdir}/treeval_upload" },
             mode: params.publish_dir_mode,
@@ -40,7 +40,7 @@ process {
     }
 
     // Files to be used for pretext, likely to be deleted once the hic workflow is complete.
-    // .bed, .hr.pretext, .lr.pretext, needs centromere}
+    // .bed, .hr.pretext, .lr.pretext, needs centromere
     withName: 'REFORMAT_INTERSECT|SEQTK_CUTN|GAP_LENGTH|PRETEXT_INGEST_HIRES|PRETEXT_INGEST_SNDRD|COOLER_ZOOMIFY|COV_FOLDER|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|EXTRACT_TELO|JUICER_TOOLS_PRE|SNAPSHOT_SRES|PRETEXT_GRAPH' {
         publishDir = [
             path: { "${params.outdir}/hic_files" },
@@ -186,6 +186,7 @@ process {
 
     withName: ".*:.*:.*:(GEN_ALIGNMENTS|RNA_ALIGNMENTS|CDS_ALIGNMENTS):MINIMAP2_ALIGN" {
         ext.args        = {"-x splice ${meta.intron_size ? "-G ${meta.intron_size}" : "" } ${reference.size() > 2.5e9 ? (" -I " + Math.ceil(reference.size()/1e9)+"G") : ""} "}
+        ext.args2       = { "-T ${meta.id}_tmp" }
         ext.prefix      = { "${meta.id}_alignment_${reference.getName().tokenize(".")[0]}" }
     }
 
@@ -228,6 +229,7 @@ process {
     //
     withName: ".*:.*:SYNTENY:MINIMAP2_ALIGN" {
         ext.args        = "-t 8 -x asm10"
+        ext.args2       = { "-T ${meta.id}_tmp" }
         ext.prefix      = { "${meta.id}_synteny_${reference.getName().tokenize(".")[0]}" }
     }
 
@@ -237,7 +239,7 @@ process {
     //
     withName: ".*:.*:READ_COVERAGE:MINIMAP2_ALIGN" {
         ext.args = {"-x ${meta.readtype.equals("hifi") ? "map-hifi" : meta.readtype.equals("clr") ? "map-pb" : meta.readtype.equals("ont") ? "map-ont" : meta.readtype.equals("illumina") ? "sr" : ""} -N 1 -c ${reference.size() > 2.5e9 ? (" -I" + Math.ceil(reference.size()/1e9)+"G") : ""}" }
-        ext.prefix  = { "${meta.id}_alignment_${reads.getName().tokenize('.')[0]}" }
+        ext.prefix  = { "${meta.id}_alignment_${reads.getName().split('.fasta.gz|.fa.gz')[0]}" }
     }
 
     withName: ".*:.*:READ_COVERAGE:BEDTOOLS_GENOMECOV" {
@@ -270,7 +272,7 @@ process {
     }
 
     withName: ".*:.*:READ_COVERAGE:BED2BW_AVGCOV" {
-        ext.prefix = { "${meta.id}_coverage_avg" }
+        ext.prefix      = { "${meta.id}_coverage_avg" }
     }
 
     //
@@ -315,12 +317,12 @@ process {
     // normal = standard run, pi = "pre-ingestion", hr = High res
     //
     withName: PRETEXTMAP_STANDRD {
-        ext.args        = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --mapq 0 --memory ${task.memory.giga}G" }
+        ext.args        = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --mapq 0 --memory ${task.memory.giga}" }
         ext.prefix      = { "${meta.id}_normal_pi" }
     }
 
     withName: PRETEXTMAP_HIGHRES {
-        ext.args        = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --highRes --mapq 0 --memory ${task.memory.giga}G" }
+        ext.args        = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --highRes --mapq 0" }
         ext.prefix      = { "${meta.id}_hr_pi" }
     }
 
@@ -401,26 +403,7 @@ process {
         ext.args        = "-P."
     }
 
-    //
-    // SUBWORKFLOW: KMER_HAPLODUPE
-    //
-    withName: ".*:.*:KMER_READ_COVERAGE:CAT_CAT" {
-        ext.prefix      = { "${meta.id}.fasta.gz" }
-    }
-
-    withName: ".*:.*:KMER_READ_COVERAGE:FASTK_FASTK" {
-        ext.args        = { "-k${meta.kmer} -t -P." }
-    }
-
-    withName: ".*:.*:KMER_READ_COVERAGE:FKUTILS_FKPROF" {
-        ext.args        = { "-f0.2 -o${meta.id}_k${meta2.kmer}_read.bed" }
-    }
-
-    withName: ".*:.*:KMER_READ_COVERAGE:GNU_SORT" {
-        ext.args        = { "-k1,1 -k2,2n -S${task.memory.mega - 100}M -T ." }
-    }
-
-    withName: ".*:.*:KMER_READ_COVERAGE:UCSC_BEDGRAPHTOBIGWIG" {
-        ext.prefix      = { "${meta.id}_coverage_kmer" }
+    withName: AVGCOV {
+        ext.args    = "-T ./"
     }
 }
diff --git a/conf/test_github.config b/conf/test_github.config
@@ -30,6 +30,6 @@ params {
 
 process {
     withName: PRETEXTMAP_HIGHRES {
-        memory  = { check_max( 3.GB                     , 'memory' ) }
+        memory  = { check_max( 8.GB                     , 'memory' ) }
     }
 }
diff --git a/modules/local/avgcov.nf b/modules/local/avgcov.nf
@@ -22,7 +22,7 @@ process AVGCOV {
     def args    = task.ext.args     ?: ''
     def prefix  = task.ext.prefix   ?: "avgcov"
     """
-    get_avgcov.sh $bedfile $genomefile ${prefix}.bed
+    get_avgcov.sh $bedfile $genomefile ${prefix}.bed $args
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf
@@ -15,7 +15,7 @@ process GENERATE_CRAM_CSV {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    generate_cram_csv.sh $crampath >> ${prefix}_cram.csv
+    generate_cram_csv.sh $crampath ${prefix}_cram.csv
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

diff --git a/modules/local/selfcomp_splitfasta.nf b/modules/local/selfcomp_splitfasta.nf
@@ -49,4 +49,3 @@ process SELFCOMP_SPLITFASTA {
     END_VERSIONS
     """
 }
-
diff --git a/nextflow.config b/nextflow.config
@@ -195,7 +195,7 @@ manifest {
     description     = """A pipeline to generate supplemental data for genome curation"""
     mainScript      = 'main.nf'
     nextflowVersion = '!>=22.10.1'
-    version         = '1.1.0'
+    version         = '1.1.1'
     doi             = '10.5281/zenodo.10047653'
 }
 

diff --git a/subworkflows/local/generate_genome.nf b/subworkflows/local/generate_genome.nf
@@ -73,5 +73,6 @@ workflow GENERATE_GENOME {
     max_scaff_size  = GET_LARGEST_SCAFF.out.scaff_size.toInteger()
     dot_genome      = ch_genomesize
     ref_index       = ch_genome_fai
+    ref             = reference_file
     versions        = ch_versions.ifEmpty(null)
 }
diff --git a/subworkflows/local/hic_mapping.nf b/subworkflows/local/hic_mapping.nf
@@ -43,6 +43,7 @@ workflow HIC_MAPPING {
     // COMMENT: 1000bp BIN SIZE INTERVALS FOR CLOAD
     ch_cool_bin         = Channel.of( 1000 )
 
+
     //
     // LOGIC: make channel of hic reads as input for GENERATE_CRAM_CSV
     //
@@ -340,7 +341,7 @@ workflow HIC_MAPPING {
     // LOGIC: FOR REPORTING
     //
 
-    ch_cram_files = GrabFiles( get_reads_input )
+    ch_cram_files = GrabFiles( hic_reads_path )
 
     ch_cram_files
         .collect()