Skip to content

Commit

Permalink
Merge pull request #306 from sanger-tol/main
Browse files Browse the repository at this point in the history
Main to Dev
  • Loading branch information
DLBPointon authored Jul 12, 2024
2 parents c3ecafe + b7475b7 commit 3f88a62
Show file tree
Hide file tree
Showing 18 changed files with 104 additions and 254 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:
name: Run pipeline with test data
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'sanger-tol/treeval') }}"
runs-on: ubuntu-latest
runs-on: ubuntu2204-8c
strategy:
matrix:
NXF_VER:
Expand Down
4 changes: 4 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

The filter_five_end.ph script has been taken from the Arima Mapping Pipeline, has not been modified and is subject to the below license:

Copyright (c) 2017 Arima Genomics, Inc.
89 changes: 71 additions & 18 deletions bin/generate_cram_csv.sh
Original file line number Diff line number Diff line change
@@ -1,29 +1,82 @@
#!/bin/bash
cram_path=$1
chunkn=0
for cram in ${cram_path}/*.cram; do
rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g")

crampath=$(readlink -f ${cram})
# generate_cram_csv.sh
# -------------------
# Generate a csv file describing the CRAM folder
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>
# Author = yy5
# ><((((°> Y ><((((°> U ><((((°> M ><((((°> I ><((((°>

ncontainers=$(zcat ${crampath}.crai|wc -l)
base=$(basename $cram .cram)
# Function to process chunking of a CRAM file
chunk_cram() {
local cram=$1
local chunkn=$2
local outcsv=$3
realcram=$(readlink -f ${cram})
realcrai=$(readlink -f ${cram}.crai)
local rgline=$(samtools view -H "${realcram}" | grep "@RG" | sed 's/\t/\\t/g' | sed "s/'//g")
local ncontainers=$(zcat "${realcrai}" | wc -l)
local base=$(basename "${realcram}" .cram)
local from=0
local to=10000

from=0
to=10000


while [ $to -lt $ncontainers ]
do
echo $crampath,${crampath}.crai,${from},${to},${base},${chunkn},${rgline}
from=$((to+1))
((to+=10000))
while [ $to -lt $ncontainers ]; do
echo "${realcram},${realcrai},${from},${to},${base},${chunkn},${rgline}" >> $outcsv
from=$((to + 1))
((to += 10000))
((chunkn++))
done

if [ $from -le $ncontainers ]
then
echo $crampath,${crampath}.crai,${from},${ncontainers},${base},${chunkn},${rgline}
if [ $from -le $ncontainers ]; then
echo "${realcram},${realcrai},${from},${ncontainers},${base},${chunkn},${rgline}" >> $outcsv
((chunkn++))
fi

echo $chunkn
}

# Function to process a CRAM file
process_cram_file() {
local cram=$1
local chunkn=$2
local outcsv=$3

local read_groups=$(samtools view -H "$cram" | grep '@RG' | awk '{for(i=1;i<=NF;i++){if($i ~ /^ID:/){print substr($i,4)}}}')
local num_read_groups=$(echo "$read_groups" | wc -w)

if [ "$num_read_groups" -gt 1 ]; then
# Multiple read groups: process each separately
for rg in $read_groups; do
local output_cram="$(basename "${cram%.cram}")_output_${rg}.cram"
samtools view -h -r "$rg" -o "$output_cram" "$cram"
samtools index "$output_cram"
chunkn=$(chunk_cram "$output_cram" "$chunkn" "$outcsv")
done
else
# Single read group or no read groups
chunkn=$(chunk_cram "$cram" "$chunkn" "$outcsv")
fi

echo $chunkn
}

# /\_/\ /\_/\
# ( o.o ) main ( o.o )
# > ^ < > ^ <

# Check if cram_path is provided
if [ -z "$1" ]; then
echo "Usage: $0 <cram_path>"
exit 1
fi

cram_path=$1
chunkn=0
outcsv=$2

# Loop through each CRAM file in the specified directory. cram cannot be the synlinked cram
for cram in ${cram_path}/*.cram; do
realcram=$(readlink -f $cram)
chunkn=$(process_cram_file $realcram $chunkn $outcsv)
done
14 changes: 7 additions & 7 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ process {

withName: '.*:.*:GENE_ALIGNMENT:.*:MINIMAP2_ALIGN' {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 64 : Math.ceil( ( reference.size() / 1e+9 ) * 45 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 70 : Math.ceil( ( reference.size() / 1e+9 ) * 50 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
time = { check_max( 10.h * task.attempt, 'time' ) }
}

Expand All @@ -142,7 +142,7 @@ process {
withName: '.*:.*:READ_COVERAGE:MINIMAP2_ALIGN' {
cpus = { check_max( 20 * 1, 'cpus' ) }
memory = { check_max( 1.GB * ( reference.size() < 2e9 ? 50 : Math.ceil( ( reference.size() / 1e+9 ) * 20 ) * Math.ceil( task.attempt * 1 ) ) , 'memory') }
time = { check_max( 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 60), 'time' ) }
time = { check_max( 1.h * ( reference.size() < 1e9 ? 10 : reference.size() < 10e9 ? 30 : 48), 'time' ) }
}

withName: '.*:.*:READ_COVERAGE:BEDTOOLS_GENOMECOV' {
Expand Down Expand Up @@ -177,13 +177,13 @@ process {
}

withName: PRETEXTMAP_STANDRD{
cpus = { check_max( 16 * 1, 'cpus' ) }
cpus = { check_max( 8 * 1, 'cpus' ) }
memory = { check_max( 3.GB * task.attempt, 'memory' ) }
}

withName: PRETEXTMAP_HIGHRES {
cpus = { check_max( 20 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * Math.ceil( task.attempt * 2.6 ), 'memory' ) }
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 20.GB * Math.ceil( task.attempt * 2.6 ), 'memory' ) }
}

withName: PRETEXT_GRAPH {
Expand All @@ -207,7 +207,7 @@ process {
// add a cpus 16 if bam.size() >= 50GB
withName: BAMTOBED_SORT {
cpus = { check_max( 12 * 1, 'cpus' ) }
memory = { check_max( 2.GB * Math.ceil( bam.size() / 1e+9 ) * task.attempt, 'memory' ) }
memory = { check_max( 3.GB * Math.ceil( bam.size() / 1e+9 ) * task.attempt, 'memory' ) }
}

withName: SAMTOOLS_MARKDUP {
Expand Down Expand Up @@ -240,7 +240,7 @@ process {

// Large Genomes > 4Gb
withName: FASTK_FASTK {
cpus = { check_max( 25 * task.attempt, 'cpus' ) }
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
memory = { check_max( 100.GB * task.attempt, 'memory' ) }
}

Expand Down
37 changes: 10 additions & 27 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ process {

// Files to be uploaded to the TreeVal JBrowse2 instance
// .genome, .gz.{tbi|csi}, .bigBed, .bigWig, .paf
withName: "GENERATE_GENOME_FILE|TABIX_BGZIPTABIX|UCSC_BEDTOBIGBED|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|.*:.*:SYNTENY:MINIMAP2_ALIGN|.*:.*:.*:GENERATE_SORTED_GENOME:GNU_SORT|.*:.*:.*:GENERATE_UNSORTED_GENOME:CUSTOM_GETCHROMSIZES" {
withName: "GENERATE_GENOME|TABIX_BGZIPTABIX|UCSC_BEDTOBIGBED|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|.*:.*:SYNTENY:MINIMAP2_ALIGN|.*:.*:.*:GENERATE_SORTED_GENOME:GNU_SORT|.*:.*:.*:GENERATE_UNSORTED_GENOME:CUSTOM_GETCHROMSIZES" {
publishDir = [
path: { "${params.outdir}/treeval_upload" },
mode: params.publish_dir_mode,
Expand All @@ -40,7 +40,7 @@ process {
}

// Files to be used for pretext, likely to be deleted once the hic workflow is complete.
// .bed, .hr.pretext, .lr.pretext, needs centromere}
// .bed, .hr.pretext, .lr.pretext, needs centromere
withName: 'REFORMAT_INTERSECT|SEQTK_CUTN|GAP_LENGTH|PRETEXT_INGEST_HIRES|PRETEXT_INGEST_SNDRD|COOLER_ZOOMIFY|COV_FOLDER|UCSC_BEDGRAPHTOBIGWIG|BED2BW_NORMAL|BED2BW_AVGCOV|EXTRACT_TELO|JUICER_TOOLS_PRE|SNAPSHOT_SRES|PRETEXT_GRAPH' {
publishDir = [
path: { "${params.outdir}/hic_files" },
Expand Down Expand Up @@ -186,6 +186,7 @@ process {

withName: ".*:.*:.*:(GEN_ALIGNMENTS|RNA_ALIGNMENTS|CDS_ALIGNMENTS):MINIMAP2_ALIGN" {
ext.args = {"-x splice ${meta.intron_size ? "-G ${meta.intron_size}" : "" } ${reference.size() > 2.5e9 ? (" -I " + Math.ceil(reference.size()/1e9)+"G") : ""} "}
ext.args2 = { "-T ${meta.id}_tmp" }
ext.prefix = { "${meta.id}_alignment_${reference.getName().tokenize(".")[0]}" }
}

Expand Down Expand Up @@ -228,6 +229,7 @@ process {
//
withName: ".*:.*:SYNTENY:MINIMAP2_ALIGN" {
ext.args = "-t 8 -x asm10"
ext.args2 = { "-T ${meta.id}_tmp" }
ext.prefix = { "${meta.id}_synteny_${reference.getName().tokenize(".")[0]}" }
}

Expand All @@ -237,7 +239,7 @@ process {
//
withName: ".*:.*:READ_COVERAGE:MINIMAP2_ALIGN" {
ext.args = {"-x ${meta.readtype.equals("hifi") ? "map-hifi" : meta.readtype.equals("clr") ? "map-pb" : meta.readtype.equals("ont") ? "map-ont" : meta.readtype.equals("illumina") ? "sr" : ""} -N 1 -c ${reference.size() > 2.5e9 ? (" -I" + Math.ceil(reference.size()/1e9)+"G") : ""}" }
ext.prefix = { "${meta.id}_alignment_${reads.getName().tokenize('.')[0]}" }
ext.prefix = { "${meta.id}_alignment_${reads.getName().split('.fasta.gz|.fa.gz')[0]}" }
}

withName: ".*:.*:READ_COVERAGE:BEDTOOLS_GENOMECOV" {
Expand Down Expand Up @@ -270,7 +272,7 @@ process {
}

withName: ".*:.*:READ_COVERAGE:BED2BW_AVGCOV" {
ext.prefix = { "${meta.id}_coverage_avg" }
ext.prefix = { "${meta.id}_coverage_avg" }
}

//
Expand Down Expand Up @@ -315,12 +317,12 @@ process {
// normal = standard run, pi = "pre-ingestion", hr = High res
//
withName: PRETEXTMAP_STANDRD {
ext.args = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --mapq 0 --memory ${task.memory.giga}G" }
ext.args = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --mapq 0 --memory ${task.memory.giga}" }
ext.prefix = { "${meta.id}_normal_pi" }
}

withName: PRETEXTMAP_HIGHRES {
ext.args = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --highRes --mapq 0 --memory ${task.memory.giga}G" }
ext.args = { "${meta.map_order.equals("length") ? "--sortby length": "--sortby nosort" } --highRes --mapq 0" }
ext.prefix = { "${meta.id}_hr_pi" }
}

Expand Down Expand Up @@ -401,26 +403,7 @@ process {
ext.args = "-P."
}

//
// SUBWORKFLOW: KMER_HAPLODUPE
//
withName: ".*:.*:KMER_READ_COVERAGE:CAT_CAT" {
ext.prefix = { "${meta.id}.fasta.gz" }
}

withName: ".*:.*:KMER_READ_COVERAGE:FASTK_FASTK" {
ext.args = { "-k${meta.kmer} -t -P." }
}

withName: ".*:.*:KMER_READ_COVERAGE:FKUTILS_FKPROF" {
ext.args = { "-f0.2 -o${meta.id}_k${meta2.kmer}_read.bed" }
}

withName: ".*:.*:KMER_READ_COVERAGE:GNU_SORT" {
ext.args = { "-k1,1 -k2,2n -S${task.memory.mega - 100}M -T ." }
}

withName: ".*:.*:KMER_READ_COVERAGE:UCSC_BEDGRAPHTOBIGWIG" {
ext.prefix = { "${meta.id}_coverage_kmer" }
withName: AVGCOV {
ext.args = "-T ./"
}
}
2 changes: 1 addition & 1 deletion conf/test_github.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ params {

process {
withName: PRETEXTMAP_HIGHRES {
memory = { check_max( 3.GB , 'memory' ) }
memory = { check_max( 8.GB , 'memory' ) }
}
}
2 changes: 1 addition & 1 deletion modules/local/avgcov.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ process AVGCOV {
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "avgcov"
"""
get_avgcov.sh $bedfile $genomefile ${prefix}.bed
get_avgcov.sh $bedfile $genomefile ${prefix}.bed $args
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
2 changes: 1 addition & 1 deletion modules/local/generate_cram_csv.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ process GENERATE_CRAM_CSV {
script:
def prefix = task.ext.prefix ?: "${meta.id}"
"""
generate_cram_csv.sh $crampath >> ${prefix}_cram.csv
generate_cram_csv.sh $crampath ${prefix}_cram.csv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
1 change: 0 additions & 1 deletion modules/local/selfcomp_splitfasta.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,3 @@ process SELFCOMP_SPLITFASTA {
END_VERSIONS
"""
}

2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ manifest {
description = """A pipeline to generate supplemental data for genome curation"""
mainScript = 'main.nf'
nextflowVersion = '!>=22.10.1'
version = '1.1.0'
version = '1.1.1'
doi = '10.5281/zenodo.10047653'
}

Expand Down
1 change: 1 addition & 0 deletions subworkflows/local/generate_genome.nf
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,6 @@ workflow GENERATE_GENOME {
max_scaff_size = GET_LARGEST_SCAFF.out.scaff_size.toInteger()
dot_genome = ch_genomesize
ref_index = ch_genome_fai
ref = reference_file
versions = ch_versions.ifEmpty(null)
}
3 changes: 2 additions & 1 deletion subworkflows/local/hic_mapping.nf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ workflow HIC_MAPPING {
// COMMENT: 1000bp BIN SIZE INTERVALS FOR CLOAD
ch_cool_bin = Channel.of( 1000 )


//
// LOGIC: make channel of hic reads as input for GENERATE_CRAM_CSV
//
Expand Down Expand Up @@ -340,7 +341,7 @@ workflow HIC_MAPPING {
// LOGIC: FOR REPORTING
//

ch_cram_files = GrabFiles( get_reads_input )
ch_cram_files = GrabFiles( hic_reads_path )

ch_cram_files
.collect()
Expand Down
Loading

0 comments on commit 3f88a62

Please sign in to comment.