Merge branch 'CW-3173' into 'dev'

fix failing when comma in ref. seq. name [CW-3173] Closes CW-3173 See merge request epi2melabs/workflows/wf-alignment!116
epi2me-labs · Jan 3, 2024 · 6a9da93 · 6a9da93
2 parents 0c42cd3 + cc64275
commit 6a9da93
Show file tree

Hide file tree

Showing 5 changed files with 22 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v1.0.1]
+### Fixed
+- The workflow failing due to commas in reference sequence names.
+
+### Changed
+- How samples, reference files, and reference sequence names are listed in the summary section at the beginning of the report.
+
 ## [v1.0.0]
 ### Added
 - Memory requirements for each process.

diff --git a/bin/workflow_glue/report_utils/sections.py b/bin/workflow_glue/report_utils/sections.py
@@ -169,13 +169,15 @@ def summary(report, sample_names, ref_files, ref_seqs, stats_df, flagstat_df):
             quickly jump to an individual section with the links in the header bar.
             """
         )
-        ref_seqs_str = ", ".join(ref_seqs[:7]) + (", ..." if len(ref_seqs) > 7 else "")
+        ref_seqs_str = "&emsp;".join(ref_seqs[:7]) + (
+            "&emsp;..." if len(ref_seqs) > 7 else ""
+        )
         dom_util.raw(
             f"""
             <b>{len(sample_names)} {plural_s("sample", len(sample_names))}:</b><br>
-            {', '.join(sample_names)}<br><br>
+            {'&emsp;'.join(sample_names)}<br><br>
             <b>{len(ref_files)} {plural_s("reference file", len(ref_files))}:</b><br>
-            {', '.join(ref_files)}<br><br>
+            {'&emsp;'.join(ref_files)}<br><br>
             <b>{len(ref_seqs)} {plural_s("reference sequence", len(ref_seqs))}:</b><br>
             {ref_seqs_str}<br><br>
             """

diff --git a/main.nf b/main.nf
@@ -80,15 +80,15 @@ process addStepsColumn {
     label "wfalignment"
     cpus 1
     memory "2 GB"
-    input: path "lengths.csv"
-    output: path "lengths_with_steps.csv"
+    input: path "lengths.tsv"
+    output: path "lengths_with_steps.tsv"
     """
     #!/usr/bin/env python
     import pandas as pd
-    all = pd.read_csv('lengths.csv')
+    all = pd.read_csv('lengths.tsv', sep='\\t')
     all["step"] = all["lengths"]//200
     all = all.replace(0, 1)
-    all.to_csv('lengths_with_steps.csv', index=False, header=False)
+    all.to_csv('lengths_with_steps.tsv', index=False, header=False, sep='\\t')
     """
 }
 
@@ -106,7 +106,7 @@ process readDepthPerRef {
         def sample_name = meta["alias"]
         outfname = "${sample_name}.all_regions.bed.gz"
     """
-    while IFS=, read -r name lengths steps; do
+    while IFS=\$'\\t' read -r name lengths steps; do
         mosdepth -n --fast-mode --by "\$steps" --chrom "\$name" -t $task.cpus \
             ${sample_name}."\$name".temp $alignment \
         || echo "No alignments for "\$name""

diff --git a/nextflow.config b/nextflow.config
@@ -51,7 +51,7 @@ manifest {
     description     = 'Align Nanopore reads and visualize mapping statistics.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=23.04.2'
-    version         = 'v1.0.0'
+    version         = 'v1.0.1'
 }
 
 epi2melabs {

diff --git a/subworkflows/process_references.nf b/subworkflows/process_references.nf
@@ -27,13 +27,13 @@ process fx2tab {
         path reference
     output:
         path "*.names.txt", emit: names
-        path "*.lengths.csv", emit: lengths
+        path "*.lengths.tsv", emit: lengths
     script:
     """
     seqkit fx2tab --length --name --only-id $reference > fx2tab.out
     cut -f1 fx2tab.out > ${reference}.names.txt
-    echo 'name,lengths' > ${reference}.lengths.csv
-    tr -s '[:blank:]' ',' < fx2tab.out >> ${reference}.lengths.csv
+    echo -e 'name\\tlengths' > ${reference}.lengths.tsv
+    cat fx2tab.out >> ${reference}.lengths.tsv
     """
 }
 
@@ -73,7 +73,7 @@ workflow process_references {
         names_per_ref_file = fx2tab.out.names
         lengths_per_ref_file = fx2tab.out.lengths
         lengths_combined = fx2tab.out.lengths.collectFile(
-            name: "combined_lengths.csv", keepHeader: true
+            name: "combined_lengths.tsv", keepHeader: true
             // we need to call `.first()` to get a value channel (`.collectFile()`
             // always returns a queue channel, even when it only produces a single file)
         ).first()