From 1ea68bc8c420d3f7292c8315bc6d6218ad7988be Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Fri, 29 Sep 2023 17:27:12 +0200
Subject: [PATCH 01/45] from megafusion to vcf_collect

---
 CHANGELOG.md                                       | 10 ++++++++++
 bin/{megafusion.py => vcf_collect.py}              |  4 ++--
 conf/modules.config                                |  4 +---
 docs/output.md                                     |  4 ++--
 docs/usage.md                                      |  2 +-
 modules/local/{megafusion => vcf_collect}/main.nf  |  4 ++--
 modules/local/{megafusion => vcf_collect}/meta.yml |  4 ++--
 subworkflows/local/fusioninspector_workflow.nf     |  6 +++---
 tower.yml                                          |  2 +-
 9 files changed, 24 insertions(+), 16 deletions(-)
 rename bin/{megafusion.py => vcf_collect.py} (98%)
 rename modules/local/{megafusion => vcf_collect}/main.nf (88%)
 rename modules/local/{megafusion => vcf_collect}/meta.yml (95%)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4c9ff036..5550a93a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,16 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## v3.0.0 -  [date]
+
+### Added
+
+### Changed
+
+### Fixed
+
+### Removed
+
 ## v2.4.0 - [2023/09/22]
 
 ### Added
diff --git a/bin/megafusion.py b/bin/vcf_collect.py
similarity index 98%
rename from bin/megafusion.py
rename to bin/vcf_collect.py
index 76872b57..8d3e5367 100755
--- a/bin/megafusion.py
+++ b/bin/vcf_collect.py
@@ -189,7 +189,7 @@ def write_vcf(df_to_print, header, out_file):
         f.write(header.rstrip("\r\n") + "\n" + content)
 
 
-def megafusion(fusioninspector_in_file, fusionreport_in_file, sample, out):
+def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, out):
     """Convert fusion information from FusionInspector and fusion-report into a vcf file. Adapted from https://github.com/J35P312/MegaFusion"""
     merged_df = build_fusioninspector_dataframe(fusioninspector_in_file, FUSIONINSPECTOR_MAP).join(
         read_build_fusionreport(fusionreport_in_file), how="left"
@@ -203,7 +203,7 @@ def main(argv=None):
     if not args.fusioninspector.is_file() or not args.fusionreport.is_file():
         logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
         sys.exit(2)
-    megafusion(args.fusioninspector, args.fusionreport, args.sample, args.out)
+    vcf_collect(args.fusioninspector, args.fusionreport, args.sample, args.out)
 
 
 if __name__ == "__main__":
diff --git a/conf/modules.config b/conf/modules.config
index b4dc96d6..66294dfb 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -141,13 +141,11 @@ process {
             saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
         ]
     }
-    withName: MEGAFUSION {
+    withName: VCF_COLLECT {
         ext.when = {!params.fusioninspector_only}
         ext.prefix = { "${meta.id}_fusion_data" }
     }
 
-
-
     withName: MULTIQC {
         ext.when         = { !params.skip_qc }
     }
diff --git a/docs/output.md b/docs/output.md
index 4144891b..ac84bd06 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -239,12 +239,12 @@ The score is explained [on the original fusion-report github page](https://matq0
 
 Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads.
 
-### Megafusion
+### Vcf_collect
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `megafusion`
+- `vcf_collect`
   - `<sample>_fusion_data.vcf` - contains the fusions in vcf format with collected statistics.
 
 </details>
diff --git a/docs/usage.md b/docs/usage.md
index 9df6d24e..8a2a725a 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -15,7 +15,7 @@ The pipeline is divided into two parts:
 2. Detecting fusions
    - Supported tools: `Arriba`, `FusionCatcher`, `pizzly`, `SQUID`, `STAR-Fusion`, and `StringTie`
    - QC: `Fastqc`, `MultiQC`, and `Qualimap rnaseq`
-   - Fusions visualization: `Arriba`, `fusion-report` and `FusionInspector`, VCF file creation based on `MegaFusion`
+   - Fusions visualization: `Arriba`, `fusion-report` and `FusionInspector`, `vcf_collect` (VCF file creation based on `MegaFusion`)
 
 ## Download and build references
 
diff --git a/modules/local/megafusion/main.nf b/modules/local/vcf_collect/main.nf
similarity index 88%
rename from modules/local/megafusion/main.nf
rename to modules/local/vcf_collect/main.nf
index d8cb5db0..5c8a57bd 100644
--- a/modules/local/megafusion/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -1,4 +1,4 @@
-process MEGAFUSION {
+process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
@@ -20,7 +20,7 @@ process MEGAFUSION {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    megafusion.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf
+    vcf_collect.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/megafusion/meta.yml b/modules/local/vcf_collect/meta.yml
similarity index 95%
rename from modules/local/megafusion/meta.yml
rename to modules/local/vcf_collect/meta.yml
index 31343c7e..40bdd6c0 100644
--- a/modules/local/megafusion/meta.yml
+++ b/modules/local/vcf_collect/meta.yml
@@ -1,5 +1,5 @@
-name: megafusion
-description: megafusion
+name: vcf_collect
+description: vcf_collect
 keywords:
   - sort
 tools:
diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index 5fa21cf1..8cb45086 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -1,6 +1,6 @@
 include { ARRIBA_VISUALISATION     }                      from '../../modules/local/arriba/visualisation/main'
 include { CAT_CAT }                                       from '../../modules/nf-core/cat/cat/main'
-include { MEGAFUSION }                                    from '../../modules/local/megafusion/main'
+include { VCF_COLLECT }                                   from '../../modules/local/vcf_collect/main'
 include { FUSIONINSPECTOR     }                           from '../../modules/local/fusioninspector/main'
 
 workflow FUSIONINSPECTOR_WORKFLOW {
@@ -39,8 +39,8 @@ workflow FUSIONINSPECTOR_WORKFLOW {
         ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions)
 
         fusion_data = FUSIONINSPECTOR.out.tsv.join(report)
-        MEGAFUSION(fusion_data)
-        ch_versions = ch_versions.mix(MEGAFUSION.out.versions)
+        VCF_COLLECT(fusion_data)
+        ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)
 
         if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) {
             bam_sorted_indexed_fusions = bam_sorted_indexed.join(FUSIONINSPECTOR.out.tsv)
diff --git a/tower.yml b/tower.yml
index 5813f5d3..d051a618 100644
--- a/tower.yml
+++ b/tower.yml
@@ -13,7 +13,7 @@ reports:
     display: "FusionInspector TSV report"
   "**/fusionreport/*/*_fusionreport_index.html":
     display: "Fusion-report HTML report"
-  "**/megafusion/*_fusion_data.vcf":
+  "**/vcf_collect/*_fusion_data.vcf":
     display: "Collected statistics on each fusion fed to FusionInspector in VCF format"
   "**/picard/*.MarkDuplicates.metrics.txt":
     display: "Picard: Metrics from CollectRnaMetrics"

From da68e1daa0b1bbc679d9045d1d074a3267c932e9 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 2 Oct 2023 09:34:10 +0200
Subject: [PATCH 02/45] start collecting more information with vcf_collect

---
 bin/vcf_collect.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 8d3e5367..e5f3df88 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -22,6 +22,10 @@
     "split_reads": {"column": 1, "delimiter": "\t", "element": 0},
     "discordant_pairs": {"column": 2, "delimiter": "\t", "element": 0},
     "ffpm": {"column": 25, "delimiter": "\t", "element": 0},
+    "LeftGene": {"column": 5, "delimiter": "\t", "element": 0},
+    "LeftBreakpoint": {"column": 7, "delimiter": ":", "element": 1},
+    "RightGene": {"column": 8, "delimiter": "\t", "element": 0},
+    "RightBreakpoint": {"column": 10, "delimiter": ":", "element": 1},
 }
 
 

From 41bd823ee27d18973464b8c0bc1d804a0f3e7110 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 09:47:33 +0100
Subject: [PATCH 03/45] add hgnc id and extra info for vcf file

---
 bin/vcf_collect.py                | 273 +++++++++++++++++++++---------
 modules/local/hgnc/main.nf        |  41 +++++
 modules/local/vcf_collect/main.nf |  13 +-
 nextflow.config                   |   2 +
 nextflow_schema.json              |  10 ++
 5 files changed, 251 insertions(+), 88 deletions(-)
 create mode 100644 modules/local/hgnc/main.nf

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index e5f3df88..e590d88b 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -6,27 +6,56 @@
 from pathlib import Path
 import pandas as pd
 import ast
+from gtfparse import read_gtf
 
 logger = logging.getLogger()
 
-FUSIONINSPECTOR_MAP = {
-    "fusion": {"column": 0, "delimiter": "\t", "element": 0},
-    "chromosomeA": {"column": 7, "delimiter": ":", "element": 0},
-    "chromosomeB": {"column": 10, "delimiter": ":", "element": 0},
-    "posA": {"column": 7, "delimiter": ":", "element": 1},
-    "posB": {"column": 10, "delimiter": ":", "element": 1},
-    "strand1": {"column": 7, "delimiter": ":", "element": 2},
-    "strand2": {"column": 10, "delimiter": ":", "element": 2},
-    "geneA": {"column": 0, "delimiter": "--", "element": 0},
-    "geneB": {"column": 0, "delimiter": "--", "element": 1},
-    "split_reads": {"column": 1, "delimiter": "\t", "element": 0},
-    "discordant_pairs": {"column": 2, "delimiter": "\t", "element": 0},
-    "ffpm": {"column": 25, "delimiter": "\t", "element": 0},
-    "LeftGene": {"column": 5, "delimiter": "\t", "element": 0},
-    "LeftBreakpoint": {"column": 7, "delimiter": ":", "element": 1},
-    "RightGene": {"column": 8, "delimiter": "\t", "element": 0},
-    "RightBreakpoint": {"column": 10, "delimiter": ":", "element": 1},
-}
+
+def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample, out):
+    """
+    Process FusionInspector and FusionReport data,
+    merge with GTF from FusionInspector and HGNC database,
+    and write a VCF file.
+
+    Args:
+        fusioninspector_in_file (str): Path to FusionInspector input file.
+        fusionreport_in_file (str): Path to FusionReport input file.
+        sample (str): Sample name for the header.
+        hgnc (str): Path to HGNC file.
+        gtf (str): Path to GTF file.
+        out (str): Output VCF file path.
+
+    Adapted from: https://github.com/J35P312/MegaFusion
+    """
+    merged_df = build_fusioninspector_dataframe(fusioninspector_in_file).join(
+        read_build_fusionreport(fusionreport_in_file), how="outer", on='FUSION').reset_index()
+
+    df = build_hgnc_dataframe(hgnc).merge(merged_df, how='right', left_on='ensembl_gene_id',
+                                          right_on='Left_ensembl_gene_id')
+    df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
+    df = build_hgnc_dataframe(hgnc).merge(df, how='right', left_on='ensembl_gene_id', right_on='Right_ensembl_gene_id')
+    df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
+    gtf_df = build_gtf_dataframe(gtf)
+    all_df = df.merge(gtf_df, how='left', left_on='CDS_LEFT_ID', right_on='Transcript_id')
+    all_df = all_df[(all_df['PosA'] >= all_df['orig_start']) & (all_df['PosA'] <= all_df['orig_end'])]
+    all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
+    all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
+    all_df = all_df[
+        ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB',
+         'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID',
+         'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_hgnc_id', 'Strand1',
+         'Strand2', 'annots']].drop_duplicates()
+    all_df = all_df.merge(gtf_df, how='left', left_on='CDS_RIGHT_ID', right_on='Transcript_id')
+    all_df = all_df[(all_df['PosB'] >= all_df['orig_start']) & (all_df['PosB'] <= all_df['orig_end'])]
+    all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
+    all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
+    all_df = all_df[
+        ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB',
+         'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID',
+         'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_transcript_version',
+         'Right_exon_number', 'Right_hgnc_id', 'Strand1', 'Strand2', 'annots']].drop_duplicates()
+
+    return write_vcf(column_manipulation(all_df), header_def(sample), out)
 
 
 def parse_args(argv=None):
@@ -47,17 +76,32 @@ def parse_args(argv=None):
         type=Path,
         help="Fusionreport output in TSV format.",
     )
+    parser.add_argument(
+        "--fusioninspector_gtf",
+        metavar="GTF",
+        type=Path,
+        help="FusionInspector GTF output.",
+    )
+    parser.add_argument(
+        "--hgnc",
+        metavar="HGNC",
+        type=Path,
+        help="HGNC database.",
+    )
     parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample")
     parser.add_argument(
         "--out",
         metavar="OUT",
         type=Path,
-        help="Output path.",
+        help="VCF output path.",
     )
     return parser.parse_args(argv)
 
 
 def header_def(sample):
+    """
+    Define the header of the VCF file
+    """
     return '##fileformat=VCFv4.1\n\
 ##ALT=<ID=BND,Description="Break end">\n\
 ##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n\
@@ -65,15 +109,23 @@ def header_def(sample):
 ##INFO=<ID=CHRB,Number=1,Type=String,Description="Chromosome B">\n\
 ##INFO=<ID=GENEA,Number=.,Type=String,Description="Gene A">\n\
 ##INFO=<ID=GENEB,Number=.,Type=String,Description="Gene B">\n\
+##INFO=<ID=POSA,Number=.,Type=String,Description="Breakpoint position A">\n\
+##INFO=<ID=POSB,Number=.,Type=String,Description="Breakpoint position B">\n\
 ##INFO=<ID=ORIENTATION,Number=.,Type=String,Description="Strand1 and strand2 directions">\n\
 ##INFO=<ID=FOUND_DB,Number=.,Type=String,Description="Databases in which the fusion has been found">\n\
-##INFO=<ID=ARRIBA,Number=.,Type=String,Description="Found by arriba">\n\
-##INFO=<ID=FUSIONCATCHER,Number=.,Type=String,Description="Found by fusioncatcher">\n\
-##INFO=<ID=PIZZLY,Number=.,Type=String,Description="Found by pizzly">\n\
-##INFO=<ID=SQUID,Number=.,Type=String,Description="Found by squid">\n\
-##INFO=<ID=STARFUSION,Number=.,Type=String,Description="Found by starfusion">\n\
+##INFO=<ID=FOUND_IN,Number=.,Type=String,Description="Callers that have found the fusion">\n\
 ##INFO=<ID=TOOL_HITS,Number=.,Type=Integer,Description="Number of tools that found the fusion">\n\
 ##INFO=<ID=SCORE,Number=.,Type=Float,Description="Score from fusionreport">\n\
+##INFO=<ID=FRAME_STATUS,Number=.,Type=Float,Description="Frame status of the fusion">\n\
+##INFO=<ID=TRANSCRIPT_ID_A,Number=.,Type=Float,Description="Transcript id A ">\n\
+##INFO=<ID=TRANSCRIPT_ID_B,Number=.,Type=Float,Description="Transcript id B">\n\
+##INFO=<ID=TRANSCRIPT_VERSION_A,Number=.,Type=Float,Description="Transcript version A">\n\
+##INFO=<ID=TRANSCRIPT_VERSION_B,Number=.,Type=Float,Description="Transcript version B">\n\
+##INFO=<ID=HGNC_ID_A,Number=.,Type=Float,Description="HGNC id A">\n\
+##INFO=<ID=HGNC_ID_B,Number=.,Type=Float,Description="HGNC id A">\n\
+##INFO=<ID=EXON_NUMBER_A,Number=.,Type=Float,Description="Exon number A">\n\
+##INFO=<ID=EXON_NUMBER_B,Number=.,Type=Float,Description="Exon number B">\n\
+##INFO=<ID=ANNOTATIONS,Number=.,Type=Float,Description="Annotations from FusionInspector">\n\
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n\
 ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of paired-ends that support the event">\n\
 ##FORMAT=<ID=RV,Number=1,Type=Integer,Description="Number of split reads that support the event">\n\
@@ -83,42 +135,76 @@ def header_def(sample):
     )
 
 
-def read_fusioninspector(fusioninspector_file, col_num, delimiter, element):
-    with open(fusioninspector_file) as fusioninspector:
-        return [line.split()[col_num].split(delimiter)[element] for line in fusioninspector if not line.startswith("#")]
+def build_fusioninspector_dataframe(file):
+    """
+    Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index.
+    """
+    df = pd.read_csv(file, sep="\t")
+    df = df.rename(columns={"#FusionName": "FUSION"})
+    df[['ChromosomeA', 'PosA', 'Strand1']] = df['LeftBreakpoint'].str.split(':', expand=True)
+    df[['ChromosomeB', 'PosB', 'Strand2']] = df['RightBreakpoint'].str.split(':', expand=True)
+    df[['GeneA', 'GeneB']] = df['FUSION'].str.split('--', expand=True)
+    df[['LeftGeneName', 'Left_ensembl_gene_id']] = df['LeftGene'].str.split('^', expand=True)
+    df[['RightGeneName', 'Right_ensembl_gene_id']] = df['RightGene'].str.split('^', expand=True)
+    return df.set_index(['FUSION'])
 
 
-def build_fusioninspector_dataframe(file, map):
-    new_dict = {}
-    for key in FUSIONINSPECTOR_MAP:
-        new_dict[key] = read_fusioninspector(
-            file,
-            map[key]["column"],
-            map[key]["delimiter"],
-            map[key]["element"],
-        )
-    return pd.DataFrame.from_dict(new_dict).set_index("fusion")
+def replace_value_with_column_name(row, value_to_replace, column_name):
+    """
+    Replace a specific value in a row with the corresponding column name.
+    """
+    new_values = ''
+    for col_name, value in row.items():
+        if col_name == column_name:
+            if value == value_to_replace:
+                new_values = col_name
+            else:
+                new_values = ''
+    return new_values
+
+
+def concatenate_columns(row):
+    """
+    Concatenate non-empty values in a row into a single string separated by commas.
+    """
+    non_empty_values = [str(value) for value in row if value != '']
+    return ','.join(non_empty_values)
 
 
 def read_build_fusionreport(fusionreport_file):
+    """
+    Read and preprocess fusion-report data from a file, including handling missing tool columns,
+    getting the columns with each tool and create a new FOUND_IN column with all the tool hits.
+    Convert the list of databases in FOUND_DB into a joined string with a comma separator.
+    Make all column headers uppercase.
+    """
     with open(fusionreport_file) as f:
         from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
         expression = from_html[0].split('], "tool')[0]
-    fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression)).set_index("fusion")
-    if not "arriba" in fusion_report.columns:
+    fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression))
+    if "arriba" not in fusion_report.columns:
         fusion_report["arriba"] = ""
-    if not "fusioncatcher" in fusion_report.columns:
+    if "fusioncatcher" not in fusion_report.columns:
         fusion_report["fusioncatcher"] = ""
-    if not "pizzly" in fusion_report.columns:
-        fusion_report["pizzly"] = ""
-    if not "squid" in fusion_report.columns:
-        fusion_report["squid"] = ""
-    if not "starfusion" in fusion_report.columns:
+    if "starfusion" not in fusion_report.columns:
         fusion_report["starfusion"] = ""
-    return fusion_report
+    fusion_report['arriba'] = fusion_report[['arriba']].apply(replace_value_with_column_name,
+                                                              args=('true', 'arriba'), axis=1)
+    fusion_report['fusioncatcher'] = fusion_report[['fusioncatcher']].apply(replace_value_with_column_name,
+                                                                            args=('true', 'fusioncatcher'), axis=1)
+    fusion_report['starfusion'] = fusion_report[['starfusion']].apply(replace_value_with_column_name,
+                                                                      args=('true', 'starfusion'), axis=1)
+    fusion_report['FOUND_IN'] = fusion_report[['arriba', 'starfusion',
+                                               'fusioncatcher']].apply(concatenate_columns, axis=1)
+    fusion_report.columns = fusion_report.columns.str.upper()
+    fusion_report['FOUND_DB'] = fusion_report['FOUND_DB'].apply(lambda x: ', '.join(x))
+    return fusion_report[['FUSION', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', 'FOUND_IN']].set_index(['FUSION'])
 
 
 def column_manipulation(df):
+    """
+    Manipulate and prepare DataFrame for VCF file creation.
+    """
     df["ALT"] = ""
     df = df.reset_index()
     df["FORMAT"] = "GT:DV:RV:FFPM"
@@ -131,46 +217,58 @@ def column_manipulation(df):
 
     for index, row in df.iterrows():
         # ALT
-        if not row["strand1"] in ["+", "-"] or not row["strand2"] in ["+", "-"]:
-            df.loc[index, "ALT"] = "N[{}:{}[".format(df["chromosomeB"], row["posB"])
-        elif row["strand1"] == "-" and row["strand2"] == "-":
-            df.loc[index, "ALT"] = "[{}:{}[N".format(row["chromosomeB"], row["posB"])
-        elif row["strand1"] == "+" and row["strand2"] == "-":
-            df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"])
-        elif row["strand1"] == "-" and row["strand2"] == "+":
-            df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"])
+        if not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
+            df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"])
+        elif row["Strand1"] == "-" and row["Strand2"] == "-":
+            df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"])
+        elif row["Strand1"] == "+" and row["Strand2"] == "-":
+            df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
+        elif row["Strand1"] == "-" and row["Strand2"] == "+":
+            df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
         else:
-            df.loc[index, "ALT"] = "N[{}:{}[".format(row["chromosomeB"], row["posB"])
+            df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"])
         # INFO
         df.loc[index, "INFO"] = (
-            "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};ORIENTATION={},{};FOUND_DB={};"
-            "ARRIBA={};FUSIONCATCHER={};PIZZLY={};SQUID={};STARFUSION={};TOOL_HITS={};SCORE={}".format(
-                row["chromosomeA"],
-                row["chromosomeB"],
-                row["geneA"],
-                row["geneB"],
-                row["strand1"],
-                row["strand2"],
-                row["found_db"],
-                row["arriba"],
-                row["fusioncatcher"],
-                row["pizzly"],
-                row["squid"],
-                row["starfusion"],
-                row["tools_hits"],
-                row["score"],
+            "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
+            "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
+            "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};"
+            "EXON_NUMBER_B={};ANNOTATIONS={}".format(
+                row["ChromosomeA"],
+                row["ChromosomeB"],
+                row["GeneA"],
+                row["GeneB"],
+                row['PosA'],
+                row['PosB'],
+                row["Strand1"],
+                row["Strand2"],
+                row["FOUND_DB"],
+                row["FOUND_IN"],
+                row["TOOLS_HITS"],
+                row["SCORE"],
+                row["PROT_FUSION_TYPE"],
+                row["CDS_LEFT_ID"],
+                row["CDS_RIGHT_ID"],
+                row["Left_transcript_version"],
+                row["Right_transcript_version"],
+                row["Left_hgnc_id"],
+                row["Right_hgnc_id"],
+                row["Left_exon_number"],
+                row["Right_exon_number"],
+                row["annots"],
             )
         )
-        # FORMAT
-        df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["split_reads"], row["discordant_pairs"], row["ffpm"])
+        df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["JunctionReadCount"], row["SpanningFragCount"], row["FFPM"])
     return df
 
 
 def write_vcf(df_to_print, header, out_file):
+    """
+    Write a VCF file with a specified DataFrame, header, and output file path.
+    """
     df_to_print[
         [
-            "chromosomeA",
-            "posA",
+            "ChromosomeA",
+            "PosA",
             "ID",
             "REF",
             "ALT",
@@ -193,12 +291,23 @@ def write_vcf(df_to_print, header, out_file):
         f.write(header.rstrip("\r\n") + "\n" + content)
 
 
-def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, out):
-    """Convert fusion information from FusionInspector and fusion-report into a vcf file. Adapted from https://github.com/J35P312/MegaFusion"""
-    merged_df = build_fusioninspector_dataframe(fusioninspector_in_file, FUSIONINSPECTOR_MAP).join(
-        read_build_fusionreport(fusionreport_in_file), how="left"
-    )
-    write_vcf(column_manipulation(merged_df), header_def(sample), out)
+def build_hgnc_dataframe(file):
+    """
+    Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
+    """
+    df = pd.read_csv(file, sep="\t", low_memory=False)
+    return df[['hgnc_id', 'ensembl_gene_id']].dropna()
+
+
+def build_gtf_dataframe(file):
+    """
+    Build a DataFrame from GTF file, extracting relevant columns.
+    """
+    df = read_gtf(file)
+    df[['fusion_dump', 'Transcript_id']] = df['transcript_id'].str.split('^', expand=True)
+    df[['orig_chromosome', 'orig_start', 'orig_end', 'orig_dir']] = df['orig_coord_info'].str.split(',', expand=True)
+#     return df
+    return df[['Transcript_id', 'transcript_version', 'exon_number', 'exon_id', 'orig_start', 'orig_end']]
 
 
 def main(argv=None):
@@ -207,7 +316,7 @@ def main(argv=None):
     if not args.fusioninspector.is_file() or not args.fusionreport.is_file():
         logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
         sys.exit(2)
-    vcf_collect(args.fusioninspector, args.fusionreport, args.sample, args.out)
+    vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out)
 
 
 if __name__ == "__main__":
diff --git a/modules/local/hgnc/main.nf b/modules/local/hgnc/main.nf
new file mode 100644
index 00000000..7211cb71
--- /dev/null
+++ b/modules/local/hgnc/main.nf
@@ -0,0 +1,41 @@
+process HGNC_DOWNLOAD {
+    tag "hgnc"
+    label 'process_low'
+
+    conda "bioconda::gnu-wget=1.18"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h5bf99c6_5' :
+        'quay.io/biocontainers/gnu-wget:1.18--h5bf99c6_5' }"
+
+    input:
+
+    output:
+    path "hgnc_complete_set.txt"        , emit: hgnc_ref
+    path "HGNC-DB-timestamp.txt"        , emit: hgnc_date
+
+    path "versions.yml"   , emit: versions
+
+
+    script:
+    """
+    wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
+    date+%Y-%m-%d/%H:%M:  > HGNC-DB-timestamp.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        wget: \$(echo wget -V 2>&1 | grep "GNU Wget" | cut -d" " -f3 > versions.yml)
+    END_VERSIONS
+    """
+
+    stub:
+    """
+    touch "hgnc_complete_set.txt"
+    touch "HGNC-DB-timestamp.txt"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        wget: \$(echo wget -V 2>&1 | grep "GNU Wget" | cut -d" " -f3 > versions.yml)
+    END_VERSIONS
+    """
+
+}
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 5c8a57bd..208286c4 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,13 +2,13 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::python=3.8.3"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
-        'quay.io/biocontainers/pandas:1.5.2' }"
+    conda "bioconda:: gtfparse =2.0.1"
+    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_0"
 
     input:
-    tuple val(meta), path(tsv), path(report)
+    tuple val(meta), path(tsv), path(out_gtf), path(report)
+    path hgnc_ref
+    path hgnc_date
 
     output:
     path "versions.yml"              , emit: versions
@@ -20,11 +20,12 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf
+    vcf_collect.py --fusioninspector $tsv --fusionreport $report --fusioninspector_gtf $out_gtf --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python --version | sed 's/Python //g')
+        HGNC DB retrieval: \$(cat $hgnc_date)
     END_VERSIONS
     """
 
diff --git a/nextflow.config b/nextflow.config
index a81e3ea2..9bd3c50c 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -73,6 +73,8 @@ params {
     arriba_ref_known_fusions      = "${params.genomes_base}/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz"
     arriba_ref_protein_domains    = "${params.genomes_base}/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3"
     fusioncatcher_ref             = "${params.genomes_base}/fusioncatcher/human_v102"
+    hgcn_ref                      = "${params.genomes_base}/hgnc/hgnc_complete_set.txt"
+    hgcn_date                     = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt"
     pizzly_ref                    = "${params.genomes_base}/pizzly/kallisto"
     squid_ref                     = "${params.genomes_base}/squid"
     starfusion_ref                = "${params.genomes_base}/starfusion/ctat_genome_lib_build_dir"
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 1c8962f9..c4be8a48 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -183,6 +183,16 @@
                     "fa_icon": "far fa-file-code",
                     "description": "Path to fusionreport references"
                 },
+                "hgnc_ref": {
+                    "type": "string",
+                    "fa_icon": "far fa-file-code",
+                    "description": "Path to HGNC database file"
+                },
+                "hgnc_date": {
+                    "type": "string",
+                    "fa_icon": "far fa-file-code",
+                    "description": "Path to HGNC timestamp file for database retrieval"
+                },
                 "pizzly": {
                     "type": "boolean",
                     "fa_icon": "far fa-file-code",

From 4ba9728b1cf55bd8bb371c7d717b30443643b9b1 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 09:52:07 +0100
Subject: [PATCH 04/45] add hgnc id and extra info for vcf file

---
 nextflow.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index 9bd3c50c..3d1e0b41 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -73,8 +73,8 @@ params {
     arriba_ref_known_fusions      = "${params.genomes_base}/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz"
     arriba_ref_protein_domains    = "${params.genomes_base}/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3"
     fusioncatcher_ref             = "${params.genomes_base}/fusioncatcher/human_v102"
-    hgcn_ref                      = "${params.genomes_base}/hgnc/hgnc_complete_set.txt"
-    hgcn_date                     = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt"
+    hgnc_ref                      = "${params.genomes_base}/hgnc/hgnc_complete_set.txt"
+    hgnc_date                     = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt"
     pizzly_ref                    = "${params.genomes_base}/pizzly/kallisto"
     squid_ref                     = "${params.genomes_base}/squid"
     starfusion_ref                = "${params.genomes_base}/starfusion/ctat_genome_lib_build_dir"

From b0c0f18555a9a60af1bbac59365b831ea0ebfaf3 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:17:57 +0100
Subject: [PATCH 05/45] fix channel i/o

---
 modules/local/fusioninspector/main.nf          | 1 +
 subworkflows/local/fusioninspector_workflow.nf | 8 +++++---
 workflows/rnafusion.nf                         | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf
index 4d24a635..1b1a7169 100644
--- a/modules/local/fusioninspector/main.nf
+++ b/modules/local/fusioninspector/main.nf
@@ -11,6 +11,7 @@ process FUSIONINSPECTOR {
 
     output:
     tuple val(meta), path("*FusionInspector.fusions.tsv")    , emit: tsv
+    tuple val(meta), path("*.gtf")                           , emit: out_gtf
     path "*"                                                 , emit: output
     path "versions.yml"                                      , emit: versions
 
diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index b0d30cc7..d0453723 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -8,11 +8,13 @@ workflow FUSIONINSPECTOR_WORKFLOW {
         reads
         fusion_list
         fusion_list_filtered
-        report
+        fusionreport_out
         bam_sorted_indexed
         ch_gtf
         ch_arriba_ref_protein_domains
         ch_arriba_ref_cytobands
+        ch_hgnc_ref
+        ch_hgnc_date
 
     main:
         ch_versions = Channel.empty()
@@ -38,9 +40,9 @@ workflow FUSIONINSPECTOR_WORKFLOW {
 
         FUSIONINSPECTOR( ch_reads_fusion, index)
         ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions)
+        fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.ch_out_gtf).join(fusionreport_out)
 
-        fusion_data = FUSIONINSPECTOR.out.tsv.join(report)
-        VCF_COLLECT(fusion_data)
+        VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date)
         ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)
 
         if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) {
diff --git a/workflows/rnafusion.nf b/workflows/rnafusion.nf
index 494e34e8..6792eff8 100644
--- a/workflows/rnafusion.nf
+++ b/workflows/rnafusion.nf
@@ -32,8 +32,8 @@ ch_arriba_ref_blacklist = Channel.fromPath(params.arriba_ref_blacklist).map { it
 ch_arriba_ref_known_fusions = Channel.fromPath(params.arriba_ref_known_fusions).map { it -> [[id:it.Name], it] }.collect()
 ch_arriba_ref_protein_domains = Channel.fromPath(params.arriba_ref_protein_domains).map { it -> [[id:it.Name], it] }.collect()
 ch_arriba_ref_cytobands = Channel.fromPath(params.arriba_ref_cytobands).map { it -> [[id:it.Name], it] }.collect()
-
-
+ch_hgnc_ref = Channel.fromPath(params.hgnc_ref).map { it -> [[id:it.Name], it] }.collect()
+ch_hgnc_date = Channel.fromPath(params.hgnc_date).map { it -> [[id:it.Name], it] }.collect()
 ch_fasta = Channel.fromPath(params.fasta).map { it -> [[id:it.Name], it] }.collect()
 ch_gtf = Channel.fromPath(params.gtf).map { it -> [[id:it.Name], it] }.collect()
 ch_transcript = Channel.fromPath(params.transcript).map { it -> [[id:it.Name], it] }.collect()
@@ -225,7 +225,9 @@ workflow RNAFUSION {
         STARFUSION_WORKFLOW.out.ch_bam_sorted_indexed,
         ch_chrgtf,
         ch_arriba_ref_protein_domains,
-        ch_arriba_ref_cytobands
+        ch_arriba_ref_cytobands,
+        ch_hgnc_ref,
+        ch_hgnc_date
     )
     ch_versions = ch_versions.mix(FUSIONINSPECTOR_WORKFLOW.out.versions.first().ifEmpty(null))
 

From 605280b0e52d7bf390312ccdfc12407caa926173 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:19:09 +0100
Subject: [PATCH 06/45] black

---
 bin/vcf_collect.py | 144 +++++++++++++++++++++++++++++++--------------
 1 file changed, 100 insertions(+), 44 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index e590d88b..bef72dc4 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -27,33 +27,85 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample
 
     Adapted from: https://github.com/J35P312/MegaFusion
     """
-    merged_df = build_fusioninspector_dataframe(fusioninspector_in_file).join(
-        read_build_fusionreport(fusionreport_in_file), how="outer", on='FUSION').reset_index()
+    merged_df = (
+        build_fusioninspector_dataframe(fusioninspector_in_file)
+        .join(read_build_fusionreport(fusionreport_in_file), how="outer", on="FUSION")
+        .reset_index()
+    )
 
-    df = build_hgnc_dataframe(hgnc).merge(merged_df, how='right', left_on='ensembl_gene_id',
-                                          right_on='Left_ensembl_gene_id')
+    df = build_hgnc_dataframe(hgnc).merge(
+        merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
+    )
     df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
-    df = build_hgnc_dataframe(hgnc).merge(df, how='right', left_on='ensembl_gene_id', right_on='Right_ensembl_gene_id')
+    df = build_hgnc_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
     df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
     gtf_df = build_gtf_dataframe(gtf)
-    all_df = df.merge(gtf_df, how='left', left_on='CDS_LEFT_ID', right_on='Transcript_id')
-    all_df = all_df[(all_df['PosA'] >= all_df['orig_start']) & (all_df['PosA'] <= all_df['orig_end'])]
+    all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
+    all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])]
     all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
     all_df = all_df[
-        ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB',
-         'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID',
-         'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_hgnc_id', 'Strand1',
-         'Strand2', 'annots']].drop_duplicates()
-    all_df = all_df.merge(gtf_df, how='left', left_on='CDS_RIGHT_ID', right_on='Transcript_id')
-    all_df = all_df[(all_df['PosB'] >= all_df['orig_start']) & (all_df['PosB'] <= all_df['orig_end'])]
+        [
+            "FUSION",
+            "GeneA",
+            "GeneB",
+            "PosA",
+            "PosB",
+            "ChromosomeA",
+            "ChromosomeB",
+            "TOOLS_HITS",
+            "SCORE",
+            "FOUND_DB",
+            "FOUND_IN",
+            "JunctionReadCount",
+            "SpanningFragCount",
+            "FFPM",
+            "PROT_FUSION_TYPE",
+            "CDS_LEFT_ID",
+            "CDS_RIGHT_ID",
+            "Left_transcript_version",
+            "Left_exon_number",
+            "Left_hgnc_id",
+            "Right_hgnc_id",
+            "Strand1",
+            "Strand2",
+            "annots",
+        ]
+    ].drop_duplicates()
+    all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
+    all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])]
     all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
     all_df = all_df[
-        ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB',
-         'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID',
-         'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_transcript_version',
-         'Right_exon_number', 'Right_hgnc_id', 'Strand1', 'Strand2', 'annots']].drop_duplicates()
+        [
+            "FUSION",
+            "GeneA",
+            "GeneB",
+            "PosA",
+            "PosB",
+            "ChromosomeA",
+            "ChromosomeB",
+            "TOOLS_HITS",
+            "SCORE",
+            "FOUND_DB",
+            "FOUND_IN",
+            "JunctionReadCount",
+            "SpanningFragCount",
+            "FFPM",
+            "PROT_FUSION_TYPE",
+            "CDS_LEFT_ID",
+            "CDS_RIGHT_ID",
+            "Left_transcript_version",
+            "Left_exon_number",
+            "Left_hgnc_id",
+            "Right_transcript_version",
+            "Right_exon_number",
+            "Right_hgnc_id",
+            "Strand1",
+            "Strand2",
+            "annots",
+        ]
+    ].drop_duplicates()
 
     return write_vcf(column_manipulation(all_df), header_def(sample), out)
 
@@ -141,25 +193,25 @@ def build_fusioninspector_dataframe(file):
     """
     df = pd.read_csv(file, sep="\t")
     df = df.rename(columns={"#FusionName": "FUSION"})
-    df[['ChromosomeA', 'PosA', 'Strand1']] = df['LeftBreakpoint'].str.split(':', expand=True)
-    df[['ChromosomeB', 'PosB', 'Strand2']] = df['RightBreakpoint'].str.split(':', expand=True)
-    df[['GeneA', 'GeneB']] = df['FUSION'].str.split('--', expand=True)
-    df[['LeftGeneName', 'Left_ensembl_gene_id']] = df['LeftGene'].str.split('^', expand=True)
-    df[['RightGeneName', 'Right_ensembl_gene_id']] = df['RightGene'].str.split('^', expand=True)
-    return df.set_index(['FUSION'])
+    df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True)
+    df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
+    df[["GeneA", "GeneB"]] = df["FUSION"].str.split("--", expand=True)
+    df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
+    df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
+    return df.set_index(["FUSION"])
 
 
 def replace_value_with_column_name(row, value_to_replace, column_name):
     """
     Replace a specific value in a row with the corresponding column name.
     """
-    new_values = ''
+    new_values = ""
     for col_name, value in row.items():
         if col_name == column_name:
             if value == value_to_replace:
                 new_values = col_name
             else:
-                new_values = ''
+                new_values = ""
     return new_values
 
 
@@ -167,8 +219,8 @@ def concatenate_columns(row):
     """
     Concatenate non-empty values in a row into a single string separated by commas.
     """
-    non_empty_values = [str(value) for value in row if value != '']
-    return ','.join(non_empty_values)
+    non_empty_values = [str(value) for value in row if value != ""]
+    return ",".join(non_empty_values)
 
 
 def read_build_fusionreport(fusionreport_file):
@@ -188,17 +240,21 @@ def read_build_fusionreport(fusionreport_file):
         fusion_report["fusioncatcher"] = ""
     if "starfusion" not in fusion_report.columns:
         fusion_report["starfusion"] = ""
-    fusion_report['arriba'] = fusion_report[['arriba']].apply(replace_value_with_column_name,
-                                                              args=('true', 'arriba'), axis=1)
-    fusion_report['fusioncatcher'] = fusion_report[['fusioncatcher']].apply(replace_value_with_column_name,
-                                                                            args=('true', 'fusioncatcher'), axis=1)
-    fusion_report['starfusion'] = fusion_report[['starfusion']].apply(replace_value_with_column_name,
-                                                                      args=('true', 'starfusion'), axis=1)
-    fusion_report['FOUND_IN'] = fusion_report[['arriba', 'starfusion',
-                                               'fusioncatcher']].apply(concatenate_columns, axis=1)
+    fusion_report["arriba"] = fusion_report[["arriba"]].apply(
+        replace_value_with_column_name, args=("true", "arriba"), axis=1
+    )
+    fusion_report["fusioncatcher"] = fusion_report[["fusioncatcher"]].apply(
+        replace_value_with_column_name, args=("true", "fusioncatcher"), axis=1
+    )
+    fusion_report["starfusion"] = fusion_report[["starfusion"]].apply(
+        replace_value_with_column_name, args=("true", "starfusion"), axis=1
+    )
+    fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply(
+        concatenate_columns, axis=1
+    )
     fusion_report.columns = fusion_report.columns.str.upper()
-    fusion_report['FOUND_DB'] = fusion_report['FOUND_DB'].apply(lambda x: ', '.join(x))
-    return fusion_report[['FUSION', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', 'FOUND_IN']].set_index(['FUSION'])
+    fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x))
+    return fusion_report[["FUSION", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(["FUSION"])
 
 
 def column_manipulation(df):
@@ -237,8 +293,8 @@ def column_manipulation(df):
                 row["ChromosomeB"],
                 row["GeneA"],
                 row["GeneB"],
-                row['PosA'],
-                row['PosB'],
+                row["PosA"],
+                row["PosB"],
                 row["Strand1"],
                 row["Strand2"],
                 row["FOUND_DB"],
@@ -296,7 +352,7 @@ def build_hgnc_dataframe(file):
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """
     df = pd.read_csv(file, sep="\t", low_memory=False)
-    return df[['hgnc_id', 'ensembl_gene_id']].dropna()
+    return df[["hgnc_id", "ensembl_gene_id"]].dropna()
 
 
 def build_gtf_dataframe(file):
@@ -304,10 +360,10 @@ def build_gtf_dataframe(file):
     Build a DataFrame from GTF file, extracting relevant columns.
     """
     df = read_gtf(file)
-    df[['fusion_dump', 'Transcript_id']] = df['transcript_id'].str.split('^', expand=True)
-    df[['orig_chromosome', 'orig_start', 'orig_end', 'orig_dir']] = df['orig_coord_info'].str.split(',', expand=True)
-#     return df
-    return df[['Transcript_id', 'transcript_version', 'exon_number', 'exon_id', 'orig_start', 'orig_end']]
+    df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True)
+    df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True)
+    #     return df
+    return df[["Transcript_id", "transcript_version", "exon_number", "exon_id", "orig_start", "orig_end"]]
 
 
 def main(argv=None):

From 36519afd15cf328d032b601f3d7afa315e327599 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:22:42 +0100
Subject: [PATCH 07/45] fix bug in channel i/o

---
 subworkflows/local/fusioninspector_workflow.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index d0453723..84b4d73f 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -40,7 +40,7 @@ workflow FUSIONINSPECTOR_WORKFLOW {
 
         FUSIONINSPECTOR( ch_reads_fusion, index)
         ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions)
-        fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.ch_out_gtf).join(fusionreport_out)
+        fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out)
 
         VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date)
         ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)

From fd80774d8637ce7bda39b475af0e18e5f80dcfa4 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:26:20 +0100
Subject: [PATCH 08/45] fix bug in channel i/o

---
 subworkflows/local/fusioninspector_workflow.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index 84b4d73f..3eb477d0 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -42,7 +42,7 @@ workflow FUSIONINSPECTOR_WORKFLOW {
         ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions)
         fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out)
 
-        VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date)
+        VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date)
         ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)
 
         if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) {

From 3db4665030553175f3ada4c9205f7f8ffe75529f Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:33:14 +0100
Subject: [PATCH 09/45] add hgnc download to build_references workflow

---
 workflows/build_references.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/build_references.nf b/workflows/build_references.nf
index 4921443a..a8130a40 100644
--- a/workflows/build_references.nf
+++ b/workflows/build_references.nf
@@ -35,6 +35,7 @@ workflow BUILD_REFERENCES {
     def fake_meta = [:]
     fake_meta.id = "Homo_sapiens.${params.genome}.${params.ensembl_version}"
     ENSEMBL_DOWNLOAD( params.ensembl_version, params.genome, fake_meta )
+    HGNC_DOWNLOAD( )
 
 
     SAMTOOLS_FAIDX(ENSEMBL_DOWNLOAD.out.fasta, [[],[]])

From 71b15d5d205420fda437f96afd0a81c11aeca9b1 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:35:47 +0100
Subject: [PATCH 10/45] add hgnc download to build_references workflow

---
 workflows/build_references.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/workflows/build_references.nf b/workflows/build_references.nf
index a8130a40..0ebf3c08 100644
--- a/workflows/build_references.nf
+++ b/workflows/build_references.nf
@@ -8,6 +8,7 @@ include { ARRIBA_DOWNLOAD }                 from '../modules/local/arriba/downlo
 include { ENSEMBL_DOWNLOAD }                from '../modules/local/ensembl/main'
 include { FUSIONCATCHER_DOWNLOAD }          from '../modules/local/fusioncatcher/download/main'
 include { FUSIONREPORT_DOWNLOAD }           from '../modules/local/fusionreport/download/main'
+include { HGNC_DOWNLOAD }                   from '../modules/local/hgnc/main'
 include { STARFUSION_BUILD }                from '../modules/local/starfusion/build/main'
 include { STARFUSION_DOWNLOAD }             from '../modules/local/starfusion/download/main'
 include { GTF_TO_REFFLAT }                  from '../modules/local/uscs/custom_gtftogenepred/main'

From 52bf0040b2c2bf378d73f04b09f53fbd47589027 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 30 Oct 2023 12:03:09 +0100
Subject: [PATCH 11/45] add meta info for hgnc

---
 modules/local/vcf_collect/main.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 208286c4..0f3d0257 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -7,8 +7,8 @@ process VCF_COLLECT {
 
     input:
     tuple val(meta), path(tsv), path(out_gtf), path(report)
-    path hgnc_ref
-    path hgnc_date
+    tuple val(meta2),  path(hgnc_ref)
+    tuple val(meta3),  path(hgnc_date)
 
     output:
     path "versions.yml"              , emit: versions

From b07cee34be10de3509d9d47738287372485b6bfd Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Tue, 31 Oct 2023 21:11:37 +0100
Subject: [PATCH 12/45] update gtfparse containers

---
 modules/local/vcf_collect/main.nf | 4 ++--
 nextflow.config                   | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 0f3d0257..b28bd08e 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,8 +2,8 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda:: gtfparse =2.0.1"
-    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_0"
+    conda "bioconda::gtfparse=2.0.1"
+    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
 
     input:
     tuple val(meta), path(tsv), path(out_gtf), path(report)
diff --git a/nextflow.config b/nextflow.config
index d609a3ef..993f22ae 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -24,11 +24,10 @@ params {
     multiqc_methods_description = null
 
     // Genome
-    genome                     = 'GRCh38'
+    genomes                     = 'GRCh38'
     genomes_base               = "${params.outdir}/references"
     ensembl_version            = 102
     read_length                = 100
-    genomes                    = [:]
     starfusion_build           = true
 
     // Filtering

From bdb1a6ea68298092a9266f89632415294a763e49 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Tue, 31 Oct 2023 21:15:13 +0100
Subject: [PATCH 13/45] fix bug with param genome.s

---
 nextflow.config | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nextflow.config b/nextflow.config
index 993f22ae..d609a3ef 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -24,10 +24,11 @@ params {
     multiqc_methods_description = null
 
     // Genome
-    genomes                     = 'GRCh38'
+    genome                     = 'GRCh38'
     genomes_base               = "${params.outdir}/references"
     ensembl_version            = 102
     read_length                = 100
+    genomes                    = [:]
     starfusion_build           = true
 
     // Filtering

From aa746115e72b22387b9e48efc43780d1456d352c Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:24:53 +0100
Subject: [PATCH 14/45] add AGAT/CONVERTSPGFF2TSV to convert gtf to tsv

---
 bin/vcf_collect.py                             | 5 ++---
 modules/local/vcf_collect/main.nf              | 4 ++--
 subworkflows/local/fusioninspector_workflow.nf | 6 +++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index bef72dc4..ec08e432 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -357,12 +357,11 @@ def build_hgnc_dataframe(file):
 
 def build_gtf_dataframe(file):
     """
-    Build a DataFrame from GTF file, extracting relevant columns.
+    Build a DataFrame from GTF file converted in TSV, extracting relevant columns.
     """
-    df = read_gtf(file)
+    df = pd.read_csv(file, sep="\t")
     df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True)
     df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True)
-    #     return df
     return df[["Transcript_id", "transcript_version", "exon_number", "exon_id", "orig_start", "orig_end"]]
 
 
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index b28bd08e..df999204 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -6,7 +6,7 @@ process VCF_COLLECT {
     container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
 
     input:
-    tuple val(meta), path(tsv), path(out_gtf), path(report)
+    tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)
     tuple val(meta2),  path(hgnc_ref)
     tuple val(meta3),  path(hgnc_date)
 
@@ -20,7 +20,7 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $tsv --fusionreport $report --fusioninspector_gtf $out_gtf --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index 3eb477d0..640b25fd 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -1,3 +1,4 @@
+include { AGAT_CONVERTSPGFF2TSV     }                     from '../../modules/nf-core/agat/visualisation/main'
 include { ARRIBA_VISUALISATION     }                      from '../../modules/local/arriba/visualisation/main'
 include { CAT_CAT }                                       from '../../modules/nf-core/cat/cat/main'
 include { VCF_COLLECT }                                   from '../../modules/local/vcf_collect/main'
@@ -40,8 +41,11 @@ workflow FUSIONINSPECTOR_WORKFLOW {
 
         FUSIONINSPECTOR( ch_reads_fusion, index)
         ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions)
-        fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out)
 
+        AGAT_CONVERTSPGFF2TSV(FUSIONINSPECTOR.out.out_gtf)
+        ch_versions = ch_versions.mix(AGAT_CONVERTSPGFF2TSV.out.versions)
+
+        fusion_data = FUSIONINSPECTOR.out.tsv.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out)
         VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date)
         ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)
 

From cc4ec30388dce42bdb7fd8ac1bbdbe1caf558f94 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 10:20:04 +0100
Subject: [PATCH 15/45] install agat/convertspgff2tsv from nf-core

---
 modules.json                                  |  5 ++
 .../agat/convertspgff2tsv/environment.yml     |  6 ++
 modules/nf-core/agat/convertspgff2tsv/main.nf | 35 +++++++++
 .../nf-core/agat/convertspgff2tsv/meta.yml    | 38 ++++++++++
 modules/nf-core/picard/markduplicates/main.nf | 65 +++++++++++++++++
 .../nf-core/picard/markduplicates/meta.yml    | 71 +++++++++++++++++++
 .../execution_trace_2023-11-01_15-05-02.txt   |  1 +
 7 files changed, 221 insertions(+)
 create mode 100644 modules/nf-core/agat/convertspgff2tsv/environment.yml
 create mode 100644 modules/nf-core/agat/convertspgff2tsv/main.nf
 create mode 100644 modules/nf-core/agat/convertspgff2tsv/meta.yml
 create mode 100644 modules/nf-core/picard/markduplicates/main.nf
 create mode 100644 modules/nf-core/picard/markduplicates/meta.yml
 create mode 100644 null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt

diff --git a/modules.json b/modules.json
index 9805e2ef..beb0e954 100644
--- a/modules.json
+++ b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "agat/convertspgff2tsv": {
+                        "branch": "master",
+                        "git_sha": "53e6fd5d80141e00a3b70762f4361f6af1f4303b",
+                        "installed_by": ["modules"]
+                    },
                     "arriba": {
                         "branch": "master",
                         "git_sha": "ea9e2892a9d12e8769402f12096219942bcf6536",
diff --git a/modules/nf-core/agat/convertspgff2tsv/environment.yml b/modules/nf-core/agat/convertspgff2tsv/environment.yml
new file mode 100644
index 00000000..9ca0ea28
--- /dev/null
+++ b/modules/nf-core/agat/convertspgff2tsv/environment.yml
@@ -0,0 +1,6 @@
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::agat=1.2.0
diff --git a/modules/nf-core/agat/convertspgff2tsv/main.nf b/modules/nf-core/agat/convertspgff2tsv/main.nf
new file mode 100644
index 00000000..cef48360
--- /dev/null
+++ b/modules/nf-core/agat/convertspgff2tsv/main.nf
@@ -0,0 +1,35 @@
+process AGAT_CONVERTSPGFF2TSV {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/agat:1.2.0--pl5321hdfd78af_0' :
+        'biocontainers/agat:1.2.0--pl5321hdfd78af_0' }"
+
+    input:
+    tuple val(meta), path(gff)
+
+    output:
+    tuple val(meta), path("*.tsv"), emit: tsv
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    """
+    agat_convert_sp_gff2tsv.pl \\
+        --gff $gff \\
+        --output ${prefix}.tsv \\
+        $args
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        agat: \$(agat_convert_sp_gff2tsv.pl --help | sed '3!d; s/.*v//' | sed 's/ .*//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/agat/convertspgff2tsv/meta.yml b/modules/nf-core/agat/convertspgff2tsv/meta.yml
new file mode 100644
index 00000000..f5865dfe
--- /dev/null
+++ b/modules/nf-core/agat/convertspgff2tsv/meta.yml
@@ -0,0 +1,38 @@
+name: agat_convertspgff2tsv
+description: |
+  Converts a GFF/GTF file into a TSV file
+keywords:
+  - genome
+  - gff
+  - gtf
+  - conversion
+  - tsv
+tools:
+  - agat:
+      description: "AGAT is a toolkit for manipulation and getting information from GFF/GTF files"
+      homepage: "https://github.com/NBISweden/AGAT"
+      documentation: "https://agat.readthedocs.io/"
+      tool_dev_url: "https://github.com/NBISweden/AGAT"
+      doi: "10.5281/zenodo.3552717"
+      licence: ["GPL v3"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - gff:
+      type: file
+      description: Annotation file in GFF3/GTF format
+      pattern: "*.{gff, gtf}"
+output:
+  - tsv:
+      type: file
+      description: Annotation file in TSV format
+      pattern: "*.{gtf}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@rannick"
diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf
new file mode 100644
index 00000000..ebfa0864
--- /dev/null
+++ b/modules/nf-core/picard/markduplicates/main.nf
@@ -0,0 +1,65 @@
+process PICARD_MARKDUPLICATES {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "bioconda::picard=3.0.0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
+        'biocontainers/picard:3.0.0--hdfd78af_1' }"
+
+    input:
+    tuple val(meta), path(bam)
+    tuple val(meta2), path(fasta)
+    tuple val(meta3), path(fai)
+
+    output:
+    tuple val(meta), path("*.bam")        , emit: bam
+    tuple val(meta), path("*.bai")        , optional:true, emit: bai
+    tuple val(meta), path("*.metrics.txt"), emit: metrics
+    path  "versions.yml"                  , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def avail_mem = 3072
+    if (!task.memory) {
+        log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
+    } else {
+        avail_mem = (task.memory.mega*0.8).intValue()
+    }
+
+    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+
+    """
+    picard \\
+        -Xmx${avail_mem}M \\
+        MarkDuplicates \\
+        $args \\
+        --INPUT $bam \\
+        --OUTPUT ${prefix}.bam \\
+        --REFERENCE_SEQUENCE $fasta \\
+        --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    """
+    touch ${prefix}.bam
+    touch ${prefix}.bam.bai
+    touch ${prefix}.MarkDuplicates.metrics.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
+    END_VERSIONS
+    """
+}
diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml
new file mode 100644
index 00000000..f7693d2f
--- /dev/null
+++ b/modules/nf-core/picard/markduplicates/meta.yml
@@ -0,0 +1,71 @@
+name: picard_markduplicates
+description: Locate and tag duplicate reads in a BAM file
+keywords:
+  - markduplicates
+  - pcr
+  - duplicates
+  - bam
+  - sam
+  - cram
+tools:
+  - picard:
+      description: |
+        A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS)
+        data and formats such as SAM/BAM/CRAM and VCF.
+      homepage: https://broadinstitute.github.io/picard/
+      documentation: https://broadinstitute.github.io/picard/
+      licence: ["MIT"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file
+      pattern: "*.{bam,cram,sam}"
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fasta:
+      type: file
+      description: Reference genome fasta file
+      pattern: "*.{fasta,fa}"
+  - meta3:
+      type: map
+      description: |
+        Groovy Map containing reference information
+        e.g. [ id:'genome' ]
+  - fai:
+      type: file
+      description: Reference genome fasta index
+      pattern: "*.{fai}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - bam:
+      type: file
+      description: BAM file with duplicate reads marked/removed
+      pattern: "*.{bam}"
+  - bai:
+      type: file
+      description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag
+      pattern: "*.{bai}"
+  - metrics:
+      type: file
+      description: Duplicate metrics file generated by picard
+      pattern: "*.{metrics.txt}"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@drpatelh"
+  - "@projectoriented"
+  - "@ramprasadn"
diff --git a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt
new file mode 100644
index 00000000..6b739acd
--- /dev/null
+++ b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt
@@ -0,0 +1 @@
+task_id	hash	native_id	name	status	exit	submit	duration	realtime	%cpu	peak_rss	peak_vmem	rchar	wchar

From 0c3ffaed91d6129217bdd6be8c9fddb002b70e21 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 10:27:33 +0100
Subject: [PATCH 16/45] correct typo

---
 subworkflows/local/fusioninspector_workflow.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index 640b25fd..e616ee5b 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -1,4 +1,4 @@
-include { AGAT_CONVERTSPGFF2TSV     }                     from '../../modules/nf-core/agat/visualisation/main'
+include { AGAT_CONVERTSPGFF2TSV     }                     from '../../modules/nf-core/agat/convertspgff2tsv/main'
 include { ARRIBA_VISUALISATION     }                      from '../../modules/local/arriba/visualisation/main'
 include { CAT_CAT }                                       from '../../modules/nf-core/cat/cat/main'
 include { VCF_COLLECT }                                   from '../../modules/local/vcf_collect/main'

From ba55fa3fdffda0b8c280e3807702bf4889b239eb Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 12:08:31 +0100
Subject: [PATCH 17/45] vcf_collect takes tsv_coding_effect channel instead of
 tsv

---
 modules/local/fusioninspector/main.nf          | 17 +++++++++++------
 subworkflows/local/fusioninspector_workflow.nf |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf
index 1b1a7169..11013837 100644
--- a/modules/local/fusioninspector/main.nf
+++ b/modules/local/fusioninspector/main.nf
@@ -10,10 +10,11 @@ process FUSIONINSPECTOR {
     path reference
 
     output:
-    tuple val(meta), path("*FusionInspector.fusions.tsv")    , emit: tsv
-    tuple val(meta), path("*.gtf")                           , emit: out_gtf
-    path "*"                                                 , emit: output
-    path "versions.yml"                                      , emit: versions
+    tuple val(meta), path("*FusionInspector.fusions.tsv")                           , emit: tsv
+    tuple val(meta), path("*FusionInspector.fusions.tsv.annotated.coding_effect")   , emit: tsv_coding_effect
+    tuple val(meta), path("*.gtf")                                                  , emit: out_gtf
+    path "*"                                                                        , emit: output
+    path "versions.yml"                                                             , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -40,9 +41,13 @@ process FUSIONINSPECTOR {
     """
 
     stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
     """
-    touch FusionInspector.log
-    touch FusionInspector.fusions.tsv
+    touch ${prefix}.FusionInspector.log
+    touch ${prefix}.FusionInspector.fusions.tsv
+    touch ${prefix}.FusionInspector.fusions.tsv.annotated.coding_effect
+    touch ${prefix}.gtf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf
index e616ee5b..32173e81 100644
--- a/subworkflows/local/fusioninspector_workflow.nf
+++ b/subworkflows/local/fusioninspector_workflow.nf
@@ -45,7 +45,7 @@ workflow FUSIONINSPECTOR_WORKFLOW {
         AGAT_CONVERTSPGFF2TSV(FUSIONINSPECTOR.out.out_gtf)
         ch_versions = ch_versions.mix(AGAT_CONVERTSPGFF2TSV.out.versions)
 
-        fusion_data = FUSIONINSPECTOR.out.tsv.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out)
+        fusion_data = FUSIONINSPECTOR.out.tsv_coding_effect.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out)
         VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date)
         ch_versions = ch_versions.mix(VCF_COLLECT.out.versions)
 

From 1b9c18115689e32039b56f40dbbd90731f572031 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 12:36:47 +0100
Subject: [PATCH 18/45] remove empty entry

---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index ec08e432..cd268ac1 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -286,7 +286,7 @@ def column_manipulation(df):
         # INFO
         df.loc[index, "INFO"] = (
             "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
-            "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
+            "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
             "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};"
             "EXON_NUMBER_B={};ANNOTATIONS={}".format(
                 row["ChromosomeA"],

From 775444e4fb7dc863b7f008a79632200e73b03498 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 15:15:23 +0100
Subject: [PATCH 19/45] make optional gtf and coding effectsg

---
 modules/local/fusioninspector/main.nf | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf
index 11013837..c7fcd3f0 100644
--- a/modules/local/fusioninspector/main.nf
+++ b/modules/local/fusioninspector/main.nf
@@ -10,11 +10,11 @@ process FUSIONINSPECTOR {
     path reference
 
     output:
-    tuple val(meta), path("*FusionInspector.fusions.tsv")                           , emit: tsv
-    tuple val(meta), path("*FusionInspector.fusions.tsv.annotated.coding_effect")   , emit: tsv_coding_effect
-    tuple val(meta), path("*.gtf")                                                  , emit: out_gtf
-    path "*"                                                                        , emit: output
-    path "versions.yml"                                                             , emit: versions
+    tuple val(meta), path("*FusionInspector.fusions.tsv")                  , emit: tsv
+    tuple val(meta), path("*.coding_effect")                , optional:true, emit: tsv_coding_effect
+    tuple val(meta), path("*.gtf")                          , optional:true, emit: out_gtf
+    path "*"                                                               , emit: output
+    path "versions.yml"                                                    , emit: versions
 
     when:
     task.ext.when == null || task.ext.when
@@ -42,7 +42,6 @@ process FUSIONINSPECTOR {
 
     stub:
     def prefix = task.ext.prefix ?: "${meta.id}"
-
     """
     touch ${prefix}.FusionInspector.log
     touch ${prefix}.FusionInspector.fusions.tsv

From 4a38147d79fa7a59032c38c689b54034ad15e106 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 16:36:03 +0100
Subject: [PATCH 20/45] remove HGNC: from HGNC id column

---
 bin/vcf_collect.py                            |  1 +
 modules.json                                  |  5 --
 modules/nf-core/picard/markduplicates/main.nf | 65 -----------------
 .../nf-core/picard/markduplicates/meta.yml    | 71 -------------------
 .../execution_trace_2023-11-01_15-05-02.txt   |  1 -
 5 files changed, 1 insertion(+), 142 deletions(-)
 delete mode 100644 modules/nf-core/picard/markduplicates/main.nf
 delete mode 100644 modules/nf-core/picard/markduplicates/meta.yml
 delete mode 100644 null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index cd268ac1..39da16ba 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -352,6 +352,7 @@ def build_hgnc_dataframe(file):
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """
     df = pd.read_csv(file, sep="\t", low_memory=False)
+    df['hgnc_id'] = df['hgnc_id'].str.replace("HGNC:","")
     return df[["hgnc_id", "ensembl_gene_id"]].dropna()
 
 
diff --git a/modules.json b/modules.json
index beb0e954..a6ffdfd5 100644
--- a/modules.json
+++ b/modules.json
@@ -70,11 +70,6 @@
                         "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1",
                         "installed_by": ["modules"]
                     },
-                    "picard/markduplicates": {
-                        "branch": "master",
-                        "git_sha": "2ee934606f1fdf7fc1cb05d6e8abc13bec8ab448",
-                        "installed_by": ["modules"]
-                    },
                     "samtools/faidx": {
                         "branch": "master",
                         "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf
deleted file mode 100644
index ebfa0864..00000000
--- a/modules/nf-core/picard/markduplicates/main.nf
+++ /dev/null
@@ -1,65 +0,0 @@
-process PICARD_MARKDUPLICATES {
-    tag "$meta.id"
-    label 'process_medium'
-
-    conda "bioconda::picard=3.0.0"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
-        'biocontainers/picard:3.0.0--hdfd78af_1' }"
-
-    input:
-    tuple val(meta), path(bam)
-    tuple val(meta2), path(fasta)
-    tuple val(meta3), path(fai)
-
-    output:
-    tuple val(meta), path("*.bam")        , emit: bam
-    tuple val(meta), path("*.bai")        , optional:true, emit: bai
-    tuple val(meta), path("*.metrics.txt"), emit: metrics
-    path  "versions.yml"                  , emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args = task.ext.args ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    def avail_mem = 3072
-    if (!task.memory) {
-        log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
-    } else {
-        avail_mem = (task.memory.mega*0.8).intValue()
-    }
-
-    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
-
-    """
-    picard \\
-        -Xmx${avail_mem}M \\
-        MarkDuplicates \\
-        $args \\
-        --INPUT $bam \\
-        --OUTPUT ${prefix}.bam \\
-        --REFERENCE_SEQUENCE $fasta \\
-        --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
-    END_VERSIONS
-    """
-
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
-    """
-    touch ${prefix}.bam
-    touch ${prefix}.bam.bai
-    touch ${prefix}.MarkDuplicates.metrics.txt
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:)
-    END_VERSIONS
-    """
-}
diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml
deleted file mode 100644
index f7693d2f..00000000
--- a/modules/nf-core/picard/markduplicates/meta.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: picard_markduplicates
-description: Locate and tag duplicate reads in a BAM file
-keywords:
-  - markduplicates
-  - pcr
-  - duplicates
-  - bam
-  - sam
-  - cram
-tools:
-  - picard:
-      description: |
-        A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS)
-        data and formats such as SAM/BAM/CRAM and VCF.
-      homepage: https://broadinstitute.github.io/picard/
-      documentation: https://broadinstitute.github.io/picard/
-      licence: ["MIT"]
-input:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - bam:
-      type: file
-      description: BAM file
-      pattern: "*.{bam,cram,sam}"
-  - meta2:
-      type: map
-      description: |
-        Groovy Map containing reference information
-        e.g. [ id:'genome' ]
-  - fasta:
-      type: file
-      description: Reference genome fasta file
-      pattern: "*.{fasta,fa}"
-  - meta3:
-      type: map
-      description: |
-        Groovy Map containing reference information
-        e.g. [ id:'genome' ]
-  - fai:
-      type: file
-      description: Reference genome fasta index
-      pattern: "*.{fai}"
-output:
-  - meta:
-      type: map
-      description: |
-        Groovy Map containing sample information
-        e.g. [ id:'test', single_end:false ]
-  - bam:
-      type: file
-      description: BAM file with duplicate reads marked/removed
-      pattern: "*.{bam}"
-  - bai:
-      type: file
-      description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag
-      pattern: "*.{bai}"
-  - metrics:
-      type: file
-      description: Duplicate metrics file generated by picard
-      pattern: "*.{metrics.txt}"
-  - versions:
-      type: file
-      description: File containing software versions
-      pattern: "versions.yml"
-authors:
-  - "@drpatelh"
-  - "@projectoriented"
-  - "@ramprasadn"
diff --git a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt
deleted file mode 100644
index 6b739acd..00000000
--- a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt
+++ /dev/null
@@ -1 +0,0 @@
-task_id	hash	native_id	name	status	exit	submit	duration	realtime	%cpu	peak_rss	peak_vmem	rchar	wchar

From 6f2a4ea24d3c6f1004e74fab5c93d898c5ebb1c5 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 17:06:07 +0100
Subject: [PATCH 21/45] fix hgnc date timestamping

---
 modules/local/hgnc/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/hgnc/main.nf b/modules/local/hgnc/main.nf
index 7211cb71..1b3808f6 100644
--- a/modules/local/hgnc/main.nf
+++ b/modules/local/hgnc/main.nf
@@ -19,7 +19,7 @@ process HGNC_DOWNLOAD {
     script:
     """
     wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
-    date+%Y-%m-%d/%H:%M:  > HGNC-DB-timestamp.txt
+    date +%Y-%m-%d/%H:%M  > HGNC-DB-timestamp.txt
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From f4c091214c983584bb933ec039a177455ab787c6 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 6 Nov 2023 21:43:14 +0100
Subject: [PATCH 22/45] black

---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 39da16ba..4b3ec199 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -352,7 +352,7 @@ def build_hgnc_dataframe(file):
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """
     df = pd.read_csv(file, sep="\t", low_memory=False)
-    df['hgnc_id'] = df['hgnc_id'].str.replace("HGNC:","")
+    df["hgnc_id"] = df["hgnc_id"].str.replace("HGNC:", "")
     return df[["hgnc_id", "ensembl_gene_id"]].dropna()
 
 

From 4337f4ae1653361393190618077291b14d08ca76 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Tue, 7 Nov 2023 21:49:47 +0100
Subject: [PATCH 23/45] add support for case fusioninspector filters out a
 fusion from fusionreport

---
 bin/vcf_collect.py | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 4b3ec199..408abfe9 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger()
 
 
-def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample, out):
+def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, hgnc, gtf, out):
     """
     Process FusionInspector and FusionReport data,
     merge with GTF from FusionInspector and HGNC database,
@@ -33,14 +33,17 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample
         .reset_index()
     )
 
-    df = build_hgnc_dataframe(hgnc).merge(
+    df = build_hgcn_dataframe(hgnc).merge(
         merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
     )
     df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
-    df = build_hgnc_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
+    df = build_hgcn_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
     df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
     gtf_df = build_gtf_dataframe(gtf)
     all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
+    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0)
+    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].astype(int)
+
     all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])]
     all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
@@ -73,7 +76,10 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample
         ]
     ].drop_duplicates()
     all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
+    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
+    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
     all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])]
+
     all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
     all_df = all_df[
@@ -195,7 +201,6 @@ def build_fusioninspector_dataframe(file):
     df = df.rename(columns={"#FusionName": "FUSION"})
     df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True)
     df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
-    df[["GeneA", "GeneB"]] = df["FUSION"].str.split("--", expand=True)
     df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
     df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
     return df.set_index(["FUSION"])
@@ -234,11 +239,11 @@ def read_build_fusionreport(fusionreport_file):
         from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
         expression = from_html[0].split('], "tool')[0]
     fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression))
-    if "arriba" not in fusion_report.columns:
+    if not "arriba" in fusion_report.columns:
         fusion_report["arriba"] = ""
-    if "fusioncatcher" not in fusion_report.columns:
+    if not "fusioncatcher" in fusion_report.columns:
         fusion_report["fusioncatcher"] = ""
-    if "starfusion" not in fusion_report.columns:
+    if not "starfusion" in fusion_report.columns:
         fusion_report["starfusion"] = ""
     fusion_report["arriba"] = fusion_report[["arriba"]].apply(
         replace_value_with_column_name, args=("true", "arriba"), axis=1
@@ -254,7 +259,11 @@ def read_build_fusionreport(fusionreport_file):
     )
     fusion_report.columns = fusion_report.columns.str.upper()
     fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x))
-    return fusion_report[["FUSION", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(["FUSION"])
+    fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)
+
+    return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
+        ["FUSION"]
+    )
 
 
 def column_manipulation(df):
@@ -270,10 +279,12 @@ def column_manipulation(df):
     df["REF"] = "N"
     df["INFO"] = ""
     df["Sample"] = ""
+    df["Strand1"] = df["Strand1"].astype(str)
 
     for index, row in df.iterrows():
-        # ALT
-        if not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
+        if row["Strand1"] == "nan":
+            df.loc[index, "ALT"] = "nan"
+        elif not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
             df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"])
         elif row["Strand1"] == "-" and row["Strand2"] == "-":
             df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"])
@@ -283,12 +294,12 @@ def column_manipulation(df):
             df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
         else:
             df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"])
-        # INFO
+
         df.loc[index, "INFO"] = (
             "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
-            "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
-            "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};"
-            "EXON_NUMBER_B={};ANNOTATIONS={}".format(
+            "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
+            "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};"
+            "ANNOTATIONS={}".format(
                 row["ChromosomeA"],
                 row["ChromosomeB"],
                 row["GeneA"],
@@ -347,7 +358,7 @@ def write_vcf(df_to_print, header, out_file):
         f.write(header.rstrip("\r\n") + "\n" + content)
 
 
-def build_hgnc_dataframe(file):
+def build_hgcn_dataframe(file):
     """
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """

From be99aef3a95573574dc9a9bac357dc20fac66f5d Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 8 Nov 2023 13:14:40 +0100
Subject: [PATCH 24/45] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c9dcad59..81535f94 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - `--extreme_sensitivity` used for fusioninspector to minimize fusioninspector filtering [#424](https://github.com/nf-core/rnafusion/pull/424)
 - `--extreme_sensitivity` removed in favor of `--max_sensitivity --max_mate_dist 10000000 --annotate --examine_coding_effect` to collect more data from fusioninspector [#426](https://github.com/nf-core/rnafusion/pull/426)
 - `Arriba` updated to 2.4.0 [#429](https://github.com/nf-core/rnafusion/pull/429)
+- Change megafusion into vcf_collect, taking into account e.g. the annotation and coding effects outputs from fusioninspector, HGNC ids, frame status... [#414](https://github.com/nf-core/rnafusion/pull/414)
 
 ### Fixed
 

From 1c109ffcdfb765f49df852636e9bde4b331246e9 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Fri, 10 Nov 2023 10:34:00 +0100
Subject: [PATCH 25/45] Update bin/vcf_collect.py

Co-authored-by: Eva C <29628428+fevac@users.noreply.github.com>
---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 408abfe9..69d39f34 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -193,7 +193,7 @@ def header_def(sample):
     )
 
 
-def build_fusioninspector_dataframe(file):
+def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
     """
     Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index.
     """

From ff51b1cee25120fc583c9b20ebba37620ac91359 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Fri, 10 Nov 2023 10:36:44 +0100
Subject: [PATCH 26/45] Apply suggestions from code review

Co-authored-by: Eva C <29628428+fevac@users.noreply.github.com>
---
 bin/vcf_collect.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 69d39f34..7d43a98a 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger()
 
 
-def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, hgnc, gtf, out):
+def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file) -> None:
     """
     Process FusionInspector and FusionReport data,
     merge with GTF from FusionInspector and HGNC database,
@@ -206,7 +206,7 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
     return df.set_index(["FUSION"])
 
 
-def replace_value_with_column_name(row, value_to_replace, column_name):
+def replace_value_with_column_name(row: pd.Series, value_to_replace: str, column_name: str) -> str:
     """
     Replace a specific value in a row with the corresponding column name.
     """
@@ -220,7 +220,7 @@ def replace_value_with_column_name(row, value_to_replace, column_name):
     return new_values
 
 
-def concatenate_columns(row):
+def concatenate_columns(row: pd.Series) -> str:
     """
     Concatenate non-empty values in a row into a single string separated by commas.
     """
@@ -228,7 +228,7 @@ def concatenate_columns(row):
     return ",".join(non_empty_values)
 
 
-def read_build_fusionreport(fusionreport_file):
+def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
     """
     Read and preprocess fusion-report data from a file, including handling missing tool columns,
     getting the columns with each tool and create a new FOUND_IN column with all the tool hits.
@@ -266,7 +266,7 @@ def read_build_fusionreport(fusionreport_file):
     )
 
 
-def column_manipulation(df):
+def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
     """
     Manipulate and prepare DataFrame for VCF file creation.
     """
@@ -297,7 +297,7 @@ def column_manipulation(df):
 
         df.loc[index, "INFO"] = (
             "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
-            "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
+            "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
             "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};"
             "ANNOTATIONS={}".format(
                 row["ChromosomeA"],
@@ -328,7 +328,7 @@ def column_manipulation(df):
     return df
 
 
-def write_vcf(df_to_print, header, out_file):
+def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
     """
     Write a VCF file with a specified DataFrame, header, and output file path.
     """
@@ -358,7 +358,7 @@ def write_vcf(df_to_print, header, out_file):
         f.write(header.rstrip("\r\n") + "\n" + content)
 
 
-def build_hgcn_dataframe(file):
+def build_hgcn_dataframe(file: str) -> pd.DataFrame:
     """
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """
@@ -367,7 +367,7 @@ def build_hgcn_dataframe(file):
     return df[["hgnc_id", "ensembl_gene_id"]].dropna()
 
 
-def build_gtf_dataframe(file):
+def build_gtf_dataframe(file: str) -> pd.DataFrame:
     """
     Build a DataFrame from GTF file converted in TSV, extracting relevant columns.
     """

From 92a3dadf2b80eb151746c5217b37d6d44328d953 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 10:04:16 +0100
Subject: [PATCH 27/45] use fstrings

---
 bin/vcf_collect.py | 47 ++++++++++++++--------------------------------
 1 file changed, 14 insertions(+), 33 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 7d43a98a..c5157512 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -285,46 +285,27 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
         if row["Strand1"] == "nan":
             df.loc[index, "ALT"] = "nan"
         elif not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]:
-            df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"])
+            df.loc[index, "ALT"] = f'N[{df["ChromosomeB"]}:{row["PosB"]}['
         elif row["Strand1"] == "-" and row["Strand2"] == "-":
-            df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"])
+            df.loc[index, "ALT"] = f'[{row["ChromosomeB"]}:{row["PosB"]}[N'
         elif row["Strand1"] == "+" and row["Strand2"] == "-":
-            df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
+            df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]'
         elif row["Strand1"] == "-" and row["Strand2"] == "+":
-            df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"])
+            df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]'
         else:
-            df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"])
+            df.loc[index, "ALT"] = f'N[{row["ChromosomeB"]}:{row["PosB"]}['
 
         df.loc[index, "INFO"] = (
-            "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};"
-            "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};"
-            "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};"
-            "ANNOTATIONS={}".format(
-                row["ChromosomeA"],
-                row["ChromosomeB"],
-                row["GeneA"],
-                row["GeneB"],
-                row["PosA"],
-                row["PosB"],
-                row["Strand1"],
-                row["Strand2"],
-                row["FOUND_DB"],
-                row["FOUND_IN"],
-                row["TOOLS_HITS"],
-                row["SCORE"],
-                row["PROT_FUSION_TYPE"],
-                row["CDS_LEFT_ID"],
-                row["CDS_RIGHT_ID"],
-                row["Left_transcript_version"],
-                row["Right_transcript_version"],
-                row["Left_hgnc_id"],
-                row["Right_hgnc_id"],
-                row["Left_exon_number"],
-                row["Right_exon_number"],
-                row["annots"],
-            )
+            f"SVTYPE=BND;CHRA={row['ChromosomeA']};CHRB={row['ChromosomeB']};GENEA={row['GeneA']};GENEB={row['GeneB']};"
+            f"POSA={row['PosA']};POSB={row['PosB']};ORIENTATION={row['Strand1']},{row['Strand2']};FOUND_DB={row['FOUND_DB']};"
+            f"FOUND_IN={row['FOUND_IN']};TOOL_HITS={row['TOOLS_HITS']};SCORE={row['SCORE']};FRAME_STATUS={row['PROT_FUSION_TYPE']};"
+            f"TRANSCRIPT_ID_A={row['CDS_LEFT_ID']};TRANSCRIPT_ID_B={row['CDS_RIGHT_ID']};"
+            f"TRANSCRIPT_VERSION_A={row['Left_transcript_version']};TRANSCRIPT_VERSION_B={row['Right_transcript_version']};"
+            f"HGNC_ID_A={row['Left_hgnc_id']};HGNC_ID_B={row['Right_hgnc_id']};"
+            f"EXON_NUMBER_A={row['Left_exon_number']},EXON_NUMBER_B={row['Right_exon_number']};"
+            f"ANNOTATIONS={row['annots']}"
         )
-        df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["JunctionReadCount"], row["SpanningFragCount"], row["FFPM"])
+        df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
     return df
 
 

From 7a700f0ff6b00a82e057d32a81ea06f717a7a437 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 10:05:30 +0100
Subject: [PATCH 28/45] add info on GTF output file from fusioninspector

---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index c5157512..3761a11c 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -22,7 +22,7 @@ def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample:
         fusionreport_in_file (str): Path to FusionReport input file.
         sample (str): Sample name for the header.
         hgnc (str): Path to HGNC file.
-        gtf (str): Path to GTF file.
+        gtf (str): Path to output GTF file from FusionInspector.
         out (str): Output VCF file path.
 
     Adapted from: https://github.com/J35P312/MegaFusion

From 22e4502c76aa635aa142006fd9c926e7ae373d6f Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 10:06:59 +0100
Subject: [PATCH 29/45] black

---
 bin/vcf_collect.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 3761a11c..14855fd6 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -11,7 +11,9 @@
 logger = logging.getLogger()
 
 
-def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file) -> None:
+def vcf_collect(
+    fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file
+) -> None:
     """
     Process FusionInspector and FusionReport data,
     merge with GTF from FusionInspector and HGNC database,

From e59823c3f7b054e262df0d5bfdd4f693017aba1d Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:18:55 +0100
Subject: [PATCH 30/45] fix header typer

---
 bin/vcf_collect.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 14855fd6..7156a806 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -6,7 +6,8 @@
 from pathlib import Path
 import pandas as pd
 import ast
-from gtfparse import read_gtf
+import numpy as np
+import csv
 
 logger = logging.getLogger()
 
@@ -176,16 +177,16 @@ def header_def(sample):
 ##INFO=<ID=FOUND_IN,Number=.,Type=String,Description="Callers that have found the fusion">\n\
 ##INFO=<ID=TOOL_HITS,Number=.,Type=Integer,Description="Number of tools that found the fusion">\n\
 ##INFO=<ID=SCORE,Number=.,Type=Float,Description="Score from fusionreport">\n\
-##INFO=<ID=FRAME_STATUS,Number=.,Type=Float,Description="Frame status of the fusion">\n\
-##INFO=<ID=TRANSCRIPT_ID_A,Number=.,Type=Float,Description="Transcript id A ">\n\
-##INFO=<ID=TRANSCRIPT_ID_B,Number=.,Type=Float,Description="Transcript id B">\n\
+##INFO=<ID=FRAME_STATUS,Number=.,Type=String,Description="Frame status of the fusion">\n\
+##INFO=<ID=TRANSCRIPT_ID_A,Number=.,Type=String,Description="Transcript id A ">\n\
+##INFO=<ID=TRANSCRIPT_ID_B,Number=.,Type=String,Description="Transcript id B">\n\
 ##INFO=<ID=TRANSCRIPT_VERSION_A,Number=.,Type=Float,Description="Transcript version A">\n\
 ##INFO=<ID=TRANSCRIPT_VERSION_B,Number=.,Type=Float,Description="Transcript version B">\n\
 ##INFO=<ID=HGNC_ID_A,Number=.,Type=Float,Description="HGNC id A">\n\
 ##INFO=<ID=HGNC_ID_B,Number=.,Type=Float,Description="HGNC id A">\n\
 ##INFO=<ID=EXON_NUMBER_A,Number=.,Type=Float,Description="Exon number A">\n\
 ##INFO=<ID=EXON_NUMBER_B,Number=.,Type=Float,Description="Exon number B">\n\
-##INFO=<ID=ANNOTATIONS,Number=.,Type=Float,Description="Annotations from FusionInspector">\n\
+##INFO=<ID=ANNOTATIONS,Number=.,Type=String,Description="Annotations from FusionInspector">\n\
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n\
 ##FORMAT=<ID=DV,Number=1,Type=Integer,Description="Number of paired-ends that support the event">\n\
 ##FORMAT=<ID=RV,Number=1,Type=Integer,Description="Number of split reads that support the event">\n\
@@ -194,7 +195,6 @@ def header_def(sample):
         sample
     )
 
-
 def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
     """
     Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index.

From afda8fda8c5cb7674b6f437e384ab1537552237a Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:19:02 +0100
Subject: [PATCH 31/45] mend

---
 conf/modules.config               | 1 -
 modules/local/vcf_collect/main.nf | 8 +++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index f49dd02c..39135aa3 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -344,6 +344,5 @@ process {
 
     withName: VCF_COLLECT {
         ext.when = {!params.fusioninspector_only}
-        ext.prefix = { "${meta.id}_fusion_data" }
     }
 }
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index df999204..c634a6a3 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,8 +2,10 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::gtfparse=2.0.1"
-    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
+    conda "conda-forge::python=3.8.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)
@@ -20,7 +22,7 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From f14bd90558e09d6dcfaf6190199be5b640247789 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:22:08 +0100
Subject: [PATCH 32/45] Revert "mend"

This reverts commit afda8fda8c5cb7674b6f437e384ab1537552237a.
---
 conf/modules.config               | 1 +
 modules/local/vcf_collect/main.nf | 8 +++-----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/conf/modules.config b/conf/modules.config
index 39135aa3..f49dd02c 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -344,5 +344,6 @@ process {
 
     withName: VCF_COLLECT {
         ext.when = {!params.fusioninspector_only}
+        ext.prefix = { "${meta.id}_fusion_data" }
     }
 }
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index c634a6a3..df999204 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,10 +2,8 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::python=3.8.3"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
-        'quay.io/biocontainers/pandas:1.5.2' }"
+    conda "bioconda::gtfparse=2.0.1"
+    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
 
     input:
     tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)
@@ -22,7 +20,7 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 55d95d1f8c81e78bdabf7e2dfded65b95bd69b67 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:24:02 +0100
Subject: [PATCH 33/45] revert to plain pandas container

---
 modules/local/vcf_collect/main.nf | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index df999204..c634a6a3 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,8 +2,10 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::gtfparse=2.0.1"
-    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
+    conda "conda-forge::python=3.8.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)
@@ -20,7 +22,7 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From 6bc7351dd1bf9e43e5c9c69312594d245841c0dd Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:24:52 +0100
Subject: [PATCH 34/45] sample should just be meta.id, add fusion_data in
 module for output name

---
 conf/modules.config | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conf/modules.config b/conf/modules.config
index f49dd02c..39135aa3 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -344,6 +344,5 @@ process {
 
     withName: VCF_COLLECT {
         ext.when = {!params.fusioninspector_only}
-        ext.prefix = { "${meta.id}_fusion_data" }
     }
 }

From d559232b5da7a028c15b53a80e462ef352ee6dab Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:53:27 +0100
Subject: [PATCH 35/45] remove quotes and flatten the pseudo list in annots

---
 bin/vcf_collect.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 7156a806..d52031ef 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -159,7 +159,7 @@ def parse_args(argv=None):
     return parser.parse_args(argv)
 
 
-def header_def(sample):
+def header_def(sample: str) -> str:
     """
     Define the header of the VCF file
     """
@@ -195,6 +195,14 @@ def header_def(sample):
         sample
     )
 
+
+def convert_to_list(annots_str: str) -> list:
+    try:
+        return ast.literal_eval(annots_str)
+    except (SyntaxError, ValueError):
+        return np.nan
+
+
 def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
     """
     Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index.
@@ -205,6 +213,11 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
     df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True)
     df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True)
     df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True)
+    df["annots"] = (
+        df["annots"]
+        .apply(convert_to_list)
+        .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "")
+    )
     return df.set_index(["FUSION"])
 
 
@@ -328,12 +341,7 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
             "FORMAT",
             "Sample",
         ]
-    ].to_csv(
-        path_or_buf=out_file,
-        sep="\t",
-        header=None,
-        index=False,
-    )
+    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
 
     with open(out_file, "r+") as f:
         content = f.read()

From d0a4bf2202945fcef640d335d0dddc91a00e7408 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 17:53:50 +0100
Subject: [PATCH 36/45] back to original container

---
 modules/local/vcf_collect/main.nf | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index c634a6a3..5028b2c5 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,10 +2,8 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::python=3.8.3"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
-        'quay.io/biocontainers/pandas:1.5.2' }"
+    conda "bioconda::gtfparse=2.0.1"
+    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
 
     input:
     tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)

From b46894252fc26553a02a1bad13162a6b6113e595 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 13 Nov 2023 20:12:36 +0100
Subject: [PATCH 37/45] revert to pandas

---
 modules/local/vcf_collect/main.nf | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 5028b2c5..c634a6a3 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,8 +2,10 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::gtfparse=2.0.1"
-    container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1"
+    conda "conda-forge::python=3.8.3"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'quay.io/biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report)

From 960db4cf090ced0022d7956dcfcf70ec3f7ded1c Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Tue, 14 Nov 2023 22:52:38 +0100
Subject: [PATCH 38/45] apply fixes

---
 bin/vcf_collect.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index d52031ef..8e15ccfc 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -11,9 +11,11 @@
 
 logger = logging.getLogger()
 
+# vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out)
+
 
 def vcf_collect(
-    fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file
+    fusioninspector_in_file: str, fusionreport_in_file: str, gtf: str, hgnc: str, sample: str, out_file
 ) -> None:
     """
     Process FusionInspector and FusionReport data,
@@ -25,7 +27,7 @@ def vcf_collect(
         fusionreport_in_file (str): Path to FusionReport input file.
         sample (str): Sample name for the header.
         hgnc (str): Path to HGNC file.
-        gtf (str): Path to output GTF file from FusionInspector.
+        gtf (str): Path to output GTF file from FusionInspector in TSV format.
         out (str): Output VCF file path.
 
     Adapted from: https://github.com/J35P312/MegaFusion
@@ -116,7 +118,7 @@ def vcf_collect(
         ]
     ].drop_duplicates()
 
-    return write_vcf(column_manipulation(all_df), header_def(sample), out)
+    return write_vcf(column_manipulation(all_df), header_def(sample), out_file)
 
 
 def parse_args(argv=None):
@@ -371,7 +373,12 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
 def main(argv=None):
     """Coordinate argument parsing and program execution."""
     args = parse_args(argv)
-    if not args.fusioninspector.is_file() or not args.fusionreport.is_file():
+    if (
+        not args.fusioninspector.is_file()
+        or not args.fusionreport.is_file()
+        or not args.fusioninspector_gtf
+        or not args.hgnc
+    ):
         logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
         sys.exit(2)
     vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out)

From 852c6ac8290aa3819f51ee0e4ebcd36b465caade Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 15 Nov 2023 09:33:18 +0100
Subject: [PATCH 39/45] compress vcf output

---
 bin/vcf_collect.py                 | 5 +++--
 modules/local/vcf_collect/main.nf  | 4 ++--
 modules/local/vcf_collect/meta.yml | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 8e15ccfc..6a97fb50 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -8,6 +8,7 @@
 import ast
 import numpy as np
 import csv
+import gzip
 
 logger = logging.getLogger()
 
@@ -343,9 +344,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
             "FORMAT",
             "Sample",
         ]
-    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
+    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression='gzip')
 
-    with open(out_file, "r+") as f:
+    with gzip.open(out_file, "r+") as f:
         content = f.read()
         f.seek(0, 0)
         f.write(header.rstrip("\r\n") + "\n" + content)
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index c634a6a3..42f94c40 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -14,7 +14,7 @@ process VCF_COLLECT {
 
     output:
     path "versions.yml"              , emit: versions
-    tuple val(meta), path("*vcf")    , emit: vcf
+    tuple val(meta), path("*vcf.gz") , emit: vcf
 
     when:
     task.ext.when == null || task.ext.when
@@ -22,7 +22,7 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf.gz
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/vcf_collect/meta.yml b/modules/local/vcf_collect/meta.yml
index 40bdd6c0..de4667bb 100644
--- a/modules/local/vcf_collect/meta.yml
+++ b/modules/local/vcf_collect/meta.yml
@@ -32,8 +32,8 @@ output:
       pattern: "versions.yml"
   - vcf:
       type: file
-      description: File containing the summary of all fusions as vcf file
-      pattern: "*.tsv"
+      description: File containing the summary of all fusions as compressed vcf file
+      pattern: "*.vcf.gz"
 
 authors:
   - "@rannick"

From 9643ed80b047441eda5735a6c61857a6e5b0a12c Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 15 Nov 2023 09:33:40 +0100
Subject: [PATCH 40/45] black

---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 6a97fb50..4d677de0 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -344,7 +344,7 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
             "FORMAT",
             "Sample",
         ]
-    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression='gzip')
+    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression="gzip")
 
     with gzip.open(out_file, "r+") as f:
         content = f.read()

From bb703dd103fa91ab4e2f28b12c673d21dbbe6232 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 15 Nov 2023 09:44:42 +0100
Subject: [PATCH 41/45] modules updatge

---
 modules.json                                  |  40 +-
 .../agat/convertspgff2tsv/environment.yml     |   1 +
 modules/nf-core/cat/cat/environment.yml       |   7 +
 modules/nf-core/cat/cat/main.nf               |   2 +-
 modules/nf-core/cat/cat/meta.yml              |   7 +-
 modules/nf-core/cat/cat/tests/main.nf.test    | 153 ++++++
 .../nf-core/cat/cat/tests/main.nf.test.snap   | 121 +++++
 .../cat/tests/nextflow_unzipped_zipped.config |   6 +
 .../cat/tests/nextflow_zipped_unzipped.config |   8 +
 modules/nf-core/cat/cat/tests/tags.yml        |   2 +
 modules/nf-core/cat/fastq/environment.yml     |   7 +
 modules/nf-core/cat/fastq/main.nf             |   2 +-
 modules/nf-core/cat/fastq/meta.yml            |   4 +-
 modules/nf-core/cat/fastq/tests/main.nf.test  | 143 ++++++
 .../nf-core/cat/fastq/tests/main.nf.test.snap |  78 +++
 modules/nf-core/cat/fastq/tests/tags.yml      |   2 +
 .../dumpsoftwareversions/environment.yml      |   7 +
 .../custom/dumpsoftwareversions/main.nf       |   6 +-
 .../custom/dumpsoftwareversions/meta.yml      |   7 +-
 .../dumpsoftwareversions/tests/main.nf.test   |  38 ++
 .../tests/main.nf.test.snap                   |  27 +
 .../dumpsoftwareversions/tests/tags.yml       |   2 +
 modules/nf-core/fastp/environment.yml         |   7 +
 modules/nf-core/fastp/main.nf                 |   2 +-
 modules/nf-core/fastp/meta.yml                |   4 +-
 modules/nf-core/fastp/tests/main.nf.test      | 485 ++++++++++++++++++
 modules/nf-core/fastp/tests/main.nf.test.snap |  52 ++
 modules/nf-core/fastp/tests/nextflow.config   |   6 +
 modules/nf-core/fastp/tests/tags.yml          |   2 +
 modules/nf-core/fastqc/environment.yml        |   7 +
 modules/nf-core/fastqc/main.nf                |   6 +-
 modules/nf-core/fastqc/meta.yml               |   5 +
 modules/nf-core/fastqc/tests/main.nf.test     |  23 +-
 .../nf-core/fastqc/tests/main.nf.test.snap    |  10 +
 modules/nf-core/fastqc/tests/tags.yml         |   2 +
 .../gatk4/bedtointervallist/environment.yml   |   7 +
 .../nf-core/gatk4/bedtointervallist/main.nf   |   2 +-
 .../nf-core/gatk4/bedtointervallist/meta.yml  |   3 +
 .../createsequencedictionary/environment.yml  |   7 +
 .../gatk4/createsequencedictionary/main.nf    |   2 +-
 .../gatk4/createsequencedictionary/meta.yml   |   4 +-
 .../gatk4/markduplicates/environment.yml      |   8 +
 modules/nf-core/gatk4/markduplicates/main.nf  |   2 +-
 modules/nf-core/gatk4/markduplicates/meta.yml |  12 +-
 modules/nf-core/multiqc/environment.yml       |   7 +
 modules/nf-core/multiqc/main.nf               |   6 +-
 modules/nf-core/multiqc/meta.yml              |  11 +-
 .../collectinsertsizemetrics/environment.yml  |   7 +
 .../picard/collectinsertsizemetrics/main.nf   |   6 +-
 .../picard/collectinsertsizemetrics/meta.yml  |   5 +-
 .../picard/collectwgsmetrics/environment.yml  |   8 +
 .../nf-core/picard/collectwgsmetrics/main.nf  |   6 +-
 .../nf-core/picard/collectwgsmetrics/meta.yml |   5 +
 .../nf-core/samtools/faidx/environment.yml    |   7 +
 modules/nf-core/samtools/faidx/main.nf        |   2 +-
 modules/nf-core/samtools/faidx/meta.yml       |   4 +
 .../nf-core/samtools/index/environment.yml    |   7 +
 modules/nf-core/samtools/index/main.nf        |   2 +-
 modules/nf-core/samtools/index/meta.yml       |   4 +
 modules/nf-core/samtools/sort/environment.yml |   7 +
 modules/nf-core/samtools/sort/main.nf         |   2 +-
 modules/nf-core/samtools/sort/meta.yml        |   3 +
 .../nf-core/samtools/sort/tests/main.nf.test  |  70 +++
 .../samtools/sort/tests/main.nf.test.snap     |  39 ++
 .../samtools/sort/tests/nextflow.config       |   7 +
 modules/nf-core/samtools/sort/tests/tags.yml  |   3 +
 modules/nf-core/samtools/view/environment.yml |   7 +
 modules/nf-core/samtools/view/main.nf         |   2 +-
 modules/nf-core/samtools/view/meta.yml        |   5 +
 modules/nf-core/star/align/environment.yml    |   9 +
 modules/nf-core/star/align/main.nf            |   2 +-
 modules/nf-core/star/align/meta.yml           |   6 +-
 .../star/genomegenerate/environment.yml       |   9 +
 modules/nf-core/star/genomegenerate/main.nf   |   2 +-
 modules/nf-core/star/genomegenerate/meta.yml  |   5 +-
 .../nf-core/stringtie/merge/environment.yml   |   7 +
 modules/nf-core/stringtie/merge/main.nf       |   2 +-
 modules/nf-core/stringtie/merge/meta.yml      |   3 +-
 .../stringtie/stringtie/environment.yml       |   7 +
 modules/nf-core/stringtie/stringtie/main.nf   |   2 +-
 modules/nf-core/stringtie/stringtie/meta.yml  |   3 +-
 81 files changed, 1531 insertions(+), 86 deletions(-)
 create mode 100644 modules/nf-core/cat/cat/environment.yml
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test
 create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
 create mode 100644 modules/nf-core/cat/cat/tests/tags.yml
 create mode 100644 modules/nf-core/cat/fastq/environment.yml
 create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test
 create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/cat/fastq/tests/tags.yml
 create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml
 create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test
 create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml
 create mode 100644 modules/nf-core/fastp/environment.yml
 create mode 100644 modules/nf-core/fastp/tests/main.nf.test
 create mode 100644 modules/nf-core/fastp/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/fastp/tests/nextflow.config
 create mode 100644 modules/nf-core/fastp/tests/tags.yml
 create mode 100644 modules/nf-core/fastqc/environment.yml
 create mode 100644 modules/nf-core/fastqc/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/fastqc/tests/tags.yml
 create mode 100644 modules/nf-core/gatk4/bedtointervallist/environment.yml
 create mode 100644 modules/nf-core/gatk4/createsequencedictionary/environment.yml
 create mode 100644 modules/nf-core/gatk4/markduplicates/environment.yml
 create mode 100644 modules/nf-core/multiqc/environment.yml
 create mode 100644 modules/nf-core/picard/collectinsertsizemetrics/environment.yml
 create mode 100644 modules/nf-core/picard/collectwgsmetrics/environment.yml
 create mode 100644 modules/nf-core/samtools/faidx/environment.yml
 create mode 100644 modules/nf-core/samtools/index/environment.yml
 create mode 100644 modules/nf-core/samtools/sort/environment.yml
 create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test
 create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test.snap
 create mode 100644 modules/nf-core/samtools/sort/tests/nextflow.config
 create mode 100644 modules/nf-core/samtools/sort/tests/tags.yml
 create mode 100644 modules/nf-core/samtools/view/environment.yml
 create mode 100644 modules/nf-core/star/align/environment.yml
 create mode 100644 modules/nf-core/star/genomegenerate/environment.yml
 create mode 100644 modules/nf-core/stringtie/merge/environment.yml
 create mode 100644 modules/nf-core/stringtie/stringtie/environment.yml

diff --git a/modules.json b/modules.json
index a6ffdfd5..724634dc 100644
--- a/modules.json
+++ b/modules.json
@@ -7,7 +7,7 @@
                 "nf-core": {
                     "agat/convertspgff2tsv": {
                         "branch": "master",
-                        "git_sha": "53e6fd5d80141e00a3b70762f4361f6af1f4303b",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "arriba": {
@@ -17,97 +17,97 @@
                     },
                     "cat/cat": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "cat/fastq": {
                         "branch": "master",
-                        "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "custom/dumpsoftwareversions": {
                         "branch": "master",
-                        "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4",
+                        "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e",
                         "installed_by": ["modules"]
                     },
                     "fastp": {
                         "branch": "master",
-                        "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "fastqc": {
                         "branch": "master",
-                        "git_sha": "9a4517e720bc812e95b56d23d15a1653b6db4f53",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "gatk4/bedtointervallist": {
                         "branch": "master",
-                        "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "gatk4/createsequencedictionary": {
                         "branch": "master",
-                        "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "gatk4/markduplicates": {
                         "branch": "master",
-                        "git_sha": "2aa9c2981930687792ed861b0a5f9ff7bb568a7d",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "multiqc": {
                         "branch": "master",
-                        "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80",
+                        "git_sha": "214d575774c172062924ad3564b4f66655600730",
                         "installed_by": ["modules"]
                     },
                     "picard/collectinsertsizemetrics": {
                         "branch": "master",
-                        "git_sha": "240937a2a9c30298110753292be041188891f2cb",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "picard/collectwgsmetrics": {
                         "branch": "master",
-                        "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "samtools/faidx": {
                         "branch": "master",
-                        "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "samtools/index": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "samtools/sort": {
                         "branch": "master",
-                        "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "samtools/view": {
                         "branch": "master",
-                        "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "star/align": {
                         "branch": "master",
-                        "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "star/genomegenerate": {
                         "branch": "master",
-                        "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "stringtie/merge": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     },
                     "stringtie/stringtie": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
                         "installed_by": ["modules"]
                     }
                 }
diff --git a/modules/nf-core/agat/convertspgff2tsv/environment.yml b/modules/nf-core/agat/convertspgff2tsv/environment.yml
index 9ca0ea28..b5fdf3db 100644
--- a/modules/nf-core/agat/convertspgff2tsv/environment.yml
+++ b/modules/nf-core/agat/convertspgff2tsv/environment.yml
@@ -1,3 +1,4 @@
+name: agat_convertspgff2tsv
 channels:
   - conda-forge
   - bioconda
diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml
new file mode 100644
index 00000000..17a04ef2
--- /dev/null
+++ b/modules/nf-core/cat/cat/environment.yml
@@ -0,0 +1,7 @@
+name: cat_cat
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::pigz=2.3.4
diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf
index 9f062219..4264a92c 100644
--- a/modules/nf-core/cat/cat/main.nf
+++ b/modules/nf-core/cat/cat/main.nf
@@ -2,7 +2,7 @@ process CAT_CAT {
     tag "$meta.id"
     label 'process_low'
 
-    conda "conda-forge::pigz=2.3.4"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/pigz:2.3.4' :
         'biocontainers/pigz:2.3.4' }"
diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml
index 8acc0bfa..00a8db0b 100644
--- a/modules/nf-core/cat/cat/meta.yml
+++ b/modules/nf-core/cat/cat/meta.yml
@@ -7,9 +7,7 @@ keywords:
 tools:
   - cat:
       description: Just concatenation
-
       documentation: https://man7.org/linux/man-pages/man1/cat.1.html
-
       licence: ["GPL-3.0-or-later"]
 input:
   - meta:
@@ -21,7 +19,6 @@ input:
       type: file
       description: List of compressed / uncompressed files
       pattern: "*"
-
 output:
   - versions:
       type: file
@@ -31,7 +28,9 @@ output:
       type: file
       description: Concatenated file. Will be gzipped if file_out ends with ".gz"
       pattern: "${file_out}"
-
 authors:
   - "@erikrikarddaniel"
   - "@FriederikeHanssen"
+maintainers:
+  - "@erikrikarddaniel"
+  - "@FriederikeHanssen"
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test
new file mode 100644
index 00000000..5766daaf
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test
@@ -0,0 +1,153 @@
+nextflow_process {
+
+    name "Test Process CAT_CAT"
+    script "../main.nf"
+    process "CAT_CAT"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "cat"
+    tag "cat/cat"
+
+    test("test_cat_unzipped_unzipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+
+    test("test_cat_zipped_zipped") {
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_zipped_unzipped") {
+        config './nextflow_zipped_unzipped.config'
+
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("test_cat_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")}
+            )
+        }
+    }
+
+    test("test_cat_one_file_unzipped_zipped") {
+        config './nextflow_unzipped_zipped.config'
+        when {
+            params {
+                outdir   = "${outputDir}"
+            }
+            process {
+                """
+                input[0] =
+                    [
+                        [ id:'test', single_end:true ],
+                        [
+                            file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)
+                        ]
+                    ]
+                """
+            }
+        }
+        then {
+            def lines = path(process.out.file_out.get(0).get(1)).linesGzip
+            assertAll(
+                { assert process.success },
+                { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") },
+                { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")}
+            )
+        }
+    }
+}
+
diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap
new file mode 100644
index 00000000..423571ba
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap
@@ -0,0 +1,121 @@
+{
+    "test_cat_unzipped_zipped_size": {
+        "content": [
+            375
+        ],
+        "timestamp": "2023-10-16T14:33:08.049445686"
+    },
+    "test_cat_unzipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:18.500464399"
+    },
+    "test_cat_zipped_unzipped": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ],
+                "file_out": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": true
+                        },
+                        "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-16T14:32:49.642741302"
+    },
+    "test_cat_zipped_zipped_lines": {
+        "content": [
+            [
+                "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab",
+                "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1",
+                "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1",
+                "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1",
+                "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:32:33.629048645"
+    },
+    "test_cat_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:08.038830506"
+    },
+    "test_cat_one_file_unzipped_zipped_lines": {
+        "content": [
+            [
+                ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome",
+                "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT",
+                "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG",
+                "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG",
+                "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT",
+                "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG"
+            ]
+        ],
+        "timestamp": "2023-10-16T14:33:21.39642399"
+    },
+    "test_cat_zipped_zipped_size": {
+        "content": [
+            78
+        ],
+        "timestamp": "2023-10-16T14:32:33.641869244"
+    },
+    "test_cat_one_file_unzipped_zipped_size": {
+        "content": [
+            374
+        ],
+        "timestamp": "2023-10-16T14:33:21.4094373"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
new file mode 100644
index 00000000..ec26b0fd
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config
@@ -0,0 +1,6 @@
+
+process {
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt.gz'
+    }
+}
diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
new file mode 100644
index 00000000..fbc79783
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config
@@ -0,0 +1,8 @@
+
+process {
+
+    withName: CAT_CAT {
+        ext.prefix = 'cat.txt'
+    }
+
+}
diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml
new file mode 100644
index 00000000..37b578f5
--- /dev/null
+++ b/modules/nf-core/cat/cat/tests/tags.yml
@@ -0,0 +1,2 @@
+cat/cat:
+  - modules/nf-core/cat/cat/**
diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml
new file mode 100644
index 00000000..bff93add
--- /dev/null
+++ b/modules/nf-core/cat/fastq/environment.yml
@@ -0,0 +1,7 @@
+name: cat_fastq
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::sed=4.7
diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf
index 5021e6fc..3d963784 100644
--- a/modules/nf-core/cat/fastq/main.nf
+++ b/modules/nf-core/cat/fastq/main.nf
@@ -2,7 +2,7 @@ process CAT_FASTQ {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::sed=4.7"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
         'nf-core/ubuntu:20.04' }"
diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml
index 8a39e309..db4ac3c7 100644
--- a/modules/nf-core/cat/fastq/meta.yml
+++ b/modules/nf-core/cat/fastq/meta.yml
@@ -34,7 +34,9 @@ output:
       type: file
       description: File containing software versions
       pattern: "versions.yml"
-
 authors:
   - "@joseespinosa"
   - "@drpatelh"
+maintainers:
+  - "@joseespinosa"
+  - "@drpatelh"
diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test
new file mode 100644
index 00000000..f5f94182
--- /dev/null
+++ b/modules/nf-core/cat/fastq/tests/main.nf.test
@@ -0,0 +1,143 @@
+nextflow_process {
+
+    name "Test Process CAT_FASTQ"
+    script "../main.nf"
+    process "CAT_FASTQ"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "cat"
+    tag "cat/fastq"
+
+    test("test_cat_fastq_single_end") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                            [ id:'test', single_end:true ], // meta map
+                            [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                            file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ]
+                        ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.reads).match() },
+                { assert path(process.out.versions.get(0)).getText().contains("cat") }
+            )
+        }
+    }
+
+    test("test_cat_fastq_paired_end") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.reads).match() },
+                { assert path(process.out.versions.get(0)).getText().contains("cat") }
+            )
+        }
+    }
+
+    test("test_cat_fastq_single_end_same_name") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.reads).match() },
+                { assert path(process.out.versions.get(0)).getText().contains("cat") }
+            )
+        }
+    }
+
+    test("test_cat_fastq_paired_end_same_name") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                    file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.reads).match() },
+                { assert path(process.out.versions.get(0)).getText().contains("cat") }
+            )
+        }
+    }
+
+    test("test_cat_fastq_single_end_single_file") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:true ], // meta map
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out.reads).match() },
+                { assert path(process.out.versions.get(0)).getText().contains("cat") }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap
new file mode 100644
index 00000000..ec2342e5
--- /dev/null
+++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap
@@ -0,0 +1,78 @@
+{
+    "test_cat_fastq_single_end": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T23:19:12.990284837"
+    },
+    "test_cat_fastq_single_end_same_name": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T23:19:31.554568147"
+    },
+    "test_cat_fastq_single_end_single_file": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T23:19:49.629360033"
+    },
+    "test_cat_fastq_paired_end_same_name": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false
+                    },
+                    [
+                        "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66",
+                        "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e"
+                    ]
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T23:19:40.711617539"
+    },
+    "test_cat_fastq_paired_end": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": false
+                    },
+                    [
+                        "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d",
+                        "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda"
+                    ]
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-18T07:53:20.923560211"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml
new file mode 100644
index 00000000..6ac43614
--- /dev/null
+++ b/modules/nf-core/cat/fastq/tests/tags.yml
@@ -0,0 +1,2 @@
+cat/fastq:
+  - modules/nf-core/cat/fastq/**
diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml
new file mode 100644
index 00000000..f0c63f69
--- /dev/null
+++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml
@@ -0,0 +1,7 @@
+name: custom_dumpsoftwareversions
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multiqc=1.17
diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf
index c9d014b1..7685b33c 100644
--- a/modules/nf-core/custom/dumpsoftwareversions/main.nf
+++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf
@@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS {
     label 'process_single'
 
     // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container
-    conda "bioconda::multiqc=1.15"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' :
-        'biocontainers/multiqc:1.15--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
+        'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
 
     input:
     path versions
diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml
index c32657de..5f15a5fd 100644
--- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml
+++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml
@@ -1,4 +1,4 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
 name: custom_dumpsoftwareversions
 description: Custom module used to dump software versions within the nf-core pipeline template
 keywords:
@@ -16,7 +16,6 @@ input:
       type: file
       description: YML file containing software versions
       pattern: "*.yml"
-
 output:
   - yml:
       type: file
@@ -30,7 +29,9 @@ output:
       type: file
       description: File containing software versions
       pattern: "versions.yml"
-
 authors:
   - "@drpatelh"
   - "@grst"
+maintainers:
+  - "@drpatelh"
+  - "@grst"
diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test
new file mode 100644
index 00000000..eec1db10
--- /dev/null
+++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test
@@ -0,0 +1,38 @@
+nextflow_process {
+
+    name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS"
+    script "../main.nf"
+    process "CUSTOM_DUMPSOFTWAREVERSIONS"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "custom"
+    tag "dumpsoftwareversions"
+    tag "custom/dumpsoftwareversions"
+
+    test("Should run without failures") {
+        when {
+            process {
+                """
+                def tool1_version = '''
+                TOOL1:
+                    tool1: 0.11.9
+                '''.stripIndent()
+
+                def tool2_version = '''
+                TOOL2:
+                    tool2: 1.9
+                '''.stripIndent()
+
+                input[0] = Channel.of(tool1_version, tool2_version).collectFile()
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap
new file mode 100644
index 00000000..4274ed57
--- /dev/null
+++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap
@@ -0,0 +1,27 @@
+{
+    "Should run without failures": {
+        "content": [
+            {
+                "0": [
+                    "software_versions.yml:md5,1c851188476409cda5752ce971b20b58"
+                ],
+                "1": [
+                    "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d"
+                ],
+                "2": [
+                    "versions.yml:md5,3843ac526e762117eedf8825b40683df"
+                ],
+                "mqc_yml": [
+                    "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d"
+                ],
+                "versions": [
+                    "versions.yml:md5,3843ac526e762117eedf8825b40683df"
+                ],
+                "yml": [
+                    "software_versions.yml:md5,1c851188476409cda5752ce971b20b58"
+                ]
+            }
+        ],
+        "timestamp": "2023-11-03T14:43:22.157011"
+    }
+}
diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml
new file mode 100644
index 00000000..405aa24a
--- /dev/null
+++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml
@@ -0,0 +1,2 @@
+custom/dumpsoftwareversions:
+  - modules/nf-core/custom/dumpsoftwareversions/**
diff --git a/modules/nf-core/fastp/environment.yml b/modules/nf-core/fastp/environment.yml
new file mode 100644
index 00000000..70389e66
--- /dev/null
+++ b/modules/nf-core/fastp/environment.yml
@@ -0,0 +1,7 @@
+name: fastp
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::fastp=0.23.4
diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf
index 831b7f12..c8e815ae 100644
--- a/modules/nf-core/fastp/main.nf
+++ b/modules/nf-core/fastp/main.nf
@@ -2,7 +2,7 @@ process FASTP {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::fastp=0.23.4"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' :
         'biocontainers/fastp:0.23.4--h5f740d0_0' }"
diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml
index 197ea7ca..c22a16ab 100644
--- a/modules/nf-core/fastp/meta.yml
+++ b/modules/nf-core/fastp/meta.yml
@@ -33,7 +33,6 @@ input:
   - save_merged:
       type: boolean
       description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz`
-
 output:
   - meta:
       type: map
@@ -71,3 +70,6 @@ output:
 authors:
   - "@drpatelh"
   - "@kevinmenden"
+maintainers:
+  - "@drpatelh"
+  - "@kevinmenden"
diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test
new file mode 100644
index 00000000..f610b735
--- /dev/null
+++ b/modules/nf-core/fastp/tests/main.nf.test
@@ -0,0 +1,485 @@
+nextflow_process {
+
+    name "Test Process FASTP"
+    script "../main.nf"
+    process "FASTP"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "fastp"
+
+    test("test_fastp_single_end") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = false
+                save_merged       = false
+
+                input[0] = [
+                    [ id:'test', single_end:true ],
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
+                ]
+
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text  = [ "Q20 bases:</td><td class='col2'>12.922000 K (92.984097%)",
+                                "single end (151 cycles)" ]
+            def log_text   = [ "Q20 bases: 12922(92.9841%)",
+								"reads passed filter: 99" ]
+            def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1",
+								"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT",
+								"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA",
+								"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1",
+								"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+								"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+            assertAll(
+                { assert process.success },
+                { read_lines.each { read_line ->
+                    { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { assert snapshot(process.out.json).match("test_fastp_single_end_json") },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("test_fastp_paired_end") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = false
+                save_merged       = false
+
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                        file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ]
+                ]
+
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text   = [ "Q20 bases:</td><td class='col2'>25.719000 K (93.033098%)",
+									"The input has little adapter percentage (~0.000000%), probably it's trimmed before."]
+            def log_text    = [ "No adapter detected for read1",
+									"Q30 bases: 12281(88.3716%)"]
+            def json_text   = ['"passed_filter_reads": 198']
+            def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1",
+									"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+									"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+            def read2_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2",
+									"ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG",
+									"AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/2",
+									"GCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGT",
+									"AAAAA6EEAEEEEEAEEAEEAEEEEEEA6EEEEAEEAEEEEE6EEEEEEAEEEEA///A<<EEEEEEEEEAEEEEEE"]
+            assertAll(
+                { assert process.success },
+                { read1_lines.each { read1_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) }
+                    }
+                },
+                { read2_lines.each { read2_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { json_text.each { json_part ->
+                    { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) }
+                    }
+                },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("fastp test_fastp_interleaved") {
+        config './nextflow.config'
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = false
+                save_merged       = false
+
+                input[0] = [ [ id:'test', single_end:true ], // meta map
+                            [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ]
+                        ]
+
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text  = [ "Q20 bases:</td><td class='col2'>25.719000 K (93.033098%)",
+									"paired end (151 cycles + 151 cycles)"]
+            def log_text   = [ "Q20 bases: 12922(92.9841%)",
+									"reads passed filter: 198"]
+            def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA",
+									"@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2",
+									"ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG",
+									"AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE"]
+            assertAll(
+                { assert process.success },
+                { read_lines.each { read_line ->
+                    { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { assert snapshot(process.out.json).match("fastp test_fastp_interleaved_json") },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("test_fastp_single_end_trim_fail") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = true
+                save_merged       = false
+
+                input[0] = [ [ id:'test', single_end:true ], // meta map
+                            [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ]
+                        ]
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text         = [ "Q20 bases:</td><td class='col2'>12.922000 K (92.984097%)",
+									"single end (151 cycles)"]
+            def log_text          = [ "Q20 bases: 12922(92.9841%)",
+									"reads passed filter: 99" ]
+            def read_lines        = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1",
+									"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+									"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+            def failed_read_lines = [ "@ERR5069949.885966 NS500628:121:HK3MMAFX2:4:11610:19682:20132/1 failed_quality_filter",
+									"GTCTAATCATAATTTCTTGGTACAGGCTGGTATTGTTCATCTCAGGGTTATTGGACATTCTATGCAAAATTGTGTACTT",
+									"AAA//E/EAA/E//E//E//E/E//AE/A/E//EAEA///AE//E///E/EEE6EEEAEEA///E/AEE/EAEE/E//E"]
+            assertAll(
+                { assert process.success },
+                { read_lines.each { read_line ->
+                    { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) }
+                    }
+                },
+                { failed_read_lines.each { failed_read_line ->
+                    { assert path(process.out.reads_fail.get(0).get(1)).linesGzip.contains(failed_read_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { assert snapshot(process.out.json).match("test_fastp_single_end_trim_fail_json") },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("test_fastp_paired_end_trim_fail") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = true
+                save_merged       = false
+
+                input[0] = [
+                    [ id:'test', single_end:false ], // meta map
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                        file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true)
+                    ]
+                ]
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text           = [ "Q20 bases:</td><td class='col2'>25.719000 K (93.033098%)",
+									"The input has little adapter percentage (~0.000000%), probably it's trimmed before."]
+            def log_text            = [ "No adapter detected for read1",
+									"Q30 bases: 12281(88.3716%)"]
+            def json_text           = ['"passed_filter_reads": 198']
+            def read1_lines         = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAA",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1",
+									"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+									"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+            def read2_lines         = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/2",
+									"ATGTGTACATTGGCGACCCTGCTCAATTACCTGCACCACGCACATTGCTAACTAAGGGCACACTAGAACCAGAATATTTCAATTCAGTGTGTAGACTTATGAAAACTATAGGTCCAGACATGTTCCTCGGAACTTGTCGGCGTTGTCCTG",
+									"AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEEEEAEEEEEAAEEEEEEEEEAAEAAA<<EAAEEEEEEEAAA<<<AE",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/2",
+									"GCATAGACGGTGCTTTACTTACAAAGTCCTCAGAATACAAAGGTCCTATTACGGATGTTTTCTACAAAGAAAACAGT",
+									"AAAAA6EEAEEEEEAEEAEEAEEEEEEA6EEEEAEEAEEEEE6EEEEEEAEEEEA///A<<EEEEEEEEEAEEEEEE"]
+            def failed_read2_lines  = ["@ERR5069949.885966 NS500628:121:HK3MMAFX2:4:11610:19682:20132/2",
+									"CTTAGGTCTTAGGATTGGCTGTATCAACCTTAAGCTTAAGTACACAATTTTGCATAGAATGTCCAATAA",
+									"A//AA6EEAEEEEE6EEE/EEA/EA///AAE/EAEEEAE6AE/E/E/EEAAE/EAA/E/E/<EA//E/6"]
+            // def failed_read1_lines = path(process.out.reads_fail.get(0).get(1).get(0)).linesGzip is empty file
+            assertAll(
+                { assert process.success },
+                { assert path(process.out.reads_fail.get(0).get(1).get(0)).exists() },
+                { read1_lines.each { read1_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) }
+                    }
+                },
+                { read2_lines.each { read2_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) }
+                    }
+                },
+                { failed_read2_lines.each { failed_read2_line ->
+                    { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { json_text.each { json_part ->
+                    { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) }
+                    }
+                },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("test_fastp_paired_end_merged") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = []
+                save_trimmed_fail = false
+                save_merged       = true
+
+                input[0] = [ [ id:'test', single_end:false ], // meta map
+                             [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                               file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ]
+                           ]
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text         = [ "<div id='After_filtering__merged__quality'>"]
+            def log_text          = [ "Merged and filtered:",
+									"total reads: 75",
+									"total bases: 13683"]
+            def json_text         = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683']
+            def read1_lines       = [ "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1",
+									"CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC",
+									"AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE<AEEAEEEAEAEEEAEAEEAE/AEEEEAEEEEAEA",
+									"@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/1",
+									"CACAAACTCTAAAAGAATGTATAGGGTCAGCACCAAAAATACCAGCAGATAATAATGTTGCAAGTAGAACTTCGTGCAGATTAAAATTTTCATAAGCACTCTAAAGAAGTTGAATGTCTTCAAATTTCTTAACATTAGGGCCCACAACAAG",
+									"AAAAAEA/A<EAA/AE/EE/EE//////EA/EEE/E/EEEE//E/6//EA//<AA/A/EEEAA/EEEE/EEEA/E/</AEE////AEEEE//<E//EAE/A///<EEE//<E<<EEE<///A//E/E/EEEAA/<A////<A/AEAAA//E"]
+            def read2_lines       = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/2",
+									"GTACAAAAATAGCCTAAGAAACAATAAACTAGCATTATACACTGAAGTGTATTACCAGTTATGAAGAAAATAGGGCAATACTCAACACACATAAAAACAATACCTCTGGCCAAAAACATGACAGTTGTAACTACACCTGAGTAGTTAGAAG",
+									"AAAAAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEA/EAA</EEEEEEEEEE/AE//A/E<AE<AA<AEEE/AAEAAAEEAEEA<A6AEEA<EEAEEEEEEEAA//EE",
+									"@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/2",
+									"ATGAATCTGATGAATACATAGCTACTAATGGACCTCTTAAAGTGCGTGGTAGTTGTGATTAAAGCGGACACATACTTGCTAAACACTCTCTTCATGATGTC",
+									"A/AAAEEEEA6AA6EE//EEA/EEEAE/EA/A////E</EEAA//EEA////EAE<///E/AEA</AAE/EA//E<EAAAE/AA//AEE//A/AE//</EE"]
+            def read_merged_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1 merged_150_37",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGTGGTGCAGGTAATTGAGCAGGGTCGCCAATGTACACAT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAAEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEAAAAA",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1 merged_77_0",
+									"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+									"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+            assertAll(
+                { assert process.success },
+                { read1_lines.each { read1_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) }
+                    }
+                },
+                { read2_lines.each { read2_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) }
+                    }
+                },
+                { read_merged_lines.each { read_merged_line ->
+                    { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { json_text.each { json_part ->
+                    { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) }
+                    }
+                },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+
+    test("test_fastp_paired_end_merged_adapterlist") {
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                adapter_fasta     = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta", checkIfExists: true)
+                save_trimmed_fail = false
+                save_merged       = true
+
+                input[0] = [ [ id:'test', single_end:false ], // meta map
+                             [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true),
+                               file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ]
+                           ]
+                input[1] = adapter_fasta
+                input[2] = save_trimmed_fail
+                input[3] = save_merged
+                """
+            }
+        }
+
+        then {
+            def html_text         = [ "<div id='After_filtering__merged__quality'>"]
+            def log_text          = [ "Merged and filtered:",
+									"total reads: 75",
+									"total bases: 13683"]
+            def json_text         = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683',"--adapter_fasta"]
+            def read1_lines       = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1",
+									"CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC",
+									"AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE<AEEAEEEAEAEEEAEAEEAE/AEEEEAEEEEAEA",
+									"@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/1",
+									"CACAAACTCTAAAAGAATGTATAGGGTCAGCACCAAAAATACCAGCAGATAATAATGTTGCAAGTAGAACTTCGTGCAGATTAAAATTTTCATAAGCACTCTAAAGAAGTTGAATGTCTTCAAATTTCTTAACATTAGGGCCCACAACAAG",
+									"AAAAAEA/A<EAA/AE/EE/EE//////EA/EEE/E/EEEE//E/6//EA//<AA/A/EEEAA/EEEE/EEEA/E/</AEE////AEEEE//<E//EAE/A///<EEE//<E<<EEE<///A//E/E/EEEAA/<A////<A/AEAAA//E"]
+            def read2_lines       = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/2",
+									"GTACAAAAATAGCCTAAGAAACAATAAACTAGCATTATACACTGAAGTGTATTACCAGTTATGAAGAAAATAGGGCAATACTCAACACACATAAAAACAATACCTCTGGCCAAAAACATGACAGTTGTAACTACACCTGAGTAGTTAGAAG",
+									"AAAAAEEEEEEEEAEEEEEEEEEEEEEEEEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEEEEEA/EAA</EEEEEEEEEE/AE//A/E<AE<AA<AEEE/AAEAAAEEAEEA<A6AEEA<EEAEEEEEEEAA//EE",
+									"@ERR5069949.324865 NS500628:121:HK3MMAFX2:1:11102:17526:14721/2",
+									"ATGAATCTGATGAATACATAGCTACTAATGGACCTCTTAAAGTGCGTGGTAGTTGTGATTAAAGCGGACACATACTTGCTAAACACTCTCTTCATGATGTC",
+									"A/AAAEEEEA6AA6EE//EEA/EEEAE/EA/A////E</EEAA//EEA////EAE<///E/AEA</AAE/EA//E<EAAAE/AA//AEE//A/AE//</EE"]
+            def read_merged_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1 merged_150_37",
+									"TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGTGGTGCAGGTAATTGAGCAGGGTCGCCAATGTACACAT",
+									"AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE<EEAAAEEEEEEEEEAAAAEAEEEAEEEEEE<AAAAEEEEEEEEEEEEEEEEEEEEE/EEEEEEEEEEAAAAA",
+									"@ERR5069949.576388 NS500628:121:HK3MMAFX2:4:11501:11167:14939/1 merged_77_0",
+									"ACTGTTTTCTTTGTAGAAAACATCCGTAATAGGACCTTTGTATTCTGAGGACTTTGTAAGTAAAGCACCGTCTATGC",
+									"AAA6AEEEEEEEEEAEEE/6EEAEEEAEEEEEAEEEEEEEEEEEEEEEEEEEEE<AAEEEEEEEEEEE</EEEA/AE"]
+
+            assertAll(
+                { assert process.success },
+                { read1_lines.each { read1_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) }
+                    }
+                },
+                { read2_lines.each { read2_line ->
+                    { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) }
+                    }
+                },
+                { read_merged_lines.each { read_merged_line ->
+                    { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) }
+                    }
+                },
+                { html_text.each { html_part ->
+                    { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) }
+                    }
+                },
+                { json_text.each { json_part ->
+                    { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) }
+                    }
+                },
+                { log_text.each { log_part ->
+                    { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) }
+                    }
+                },
+                { assert snapshot(process.out.versions).match("versions") }
+            )
+        }
+    }
+}
diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap
new file mode 100644
index 00000000..0fa68c7d
--- /dev/null
+++ b/modules/nf-core/fastp/tests/main.nf.test.snap
@@ -0,0 +1,52 @@
+{
+    "fastp test_fastp_interleaved_json": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T11:04:45.794175881"
+    },
+    "test_fastp_single_end_json": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T11:04:10.566343705"
+    },
+    "versions": {
+        "content": [
+            [
+                "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02"
+            ]
+        ],
+        "timestamp": "2023-10-17T11:04:10.582076024"
+    },
+    "test_fastp_single_end_trim_fail_json": {
+        "content": [
+            [
+                [
+                    {
+                        "id": "test",
+                        "single_end": true
+                    },
+                    "test.fastp.json:md5,9a7ee180f000e8d00c7fb67f06293eb5"
+                ]
+            ]
+        ],
+        "timestamp": "2023-10-17T11:05:00.379878948"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.config
new file mode 100644
index 00000000..0f7849ad
--- /dev/null
+++ b/modules/nf-core/fastp/tests/nextflow.config
@@ -0,0 +1,6 @@
+process {
+
+    withName: FASTP {
+        ext.args = "--interleaved_in"
+    }
+}
diff --git a/modules/nf-core/fastp/tests/tags.yml b/modules/nf-core/fastp/tests/tags.yml
new file mode 100644
index 00000000..c1afcce7
--- /dev/null
+++ b/modules/nf-core/fastp/tests/tags.yml
@@ -0,0 +1,2 @@
+fastp:
+  - modules/nf-core/fastp/**
diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml
new file mode 100644
index 00000000..1787b38a
--- /dev/null
+++ b/modules/nf-core/fastqc/environment.yml
@@ -0,0 +1,7 @@
+name: fastqc
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::fastqc=0.12.1
diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf
index 249f9064..50e59f2b 100644
--- a/modules/nf-core/fastqc/main.nf
+++ b/modules/nf-core/fastqc/main.nf
@@ -2,10 +2,10 @@ process FASTQC {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::fastqc=0.11.9"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' :
-        'biocontainers/fastqc:0.11.9--0' }"
+        'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' :
+        'biocontainers/fastqc:0.12.1--hdfd78af_0' }"
 
     input:
     tuple val(meta), path(reads)
diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml
index 4da5bb5a..ee5507e0 100644
--- a/modules/nf-core/fastqc/meta.yml
+++ b/modules/nf-core/fastqc/meta.yml
@@ -50,3 +50,8 @@ authors:
   - "@grst"
   - "@ewels"
   - "@FelixKrueger"
+maintainers:
+  - "@drpatelh"
+  - "@grst"
+  - "@ewels"
+  - "@FelixKrueger"
diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test
index 3961de60..6437a144 100644
--- a/modules/nf-core/fastqc/tests/main.nf.test
+++ b/modules/nf-core/fastqc/tests/main.nf.test
@@ -1,13 +1,18 @@
 nextflow_process {
 
     name "Test Process FASTQC"
-    script "modules/nf-core/fastqc/main.nf"
+    script "../main.nf"
     process "FASTQC"
+    tag "modules"
+    tag "modules_nfcore"
     tag "fastqc"
 
     test("Single-Read") {
 
         when {
+            params {
+                outdir   = "$outputDir"
+            }
             process {
                 """
                 input[0] = [
@@ -21,12 +26,16 @@ nextflow_process {
         }
 
         then {
-            assert process.success
-            assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html"
-            assert path(process.out.html.get(0).get(1)).getText().contains("<tr><td>File type</td><td>Conventional base calls</td></tr>")
-            assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip"
+            assertAll (
+            { assert process.success },
+            // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it.
+            // looks like this: <div id="header_filename">Mon 2 Oct 2023<br/>test.gz</div>
+            // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039
+            { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" },
+            { assert path(process.out.html.get(0).get(1)).getText().contains("<tr><td>File type</td><td>Conventional base calls</td></tr>") },
+            { assert snapshot(process.out.versions).match("versions") },
+            { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" }
+            )
         }
-
     }
-
 }
diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap
new file mode 100644
index 00000000..636a32ce
--- /dev/null
+++ b/modules/nf-core/fastqc/tests/main.nf.test.snap
@@ -0,0 +1,10 @@
+{
+    "versions": {
+        "content": [
+            [
+                "versions.yml:md5,e1cc25ca8af856014824abd842e93978"
+            ]
+        ],
+        "timestamp": "2023-10-09T23:40:54+0000"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml
new file mode 100644
index 00000000..7834294b
--- /dev/null
+++ b/modules/nf-core/fastqc/tests/tags.yml
@@ -0,0 +1,2 @@
+fastqc:
+  - modules/nf-core/fastqc/**
diff --git a/modules/nf-core/gatk4/bedtointervallist/environment.yml b/modules/nf-core/gatk4/bedtointervallist/environment.yml
new file mode 100644
index 00000000..e7cb4280
--- /dev/null
+++ b/modules/nf-core/gatk4/bedtointervallist/environment.yml
@@ -0,0 +1,7 @@
+name: gatk4_bedtointervallist
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::gatk4=4.4.0.0
diff --git a/modules/nf-core/gatk4/bedtointervallist/main.nf b/modules/nf-core/gatk4/bedtointervallist/main.nf
index 24968c38..88b24b1a 100644
--- a/modules/nf-core/gatk4/bedtointervallist/main.nf
+++ b/modules/nf-core/gatk4/bedtointervallist/main.nf
@@ -2,7 +2,7 @@ process GATK4_BEDTOINTERVALLIST {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::gatk4=4.4.0.0"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0':
         'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }"
diff --git a/modules/nf-core/gatk4/bedtointervallist/meta.yml b/modules/nf-core/gatk4/bedtointervallist/meta.yml
index 83617a7c..187da885 100644
--- a/modules/nf-core/gatk4/bedtointervallist/meta.yml
+++ b/modules/nf-core/gatk4/bedtointervallist/meta.yml
@@ -46,3 +46,6 @@ output:
 authors:
   - "@kevinmenden"
   - "@ramprasadn"
+maintainers:
+  - "@kevinmenden"
+  - "@ramprasadn"
diff --git a/modules/nf-core/gatk4/createsequencedictionary/environment.yml b/modules/nf-core/gatk4/createsequencedictionary/environment.yml
new file mode 100644
index 00000000..db663e14
--- /dev/null
+++ b/modules/nf-core/gatk4/createsequencedictionary/environment.yml
@@ -0,0 +1,7 @@
+name: gatk4_createsequencedictionary
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::gatk4=4.4.0.0
diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf
index 3e4efdd9..b47ad162 100644
--- a/modules/nf-core/gatk4/createsequencedictionary/main.nf
+++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf
@@ -2,7 +2,7 @@ process GATK4_CREATESEQUENCEDICTIONARY {
     tag "$fasta"
     label 'process_medium'
 
-    conda "bioconda::gatk4=4.4.0.0"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0':
         'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }"
diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml
index 9b8b8c89..f9d70be0 100644
--- a/modules/nf-core/gatk4/createsequencedictionary/meta.yml
+++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml
@@ -15,7 +15,6 @@ tools:
       documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
       doi: 10.1158/1538-7445.AM2017-3590
       licence: ["Apache-2.0"]
-
 input:
   - meta:
       type: map
@@ -38,3 +37,6 @@ output:
 authors:
   - "@maxulysse"
   - "@ramprasadn"
+maintainers:
+  - "@maxulysse"
+  - "@ramprasadn"
diff --git a/modules/nf-core/gatk4/markduplicates/environment.yml b/modules/nf-core/gatk4/markduplicates/environment.yml
new file mode 100644
index 00000000..9adad104
--- /dev/null
+++ b/modules/nf-core/gatk4/markduplicates/environment.yml
@@ -0,0 +1,8 @@
+name: gatk4_markduplicates
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::gatk4=4.4.0.0
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf
index 59e52a3d..564b86d3 100644
--- a/modules/nf-core/gatk4/markduplicates/main.nf
+++ b/modules/nf-core/gatk4/markduplicates/main.nf
@@ -2,7 +2,7 @@ process GATK4_MARKDUPLICATES {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::gatk4=4.4.0.0 bioconda::samtools=1.17"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0':
         'biocontainers/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0' }"
diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml
index d3e75505..b0f09d4b 100644
--- a/modules/nf-core/gatk4/markduplicates/meta.yml
+++ b/modules/nf-core/gatk4/markduplicates/meta.yml
@@ -7,16 +7,12 @@ keywords:
   - sort
 tools:
   - gatk4:
-      description:
-        Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
-        with a primary focus on variant discovery and genotyping. Its powerful processing engine
-        and high-performance computing features make it capable of taking on projects of any size.
+      description: Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools with a primary focus on variant discovery and genotyping. Its powerful processing engine and high-performance computing features make it capable of taking on projects of any size.
       homepage: https://gatk.broadinstitute.org/hc/en-us
       documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard-
       tool_dev_url: https://github.com/broadinstitute/gatk
       doi: 10.1158/1538-7445.AM2017-3590
       licence: ["MIT"]
-
 input:
   - meta:
       type: map
@@ -35,7 +31,6 @@ input:
       type: file
       description: Fasta index file
       pattern: "*.{fai}"
-
 output:
   - meta:
       type: map
@@ -66,8 +61,11 @@ output:
       type: file
       description: Duplicate metrics file generated by GATK
       pattern: "*.{metrics.txt}"
-
 authors:
   - "@ajodeh-juma"
   - "@FriederikeHanssen"
   - "@maxulysse"
+maintainers:
+  - "@ajodeh-juma"
+  - "@FriederikeHanssen"
+  - "@maxulysse"
diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml
new file mode 100644
index 00000000..d2a9f21a
--- /dev/null
+++ b/modules/nf-core/multiqc/environment.yml
@@ -0,0 +1,7 @@
+name: multiqc
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::multiqc=1.17
diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf
index 65d7dd0d..2bbc3983 100644
--- a/modules/nf-core/multiqc/main.nf
+++ b/modules/nf-core/multiqc/main.nf
@@ -1,10 +1,10 @@
 process MULTIQC {
     label 'process_single'
 
-    conda "bioconda::multiqc=1.15"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' :
-        'biocontainers/multiqc:1.15--pyhdfd78af_0' }"
+        'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' :
+        'biocontainers/multiqc:1.17--pyhdfd78af_0' }"
 
     input:
     path  multiqc_files, stageAs: "?/*"
diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml
index f93b5ee5..f1aa660e 100644
--- a/modules/nf-core/multiqc/meta.yml
+++ b/modules/nf-core/multiqc/meta.yml
@@ -1,5 +1,5 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json
-name: MultiQC
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: multiqc
 description: Aggregate results from bioinformatics analyses across many samples into a single report
 keywords:
   - QC
@@ -13,7 +13,6 @@ tools:
       homepage: https://multiqc.info/
       documentation: https://multiqc.info/docs/
       licence: ["GPL-3.0-or-later"]
-
 input:
   - multiqc_files:
       type: file
@@ -31,7 +30,6 @@ input:
       type: file
       description: Optional logo file for MultiQC
       pattern: "*.{png}"
-
 output:
   - report:
       type: file
@@ -54,3 +52,8 @@ authors:
   - "@bunop"
   - "@drpatelh"
   - "@jfy133"
+maintainers:
+  - "@abhi18av"
+  - "@bunop"
+  - "@drpatelh"
+  - "@jfy133"
diff --git a/modules/nf-core/picard/collectinsertsizemetrics/environment.yml b/modules/nf-core/picard/collectinsertsizemetrics/environment.yml
new file mode 100644
index 00000000..5c85f872
--- /dev/null
+++ b/modules/nf-core/picard/collectinsertsizemetrics/environment.yml
@@ -0,0 +1,7 @@
+name: picard_collectinsertsizemetrics
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::picard=3.1.0
diff --git a/modules/nf-core/picard/collectinsertsizemetrics/main.nf b/modules/nf-core/picard/collectinsertsizemetrics/main.nf
index 1d538fae..48e4d2ad 100644
--- a/modules/nf-core/picard/collectinsertsizemetrics/main.nf
+++ b/modules/nf-core/picard/collectinsertsizemetrics/main.nf
@@ -2,10 +2,10 @@ process PICARD_COLLECTINSERTSIZEMETRICS {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::picard=3.0.0"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
-        'biocontainers/picard:3.0.0--hdfd78af_1' }"
+        'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' :
+        'biocontainers/picard:3.1.0--hdfd78af_0' }"
 
     input:
     tuple val(meta), path(bam)
diff --git a/modules/nf-core/picard/collectinsertsizemetrics/meta.yml b/modules/nf-core/picard/collectinsertsizemetrics/meta.yml
index e611bdd4..efd5abe0 100644
--- a/modules/nf-core/picard/collectinsertsizemetrics/meta.yml
+++ b/modules/nf-core/picard/collectinsertsizemetrics/meta.yml
@@ -6,7 +6,6 @@ keywords:
   - insert
   - statistics
   - bam
-
 tools:
   - "picard":
       description: "Java tools for working with NGS data in the BAM format"
@@ -14,7 +13,6 @@ tools:
       documentation: "https://broadinstitute.github.io/picard/"
       tool_dev_url: "https://github.com/broadinstitute/picard"
       licence: "['MIT']"
-
 input:
   - meta:
       type: map
@@ -25,7 +23,6 @@ input:
       type: file
       description: BAM/CRAM/SAM file
       pattern: "*.{bam,cram,sam}"
-
 output:
   - meta:
       type: map
@@ -46,3 +43,5 @@ output:
       pattern: "*.txt"
 authors:
   - "@FerriolCalvet"
+maintainers:
+  - "@FerriolCalvet"
diff --git a/modules/nf-core/picard/collectwgsmetrics/environment.yml b/modules/nf-core/picard/collectwgsmetrics/environment.yml
new file mode 100644
index 00000000..8adda491
--- /dev/null
+++ b/modules/nf-core/picard/collectwgsmetrics/environment.yml
@@ -0,0 +1,8 @@
+name: picard_collectwgsmetrics
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::picard=3.1.0
+  - r::r-base
diff --git a/modules/nf-core/picard/collectwgsmetrics/main.nf b/modules/nf-core/picard/collectwgsmetrics/main.nf
index 1d59334c..67aa5b5e 100644
--- a/modules/nf-core/picard/collectwgsmetrics/main.nf
+++ b/modules/nf-core/picard/collectwgsmetrics/main.nf
@@ -2,10 +2,10 @@ process PICARD_COLLECTWGSMETRICS {
     tag "$meta.id"
     label 'process_single'
 
-    conda "bioconda::picard=3.0.0 r::r-base"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
-        'biocontainers/picard:3.0.0--hdfd78af_1' }"
+        'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' :
+        'biocontainers/picard:3.1.0--hdfd78af_0' }"
 
     input:
     tuple val(meta), path(bam), path(bai)
diff --git a/modules/nf-core/picard/collectwgsmetrics/meta.yml b/modules/nf-core/picard/collectwgsmetrics/meta.yml
index 19906f08..5576ef92 100644
--- a/modules/nf-core/picard/collectwgsmetrics/meta.yml
+++ b/modules/nf-core/picard/collectwgsmetrics/meta.yml
@@ -68,3 +68,8 @@ authors:
   - "@flowuenne"
   - "@lassefolkersen"
   - "@ramprasadn"
+maintainers:
+  - "@drpatelh"
+  - "@flowuenne"
+  - "@lassefolkersen"
+  - "@ramprasadn"
diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml
new file mode 100644
index 00000000..73badedb
--- /dev/null
+++ b/modules/nf-core/samtools/faidx/environment.yml
@@ -0,0 +1,7 @@
+name: samtools_faidx
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf
index 59ed3088..3aa98822 100644
--- a/modules/nf-core/samtools/faidx/main.nf
+++ b/modules/nf-core/samtools/faidx/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_FAIDX {
     tag "$fasta"
     label 'process_single'
 
-    conda "bioconda::samtools=1.17"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml
index 957b25e5..e189af28 100644
--- a/modules/nf-core/samtools/faidx/meta.yml
+++ b/modules/nf-core/samtools/faidx/meta.yml
@@ -55,3 +55,7 @@ authors:
   - "@drpatelh"
   - "@ewels"
   - "@phue"
+maintainers:
+  - "@drpatelh"
+  - "@ewels"
+  - "@phue"
diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml
new file mode 100644
index 00000000..3c6f95b2
--- /dev/null
+++ b/modules/nf-core/samtools/index/environment.yml
@@ -0,0 +1,7 @@
+name: samtools_index
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf
index 0b20aa4b..256bd7c4 100644
--- a/modules/nf-core/samtools/index/main.nf
+++ b/modules/nf-core/samtools/index/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_INDEX {
     tag "$meta.id"
     label 'process_low'
 
-    conda "bioconda::samtools=1.17"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml
index 8bd2fa6f..01a4ee03 100644
--- a/modules/nf-core/samtools/index/meta.yml
+++ b/modules/nf-core/samtools/index/meta.yml
@@ -51,3 +51,7 @@ authors:
   - "@drpatelh"
   - "@ewels"
   - "@maxulysse"
+maintainers:
+  - "@drpatelh"
+  - "@ewels"
+  - "@maxulysse"
diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml
new file mode 100644
index 00000000..508659f0
--- /dev/null
+++ b/modules/nf-core/samtools/sort/environment.yml
@@ -0,0 +1,7 @@
+name: samtools_sort
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf
index 2b7753fd..60f0c634 100644
--- a/modules/nf-core/samtools/sort/main.nf
+++ b/modules/nf-core/samtools/sort/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_SORT {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::samtools=1.17"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml
index 07328431..2200de72 100644
--- a/modules/nf-core/samtools/sort/meta.yml
+++ b/modules/nf-core/samtools/sort/meta.yml
@@ -46,3 +46,6 @@ output:
 authors:
   - "@drpatelh"
   - "@ewels"
+maintainers:
+  - "@drpatelh"
+  - "@ewels"
diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test
new file mode 100644
index 00000000..1f72f3b9
--- /dev/null
+++ b/modules/nf-core/samtools/sort/tests/main.nf.test
@@ -0,0 +1,70 @@
+nextflow_process {
+
+    name "Test Process SAMTOOLS_SORT"
+    script "../main.nf"
+    process "SAMTOOLS_SORT"
+    tag "modules"
+    tag "modules_nfcore"
+    tag "samtools"
+    tag "samtools/sort"
+
+    test("test_samtools_sort") {
+
+        config "./nextflow.config"
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ],
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true)
+                    ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+    test("test_samtools_sort_stub") {
+
+        config "./nextflow.config"
+        options "-stub-run"
+
+        when {
+            params {
+                outdir   = "$outputDir"
+            }
+            process {
+                """
+                input[0] = [
+                    [ id:'test', single_end:false ],
+                    [
+                        file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true)
+                    ]
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll (
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+
+    }
+
+}
diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap
new file mode 100644
index 00000000..a43566da
--- /dev/null
+++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap
@@ -0,0 +1,39 @@
+{
+    "test_samtools_sort": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.sorted.bam:md5,a29570e7607d217c2fa4d75829e09cd7"
+                    ]
+                ],
+                "1": [
+                    
+                ],
+                "2": [
+                    "versions.yml:md5,46f7a36082fa1f68285fe30d689244e8"
+                ],
+                "bam": [
+                    [
+                        {
+                            "id": "test",
+                            "single_end": false
+                        },
+                        "test.sorted.bam:md5,a29570e7607d217c2fa4d75829e09cd7"
+                    ]
+                ],
+                "csi": [
+                    
+                ],
+                "versions": [
+                    "versions.yml:md5,46f7a36082fa1f68285fe30d689244e8"
+                ]
+            }
+        ],
+        "timestamp": "2023-10-17T17:21:46.5427968"
+    }
+}
\ No newline at end of file
diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config
new file mode 100644
index 00000000..d0f35086
--- /dev/null
+++ b/modules/nf-core/samtools/sort/tests/nextflow.config
@@ -0,0 +1,7 @@
+process {
+
+    withName: SAMTOOLS_SORT {
+        ext.prefix = { "${meta.id}.sorted" }
+    }
+
+}
diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml
new file mode 100644
index 00000000..cd63ea20
--- /dev/null
+++ b/modules/nf-core/samtools/sort/tests/tags.yml
@@ -0,0 +1,3 @@
+samtools/sort:
+  - modules/nf-core/samtools/sort/**
+  - tests/modules/nf-core/samtools/sort/**
diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml
new file mode 100644
index 00000000..141e7bd8
--- /dev/null
+++ b/modules/nf-core/samtools/view/environment.yml
@@ -0,0 +1,7 @@
+name: samtools_view
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::samtools=1.17
diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf
index cb91facf..ddf3f88a 100644
--- a/modules/nf-core/samtools/view/main.nf
+++ b/modules/nf-core/samtools/view/main.nf
@@ -2,7 +2,7 @@ process SAMTOOLS_VIEW {
     tag "$meta.id"
     label 'process_low'
 
-    conda "bioconda::samtools=1.17"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' :
         'biocontainers/samtools:1.17--h00cdaf9_0' }"
diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml
index 3b05450b..3dadafae 100644
--- a/modules/nf-core/samtools/view/meta.yml
+++ b/modules/nf-core/samtools/view/meta.yml
@@ -82,3 +82,8 @@ authors:
   - "@joseespinosa"
   - "@FriederikeHanssen"
   - "@priyanka-surana"
+maintainers:
+  - "@drpatelh"
+  - "@joseespinosa"
+  - "@FriederikeHanssen"
+  - "@priyanka-surana"
diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml
new file mode 100644
index 00000000..6db20988
--- /dev/null
+++ b/modules/nf-core/star/align/environment.yml
@@ -0,0 +1,9 @@
+name: star_align
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::star=2.7.10a
+  - bioconda::samtools=1.16.1
+  - conda-forge::gawk=5.1.0
diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf
index d0e20384..fa645a6d 100644
--- a/modules/nf-core/star/align/main.nf
+++ b/modules/nf-core/star/align/main.nf
@@ -2,7 +2,7 @@ process STAR_ALIGN {
     tag "$meta.id"
     label 'process_high'
 
-    conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' :
         'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }"
diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml
index 3d8fed0c..e80dbb7d 100644
--- a/modules/nf-core/star/align/meta.yml
+++ b/modules/nf-core/star/align/meta.yml
@@ -52,7 +52,6 @@ input:
   - seq_center:
       type: string
       description: Sequencing center
-
 output:
   - bam:
       type: file
@@ -106,8 +105,11 @@ output:
       type: file
       description: STAR output bedGraph format file(s) (optional)
       pattern: "*.bg"
-
 authors:
   - "@kevinmenden"
   - "@drpatelh"
   - "@praveenraj2018"
+maintainers:
+  - "@kevinmenden"
+  - "@drpatelh"
+  - "@praveenraj2018"
diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml
new file mode 100644
index 00000000..0b35ff51
--- /dev/null
+++ b/modules/nf-core/star/genomegenerate/environment.yml
@@ -0,0 +1,9 @@
+name: star_genomegenerate
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::star=2.7.10a
+  - bioconda::samtools=1.16.1
+  - conda-forge::gawk=5.1.0
diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf
index 43424042..473e62a6 100644
--- a/modules/nf-core/star/genomegenerate/main.nf
+++ b/modules/nf-core/star/genomegenerate/main.nf
@@ -2,7 +2,7 @@ process STAR_GENOMEGENERATE {
     tag "$fasta"
     label 'process_high'
 
-    conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' :
         'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }"
diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml
index eba2d9cf..1061e1b8 100644
--- a/modules/nf-core/star/genomegenerate/meta.yml
+++ b/modules/nf-core/star/genomegenerate/meta.yml
@@ -31,7 +31,6 @@ input:
   - gtf:
       type: file
       description: GTF file of the reference genome
-
 output:
   - meta:
       type: map
@@ -46,7 +45,9 @@ output:
       type: file
       description: File containing software versions
       pattern: "versions.yml"
-
 authors:
   - "@kevinmenden"
   - "@drpatelh"
+maintainers:
+  - "@kevinmenden"
+  - "@drpatelh"
diff --git a/modules/nf-core/stringtie/merge/environment.yml b/modules/nf-core/stringtie/merge/environment.yml
new file mode 100644
index 00000000..9914b202
--- /dev/null
+++ b/modules/nf-core/stringtie/merge/environment.yml
@@ -0,0 +1,7 @@
+name: stringtie_merge
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::stringtie=2.2.1
diff --git a/modules/nf-core/stringtie/merge/main.nf b/modules/nf-core/stringtie/merge/main.nf
index 12224f78..c2568219 100644
--- a/modules/nf-core/stringtie/merge/main.nf
+++ b/modules/nf-core/stringtie/merge/main.nf
@@ -2,7 +2,7 @@ process STRINGTIE_MERGE {
     label 'process_medium'
 
     // Note: 2.7X indices incompatible with AWS iGenomes.
-    conda "bioconda::stringtie=2.2.1"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' :
         'biocontainers/stringtie:2.2.1--hecb563c_2' }"
diff --git a/modules/nf-core/stringtie/merge/meta.yml b/modules/nf-core/stringtie/merge/meta.yml
index 2e9784fe..5d02d678 100644
--- a/modules/nf-core/stringtie/merge/meta.yml
+++ b/modules/nf-core/stringtie/merge/meta.yml
@@ -32,6 +32,7 @@ output:
       type: file
       description: File containing software versions
       pattern: "versions.yml"
-
 authors:
   - "@yuukiiwa"
+maintainers:
+  - "@yuukiiwa"
diff --git a/modules/nf-core/stringtie/stringtie/environment.yml b/modules/nf-core/stringtie/stringtie/environment.yml
new file mode 100644
index 00000000..7a0eccdb
--- /dev/null
+++ b/modules/nf-core/stringtie/stringtie/environment.yml
@@ -0,0 +1,7 @@
+name: stringtie_stringtie
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - bioconda::stringtie=2.2.1
diff --git a/modules/nf-core/stringtie/stringtie/main.nf b/modules/nf-core/stringtie/stringtie/main.nf
index d0f8b563..6e25ba27 100644
--- a/modules/nf-core/stringtie/stringtie/main.nf
+++ b/modules/nf-core/stringtie/stringtie/main.nf
@@ -2,7 +2,7 @@ process STRINGTIE_STRINGTIE {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::stringtie=2.2.1"
+    conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' :
         'biocontainers/stringtie:2.2.1--hecb563c_2' }"
diff --git a/modules/nf-core/stringtie/stringtie/meta.yml b/modules/nf-core/stringtie/stringtie/meta.yml
index 75518470..d8ebdd88 100644
--- a/modules/nf-core/stringtie/stringtie/meta.yml
+++ b/modules/nf-core/stringtie/stringtie/meta.yml
@@ -5,7 +5,6 @@ keywords:
   - assembly
   - quantification
   - gtf
-
 tools:
   - stringtie2:
       description: |
@@ -55,3 +54,5 @@ output:
       pattern: "versions.yml"
 authors:
   - "@drpatelh"
+maintainers:
+  - "@drpatelh"

From ef9f57a869e79446397e22c260aa9c80cab962ab Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 15 Nov 2023 10:40:39 +0100
Subject: [PATCH 42/45] move compression out of vcf_collect

---
 bin/vcf_collect.py                | 5 ++---
 modules/local/vcf_collect/main.nf | 3 ++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 4d677de0..8e15ccfc 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -8,7 +8,6 @@
 import ast
 import numpy as np
 import csv
-import gzip
 
 logger = logging.getLogger()
 
@@ -344,9 +343,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
             "FORMAT",
             "Sample",
         ]
-    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression="gzip")
+    ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE)
 
-    with gzip.open(out_file, "r+") as f:
+    with open(out_file, "r+") as f:
         content = f.read()
         f.seek(0, 0)
         f.write(header.rstrip("\r\n") + "\n" + content)
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 42f94c40..1b8e56fe 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -22,7 +22,8 @@ process VCF_COLLECT {
     script:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf.gz
+    vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf
+    gzip ${prefix}_fusion_data.vcf
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":

From f33a5e9f36271d07c11cb28b762a2c3859aa7302 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 15 Nov 2023 13:25:59 +0100
Subject: [PATCH 43/45] fix ,/;

---
 bin/vcf_collect.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 8e15ccfc..57cfbcaa 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -319,7 +319,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
             f"TRANSCRIPT_ID_A={row['CDS_LEFT_ID']};TRANSCRIPT_ID_B={row['CDS_RIGHT_ID']};"
             f"TRANSCRIPT_VERSION_A={row['Left_transcript_version']};TRANSCRIPT_VERSION_B={row['Right_transcript_version']};"
             f"HGNC_ID_A={row['Left_hgnc_id']};HGNC_ID_B={row['Right_hgnc_id']};"
-            f"EXON_NUMBER_A={row['Left_exon_number']},EXON_NUMBER_B={row['Right_exon_number']};"
+            f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};"
             f"ANNOTATIONS={row['annots']}"
         )
         df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"

From a8b07438c70907de25960cfbf934e1b175e82cd2 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Thu, 16 Nov 2023 21:35:59 +0100
Subject: [PATCH 44/45] fix values display vcf

---
 bin/vcf_collect.py                | 13 ++++++++++++-
 modules/local/vcf_collect/main.nf |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 57cfbcaa..dbbd384e 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -275,7 +275,7 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
         concatenate_columns, axis=1
     )
     fusion_report.columns = fusion_report.columns.str.upper()
-    fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x))
+    fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x))
     fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)
 
     return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
@@ -297,6 +297,17 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
     df["INFO"] = ""
     df["Sample"] = ""
     df["Strand1"] = df["Strand1"].astype(str)
+    df["JunctionReadCount"] = df["JunctionReadCount"].fillna(0).astype(int).astype(str)
+    df["SpanningFragCount"] = df["SpanningFragCount"].fillna(0).astype(int).astype(str)
+    df["FFPM"] = df["FFPM"].fillna(0).astype(float).astype(str)
+    df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(int).astype(str)
+    df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(int).astype(str)
+    df["Left_hgnc_id"] = df["Left_hgnc_id"].fillna(0).astype(int).astype(str)
+    df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str)
+    df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str)
+    df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str)
+    df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str)
+    df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str)
 
     for index, row in df.iterrows():
         if row["Strand1"] == "nan":
diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf
index 1b8e56fe..2af4a777 100644
--- a/modules/local/vcf_collect/main.nf
+++ b/modules/local/vcf_collect/main.nf
@@ -2,7 +2,7 @@ process VCF_COLLECT {
     tag "$meta.id"
     label 'process_single'
 
-    conda "conda-forge::python=3.8.3"
+    conda "conda-forge::pandas=1.5.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
         'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
         'quay.io/biocontainers/pandas:1.5.2' }"

From acf64a1d862a7c16fd68d6bd736100e1c1c4ae1c Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Fri, 17 Nov 2023 13:54:11 +0100
Subject: [PATCH 45/45] add hgnc id for fusionreport-only entries in vcf and
 add bug fix

---
 bin/vcf_collect.py | 44 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 33 insertions(+), 11 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index dbbd384e..b5ff126d 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -37,19 +37,36 @@ def vcf_collect(
         .join(read_build_fusionreport(fusionreport_in_file), how="outer", on="FUSION")
         .reset_index()
     )
+    hgnc_df = build_hgnc_dataframe(hgnc)
 
-    df = build_hgcn_dataframe(hgnc).merge(
-        merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
+    df_symbol = merged_df[merged_df["Left_ensembl_gene_id"].isna()]
+    df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()]
+
+    df_not_symbol = hgnc_df.merge(
+        df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id"
     )
+    df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneA")
+    df = pd.concat([df_not_symbol, df_symbol])
     df = df.rename(columns={"hgnc_id": "Left_hgnc_id"})
-    df = build_hgcn_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id")
+
+    df_symbol = df[df["Right_ensembl_gene_id"].isna()]
+    df_not_symbol = df[df["Right_ensembl_gene_id"].notna()]
+
+    df_not_symbol = hgnc_df.merge(
+        df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id"
+    )
+    df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneB")
+    df = pd.concat([df_not_symbol, df_symbol])
     df = df.rename(columns={"hgnc_id": "Right_hgnc_id"})
+
     gtf_df = build_gtf_dataframe(gtf)
     all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
-    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0)
-    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].astype(int)
+    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
 
-    all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])]
+    all_df = all_df[
+        ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"]))
+        | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
+    ]
     all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Left_exon_number"})
     all_df = all_df[
@@ -83,10 +100,14 @@ def vcf_collect(
     all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
     all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
     all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
-    all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])]
+    all_df = all_df[
+        ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"]))
+        | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
+    ]
 
     all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"})
     all_df = all_df.rename(columns={"exon_number": "Right_exon_number"})
+
     all_df = all_df[
         [
             "FUSION",
@@ -300,8 +321,8 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
     df["JunctionReadCount"] = df["JunctionReadCount"].fillna(0).astype(int).astype(str)
     df["SpanningFragCount"] = df["SpanningFragCount"].fillna(0).astype(int).astype(str)
     df["FFPM"] = df["FFPM"].fillna(0).astype(float).astype(str)
-    df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(int).astype(str)
-    df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(int).astype(str)
+    df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(str)
+    df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(str)
     df["Left_hgnc_id"] = df["Left_hgnc_id"].fillna(0).astype(int).astype(str)
     df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str)
     df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str)
@@ -334,6 +355,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
             f"ANNOTATIONS={row['annots']}"
         )
         df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
+
     return df
 
 
@@ -362,13 +384,13 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None:
         f.write(header.rstrip("\r\n") + "\n" + content)
 
 
-def build_hgcn_dataframe(file: str) -> pd.DataFrame:
+def build_hgnc_dataframe(file: str) -> pd.DataFrame:
     """
     Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns.
     """
     df = pd.read_csv(file, sep="\t", low_memory=False)
     df["hgnc_id"] = df["hgnc_id"].str.replace("HGNC:", "")
-    return df[["hgnc_id", "ensembl_gene_id"]].dropna()
+    return df[["hgnc_id", "ensembl_gene_id", "symbol"]].dropna()
 
 
 def build_gtf_dataframe(file: str) -> pd.DataFrame: