From 1ea68bc8c420d3f7292c8315bc6d6218ad7988be Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 29 Sep 2023 17:27:12 +0200 Subject: [PATCH 01/45] from megafusion to vcf_collect --- CHANGELOG.md | 10 ++++++++++ bin/{megafusion.py => vcf_collect.py} | 4 ++-- conf/modules.config | 4 +--- docs/output.md | 4 ++-- docs/usage.md | 2 +- modules/local/{megafusion => vcf_collect}/main.nf | 4 ++-- modules/local/{megafusion => vcf_collect}/meta.yml | 4 ++-- subworkflows/local/fusioninspector_workflow.nf | 6 +++--- tower.yml | 2 +- 9 files changed, 24 insertions(+), 16 deletions(-) rename bin/{megafusion.py => vcf_collect.py} (98%) rename modules/local/{megafusion => vcf_collect}/main.nf (88%) rename modules/local/{megafusion => vcf_collect}/meta.yml (95%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c9ff036..5550a93a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v3.0.0 - [date] + +### Added + +### Changed + +### Fixed + +### Removed + ## v2.4.0 - [2023/09/22] ### Added diff --git a/bin/megafusion.py b/bin/vcf_collect.py similarity index 98% rename from bin/megafusion.py rename to bin/vcf_collect.py index 76872b57..8d3e5367 100755 --- a/bin/megafusion.py +++ b/bin/vcf_collect.py @@ -189,7 +189,7 @@ def write_vcf(df_to_print, header, out_file): f.write(header.rstrip("\r\n") + "\n" + content) -def megafusion(fusioninspector_in_file, fusionreport_in_file, sample, out): +def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, out): """Convert fusion information from FusionInspector and fusion-report into a vcf file. Adapted from https://github.com/J35P312/MegaFusion""" merged_df = build_fusioninspector_dataframe(fusioninspector_in_file, FUSIONINSPECTOR_MAP).join( read_build_fusionreport(fusionreport_in_file), how="left" @@ -203,7 +203,7 @@ def main(argv=None): if not args.fusioninspector.is_file() or not args.fusionreport.is_file(): logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") sys.exit(2) - megafusion(args.fusioninspector, args.fusionreport, args.sample, args.out) + vcf_collect(args.fusioninspector, args.fusionreport, args.sample, args.out) if __name__ == "__main__": diff --git a/conf/modules.config b/conf/modules.config index b4dc96d6..66294dfb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -141,13 +141,11 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, ] } - withName: MEGAFUSION { + withName: VCF_COLLECT { ext.when = {!params.fusioninspector_only} ext.prefix = { "${meta.id}_fusion_data" } } - - withName: MULTIQC { ext.when = { !params.skip_qc } } diff --git a/docs/output.md b/docs/output.md index 4144891b..ac84bd06 100644 --- a/docs/output.md +++ b/docs/output.md @@ -239,12 +239,12 @@ The score is explained [on the original fusion-report github page](https://matq0 Quantifying abundances of transcripts from bulk and single-cell RNA-Seq data, or more generally of target sequences using high-throughput sequencing reads. -### Megafusion +### Vcf_collect
Output files -- `megafusion` +- `vcf_collect` - `_fusion_data.vcf` - contains the fusions in vcf format with collected statistics.
diff --git a/docs/usage.md b/docs/usage.md index 9df6d24e..8a2a725a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -15,7 +15,7 @@ The pipeline is divided into two parts: 2. Detecting fusions - Supported tools: `Arriba`, `FusionCatcher`, `pizzly`, `SQUID`, `STAR-Fusion`, and `StringTie` - QC: `Fastqc`, `MultiQC`, and `Qualimap rnaseq` - - Fusions visualization: `Arriba`, `fusion-report` and `FusionInspector`, VCF file creation based on `MegaFusion` + - Fusions visualization: `Arriba`, `fusion-report` and `FusionInspector`, `vcf_collect` (VCF file creation based on `MegaFusion`) ## Download and build references diff --git a/modules/local/megafusion/main.nf b/modules/local/vcf_collect/main.nf similarity index 88% rename from modules/local/megafusion/main.nf rename to modules/local/vcf_collect/main.nf index d8cb5db0..5c8a57bd 100644 --- a/modules/local/megafusion/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -1,4 +1,4 @@ -process MEGAFUSION { +process VCF_COLLECT { tag "$meta.id" label 'process_single' @@ -20,7 +20,7 @@ process MEGAFUSION { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - megafusion.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf + vcf_collect.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/megafusion/meta.yml b/modules/local/vcf_collect/meta.yml similarity index 95% rename from modules/local/megafusion/meta.yml rename to modules/local/vcf_collect/meta.yml index 31343c7e..40bdd6c0 100644 --- a/modules/local/megafusion/meta.yml +++ b/modules/local/vcf_collect/meta.yml @@ -1,5 +1,5 @@ -name: megafusion -description: megafusion +name: vcf_collect +description: vcf_collect keywords: - sort tools: diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index 5fa21cf1..8cb45086 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -1,6 +1,6 @@ include { ARRIBA_VISUALISATION } from '../../modules/local/arriba/visualisation/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' -include { MEGAFUSION } from '../../modules/local/megafusion/main' +include { VCF_COLLECT } from '../../modules/local/vcf_collect/main' include { FUSIONINSPECTOR } from '../../modules/local/fusioninspector/main' workflow FUSIONINSPECTOR_WORKFLOW { @@ -39,8 +39,8 @@ workflow FUSIONINSPECTOR_WORKFLOW { ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions) fusion_data = FUSIONINSPECTOR.out.tsv.join(report) - MEGAFUSION(fusion_data) - ch_versions = ch_versions.mix(MEGAFUSION.out.versions) + VCF_COLLECT(fusion_data) + ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) { bam_sorted_indexed_fusions = bam_sorted_indexed.join(FUSIONINSPECTOR.out.tsv) diff --git a/tower.yml b/tower.yml index 5813f5d3..d051a618 100644 --- a/tower.yml +++ b/tower.yml @@ -13,7 +13,7 @@ reports: display: "FusionInspector TSV report" "**/fusionreport/*/*_fusionreport_index.html": display: "Fusion-report HTML report" - "**/megafusion/*_fusion_data.vcf": + "**/vcf_collect/*_fusion_data.vcf": display: "Collected statistics on each fusion fed to FusionInspector in VCF format" "**/picard/*.MarkDuplicates.metrics.txt": display: "Picard: Metrics from CollectRnaMetrics" From da68e1daa0b1bbc679d9045d1d074a3267c932e9 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 2 Oct 2023 09:34:10 +0200 Subject: [PATCH 02/45] start collecting more information with vcf_collect --- bin/vcf_collect.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 8d3e5367..e5f3df88 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -22,6 +22,10 @@ "split_reads": {"column": 1, "delimiter": "\t", "element": 0}, "discordant_pairs": {"column": 2, "delimiter": "\t", "element": 0}, "ffpm": {"column": 25, "delimiter": "\t", "element": 0}, + "LeftGene": {"column": 5, "delimiter": "\t", "element": 0}, + "LeftBreakpoint": {"column": 7, "delimiter": ":", "element": 1}, + "RightGene": {"column": 8, "delimiter": "\t", "element": 0}, + "RightBreakpoint": {"column": 10, "delimiter": ":", "element": 1}, } From 41bd823ee27d18973464b8c0bc1d804a0f3e7110 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 09:47:33 +0100 Subject: [PATCH 03/45] add hgnc id and extra info for vcf file --- bin/vcf_collect.py | 273 +++++++++++++++++++++--------- modules/local/hgnc/main.nf | 41 +++++ modules/local/vcf_collect/main.nf | 13 +- nextflow.config | 2 + nextflow_schema.json | 10 ++ 5 files changed, 251 insertions(+), 88 deletions(-) create mode 100644 modules/local/hgnc/main.nf diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index e5f3df88..e590d88b 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -6,27 +6,56 @@ from pathlib import Path import pandas as pd import ast +from gtfparse import read_gtf logger = logging.getLogger() -FUSIONINSPECTOR_MAP = { - "fusion": {"column": 0, "delimiter": "\t", "element": 0}, - "chromosomeA": {"column": 7, "delimiter": ":", "element": 0}, - "chromosomeB": {"column": 10, "delimiter": ":", "element": 0}, - "posA": {"column": 7, "delimiter": ":", "element": 1}, - "posB": {"column": 10, "delimiter": ":", "element": 1}, - "strand1": {"column": 7, "delimiter": ":", "element": 2}, - "strand2": {"column": 10, "delimiter": ":", "element": 2}, - "geneA": {"column": 0, "delimiter": "--", "element": 0}, - "geneB": {"column": 0, "delimiter": "--", "element": 1}, - "split_reads": {"column": 1, "delimiter": "\t", "element": 0}, - "discordant_pairs": {"column": 2, "delimiter": "\t", "element": 0}, - "ffpm": {"column": 25, "delimiter": "\t", "element": 0}, - "LeftGene": {"column": 5, "delimiter": "\t", "element": 0}, - "LeftBreakpoint": {"column": 7, "delimiter": ":", "element": 1}, - "RightGene": {"column": 8, "delimiter": "\t", "element": 0}, - "RightBreakpoint": {"column": 10, "delimiter": ":", "element": 1}, -} + +def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample, out): + """ + Process FusionInspector and FusionReport data, + merge with GTF from FusionInspector and HGNC database, + and write a VCF file. + + Args: + fusioninspector_in_file (str): Path to FusionInspector input file. + fusionreport_in_file (str): Path to FusionReport input file. + sample (str): Sample name for the header. + hgnc (str): Path to HGNC file. + gtf (str): Path to GTF file. + out (str): Output VCF file path. + + Adapted from: https://github.com/J35P312/MegaFusion + """ + merged_df = build_fusioninspector_dataframe(fusioninspector_in_file).join( + read_build_fusionreport(fusionreport_in_file), how="outer", on='FUSION').reset_index() + + df = build_hgnc_dataframe(hgnc).merge(merged_df, how='right', left_on='ensembl_gene_id', + right_on='Left_ensembl_gene_id') + df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) + df = build_hgnc_dataframe(hgnc).merge(df, how='right', left_on='ensembl_gene_id', right_on='Right_ensembl_gene_id') + df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) + gtf_df = build_gtf_dataframe(gtf) + all_df = df.merge(gtf_df, how='left', left_on='CDS_LEFT_ID', right_on='Transcript_id') + all_df = all_df[(all_df['PosA'] >= all_df['orig_start']) & (all_df['PosA'] <= all_df['orig_end'])] + all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"}) + all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) + all_df = all_df[ + ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', + 'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID', + 'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_hgnc_id', 'Strand1', + 'Strand2', 'annots']].drop_duplicates() + all_df = all_df.merge(gtf_df, how='left', left_on='CDS_RIGHT_ID', right_on='Transcript_id') + all_df = all_df[(all_df['PosB'] >= all_df['orig_start']) & (all_df['PosB'] <= all_df['orig_end'])] + all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"}) + all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) + all_df = all_df[ + ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', + 'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID', + 'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_transcript_version', + 'Right_exon_number', 'Right_hgnc_id', 'Strand1', 'Strand2', 'annots']].drop_duplicates() + + return write_vcf(column_manipulation(all_df), header_def(sample), out) def parse_args(argv=None): @@ -47,17 +76,32 @@ def parse_args(argv=None): type=Path, help="Fusionreport output in TSV format.", ) + parser.add_argument( + "--fusioninspector_gtf", + metavar="GTF", + type=Path, + help="FusionInspector GTF output.", + ) + parser.add_argument( + "--hgnc", + metavar="HGNC", + type=Path, + help="HGNC database.", + ) parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample") parser.add_argument( "--out", metavar="OUT", type=Path, - help="Output path.", + help="VCF output path.", ) return parser.parse_args(argv) def header_def(sample): + """ + Define the header of the VCF file + """ return '##fileformat=VCFv4.1\n\ ##ALT=\n\ ##INFO=\n\ @@ -65,15 +109,23 @@ def header_def(sample): ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ +##INFO=\n\ +##INFO=\n\ ##INFO=\n\ ##INFO=\n\ -##INFO=\n\ -##INFO=\n\ -##INFO=\n\ -##INFO=\n\ -##INFO=\n\ +##INFO=\n\ ##INFO=\n\ ##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ ##FORMAT=\n\ ##FORMAT=\n\ ##FORMAT=\n\ @@ -83,42 +135,76 @@ def header_def(sample): ) -def read_fusioninspector(fusioninspector_file, col_num, delimiter, element): - with open(fusioninspector_file) as fusioninspector: - return [line.split()[col_num].split(delimiter)[element] for line in fusioninspector if not line.startswith("#")] +def build_fusioninspector_dataframe(file): + """ + Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index. + """ + df = pd.read_csv(file, sep="\t") + df = df.rename(columns={"#FusionName": "FUSION"}) + df[['ChromosomeA', 'PosA', 'Strand1']] = df['LeftBreakpoint'].str.split(':', expand=True) + df[['ChromosomeB', 'PosB', 'Strand2']] = df['RightBreakpoint'].str.split(':', expand=True) + df[['GeneA', 'GeneB']] = df['FUSION'].str.split('--', expand=True) + df[['LeftGeneName', 'Left_ensembl_gene_id']] = df['LeftGene'].str.split('^', expand=True) + df[['RightGeneName', 'Right_ensembl_gene_id']] = df['RightGene'].str.split('^', expand=True) + return df.set_index(['FUSION']) -def build_fusioninspector_dataframe(file, map): - new_dict = {} - for key in FUSIONINSPECTOR_MAP: - new_dict[key] = read_fusioninspector( - file, - map[key]["column"], - map[key]["delimiter"], - map[key]["element"], - ) - return pd.DataFrame.from_dict(new_dict).set_index("fusion") +def replace_value_with_column_name(row, value_to_replace, column_name): + """ + Replace a specific value in a row with the corresponding column name. + """ + new_values = '' + for col_name, value in row.items(): + if col_name == column_name: + if value == value_to_replace: + new_values = col_name + else: + new_values = '' + return new_values + + +def concatenate_columns(row): + """ + Concatenate non-empty values in a row into a single string separated by commas. + """ + non_empty_values = [str(value) for value in row if value != ''] + return ','.join(non_empty_values) def read_build_fusionreport(fusionreport_file): + """ + Read and preprocess fusion-report data from a file, including handling missing tool columns, + getting the columns with each tool and create a new FOUND_IN column with all the tool hits. + Convert the list of databases in FOUND_DB into a joined string with a comma separator. + Make all column headers uppercase. + """ with open(fusionreport_file) as f: from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line] expression = from_html[0].split('], "tool')[0] - fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression)).set_index("fusion") - if not "arriba" in fusion_report.columns: + fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression)) + if "arriba" not in fusion_report.columns: fusion_report["arriba"] = "" - if not "fusioncatcher" in fusion_report.columns: + if "fusioncatcher" not in fusion_report.columns: fusion_report["fusioncatcher"] = "" - if not "pizzly" in fusion_report.columns: - fusion_report["pizzly"] = "" - if not "squid" in fusion_report.columns: - fusion_report["squid"] = "" - if not "starfusion" in fusion_report.columns: + if "starfusion" not in fusion_report.columns: fusion_report["starfusion"] = "" - return fusion_report + fusion_report['arriba'] = fusion_report[['arriba']].apply(replace_value_with_column_name, + args=('true', 'arriba'), axis=1) + fusion_report['fusioncatcher'] = fusion_report[['fusioncatcher']].apply(replace_value_with_column_name, + args=('true', 'fusioncatcher'), axis=1) + fusion_report['starfusion'] = fusion_report[['starfusion']].apply(replace_value_with_column_name, + args=('true', 'starfusion'), axis=1) + fusion_report['FOUND_IN'] = fusion_report[['arriba', 'starfusion', + 'fusioncatcher']].apply(concatenate_columns, axis=1) + fusion_report.columns = fusion_report.columns.str.upper() + fusion_report['FOUND_DB'] = fusion_report['FOUND_DB'].apply(lambda x: ', '.join(x)) + return fusion_report[['FUSION', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', 'FOUND_IN']].set_index(['FUSION']) def column_manipulation(df): + """ + Manipulate and prepare DataFrame for VCF file creation. + """ df["ALT"] = "" df = df.reset_index() df["FORMAT"] = "GT:DV:RV:FFPM" @@ -131,46 +217,58 @@ def column_manipulation(df): for index, row in df.iterrows(): # ALT - if not row["strand1"] in ["+", "-"] or not row["strand2"] in ["+", "-"]: - df.loc[index, "ALT"] = "N[{}:{}[".format(df["chromosomeB"], row["posB"]) - elif row["strand1"] == "-" and row["strand2"] == "-": - df.loc[index, "ALT"] = "[{}:{}[N".format(row["chromosomeB"], row["posB"]) - elif row["strand1"] == "+" and row["strand2"] == "-": - df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"]) - elif row["strand1"] == "-" and row["strand2"] == "+": - df.loc[index, "ALT"] = "N]{}:{}]".format(row["chromosomeB"], row["posB"]) + if not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]: + df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"]) + elif row["Strand1"] == "-" and row["Strand2"] == "-": + df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"]) + elif row["Strand1"] == "+" and row["Strand2"] == "-": + df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"]) + elif row["Strand1"] == "-" and row["Strand2"] == "+": + df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"]) else: - df.loc[index, "ALT"] = "N[{}:{}[".format(row["chromosomeB"], row["posB"]) + df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"]) # INFO df.loc[index, "INFO"] = ( - "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};ORIENTATION={},{};FOUND_DB={};" - "ARRIBA={};FUSIONCATCHER={};PIZZLY={};SQUID={};STARFUSION={};TOOL_HITS={};SCORE={}".format( - row["chromosomeA"], - row["chromosomeB"], - row["geneA"], - row["geneB"], - row["strand1"], - row["strand2"], - row["found_db"], - row["arriba"], - row["fusioncatcher"], - row["pizzly"], - row["squid"], - row["starfusion"], - row["tools_hits"], - row["score"], + "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};" + "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" + "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};" + "EXON_NUMBER_B={};ANNOTATIONS={}".format( + row["ChromosomeA"], + row["ChromosomeB"], + row["GeneA"], + row["GeneB"], + row['PosA'], + row['PosB'], + row["Strand1"], + row["Strand2"], + row["FOUND_DB"], + row["FOUND_IN"], + row["TOOLS_HITS"], + row["SCORE"], + row["PROT_FUSION_TYPE"], + row["CDS_LEFT_ID"], + row["CDS_RIGHT_ID"], + row["Left_transcript_version"], + row["Right_transcript_version"], + row["Left_hgnc_id"], + row["Right_hgnc_id"], + row["Left_exon_number"], + row["Right_exon_number"], + row["annots"], ) ) - # FORMAT - df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["split_reads"], row["discordant_pairs"], row["ffpm"]) + df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["JunctionReadCount"], row["SpanningFragCount"], row["FFPM"]) return df def write_vcf(df_to_print, header, out_file): + """ + Write a VCF file with a specified DataFrame, header, and output file path. + """ df_to_print[ [ - "chromosomeA", - "posA", + "ChromosomeA", + "PosA", "ID", "REF", "ALT", @@ -193,12 +291,23 @@ def write_vcf(df_to_print, header, out_file): f.write(header.rstrip("\r\n") + "\n" + content) -def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, out): - """Convert fusion information from FusionInspector and fusion-report into a vcf file. Adapted from https://github.com/J35P312/MegaFusion""" - merged_df = build_fusioninspector_dataframe(fusioninspector_in_file, FUSIONINSPECTOR_MAP).join( - read_build_fusionreport(fusionreport_in_file), how="left" - ) - write_vcf(column_manipulation(merged_df), header_def(sample), out) +def build_hgnc_dataframe(file): + """ + Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. + """ + df = pd.read_csv(file, sep="\t", low_memory=False) + return df[['hgnc_id', 'ensembl_gene_id']].dropna() + + +def build_gtf_dataframe(file): + """ + Build a DataFrame from GTF file, extracting relevant columns. + """ + df = read_gtf(file) + df[['fusion_dump', 'Transcript_id']] = df['transcript_id'].str.split('^', expand=True) + df[['orig_chromosome', 'orig_start', 'orig_end', 'orig_dir']] = df['orig_coord_info'].str.split(',', expand=True) +# return df + return df[['Transcript_id', 'transcript_version', 'exon_number', 'exon_id', 'orig_start', 'orig_end']] def main(argv=None): @@ -207,7 +316,7 @@ def main(argv=None): if not args.fusioninspector.is_file() or not args.fusionreport.is_file(): logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") sys.exit(2) - vcf_collect(args.fusioninspector, args.fusionreport, args.sample, args.out) + vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out) if __name__ == "__main__": diff --git a/modules/local/hgnc/main.nf b/modules/local/hgnc/main.nf new file mode 100644 index 00000000..7211cb71 --- /dev/null +++ b/modules/local/hgnc/main.nf @@ -0,0 +1,41 @@ +process HGNC_DOWNLOAD { + tag "hgnc" + label 'process_low' + + conda "bioconda::gnu-wget=1.18" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h5bf99c6_5' : + 'quay.io/biocontainers/gnu-wget:1.18--h5bf99c6_5' }" + + input: + + output: + path "hgnc_complete_set.txt" , emit: hgnc_ref + path "HGNC-DB-timestamp.txt" , emit: hgnc_date + + path "versions.yml" , emit: versions + + + script: + """ + wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt + date+%Y-%m-%d/%H:%M: > HGNC-DB-timestamp.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo wget -V 2>&1 | grep "GNU Wget" | cut -d" " -f3 > versions.yml) + END_VERSIONS + """ + + stub: + """ + touch "hgnc_complete_set.txt" + touch "HGNC-DB-timestamp.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo wget -V 2>&1 | grep "GNU Wget" | cut -d" " -f3 > versions.yml) + END_VERSIONS + """ + +} diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 5c8a57bd..208286c4 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,13 +2,13 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + conda "bioconda:: gtfparse =2.0.1" + container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_0" input: - tuple val(meta), path(tsv), path(report) + tuple val(meta), path(tsv), path(out_gtf), path(report) + path hgnc_ref + path hgnc_date output: path "versions.yml" , emit: versions @@ -20,11 +20,12 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $tsv --fusionreport $report --sample ${prefix} --out ${prefix}.vcf + vcf_collect.py --fusioninspector $tsv --fusionreport $report --fusioninspector_gtf $out_gtf --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": python: \$(python --version | sed 's/Python //g') + HGNC DB retrieval: \$(cat $hgnc_date) END_VERSIONS """ diff --git a/nextflow.config b/nextflow.config index a81e3ea2..9bd3c50c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,6 +73,8 @@ params { arriba_ref_known_fusions = "${params.genomes_base}/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz" arriba_ref_protein_domains = "${params.genomes_base}/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3" fusioncatcher_ref = "${params.genomes_base}/fusioncatcher/human_v102" + hgcn_ref = "${params.genomes_base}/hgnc/hgnc_complete_set.txt" + hgcn_date = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt" pizzly_ref = "${params.genomes_base}/pizzly/kallisto" squid_ref = "${params.genomes_base}/squid" starfusion_ref = "${params.genomes_base}/starfusion/ctat_genome_lib_build_dir" diff --git a/nextflow_schema.json b/nextflow_schema.json index 1c8962f9..c4be8a48 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -183,6 +183,16 @@ "fa_icon": "far fa-file-code", "description": "Path to fusionreport references" }, + "hgnc_ref": { + "type": "string", + "fa_icon": "far fa-file-code", + "description": "Path to HGNC database file" + }, + "hgnc_date": { + "type": "string", + "fa_icon": "far fa-file-code", + "description": "Path to HGNC timestamp file for database retrieval" + }, "pizzly": { "type": "boolean", "fa_icon": "far fa-file-code", From 4ba9728b1cf55bd8bb371c7d717b30443643b9b1 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 09:52:07 +0100 Subject: [PATCH 04/45] add hgnc id and extra info for vcf file --- nextflow.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 9bd3c50c..3d1e0b41 100644 --- a/nextflow.config +++ b/nextflow.config @@ -73,8 +73,8 @@ params { arriba_ref_known_fusions = "${params.genomes_base}/arriba/known_fusions_hg38_GRCh38_v2.3.0.tsv.gz" arriba_ref_protein_domains = "${params.genomes_base}/arriba/protein_domains_hg38_GRCh38_v2.3.0.gff3" fusioncatcher_ref = "${params.genomes_base}/fusioncatcher/human_v102" - hgcn_ref = "${params.genomes_base}/hgnc/hgnc_complete_set.txt" - hgcn_date = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt" + hgnc_ref = "${params.genomes_base}/hgnc/hgnc_complete_set.txt" + hgnc_date = "${params.genomes_base}/hgnc/HGNC-DB-timestamp.txt" pizzly_ref = "${params.genomes_base}/pizzly/kallisto" squid_ref = "${params.genomes_base}/squid" starfusion_ref = "${params.genomes_base}/starfusion/ctat_genome_lib_build_dir" From b0c0f18555a9a60af1bbac59365b831ea0ebfaf3 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:17:57 +0100 Subject: [PATCH 05/45] fix channel i/o --- modules/local/fusioninspector/main.nf | 1 + subworkflows/local/fusioninspector_workflow.nf | 8 +++++--- workflows/rnafusion.nf | 8 +++++--- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf index 4d24a635..1b1a7169 100644 --- a/modules/local/fusioninspector/main.nf +++ b/modules/local/fusioninspector/main.nf @@ -11,6 +11,7 @@ process FUSIONINSPECTOR { output: tuple val(meta), path("*FusionInspector.fusions.tsv") , emit: tsv + tuple val(meta), path("*.gtf") , emit: out_gtf path "*" , emit: output path "versions.yml" , emit: versions diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index b0d30cc7..d0453723 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -8,11 +8,13 @@ workflow FUSIONINSPECTOR_WORKFLOW { reads fusion_list fusion_list_filtered - report + fusionreport_out bam_sorted_indexed ch_gtf ch_arriba_ref_protein_domains ch_arriba_ref_cytobands + ch_hgnc_ref + ch_hgnc_date main: ch_versions = Channel.empty() @@ -38,9 +40,9 @@ workflow FUSIONINSPECTOR_WORKFLOW { FUSIONINSPECTOR( ch_reads_fusion, index) ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions) + fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.ch_out_gtf).join(fusionreport_out) - fusion_data = FUSIONINSPECTOR.out.tsv.join(report) - VCF_COLLECT(fusion_data) + VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date) ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) { diff --git a/workflows/rnafusion.nf b/workflows/rnafusion.nf index 494e34e8..6792eff8 100644 --- a/workflows/rnafusion.nf +++ b/workflows/rnafusion.nf @@ -32,8 +32,8 @@ ch_arriba_ref_blacklist = Channel.fromPath(params.arriba_ref_blacklist).map { it ch_arriba_ref_known_fusions = Channel.fromPath(params.arriba_ref_known_fusions).map { it -> [[id:it.Name], it] }.collect() ch_arriba_ref_protein_domains = Channel.fromPath(params.arriba_ref_protein_domains).map { it -> [[id:it.Name], it] }.collect() ch_arriba_ref_cytobands = Channel.fromPath(params.arriba_ref_cytobands).map { it -> [[id:it.Name], it] }.collect() - - +ch_hgnc_ref = Channel.fromPath(params.hgnc_ref).map { it -> [[id:it.Name], it] }.collect() +ch_hgnc_date = Channel.fromPath(params.hgnc_date).map { it -> [[id:it.Name], it] }.collect() ch_fasta = Channel.fromPath(params.fasta).map { it -> [[id:it.Name], it] }.collect() ch_gtf = Channel.fromPath(params.gtf).map { it -> [[id:it.Name], it] }.collect() ch_transcript = Channel.fromPath(params.transcript).map { it -> [[id:it.Name], it] }.collect() @@ -225,7 +225,9 @@ workflow RNAFUSION { STARFUSION_WORKFLOW.out.ch_bam_sorted_indexed, ch_chrgtf, ch_arriba_ref_protein_domains, - ch_arriba_ref_cytobands + ch_arriba_ref_cytobands, + ch_hgnc_ref, + ch_hgnc_date ) ch_versions = ch_versions.mix(FUSIONINSPECTOR_WORKFLOW.out.versions.first().ifEmpty(null)) From 605280b0e52d7bf390312ccdfc12407caa926173 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:19:09 +0100 Subject: [PATCH 06/45] black --- bin/vcf_collect.py | 144 +++++++++++++++++++++++++++++++-------------- 1 file changed, 100 insertions(+), 44 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index e590d88b..bef72dc4 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -27,33 +27,85 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample Adapted from: https://github.com/J35P312/MegaFusion """ - merged_df = build_fusioninspector_dataframe(fusioninspector_in_file).join( - read_build_fusionreport(fusionreport_in_file), how="outer", on='FUSION').reset_index() + merged_df = ( + build_fusioninspector_dataframe(fusioninspector_in_file) + .join(read_build_fusionreport(fusionreport_in_file), how="outer", on="FUSION") + .reset_index() + ) - df = build_hgnc_dataframe(hgnc).merge(merged_df, how='right', left_on='ensembl_gene_id', - right_on='Left_ensembl_gene_id') + df = build_hgnc_dataframe(hgnc).merge( + merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id" + ) df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) - df = build_hgnc_dataframe(hgnc).merge(df, how='right', left_on='ensembl_gene_id', right_on='Right_ensembl_gene_id') + df = build_hgnc_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id") df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) gtf_df = build_gtf_dataframe(gtf) - all_df = df.merge(gtf_df, how='left', left_on='CDS_LEFT_ID', right_on='Transcript_id') - all_df = all_df[(all_df['PosA'] >= all_df['orig_start']) & (all_df['PosA'] <= all_df['orig_end'])] + all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") + all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])] all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) all_df = all_df[ - ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', - 'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID', - 'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_hgnc_id', 'Strand1', - 'Strand2', 'annots']].drop_duplicates() - all_df = all_df.merge(gtf_df, how='left', left_on='CDS_RIGHT_ID', right_on='Transcript_id') - all_df = all_df[(all_df['PosB'] >= all_df['orig_start']) & (all_df['PosB'] <= all_df['orig_end'])] + [ + "FUSION", + "GeneA", + "GeneB", + "PosA", + "PosB", + "ChromosomeA", + "ChromosomeB", + "TOOLS_HITS", + "SCORE", + "FOUND_DB", + "FOUND_IN", + "JunctionReadCount", + "SpanningFragCount", + "FFPM", + "PROT_FUSION_TYPE", + "CDS_LEFT_ID", + "CDS_RIGHT_ID", + "Left_transcript_version", + "Left_exon_number", + "Left_hgnc_id", + "Right_hgnc_id", + "Strand1", + "Strand2", + "annots", + ] + ].drop_duplicates() + all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") + all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])] all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) all_df = all_df[ - ['FUSION', 'GeneA', 'GeneB', 'PosA', 'PosB', 'ChromosomeA', 'ChromosomeB', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', - 'FOUND_IN', 'JunctionReadCount', 'SpanningFragCount', 'FFPM', 'PROT_FUSION_TYPE', 'CDS_LEFT_ID', - 'CDS_RIGHT_ID', 'Left_transcript_version', 'Left_exon_number', 'Left_hgnc_id', 'Right_transcript_version', - 'Right_exon_number', 'Right_hgnc_id', 'Strand1', 'Strand2', 'annots']].drop_duplicates() + [ + "FUSION", + "GeneA", + "GeneB", + "PosA", + "PosB", + "ChromosomeA", + "ChromosomeB", + "TOOLS_HITS", + "SCORE", + "FOUND_DB", + "FOUND_IN", + "JunctionReadCount", + "SpanningFragCount", + "FFPM", + "PROT_FUSION_TYPE", + "CDS_LEFT_ID", + "CDS_RIGHT_ID", + "Left_transcript_version", + "Left_exon_number", + "Left_hgnc_id", + "Right_transcript_version", + "Right_exon_number", + "Right_hgnc_id", + "Strand1", + "Strand2", + "annots", + ] + ].drop_duplicates() return write_vcf(column_manipulation(all_df), header_def(sample), out) @@ -141,25 +193,25 @@ def build_fusioninspector_dataframe(file): """ df = pd.read_csv(file, sep="\t") df = df.rename(columns={"#FusionName": "FUSION"}) - df[['ChromosomeA', 'PosA', 'Strand1']] = df['LeftBreakpoint'].str.split(':', expand=True) - df[['ChromosomeB', 'PosB', 'Strand2']] = df['RightBreakpoint'].str.split(':', expand=True) - df[['GeneA', 'GeneB']] = df['FUSION'].str.split('--', expand=True) - df[['LeftGeneName', 'Left_ensembl_gene_id']] = df['LeftGene'].str.split('^', expand=True) - df[['RightGeneName', 'Right_ensembl_gene_id']] = df['RightGene'].str.split('^', expand=True) - return df.set_index(['FUSION']) + df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True) + df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) + df[["GeneA", "GeneB"]] = df["FUSION"].str.split("--", expand=True) + df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) + df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) + return df.set_index(["FUSION"]) def replace_value_with_column_name(row, value_to_replace, column_name): """ Replace a specific value in a row with the corresponding column name. """ - new_values = '' + new_values = "" for col_name, value in row.items(): if col_name == column_name: if value == value_to_replace: new_values = col_name else: - new_values = '' + new_values = "" return new_values @@ -167,8 +219,8 @@ def concatenate_columns(row): """ Concatenate non-empty values in a row into a single string separated by commas. """ - non_empty_values = [str(value) for value in row if value != ''] - return ','.join(non_empty_values) + non_empty_values = [str(value) for value in row if value != ""] + return ",".join(non_empty_values) def read_build_fusionreport(fusionreport_file): @@ -188,17 +240,21 @@ def read_build_fusionreport(fusionreport_file): fusion_report["fusioncatcher"] = "" if "starfusion" not in fusion_report.columns: fusion_report["starfusion"] = "" - fusion_report['arriba'] = fusion_report[['arriba']].apply(replace_value_with_column_name, - args=('true', 'arriba'), axis=1) - fusion_report['fusioncatcher'] = fusion_report[['fusioncatcher']].apply(replace_value_with_column_name, - args=('true', 'fusioncatcher'), axis=1) - fusion_report['starfusion'] = fusion_report[['starfusion']].apply(replace_value_with_column_name, - args=('true', 'starfusion'), axis=1) - fusion_report['FOUND_IN'] = fusion_report[['arriba', 'starfusion', - 'fusioncatcher']].apply(concatenate_columns, axis=1) + fusion_report["arriba"] = fusion_report[["arriba"]].apply( + replace_value_with_column_name, args=("true", "arriba"), axis=1 + ) + fusion_report["fusioncatcher"] = fusion_report[["fusioncatcher"]].apply( + replace_value_with_column_name, args=("true", "fusioncatcher"), axis=1 + ) + fusion_report["starfusion"] = fusion_report[["starfusion"]].apply( + replace_value_with_column_name, args=("true", "starfusion"), axis=1 + ) + fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply( + concatenate_columns, axis=1 + ) fusion_report.columns = fusion_report.columns.str.upper() - fusion_report['FOUND_DB'] = fusion_report['FOUND_DB'].apply(lambda x: ', '.join(x)) - return fusion_report[['FUSION', 'TOOLS_HITS', 'SCORE', 'FOUND_DB', 'FOUND_IN']].set_index(['FUSION']) + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x)) + return fusion_report[["FUSION", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(["FUSION"]) def column_manipulation(df): @@ -237,8 +293,8 @@ def column_manipulation(df): row["ChromosomeB"], row["GeneA"], row["GeneB"], - row['PosA'], - row['PosB'], + row["PosA"], + row["PosB"], row["Strand1"], row["Strand2"], row["FOUND_DB"], @@ -296,7 +352,7 @@ def build_hgnc_dataframe(file): Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ df = pd.read_csv(file, sep="\t", low_memory=False) - return df[['hgnc_id', 'ensembl_gene_id']].dropna() + return df[["hgnc_id", "ensembl_gene_id"]].dropna() def build_gtf_dataframe(file): @@ -304,10 +360,10 @@ def build_gtf_dataframe(file): Build a DataFrame from GTF file, extracting relevant columns. """ df = read_gtf(file) - df[['fusion_dump', 'Transcript_id']] = df['transcript_id'].str.split('^', expand=True) - df[['orig_chromosome', 'orig_start', 'orig_end', 'orig_dir']] = df['orig_coord_info'].str.split(',', expand=True) -# return df - return df[['Transcript_id', 'transcript_version', 'exon_number', 'exon_id', 'orig_start', 'orig_end']] + df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) + # return df + return df[["Transcript_id", "transcript_version", "exon_number", "exon_id", "orig_start", "orig_end"]] def main(argv=None): From 36519afd15cf328d032b601f3d7afa315e327599 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:22:42 +0100 Subject: [PATCH 07/45] fix bug in channel i/o --- subworkflows/local/fusioninspector_workflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index d0453723..84b4d73f 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -40,7 +40,7 @@ workflow FUSIONINSPECTOR_WORKFLOW { FUSIONINSPECTOR( ch_reads_fusion, index) ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions) - fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.ch_out_gtf).join(fusionreport_out) + fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out) VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date) ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) From fd80774d8637ce7bda39b475af0e18e5f80dcfa4 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:26:20 +0100 Subject: [PATCH 08/45] fix bug in channel i/o --- subworkflows/local/fusioninspector_workflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index 84b4d73f..3eb477d0 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -42,7 +42,7 @@ workflow FUSIONINSPECTOR_WORKFLOW { ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions) fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out) - VCF_COLLECT(fusion_data, hgnc_ref, hgnc_date) + VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date) ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) if ((params.starfusion || params.all || params.stringtie) && !params.fusioninspector_only && !params.skip_vis) { From 3db4665030553175f3ada4c9205f7f8ffe75529f Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:33:14 +0100 Subject: [PATCH 09/45] add hgnc download to build_references workflow --- workflows/build_references.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/build_references.nf b/workflows/build_references.nf index 4921443a..a8130a40 100644 --- a/workflows/build_references.nf +++ b/workflows/build_references.nf @@ -35,6 +35,7 @@ workflow BUILD_REFERENCES { def fake_meta = [:] fake_meta.id = "Homo_sapiens.${params.genome}.${params.ensembl_version}" ENSEMBL_DOWNLOAD( params.ensembl_version, params.genome, fake_meta ) + HGNC_DOWNLOAD( ) SAMTOOLS_FAIDX(ENSEMBL_DOWNLOAD.out.fasta, [[],[]]) From 71b15d5d205420fda437f96afd0a81c11aeca9b1 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 10:35:47 +0100 Subject: [PATCH 10/45] add hgnc download to build_references workflow --- workflows/build_references.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/build_references.nf b/workflows/build_references.nf index a8130a40..0ebf3c08 100644 --- a/workflows/build_references.nf +++ b/workflows/build_references.nf @@ -8,6 +8,7 @@ include { ARRIBA_DOWNLOAD } from '../modules/local/arriba/downlo include { ENSEMBL_DOWNLOAD } from '../modules/local/ensembl/main' include { FUSIONCATCHER_DOWNLOAD } from '../modules/local/fusioncatcher/download/main' include { FUSIONREPORT_DOWNLOAD } from '../modules/local/fusionreport/download/main' +include { HGNC_DOWNLOAD } from '../modules/local/hgnc/main' include { STARFUSION_BUILD } from '../modules/local/starfusion/build/main' include { STARFUSION_DOWNLOAD } from '../modules/local/starfusion/download/main' include { GTF_TO_REFFLAT } from '../modules/local/uscs/custom_gtftogenepred/main' From 52bf0040b2c2bf378d73f04b09f53fbd47589027 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 30 Oct 2023 12:03:09 +0100 Subject: [PATCH 11/45] add meta info for hgnc --- modules/local/vcf_collect/main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 208286c4..0f3d0257 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -7,8 +7,8 @@ process VCF_COLLECT { input: tuple val(meta), path(tsv), path(out_gtf), path(report) - path hgnc_ref - path hgnc_date + tuple val(meta2), path(hgnc_ref) + tuple val(meta3), path(hgnc_date) output: path "versions.yml" , emit: versions From b07cee34be10de3509d9d47738287372485b6bfd Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Tue, 31 Oct 2023 21:11:37 +0100 Subject: [PATCH 12/45] update gtfparse containers --- modules/local/vcf_collect/main.nf | 4 ++-- nextflow.config | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 0f3d0257..b28bd08e 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,8 +2,8 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "bioconda:: gtfparse =2.0.1" - container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_0" + conda "bioconda::gtfparse=2.0.1" + container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" input: tuple val(meta), path(tsv), path(out_gtf), path(report) diff --git a/nextflow.config b/nextflow.config index d609a3ef..993f22ae 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,11 +24,10 @@ params { multiqc_methods_description = null // Genome - genome = 'GRCh38' + genomes = 'GRCh38' genomes_base = "${params.outdir}/references" ensembl_version = 102 read_length = 100 - genomes = [:] starfusion_build = true // Filtering From bdb1a6ea68298092a9266f89632415294a763e49 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Tue, 31 Oct 2023 21:15:13 +0100 Subject: [PATCH 13/45] fix bug with param genome.s --- nextflow.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 993f22ae..d609a3ef 100644 --- a/nextflow.config +++ b/nextflow.config @@ -24,10 +24,11 @@ params { multiqc_methods_description = null // Genome - genomes = 'GRCh38' + genome = 'GRCh38' genomes_base = "${params.outdir}/references" ensembl_version = 102 read_length = 100 + genomes = [:] starfusion_build = true // Filtering From aa746115e72b22387b9e48efc43780d1456d352c Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:24:53 +0100 Subject: [PATCH 14/45] add AGAT/CONVERTSPGFF2TSV to convert gtf to tsv --- bin/vcf_collect.py | 5 ++--- modules/local/vcf_collect/main.nf | 4 ++-- subworkflows/local/fusioninspector_workflow.nf | 6 +++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index bef72dc4..ec08e432 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -357,12 +357,11 @@ def build_hgnc_dataframe(file): def build_gtf_dataframe(file): """ - Build a DataFrame from GTF file, extracting relevant columns. + Build a DataFrame from GTF file converted in TSV, extracting relevant columns. """ - df = read_gtf(file) + df = pd.read_csv(file, sep="\t") df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) - # return df return df[["Transcript_id", "transcript_version", "exon_number", "exon_id", "orig_start", "orig_end"]] diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index b28bd08e..df999204 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -6,7 +6,7 @@ process VCF_COLLECT { container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" input: - tuple val(meta), path(tsv), path(out_gtf), path(report) + tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) tuple val(meta2), path(hgnc_ref) tuple val(meta3), path(hgnc_date) @@ -20,7 +20,7 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $tsv --fusionreport $report --fusioninspector_gtf $out_gtf --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index 3eb477d0..640b25fd 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -1,3 +1,4 @@ +include { AGAT_CONVERTSPGFF2TSV } from '../../modules/nf-core/agat/visualisation/main' include { ARRIBA_VISUALISATION } from '../../modules/local/arriba/visualisation/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { VCF_COLLECT } from '../../modules/local/vcf_collect/main' @@ -40,8 +41,11 @@ workflow FUSIONINSPECTOR_WORKFLOW { FUSIONINSPECTOR( ch_reads_fusion, index) ch_versions = ch_versions.mix(FUSIONINSPECTOR.out.versions) - fusion_data = FUSIONINSPECTOR.out.tsv.join(FUSIONINSPECTOR.out.out_gtf).join(fusionreport_out) + AGAT_CONVERTSPGFF2TSV(FUSIONINSPECTOR.out.out_gtf) + ch_versions = ch_versions.mix(AGAT_CONVERTSPGFF2TSV.out.versions) + + fusion_data = FUSIONINSPECTOR.out.tsv.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out) VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date) ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) From cc4ec30388dce42bdb7fd8ac1bbdbe1caf558f94 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 10:20:04 +0100 Subject: [PATCH 15/45] install agat/convertspgff2tsv from nf-core --- modules.json | 5 ++ .../agat/convertspgff2tsv/environment.yml | 6 ++ modules/nf-core/agat/convertspgff2tsv/main.nf | 35 +++++++++ .../nf-core/agat/convertspgff2tsv/meta.yml | 38 ++++++++++ modules/nf-core/picard/markduplicates/main.nf | 65 +++++++++++++++++ .../nf-core/picard/markduplicates/meta.yml | 71 +++++++++++++++++++ .../execution_trace_2023-11-01_15-05-02.txt | 1 + 7 files changed, 221 insertions(+) create mode 100644 modules/nf-core/agat/convertspgff2tsv/environment.yml create mode 100644 modules/nf-core/agat/convertspgff2tsv/main.nf create mode 100644 modules/nf-core/agat/convertspgff2tsv/meta.yml create mode 100644 modules/nf-core/picard/markduplicates/main.nf create mode 100644 modules/nf-core/picard/markduplicates/meta.yml create mode 100644 null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt diff --git a/modules.json b/modules.json index 9805e2ef..beb0e954 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "agat/convertspgff2tsv": { + "branch": "master", + "git_sha": "53e6fd5d80141e00a3b70762f4361f6af1f4303b", + "installed_by": ["modules"] + }, "arriba": { "branch": "master", "git_sha": "ea9e2892a9d12e8769402f12096219942bcf6536", diff --git a/modules/nf-core/agat/convertspgff2tsv/environment.yml b/modules/nf-core/agat/convertspgff2tsv/environment.yml new file mode 100644 index 00000000..9ca0ea28 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::agat=1.2.0 diff --git a/modules/nf-core/agat/convertspgff2tsv/main.nf b/modules/nf-core/agat/convertspgff2tsv/main.nf new file mode 100644 index 00000000..cef48360 --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/main.nf @@ -0,0 +1,35 @@ +process AGAT_CONVERTSPGFF2TSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/agat:1.2.0--pl5321hdfd78af_0' : + 'biocontainers/agat:1.2.0--pl5321hdfd78af_0' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + agat_convert_sp_gff2tsv.pl \\ + --gff $gff \\ + --output ${prefix}.tsv \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + agat: \$(agat_convert_sp_gff2tsv.pl --help | sed '3!d; s/.*v//' | sed 's/ .*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/agat/convertspgff2tsv/meta.yml b/modules/nf-core/agat/convertspgff2tsv/meta.yml new file mode 100644 index 00000000..f5865dfe --- /dev/null +++ b/modules/nf-core/agat/convertspgff2tsv/meta.yml @@ -0,0 +1,38 @@ +name: agat_convertspgff2tsv +description: | + Converts a GFF/GTF file into a TSV file +keywords: + - genome + - gff + - gtf + - conversion + - tsv +tools: + - agat: + description: "AGAT is a toolkit for manipulation and getting information from GFF/GTF files" + homepage: "https://github.com/NBISweden/AGAT" + documentation: "https://agat.readthedocs.io/" + tool_dev_url: "https://github.com/NBISweden/AGAT" + doi: "10.5281/zenodo.3552717" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gff: + type: file + description: Annotation file in GFF3/GTF format + pattern: "*.{gff, gtf}" +output: + - tsv: + type: file + description: Annotation file in TSV format + pattern: "*.{gtf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@rannick" diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf new file mode 100644 index 00000000..ebfa0864 --- /dev/null +++ b/modules/nf-core/picard/markduplicates/main.nf @@ -0,0 +1,65 @@ +process PICARD_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::picard=3.0.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : + 'biocontainers/picard:3.0.0--hdfd78af_1' }" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.metrics.txt"), emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def avail_mem = 3072 + if (!task.memory) { + log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + + """ + picard \\ + -Xmx${avail_mem}M \\ + MarkDuplicates \\ + $args \\ + --INPUT $bam \\ + --OUTPUT ${prefix}.bam \\ + --REFERENCE_SEQUENCE $fasta \\ + --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + touch ${prefix}.MarkDuplicates.metrics.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) + END_VERSIONS + """ +} diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml new file mode 100644 index 00000000..f7693d2f --- /dev/null +++ b/modules/nf-core/picard/markduplicates/meta.yml @@ -0,0 +1,71 @@ +name: picard_markduplicates +description: Locate and tag duplicate reads in a BAM file +keywords: + - markduplicates + - pcr + - duplicates + - bam + - sam + - cram +tools: + - picard: + description: | + A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) + data and formats such as SAM/BAM/CRAM and VCF. + homepage: https://broadinstitute.github.io/picard/ + documentation: https://broadinstitute.github.io/picard/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Reference genome fasta index + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file with duplicate reads marked/removed + pattern: "*.{bam}" + - bai: + type: file + description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag + pattern: "*.{bai}" + - metrics: + type: file + description: Duplicate metrics file generated by picard + pattern: "*.{metrics.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@projectoriented" + - "@ramprasadn" diff --git a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt new file mode 100644 index 00000000..6b739acd --- /dev/null +++ b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt @@ -0,0 +1 @@ +task_id hash native_id name status exit submit duration realtime %cpu peak_rss peak_vmem rchar wchar From 0c3ffaed91d6129217bdd6be8c9fddb002b70e21 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 10:27:33 +0100 Subject: [PATCH 16/45] correct typo --- subworkflows/local/fusioninspector_workflow.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index 640b25fd..e616ee5b 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -1,4 +1,4 @@ -include { AGAT_CONVERTSPGFF2TSV } from '../../modules/nf-core/agat/visualisation/main' +include { AGAT_CONVERTSPGFF2TSV } from '../../modules/nf-core/agat/convertspgff2tsv/main' include { ARRIBA_VISUALISATION } from '../../modules/local/arriba/visualisation/main' include { CAT_CAT } from '../../modules/nf-core/cat/cat/main' include { VCF_COLLECT } from '../../modules/local/vcf_collect/main' From ba55fa3fdffda0b8c280e3807702bf4889b239eb Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 12:08:31 +0100 Subject: [PATCH 17/45] vcf_collect takes tsv_coding_effect channel instead of tsv --- modules/local/fusioninspector/main.nf | 17 +++++++++++------ subworkflows/local/fusioninspector_workflow.nf | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf index 1b1a7169..11013837 100644 --- a/modules/local/fusioninspector/main.nf +++ b/modules/local/fusioninspector/main.nf @@ -10,10 +10,11 @@ process FUSIONINSPECTOR { path reference output: - tuple val(meta), path("*FusionInspector.fusions.tsv") , emit: tsv - tuple val(meta), path("*.gtf") , emit: out_gtf - path "*" , emit: output - path "versions.yml" , emit: versions + tuple val(meta), path("*FusionInspector.fusions.tsv") , emit: tsv + tuple val(meta), path("*FusionInspector.fusions.tsv.annotated.coding_effect") , emit: tsv_coding_effect + tuple val(meta), path("*.gtf") , emit: out_gtf + path "*" , emit: output + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -40,9 +41,13 @@ process FUSIONINSPECTOR { """ stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ - touch FusionInspector.log - touch FusionInspector.fusions.tsv + touch ${prefix}.FusionInspector.log + touch ${prefix}.FusionInspector.fusions.tsv + touch ${prefix}.FusionInspector.fusions.tsv.annotated.coding_effect + touch ${prefix}.gtf cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/subworkflows/local/fusioninspector_workflow.nf b/subworkflows/local/fusioninspector_workflow.nf index e616ee5b..32173e81 100644 --- a/subworkflows/local/fusioninspector_workflow.nf +++ b/subworkflows/local/fusioninspector_workflow.nf @@ -45,7 +45,7 @@ workflow FUSIONINSPECTOR_WORKFLOW { AGAT_CONVERTSPGFF2TSV(FUSIONINSPECTOR.out.out_gtf) ch_versions = ch_versions.mix(AGAT_CONVERTSPGFF2TSV.out.versions) - fusion_data = FUSIONINSPECTOR.out.tsv.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out) + fusion_data = FUSIONINSPECTOR.out.tsv_coding_effect.join(AGAT_CONVERTSPGFF2TSV.out.tsv).join(fusionreport_out) VCF_COLLECT(fusion_data, ch_hgnc_ref, ch_hgnc_date) ch_versions = ch_versions.mix(VCF_COLLECT.out.versions) From 1b9c18115689e32039b56f40dbbd90731f572031 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 12:36:47 +0100 Subject: [PATCH 18/45] remove empty entry --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index ec08e432..cd268ac1 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -286,7 +286,7 @@ def column_manipulation(df): # INFO df.loc[index, "INFO"] = ( "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};" - "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" + "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};" "EXON_NUMBER_B={};ANNOTATIONS={}".format( row["ChromosomeA"], From 775444e4fb7dc863b7f008a79632200e73b03498 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 15:15:23 +0100 Subject: [PATCH 19/45] make optional gtf and coding effectsg --- modules/local/fusioninspector/main.nf | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/modules/local/fusioninspector/main.nf b/modules/local/fusioninspector/main.nf index 11013837..c7fcd3f0 100644 --- a/modules/local/fusioninspector/main.nf +++ b/modules/local/fusioninspector/main.nf @@ -10,11 +10,11 @@ process FUSIONINSPECTOR { path reference output: - tuple val(meta), path("*FusionInspector.fusions.tsv") , emit: tsv - tuple val(meta), path("*FusionInspector.fusions.tsv.annotated.coding_effect") , emit: tsv_coding_effect - tuple val(meta), path("*.gtf") , emit: out_gtf - path "*" , emit: output - path "versions.yml" , emit: versions + tuple val(meta), path("*FusionInspector.fusions.tsv") , emit: tsv + tuple val(meta), path("*.coding_effect") , optional:true, emit: tsv_coding_effect + tuple val(meta), path("*.gtf") , optional:true, emit: out_gtf + path "*" , emit: output + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when @@ -42,7 +42,6 @@ process FUSIONINSPECTOR { stub: def prefix = task.ext.prefix ?: "${meta.id}" - """ touch ${prefix}.FusionInspector.log touch ${prefix}.FusionInspector.fusions.tsv From 4a38147d79fa7a59032c38c689b54034ad15e106 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 16:36:03 +0100 Subject: [PATCH 20/45] remove HGNC: from HGNC id column --- bin/vcf_collect.py | 1 + modules.json | 5 -- modules/nf-core/picard/markduplicates/main.nf | 65 ----------------- .../nf-core/picard/markduplicates/meta.yml | 71 ------------------- .../execution_trace_2023-11-01_15-05-02.txt | 1 - 5 files changed, 1 insertion(+), 142 deletions(-) delete mode 100644 modules/nf-core/picard/markduplicates/main.nf delete mode 100644 modules/nf-core/picard/markduplicates/meta.yml delete mode 100644 null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index cd268ac1..39da16ba 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -352,6 +352,7 @@ def build_hgnc_dataframe(file): Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ df = pd.read_csv(file, sep="\t", low_memory=False) + df['hgnc_id'] = df['hgnc_id'].str.replace("HGNC:","") return df[["hgnc_id", "ensembl_gene_id"]].dropna() diff --git a/modules.json b/modules.json index beb0e954..a6ffdfd5 100644 --- a/modules.json +++ b/modules.json @@ -70,11 +70,6 @@ "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", "installed_by": ["modules"] }, - "picard/markduplicates": { - "branch": "master", - "git_sha": "2ee934606f1fdf7fc1cb05d6e8abc13bec8ab448", - "installed_by": ["modules"] - }, "samtools/faidx": { "branch": "master", "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", diff --git a/modules/nf-core/picard/markduplicates/main.nf b/modules/nf-core/picard/markduplicates/main.nf deleted file mode 100644 index ebfa0864..00000000 --- a/modules/nf-core/picard/markduplicates/main.nf +++ /dev/null @@ -1,65 +0,0 @@ -process PICARD_MARKDUPLICATES { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::picard=3.0.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" - - input: - tuple val(meta), path(bam) - tuple val(meta2), path(fasta) - tuple val(meta3), path(fai) - - output: - tuple val(meta), path("*.bam") , emit: bam - tuple val(meta), path("*.bai") , optional:true, emit: bai - tuple val(meta), path("*.metrics.txt"), emit: metrics - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def avail_mem = 3072 - if (!task.memory) { - log.info '[Picard MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' - } else { - avail_mem = (task.memory.mega*0.8).intValue() - } - - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - - """ - picard \\ - -Xmx${avail_mem}M \\ - MarkDuplicates \\ - $args \\ - --INPUT $bam \\ - --OUTPUT ${prefix}.bam \\ - --REFERENCE_SEQUENCE $fasta \\ - --METRICS_FILE ${prefix}.MarkDuplicates.metrics.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" - """ - touch ${prefix}.bam - touch ${prefix}.bam.bai - touch ${prefix}.MarkDuplicates.metrics.txt - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - picard: \$(echo \$(picard MarkDuplicates --version 2>&1) | grep -o 'Version:.*' | cut -f2- -d:) - END_VERSIONS - """ -} diff --git a/modules/nf-core/picard/markduplicates/meta.yml b/modules/nf-core/picard/markduplicates/meta.yml deleted file mode 100644 index f7693d2f..00000000 --- a/modules/nf-core/picard/markduplicates/meta.yml +++ /dev/null @@ -1,71 +0,0 @@ -name: picard_markduplicates -description: Locate and tag duplicate reads in a BAM file -keywords: - - markduplicates - - pcr - - duplicates - - bam - - sam - - cram -tools: - - picard: - description: | - A set of command line tools (in Java) for manipulating high-throughput sequencing (HTS) - data and formats such as SAM/BAM/CRAM and VCF. - homepage: https://broadinstitute.github.io/picard/ - documentation: https://broadinstitute.github.io/picard/ - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM file - pattern: "*.{bam,cram,sam}" - - meta2: - type: map - description: | - Groovy Map containing reference information - e.g. [ id:'genome' ] - - fasta: - type: file - description: Reference genome fasta file - pattern: "*.{fasta,fa}" - - meta3: - type: map - description: | - Groovy Map containing reference information - e.g. [ id:'genome' ] - - fai: - type: file - description: Reference genome fasta index - pattern: "*.{fai}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM file with duplicate reads marked/removed - pattern: "*.{bam}" - - bai: - type: file - description: An optional BAM index file. If desired, --CREATE_INDEX must be passed as a flag - pattern: "*.{bai}" - - metrics: - type: file - description: Duplicate metrics file generated by picard - pattern: "*.{metrics.txt}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@projectoriented" - - "@ramprasadn" diff --git a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt b/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt deleted file mode 100644 index 6b739acd..00000000 --- a/null/pipeline_info/execution_trace_2023-11-01_15-05-02.txt +++ /dev/null @@ -1 +0,0 @@ -task_id hash native_id name status exit submit duration realtime %cpu peak_rss peak_vmem rchar wchar From 6f2a4ea24d3c6f1004e74fab5c93d898c5ebb1c5 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 17:06:07 +0100 Subject: [PATCH 21/45] fix hgnc date timestamping --- modules/local/hgnc/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/hgnc/main.nf b/modules/local/hgnc/main.nf index 7211cb71..1b3808f6 100644 --- a/modules/local/hgnc/main.nf +++ b/modules/local/hgnc/main.nf @@ -19,7 +19,7 @@ process HGNC_DOWNLOAD { script: """ wget https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt - date+%Y-%m-%d/%H:%M: > HGNC-DB-timestamp.txt + date +%Y-%m-%d/%H:%M > HGNC-DB-timestamp.txt cat <<-END_VERSIONS > versions.yml "${task.process}": From f4c091214c983584bb933ec039a177455ab787c6 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 6 Nov 2023 21:43:14 +0100 Subject: [PATCH 22/45] black --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 39da16ba..4b3ec199 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -352,7 +352,7 @@ def build_hgnc_dataframe(file): Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ df = pd.read_csv(file, sep="\t", low_memory=False) - df['hgnc_id'] = df['hgnc_id'].str.replace("HGNC:","") + df["hgnc_id"] = df["hgnc_id"].str.replace("HGNC:", "") return df[["hgnc_id", "ensembl_gene_id"]].dropna() From 4337f4ae1653361393190618077291b14d08ca76 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Tue, 7 Nov 2023 21:49:47 +0100 Subject: [PATCH 23/45] add support for case fusioninspector filters out a fusion from fusionreport --- bin/vcf_collect.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 4b3ec199..408abfe9 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -11,7 +11,7 @@ logger = logging.getLogger() -def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample, out): +def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, hgnc, gtf, out): """ Process FusionInspector and FusionReport data, merge with GTF from FusionInspector and HGNC database, @@ -33,14 +33,17 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample .reset_index() ) - df = build_hgnc_dataframe(hgnc).merge( + df = build_hgcn_dataframe(hgnc).merge( merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id" ) df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) - df = build_hgnc_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id") + df = build_hgcn_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id") df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") + all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0) + all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].astype(int) + all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])] all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) @@ -73,7 +76,10 @@ def vcf_collect(fusioninspector_in_file, fusionreport_in_file, gtf, hgnc, sample ] ].drop_duplicates() all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])] + all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) all_df = all_df[ @@ -195,7 +201,6 @@ def build_fusioninspector_dataframe(file): df = df.rename(columns={"#FusionName": "FUSION"}) df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True) df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) - df[["GeneA", "GeneB"]] = df["FUSION"].str.split("--", expand=True) df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) return df.set_index(["FUSION"]) @@ -234,11 +239,11 @@ def read_build_fusionreport(fusionreport_file): from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line] expression = from_html[0].split('], "tool')[0] fusion_report = pd.DataFrame.from_dict(ast.literal_eval(expression)) - if "arriba" not in fusion_report.columns: + if not "arriba" in fusion_report.columns: fusion_report["arriba"] = "" - if "fusioncatcher" not in fusion_report.columns: + if not "fusioncatcher" in fusion_report.columns: fusion_report["fusioncatcher"] = "" - if "starfusion" not in fusion_report.columns: + if not "starfusion" in fusion_report.columns: fusion_report["starfusion"] = "" fusion_report["arriba"] = fusion_report[["arriba"]].apply( replace_value_with_column_name, args=("true", "arriba"), axis=1 @@ -254,7 +259,11 @@ def read_build_fusionreport(fusionreport_file): ) fusion_report.columns = fusion_report.columns.str.upper() fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x)) - return fusion_report[["FUSION", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(["FUSION"]) + fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) + + return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( + ["FUSION"] + ) def column_manipulation(df): @@ -270,10 +279,12 @@ def column_manipulation(df): df["REF"] = "N" df["INFO"] = "" df["Sample"] = "" + df["Strand1"] = df["Strand1"].astype(str) for index, row in df.iterrows(): - # ALT - if not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]: + if row["Strand1"] == "nan": + df.loc[index, "ALT"] = "nan" + elif not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]: df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"]) elif row["Strand1"] == "-" and row["Strand2"] == "-": df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"]) @@ -283,12 +294,12 @@ def column_manipulation(df): df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"]) else: df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"]) - # INFO + df.loc[index, "INFO"] = ( "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};" - "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" - "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={};" - "EXON_NUMBER_B={};ANNOTATIONS={}".format( + "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" + "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};" + "ANNOTATIONS={}".format( row["ChromosomeA"], row["ChromosomeB"], row["GeneA"], @@ -347,7 +358,7 @@ def write_vcf(df_to_print, header, out_file): f.write(header.rstrip("\r\n") + "\n" + content) -def build_hgnc_dataframe(file): +def build_hgcn_dataframe(file): """ Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ From be99aef3a95573574dc9a9bac357dc20fac66f5d Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 8 Nov 2023 13:14:40 +0100 Subject: [PATCH 24/45] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c9dcad59..81535f94 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - `--extreme_sensitivity` used for fusioninspector to minimize fusioninspector filtering [#424](https://github.com/nf-core/rnafusion/pull/424) - `--extreme_sensitivity` removed in favor of `--max_sensitivity --max_mate_dist 10000000 --annotate --examine_coding_effect` to collect more data from fusioninspector [#426](https://github.com/nf-core/rnafusion/pull/426) - `Arriba` updated to 2.4.0 [#429](https://github.com/nf-core/rnafusion/pull/429) +- Change megafusion into vcf_collect, taking into account e.g. the annotation and coding effects outputs from fusioninspector, HGNC ids, frame status... [#414](https://github.com/nf-core/rnafusion/pull/414) ### Fixed From 1c109ffcdfb765f49df852636e9bde4b331246e9 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 10 Nov 2023 10:34:00 +0100 Subject: [PATCH 25/45] Update bin/vcf_collect.py Co-authored-by: Eva C <29628428+fevac@users.noreply.github.com> --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 408abfe9..69d39f34 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -193,7 +193,7 @@ def header_def(sample): ) -def build_fusioninspector_dataframe(file): +def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: """ Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index. """ From ff51b1cee25120fc583c9b20ebba37620ac91359 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 10 Nov 2023 10:36:44 +0100 Subject: [PATCH 26/45] Apply suggestions from code review Co-authored-by: Eva C <29628428+fevac@users.noreply.github.com> --- bin/vcf_collect.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 69d39f34..7d43a98a 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -11,7 +11,7 @@ logger = logging.getLogger() -def vcf_collect(fusioninspector_in_file, fusionreport_in_file, sample, hgnc, gtf, out): +def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file) -> None: """ Process FusionInspector and FusionReport data, merge with GTF from FusionInspector and HGNC database, @@ -206,7 +206,7 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: return df.set_index(["FUSION"]) -def replace_value_with_column_name(row, value_to_replace, column_name): +def replace_value_with_column_name(row: pd.Series, value_to_replace: str, column_name: str) -> str: """ Replace a specific value in a row with the corresponding column name. """ @@ -220,7 +220,7 @@ def replace_value_with_column_name(row, value_to_replace, column_name): return new_values -def concatenate_columns(row): +def concatenate_columns(row: pd.Series) -> str: """ Concatenate non-empty values in a row into a single string separated by commas. """ @@ -228,7 +228,7 @@ def concatenate_columns(row): return ",".join(non_empty_values) -def read_build_fusionreport(fusionreport_file): +def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: """ Read and preprocess fusion-report data from a file, including handling missing tool columns, getting the columns with each tool and create a new FOUND_IN column with all the tool hits. @@ -266,7 +266,7 @@ def read_build_fusionreport(fusionreport_file): ) -def column_manipulation(df): +def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: """ Manipulate and prepare DataFrame for VCF file creation. """ @@ -297,7 +297,7 @@ def column_manipulation(df): df.loc[index, "INFO"] = ( "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};" - "FOUND_IN={};;TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" + "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};" "ANNOTATIONS={}".format( row["ChromosomeA"], @@ -328,7 +328,7 @@ def column_manipulation(df): return df -def write_vcf(df_to_print, header, out_file): +def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: """ Write a VCF file with a specified DataFrame, header, and output file path. """ @@ -358,7 +358,7 @@ def write_vcf(df_to_print, header, out_file): f.write(header.rstrip("\r\n") + "\n" + content) -def build_hgcn_dataframe(file): +def build_hgcn_dataframe(file: str) -> pd.DataFrame: """ Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ @@ -367,7 +367,7 @@ def build_hgcn_dataframe(file): return df[["hgnc_id", "ensembl_gene_id"]].dropna() -def build_gtf_dataframe(file): +def build_gtf_dataframe(file: str) -> pd.DataFrame: """ Build a DataFrame from GTF file converted in TSV, extracting relevant columns. """ From 92a3dadf2b80eb151746c5217b37d6d44328d953 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:04:16 +0100 Subject: [PATCH 27/45] use fstrings --- bin/vcf_collect.py | 47 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 7d43a98a..c5157512 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -285,46 +285,27 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: if row["Strand1"] == "nan": df.loc[index, "ALT"] = "nan" elif not row["Strand1"] in ["+", "-"] or not row["Strand2"] in ["+", "-"]: - df.loc[index, "ALT"] = "N[{}:{}[".format(df["ChromosomeB"], row["PosB"]) + df.loc[index, "ALT"] = f'N[{df["ChromosomeB"]}:{row["PosB"]}[' elif row["Strand1"] == "-" and row["Strand2"] == "-": - df.loc[index, "ALT"] = "[{}:{}[N".format(row["ChromosomeB"], row["PosB"]) + df.loc[index, "ALT"] = f'[{row["ChromosomeB"]}:{row["PosB"]}[N' elif row["Strand1"] == "+" and row["Strand2"] == "-": - df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"]) + df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]' elif row["Strand1"] == "-" and row["Strand2"] == "+": - df.loc[index, "ALT"] = "N]{}:{}]".format(row["ChromosomeB"], row["PosB"]) + df.loc[index, "ALT"] = f'N]{row["ChromosomeB"]}:{row["PosB"]}]' else: - df.loc[index, "ALT"] = "N[{}:{}[".format(row["ChromosomeB"], row["PosB"]) + df.loc[index, "ALT"] = f'N[{row["ChromosomeB"]}:{row["PosB"]}[' df.loc[index, "INFO"] = ( - "SVTYPE=BND;CHRA={};CHRB={};GENEA={};GENEB={};POSA={};POSB={};ORIENTATION={},{};FOUND_DB={};" - "FOUND_IN={};TOOL_HITS={};SCORE={};FRAME_STATUS={};TRANSCRIPT_ID_A={};TRANSCRIPT_ID_B={};" - "TRANSCRIPT_VERSION_A={};TRANSCRIPT_VERSION_B={};HGNC_ID_A={};HGNC_ID_B={};EXON_NUMBER_A={},EXON_NUMBER_B={};" - "ANNOTATIONS={}".format( - row["ChromosomeA"], - row["ChromosomeB"], - row["GeneA"], - row["GeneB"], - row["PosA"], - row["PosB"], - row["Strand1"], - row["Strand2"], - row["FOUND_DB"], - row["FOUND_IN"], - row["TOOLS_HITS"], - row["SCORE"], - row["PROT_FUSION_TYPE"], - row["CDS_LEFT_ID"], - row["CDS_RIGHT_ID"], - row["Left_transcript_version"], - row["Right_transcript_version"], - row["Left_hgnc_id"], - row["Right_hgnc_id"], - row["Left_exon_number"], - row["Right_exon_number"], - row["annots"], - ) + f"SVTYPE=BND;CHRA={row['ChromosomeA']};CHRB={row['ChromosomeB']};GENEA={row['GeneA']};GENEB={row['GeneB']};" + f"POSA={row['PosA']};POSB={row['PosB']};ORIENTATION={row['Strand1']},{row['Strand2']};FOUND_DB={row['FOUND_DB']};" + f"FOUND_IN={row['FOUND_IN']};TOOL_HITS={row['TOOLS_HITS']};SCORE={row['SCORE']};FRAME_STATUS={row['PROT_FUSION_TYPE']};" + f"TRANSCRIPT_ID_A={row['CDS_LEFT_ID']};TRANSCRIPT_ID_B={row['CDS_RIGHT_ID']};" + f"TRANSCRIPT_VERSION_A={row['Left_transcript_version']};TRANSCRIPT_VERSION_B={row['Right_transcript_version']};" + f"HGNC_ID_A={row['Left_hgnc_id']};HGNC_ID_B={row['Right_hgnc_id']};" + f"EXON_NUMBER_A={row['Left_exon_number']},EXON_NUMBER_B={row['Right_exon_number']};" + f"ANNOTATIONS={row['annots']}" ) - df.loc[index, "Sample"] = "./1:{}:{}:{}".format(row["JunctionReadCount"], row["SpanningFragCount"], row["FFPM"]) + df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" return df From 7a700f0ff6b00a82e057d32a81ea06f717a7a437 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:05:30 +0100 Subject: [PATCH 28/45] add info on GTF output file from fusioninspector --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index c5157512..3761a11c 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -22,7 +22,7 @@ def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample: fusionreport_in_file (str): Path to FusionReport input file. sample (str): Sample name for the header. hgnc (str): Path to HGNC file. - gtf (str): Path to GTF file. + gtf (str): Path to output GTF file from FusionInspector. out (str): Output VCF file path. Adapted from: https://github.com/J35P312/MegaFusion From 22e4502c76aa635aa142006fd9c926e7ae373d6f Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 10:06:59 +0100 Subject: [PATCH 29/45] black --- bin/vcf_collect.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 3761a11c..14855fd6 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -11,7 +11,9 @@ logger = logging.getLogger() -def vcf_collect(fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file) -> None: +def vcf_collect( + fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file +) -> None: """ Process FusionInspector and FusionReport data, merge with GTF from FusionInspector and HGNC database, From e59823c3f7b054e262df0d5bfdd4f693017aba1d Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:18:55 +0100 Subject: [PATCH 30/45] fix header typer --- bin/vcf_collect.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 14855fd6..7156a806 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -6,7 +6,8 @@ from pathlib import Path import pandas as pd import ast -from gtfparse import read_gtf +import numpy as np +import csv logger = logging.getLogger() @@ -176,16 +177,16 @@ def header_def(sample): ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ -##INFO=\n\ -##INFO=\n\ -##INFO=\n\ +##INFO=\n\ +##INFO=\n\ +##INFO=\n\ ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ ##INFO=\n\ -##INFO=\n\ +##INFO=\n\ ##FORMAT=\n\ ##FORMAT=\n\ ##FORMAT=\n\ @@ -194,7 +195,6 @@ def header_def(sample): sample ) - def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: """ Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index. From afda8fda8c5cb7674b6f437e384ab1537552237a Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:19:02 +0100 Subject: [PATCH 31/45] mend --- conf/modules.config | 1 - modules/local/vcf_collect/main.nf | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index f49dd02c..39135aa3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,6 +344,5 @@ process { withName: VCF_COLLECT { ext.when = {!params.fusioninspector_only} - ext.prefix = { "${meta.id}_fusion_data" } } } diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index df999204..c634a6a3 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,8 +2,10 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "bioconda::gtfparse=2.0.1" - container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) @@ -20,7 +22,7 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": From f14bd90558e09d6dcfaf6190199be5b640247789 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:22:08 +0100 Subject: [PATCH 32/45] Revert "mend" This reverts commit afda8fda8c5cb7674b6f437e384ab1537552237a. --- conf/modules.config | 1 + modules/local/vcf_collect/main.nf | 8 +++----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 39135aa3..f49dd02c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,5 +344,6 @@ process { withName: VCF_COLLECT { ext.when = {!params.fusioninspector_only} + ext.prefix = { "${meta.id}_fusion_data" } } } diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index c634a6a3..df999204 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,10 +2,8 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + conda "bioconda::gtfparse=2.0.1" + container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" input: tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) @@ -22,7 +20,7 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": From 55d95d1f8c81e78bdabf7e2dfded65b95bd69b67 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:24:02 +0100 Subject: [PATCH 33/45] revert to plain pandas container --- modules/local/vcf_collect/main.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index df999204..c634a6a3 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,8 +2,10 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "bioconda::gtfparse=2.0.1" - container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) @@ -20,7 +22,7 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}.vcf + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": From 6bc7351dd1bf9e43e5c9c69312594d245841c0dd Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:24:52 +0100 Subject: [PATCH 34/45] sample should just be meta.id, add fusion_data in module for output name --- conf/modules.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index f49dd02c..39135aa3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -344,6 +344,5 @@ process { withName: VCF_COLLECT { ext.when = {!params.fusioninspector_only} - ext.prefix = { "${meta.id}_fusion_data" } } } From d559232b5da7a028c15b53a80e462ef352ee6dab Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:53:27 +0100 Subject: [PATCH 35/45] remove quotes and flatten the pseudo list in annots --- bin/vcf_collect.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 7156a806..d52031ef 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -159,7 +159,7 @@ def parse_args(argv=None): return parser.parse_args(argv) -def header_def(sample): +def header_def(sample: str) -> str: """ Define the header of the VCF file """ @@ -195,6 +195,14 @@ def header_def(sample): sample ) + +def convert_to_list(annots_str: str) -> list: + try: + return ast.literal_eval(annots_str) + except (SyntaxError, ValueError): + return np.nan + + def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: """ Read FusionInspector output from a CSV file, preprocess the data, and set 'FUSION' as the index. @@ -205,6 +213,11 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) + df["annots"] = ( + df["annots"] + .apply(convert_to_list) + .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "") + ) return df.set_index(["FUSION"]) @@ -328,12 +341,7 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: "FORMAT", "Sample", ] - ].to_csv( - path_or_buf=out_file, - sep="\t", - header=None, - index=False, - ) + ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) with open(out_file, "r+") as f: content = f.read() From d0a4bf2202945fcef640d335d0dddc91a00e7408 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 17:53:50 +0100 Subject: [PATCH 36/45] back to original container --- modules/local/vcf_collect/main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index c634a6a3..5028b2c5 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,10 +2,8 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : - 'quay.io/biocontainers/pandas:1.5.2' }" + conda "bioconda::gtfparse=2.0.1" + container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" input: tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) From b46894252fc26553a02a1bad13162a6b6113e595 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 13 Nov 2023 20:12:36 +0100 Subject: [PATCH 37/45] revert to pandas --- modules/local/vcf_collect/main.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 5028b2c5..c634a6a3 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,8 +2,10 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "bioconda::gtfparse=2.0.1" - container "quay.io/biocontainers/gtfparse:2.0.1--pyh7cba7a3_1" + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : + 'quay.io/biocontainers/pandas:1.5.2' }" input: tuple val(meta), path(fusioninspector_tsv), path(fusioninspector_gtf_tsv), path(fusionreport_report) From 960db4cf090ced0022d7956dcfcf70ec3f7ded1c Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Tue, 14 Nov 2023 22:52:38 +0100 Subject: [PATCH 38/45] apply fixes --- bin/vcf_collect.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index d52031ef..8e15ccfc 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -11,9 +11,11 @@ logger = logging.getLogger() +# vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out) + def vcf_collect( - fusioninspector_in_file: str, fusionreport_in_file: str, sample: str, hgnc: str, gtf: str, out_file + fusioninspector_in_file: str, fusionreport_in_file: str, gtf: str, hgnc: str, sample: str, out_file ) -> None: """ Process FusionInspector and FusionReport data, @@ -25,7 +27,7 @@ def vcf_collect( fusionreport_in_file (str): Path to FusionReport input file. sample (str): Sample name for the header. hgnc (str): Path to HGNC file. - gtf (str): Path to output GTF file from FusionInspector. + gtf (str): Path to output GTF file from FusionInspector in TSV format. out (str): Output VCF file path. Adapted from: https://github.com/J35P312/MegaFusion @@ -116,7 +118,7 @@ def vcf_collect( ] ].drop_duplicates() - return write_vcf(column_manipulation(all_df), header_def(sample), out) + return write_vcf(column_manipulation(all_df), header_def(sample), out_file) def parse_args(argv=None): @@ -371,7 +373,12 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: def main(argv=None): """Coordinate argument parsing and program execution.""" args = parse_args(argv) - if not args.fusioninspector.is_file() or not args.fusionreport.is_file(): + if ( + not args.fusioninspector.is_file() + or not args.fusionreport.is_file() + or not args.fusioninspector_gtf + or not args.hgnc + ): logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") sys.exit(2) vcf_collect(args.fusioninspector, args.fusionreport, args.fusioninspector_gtf, args.hgnc, args.sample, args.out) From 852c6ac8290aa3819f51ee0e4ebcd36b465caade Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 15 Nov 2023 09:33:18 +0100 Subject: [PATCH 39/45] compress vcf output --- bin/vcf_collect.py | 5 +++-- modules/local/vcf_collect/main.nf | 4 ++-- modules/local/vcf_collect/meta.yml | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 8e15ccfc..6a97fb50 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -8,6 +8,7 @@ import ast import numpy as np import csv +import gzip logger = logging.getLogger() @@ -343,9 +344,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: "FORMAT", "Sample", ] - ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) + ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression='gzip') - with open(out_file, "r+") as f: + with gzip.open(out_file, "r+") as f: content = f.read() f.seek(0, 0) f.write(header.rstrip("\r\n") + "\n" + content) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index c634a6a3..42f94c40 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -14,7 +14,7 @@ process VCF_COLLECT { output: path "versions.yml" , emit: versions - tuple val(meta), path("*vcf") , emit: vcf + tuple val(meta), path("*vcf.gz") , emit: vcf when: task.ext.when == null || task.ext.when @@ -22,7 +22,7 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf.gz cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/vcf_collect/meta.yml b/modules/local/vcf_collect/meta.yml index 40bdd6c0..de4667bb 100644 --- a/modules/local/vcf_collect/meta.yml +++ b/modules/local/vcf_collect/meta.yml @@ -32,8 +32,8 @@ output: pattern: "versions.yml" - vcf: type: file - description: File containing the summary of all fusions as vcf file - pattern: "*.tsv" + description: File containing the summary of all fusions as compressed vcf file + pattern: "*.vcf.gz" authors: - "@rannick" From 9643ed80b047441eda5735a6c61857a6e5b0a12c Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 15 Nov 2023 09:33:40 +0100 Subject: [PATCH 40/45] black --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 6a97fb50..4d677de0 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -344,7 +344,7 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: "FORMAT", "Sample", ] - ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression='gzip') + ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression="gzip") with gzip.open(out_file, "r+") as f: content = f.read() From bb703dd103fa91ab4e2f28b12c673d21dbbe6232 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 15 Nov 2023 09:44:42 +0100 Subject: [PATCH 41/45] modules updatge --- modules.json | 40 +- .../agat/convertspgff2tsv/environment.yml | 1 + modules/nf-core/cat/cat/environment.yml | 7 + modules/nf-core/cat/cat/main.nf | 2 +- modules/nf-core/cat/cat/meta.yml | 7 +- modules/nf-core/cat/cat/tests/main.nf.test | 153 ++++++ .../nf-core/cat/cat/tests/main.nf.test.snap | 121 +++++ .../cat/tests/nextflow_unzipped_zipped.config | 6 + .../cat/tests/nextflow_zipped_unzipped.config | 8 + modules/nf-core/cat/cat/tests/tags.yml | 2 + modules/nf-core/cat/fastq/environment.yml | 7 + modules/nf-core/cat/fastq/main.nf | 2 +- modules/nf-core/cat/fastq/meta.yml | 4 +- modules/nf-core/cat/fastq/tests/main.nf.test | 143 ++++++ .../nf-core/cat/fastq/tests/main.nf.test.snap | 78 +++ modules/nf-core/cat/fastq/tests/tags.yml | 2 + .../dumpsoftwareversions/environment.yml | 7 + .../custom/dumpsoftwareversions/main.nf | 6 +- .../custom/dumpsoftwareversions/meta.yml | 7 +- .../dumpsoftwareversions/tests/main.nf.test | 38 ++ .../tests/main.nf.test.snap | 27 + .../dumpsoftwareversions/tests/tags.yml | 2 + modules/nf-core/fastp/environment.yml | 7 + modules/nf-core/fastp/main.nf | 2 +- modules/nf-core/fastp/meta.yml | 4 +- modules/nf-core/fastp/tests/main.nf.test | 485 ++++++++++++++++++ modules/nf-core/fastp/tests/main.nf.test.snap | 52 ++ modules/nf-core/fastp/tests/nextflow.config | 6 + modules/nf-core/fastp/tests/tags.yml | 2 + modules/nf-core/fastqc/environment.yml | 7 + modules/nf-core/fastqc/main.nf | 6 +- modules/nf-core/fastqc/meta.yml | 5 + modules/nf-core/fastqc/tests/main.nf.test | 23 +- .../nf-core/fastqc/tests/main.nf.test.snap | 10 + modules/nf-core/fastqc/tests/tags.yml | 2 + .../gatk4/bedtointervallist/environment.yml | 7 + .../nf-core/gatk4/bedtointervallist/main.nf | 2 +- .../nf-core/gatk4/bedtointervallist/meta.yml | 3 + .../createsequencedictionary/environment.yml | 7 + .../gatk4/createsequencedictionary/main.nf | 2 +- .../gatk4/createsequencedictionary/meta.yml | 4 +- .../gatk4/markduplicates/environment.yml | 8 + modules/nf-core/gatk4/markduplicates/main.nf | 2 +- modules/nf-core/gatk4/markduplicates/meta.yml | 12 +- modules/nf-core/multiqc/environment.yml | 7 + modules/nf-core/multiqc/main.nf | 6 +- modules/nf-core/multiqc/meta.yml | 11 +- .../collectinsertsizemetrics/environment.yml | 7 + .../picard/collectinsertsizemetrics/main.nf | 6 +- .../picard/collectinsertsizemetrics/meta.yml | 5 +- .../picard/collectwgsmetrics/environment.yml | 8 + .../nf-core/picard/collectwgsmetrics/main.nf | 6 +- .../nf-core/picard/collectwgsmetrics/meta.yml | 5 + .../nf-core/samtools/faidx/environment.yml | 7 + modules/nf-core/samtools/faidx/main.nf | 2 +- modules/nf-core/samtools/faidx/meta.yml | 4 + .../nf-core/samtools/index/environment.yml | 7 + modules/nf-core/samtools/index/main.nf | 2 +- modules/nf-core/samtools/index/meta.yml | 4 + modules/nf-core/samtools/sort/environment.yml | 7 + modules/nf-core/samtools/sort/main.nf | 2 +- modules/nf-core/samtools/sort/meta.yml | 3 + .../nf-core/samtools/sort/tests/main.nf.test | 70 +++ .../samtools/sort/tests/main.nf.test.snap | 39 ++ .../samtools/sort/tests/nextflow.config | 7 + modules/nf-core/samtools/sort/tests/tags.yml | 3 + modules/nf-core/samtools/view/environment.yml | 7 + modules/nf-core/samtools/view/main.nf | 2 +- modules/nf-core/samtools/view/meta.yml | 5 + modules/nf-core/star/align/environment.yml | 9 + modules/nf-core/star/align/main.nf | 2 +- modules/nf-core/star/align/meta.yml | 6 +- .../star/genomegenerate/environment.yml | 9 + modules/nf-core/star/genomegenerate/main.nf | 2 +- modules/nf-core/star/genomegenerate/meta.yml | 5 +- .../nf-core/stringtie/merge/environment.yml | 7 + modules/nf-core/stringtie/merge/main.nf | 2 +- modules/nf-core/stringtie/merge/meta.yml | 3 +- .../stringtie/stringtie/environment.yml | 7 + modules/nf-core/stringtie/stringtie/main.nf | 2 +- modules/nf-core/stringtie/stringtie/meta.yml | 3 +- 81 files changed, 1531 insertions(+), 86 deletions(-) create mode 100644 modules/nf-core/cat/cat/environment.yml create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test create mode 100644 modules/nf-core/cat/cat/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config create mode 100644 modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config create mode 100644 modules/nf-core/cat/cat/tests/tags.yml create mode 100644 modules/nf-core/cat/fastq/environment.yml create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test create mode 100644 modules/nf-core/cat/fastq/tests/main.nf.test.snap create mode 100644 modules/nf-core/cat/fastq/tests/tags.yml create mode 100644 modules/nf-core/custom/dumpsoftwareversions/environment.yml create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap create mode 100644 modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml create mode 100644 modules/nf-core/fastp/environment.yml create mode 100644 modules/nf-core/fastp/tests/main.nf.test create mode 100644 modules/nf-core/fastp/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastp/tests/nextflow.config create mode 100644 modules/nf-core/fastp/tests/tags.yml create mode 100644 modules/nf-core/fastqc/environment.yml create mode 100644 modules/nf-core/fastqc/tests/main.nf.test.snap create mode 100644 modules/nf-core/fastqc/tests/tags.yml create mode 100644 modules/nf-core/gatk4/bedtointervallist/environment.yml create mode 100644 modules/nf-core/gatk4/createsequencedictionary/environment.yml create mode 100644 modules/nf-core/gatk4/markduplicates/environment.yml create mode 100644 modules/nf-core/multiqc/environment.yml create mode 100644 modules/nf-core/picard/collectinsertsizemetrics/environment.yml create mode 100644 modules/nf-core/picard/collectwgsmetrics/environment.yml create mode 100644 modules/nf-core/samtools/faidx/environment.yml create mode 100644 modules/nf-core/samtools/index/environment.yml create mode 100644 modules/nf-core/samtools/sort/environment.yml create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test create mode 100644 modules/nf-core/samtools/sort/tests/main.nf.test.snap create mode 100644 modules/nf-core/samtools/sort/tests/nextflow.config create mode 100644 modules/nf-core/samtools/sort/tests/tags.yml create mode 100644 modules/nf-core/samtools/view/environment.yml create mode 100644 modules/nf-core/star/align/environment.yml create mode 100644 modules/nf-core/star/genomegenerate/environment.yml create mode 100644 modules/nf-core/stringtie/merge/environment.yml create mode 100644 modules/nf-core/stringtie/stringtie/environment.yml diff --git a/modules.json b/modules.json index a6ffdfd5..724634dc 100644 --- a/modules.json +++ b/modules.json @@ -7,7 +7,7 @@ "nf-core": { "agat/convertspgff2tsv": { "branch": "master", - "git_sha": "53e6fd5d80141e00a3b70762f4361f6af1f4303b", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "arriba": { @@ -17,97 +17,97 @@ }, "cat/cat": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "cat/fastq": { "branch": "master", - "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", + "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, "fastp": { "branch": "master", - "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "9a4517e720bc812e95b56d23d15a1653b6db4f53", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk4/bedtointervallist": { "branch": "master", - "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk4/createsequencedictionary": { "branch": "master", - "git_sha": "cf8f9ace77aac01caa5c7cb92af5bbda7adb77bd", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "gatk4/markduplicates": { "branch": "master", - "git_sha": "2aa9c2981930687792ed861b0a5f9ff7bb568a7d", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "a6e11ac655e744f7ebc724be669dd568ffdc0e80", + "git_sha": "214d575774c172062924ad3564b4f66655600730", "installed_by": ["modules"] }, "picard/collectinsertsizemetrics": { "branch": "master", - "git_sha": "240937a2a9c30298110753292be041188891f2cb", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "picard/collectwgsmetrics": { "branch": "master", - "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "samtools/faidx": { "branch": "master", - "git_sha": "fd742419940e01ba1c5ecb172c3e32ec840662fe", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "samtools/index": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "samtools/sort": { "branch": "master", - "git_sha": "a0f7be95788366c1923171e358da7d049eb440f9", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "samtools/view": { "branch": "master", - "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "star/align": { "branch": "master", - "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "star/genomegenerate": { "branch": "master", - "git_sha": "cc08a888069f67cab8120259bddab8032d4c0fe3", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "stringtie/merge": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] }, "stringtie/stringtie": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", "installed_by": ["modules"] } } diff --git a/modules/nf-core/agat/convertspgff2tsv/environment.yml b/modules/nf-core/agat/convertspgff2tsv/environment.yml index 9ca0ea28..b5fdf3db 100644 --- a/modules/nf-core/agat/convertspgff2tsv/environment.yml +++ b/modules/nf-core/agat/convertspgff2tsv/environment.yml @@ -1,3 +1,4 @@ +name: agat_convertspgff2tsv channels: - conda-forge - bioconda diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..17a04ef2 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf index 9f062219..4264a92c 100644 --- a/modules/nf-core/cat/cat/main.nf +++ b/modules/nf-core/cat/cat/main.nf @@ -2,7 +2,7 @@ process CAT_CAT { tag "$meta.id" label 'process_low' - conda "conda-forge::pigz=2.3.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : 'biocontainers/pigz:2.3.4' }" diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml index 8acc0bfa..00a8db0b 100644 --- a/modules/nf-core/cat/cat/meta.yml +++ b/modules/nf-core/cat/cat/meta.yml @@ -7,9 +7,7 @@ keywords: tools: - cat: description: Just concatenation - documentation: https://man7.org/linux/man-pages/man1/cat.1.html - licence: ["GPL-3.0-or-later"] input: - meta: @@ -21,7 +19,6 @@ input: type: file description: List of compressed / uncompressed files pattern: "*" - output: - versions: type: file @@ -31,7 +28,9 @@ output: type: file description: Concatenated file. Will be gzipped if file_out ends with ".gz" pattern: "${file_out}" - authors: - "@erikrikarddaniel" - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..5766daaf --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..423571ba --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 00000000..37b578f5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 00000000..bff93add --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf index 5021e6fc..3d963784 100644 --- a/modules/nf-core/cat/fastq/main.nf +++ b/modules/nf-core/cat/fastq/main.nf @@ -2,7 +2,7 @@ process CAT_FASTQ { tag "$meta.id" label 'process_single' - conda "conda-forge::sed=4.7" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : 'nf-core/ubuntu:20.04' }" diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml index 8a39e309..db4ac3c7 100644 --- a/modules/nf-core/cat/fastq/meta.yml +++ b/modules/nf-core/cat/fastq/meta.yml @@ -34,7 +34,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@joseespinosa" - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 00000000..f5f94182 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,143 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..ec2342e5 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + ] + ] + ], + "timestamp": "2023-10-17T23:19:12.990284837" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + ] + ] + ], + "timestamp": "2023-10-17T23:19:31.554568147" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + ] + ] + ], + "timestamp": "2023-10-17T23:19:49.629360033" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", + "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + ] + ] + ] + ], + "timestamp": "2023-10-17T23:19:40.711617539" + }, + "test_cat_fastq_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", + "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + ] + ] + ] + ], + "timestamp": "2023-10-18T07:53:20.923560211" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 00000000..6ac43614 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 00000000..f0c63f69 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index c9d014b1..7685b33c 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.15" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : - 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index c32657de..5f15a5fd 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,4 +1,4 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: @@ -16,7 +16,6 @@ input: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -30,7 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..eec1db10 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..4274ed57 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,27 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ], + "1": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "2": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "mqc_yml": [ + "software_versions_mqc.yml:md5,2570f4ba271ad08357b0d3d32a9cf84d" + ], + "versions": [ + "versions.yml:md5,3843ac526e762117eedf8825b40683df" + ], + "yml": [ + "software_versions.yml:md5,1c851188476409cda5752ce971b20b58" + ] + } + ], + "timestamp": "2023-11-03T14:43:22.157011" + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 00000000..405aa24a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/fastp/environment.yml b/modules/nf-core/fastp/environment.yml new file mode 100644 index 00000000..70389e66 --- /dev/null +++ b/modules/nf-core/fastp/environment.yml @@ -0,0 +1,7 @@ +name: fastp +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastp=0.23.4 diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf index 831b7f12..c8e815ae 100644 --- a/modules/nf-core/fastp/main.nf +++ b/modules/nf-core/fastp/main.nf @@ -2,7 +2,7 @@ process FASTP { tag "$meta.id" label 'process_medium' - conda "bioconda::fastp=0.23.4" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : 'biocontainers/fastp:0.23.4--h5f740d0_0' }" diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml index 197ea7ca..c22a16ab 100644 --- a/modules/nf-core/fastp/meta.yml +++ b/modules/nf-core/fastp/meta.yml @@ -33,7 +33,6 @@ input: - save_merged: type: boolean description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` - output: - meta: type: map @@ -71,3 +70,6 @@ output: authors: - "@drpatelh" - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test new file mode 100644 index 00000000..f610b735 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -0,0 +1,485 @@ +nextflow_process { + + name "Test Process FASTP" + script "../main.nf" + process "FASTP" + tag "modules" + tag "modules_nfcore" + tag "fastp" + + test("test_fastp_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)" ] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved") { + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "paired end (151 cycles + 151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 198"] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("fastp test_fastp_interleaved_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { failed_read_lines.each { failed_read_line -> + { assert path(process.out.reads_fail.get(0).get(1)).linesGzip.contains(failed_read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_trim_fail_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { failed_read2_lines.each { failed_read2_line -> + { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683'] + def read1_lines = [ "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged_adapterlist") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta", checkIfExists: true) + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683',"--adapter_fasta"] + def read1_lines = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap new file mode 100644 index 00000000..0fa68c7d --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -0,0 +1,52 @@ +{ + "fastp test_fastp_interleaved_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4" + ] + ] + ], + "timestamp": "2023-10-17T11:04:45.794175881" + }, + "test_fastp_single_end_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" + ] + ] + ], + "timestamp": "2023-10-17T11:04:10.566343705" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "timestamp": "2023-10-17T11:04:10.582076024" + }, + "test_fastp_single_end_trim_fail_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,9a7ee180f000e8d00c7fb67f06293eb5" + ] + ] + ], + "timestamp": "2023-10-17T11:05:00.379878948" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.config new file mode 100644 index 00000000..0f7849ad --- /dev/null +++ b/modules/nf-core/fastp/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: FASTP { + ext.args = "--interleaved_in" + } +} diff --git a/modules/nf-core/fastp/tests/tags.yml b/modules/nf-core/fastp/tests/tags.yml new file mode 100644 index 00000000..c1afcce7 --- /dev/null +++ b/modules/nf-core/fastp/tests/tags.yml @@ -0,0 +1,2 @@ +fastp: + - modules/nf-core/fastp/** diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 00000000..1787b38a --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 249f9064..50e59f2b 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -2,10 +2,10 @@ process FASTQC { tag "$meta.id" label 'process_medium' - conda "bioconda::fastqc=0.11.9" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'biocontainers/fastqc:0.11.9--0' }" + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 4da5bb5a..ee5507e0 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -50,3 +50,8 @@ authors: - "@grst" - "@ewels" - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index 3961de60..6437a144 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -1,13 +1,18 @@ nextflow_process { name "Test Process FASTQC" - script "modules/nf-core/fastqc/main.nf" + script "../main.nf" process "FASTQC" + tag "modules" + tag "modules_nfcore" tag "fastqc" test("Single-Read") { when { + params { + outdir = "$outputDir" + } process { """ input[0] = [ @@ -21,12 +26,16 @@ nextflow_process { } then { - assert process.success - assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" - assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") - assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" + assertAll ( + { assert process.success }, + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
Mon 2 Oct 2023
test.gz
+ // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + { assert process.out.html.get(0).get(1) ==~ ".*/test_fastqc.html" }, + { assert path(process.out.html.get(0).get(1)).getText().contains("File typeConventional base calls") }, + { assert snapshot(process.out.versions).match("versions") }, + { assert process.out.zip.get(0).get(1) ==~ ".*/test_fastqc.zip" } + ) } - } - } diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 00000000..636a32ce --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,10 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "timestamp": "2023-10-09T23:40:54+0000" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml new file mode 100644 index 00000000..7834294b --- /dev/null +++ b/modules/nf-core/fastqc/tests/tags.yml @@ -0,0 +1,2 @@ +fastqc: + - modules/nf-core/fastqc/** diff --git a/modules/nf-core/gatk4/bedtointervallist/environment.yml b/modules/nf-core/gatk4/bedtointervallist/environment.yml new file mode 100644 index 00000000..e7cb4280 --- /dev/null +++ b/modules/nf-core/gatk4/bedtointervallist/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_bedtointervallist +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/bedtointervallist/main.nf b/modules/nf-core/gatk4/bedtointervallist/main.nf index 24968c38..88b24b1a 100644 --- a/modules/nf-core/gatk4/bedtointervallist/main.nf +++ b/modules/nf-core/gatk4/bedtointervallist/main.nf @@ -2,7 +2,7 @@ process GATK4_BEDTOINTERVALLIST { tag "$meta.id" label 'process_medium' - conda "bioconda::gatk4=4.4.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" diff --git a/modules/nf-core/gatk4/bedtointervallist/meta.yml b/modules/nf-core/gatk4/bedtointervallist/meta.yml index 83617a7c..187da885 100644 --- a/modules/nf-core/gatk4/bedtointervallist/meta.yml +++ b/modules/nf-core/gatk4/bedtointervallist/meta.yml @@ -46,3 +46,6 @@ output: authors: - "@kevinmenden" - "@ramprasadn" +maintainers: + - "@kevinmenden" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/createsequencedictionary/environment.yml b/modules/nf-core/gatk4/createsequencedictionary/environment.yml new file mode 100644 index 00000000..db663e14 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_createsequencedictionary +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf index 3e4efdd9..b47ad162 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/main.nf +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -2,7 +2,7 @@ process GATK4_CREATESEQUENCEDICTIONARY { tag "$fasta" label 'process_medium' - conda "bioconda::gatk4=4.4.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml index 9b8b8c89..f9d70be0 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/meta.yml +++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml @@ -15,7 +15,6 @@ tools: documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s doi: 10.1158/1538-7445.AM2017-3590 licence: ["Apache-2.0"] - input: - meta: type: map @@ -38,3 +37,6 @@ output: authors: - "@maxulysse" - "@ramprasadn" +maintainers: + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/markduplicates/environment.yml b/modules/nf-core/gatk4/markduplicates/environment.yml new file mode 100644 index 00000000..9adad104 --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/environment.yml @@ -0,0 +1,8 @@ +name: gatk4_markduplicates +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf index 59e52a3d..564b86d3 100644 --- a/modules/nf-core/gatk4/markduplicates/main.nf +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -2,7 +2,7 @@ process GATK4_MARKDUPLICATES { tag "$meta.id" label 'process_medium' - conda "bioconda::gatk4=4.4.0.0 bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0': 'biocontainers/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0' }" diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml index d3e75505..b0f09d4b 100644 --- a/modules/nf-core/gatk4/markduplicates/meta.yml +++ b/modules/nf-core/gatk4/markduplicates/meta.yml @@ -7,16 +7,12 @@ keywords: - sort tools: - gatk4: - description: - Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools - with a primary focus on variant discovery and genotyping. Its powerful processing engine - and high-performance computing features make it capable of taking on projects of any size. + description: Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools with a primary focus on variant discovery and genotyping. Its powerful processing engine and high-performance computing features make it capable of taking on projects of any size. homepage: https://gatk.broadinstitute.org/hc/en-us documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- tool_dev_url: https://github.com/broadinstitute/gatk doi: 10.1158/1538-7445.AM2017-3590 licence: ["MIT"] - input: - meta: type: map @@ -35,7 +31,6 @@ input: type: file description: Fasta index file pattern: "*.{fai}" - output: - meta: type: map @@ -66,8 +61,11 @@ output: type: file description: Duplicate metrics file generated by GATK pattern: "*.{metrics.txt}" - authors: - "@ajodeh-juma" - "@FriederikeHanssen" - "@maxulysse" +maintainers: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..d2a9f21a --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.17 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 65d7dd0d..2bbc3983 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -1,10 +1,10 @@ process MULTIQC { label 'process_single' - conda "bioconda::multiqc=1.15" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : - 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.17--pyhdfd78af_0' : + 'biocontainers/multiqc:1.17--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index f93b5ee5..f1aa660e 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,5 +1,5 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json -name: MultiQC +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: multiqc description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: - QC @@ -13,7 +13,6 @@ tools: homepage: https://multiqc.info/ documentation: https://multiqc.info/docs/ licence: ["GPL-3.0-or-later"] - input: - multiqc_files: type: file @@ -31,7 +30,6 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" - output: - report: type: file @@ -54,3 +52,8 @@ authors: - "@bunop" - "@drpatelh" - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/picard/collectinsertsizemetrics/environment.yml b/modules/nf-core/picard/collectinsertsizemetrics/environment.yml new file mode 100644 index 00000000..5c85f872 --- /dev/null +++ b/modules/nf-core/picard/collectinsertsizemetrics/environment.yml @@ -0,0 +1,7 @@ +name: picard_collectinsertsizemetrics +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::picard=3.1.0 diff --git a/modules/nf-core/picard/collectinsertsizemetrics/main.nf b/modules/nf-core/picard/collectinsertsizemetrics/main.nf index 1d538fae..48e4d2ad 100644 --- a/modules/nf-core/picard/collectinsertsizemetrics/main.nf +++ b/modules/nf-core/picard/collectinsertsizemetrics/main.nf @@ -2,10 +2,10 @@ process PICARD_COLLECTINSERTSIZEMETRICS { tag "$meta.id" label 'process_single' - conda "bioconda::picard=3.0.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' : + 'biocontainers/picard:3.1.0--hdfd78af_0' }" input: tuple val(meta), path(bam) diff --git a/modules/nf-core/picard/collectinsertsizemetrics/meta.yml b/modules/nf-core/picard/collectinsertsizemetrics/meta.yml index e611bdd4..efd5abe0 100644 --- a/modules/nf-core/picard/collectinsertsizemetrics/meta.yml +++ b/modules/nf-core/picard/collectinsertsizemetrics/meta.yml @@ -6,7 +6,6 @@ keywords: - insert - statistics - bam - tools: - "picard": description: "Java tools for working with NGS data in the BAM format" @@ -14,7 +13,6 @@ tools: documentation: "https://broadinstitute.github.io/picard/" tool_dev_url: "https://github.com/broadinstitute/picard" licence: "['MIT']" - input: - meta: type: map @@ -25,7 +23,6 @@ input: type: file description: BAM/CRAM/SAM file pattern: "*.{bam,cram,sam}" - output: - meta: type: map @@ -46,3 +43,5 @@ output: pattern: "*.txt" authors: - "@FerriolCalvet" +maintainers: + - "@FerriolCalvet" diff --git a/modules/nf-core/picard/collectwgsmetrics/environment.yml b/modules/nf-core/picard/collectwgsmetrics/environment.yml new file mode 100644 index 00000000..8adda491 --- /dev/null +++ b/modules/nf-core/picard/collectwgsmetrics/environment.yml @@ -0,0 +1,8 @@ +name: picard_collectwgsmetrics +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::picard=3.1.0 + - r::r-base diff --git a/modules/nf-core/picard/collectwgsmetrics/main.nf b/modules/nf-core/picard/collectwgsmetrics/main.nf index 1d59334c..67aa5b5e 100644 --- a/modules/nf-core/picard/collectwgsmetrics/main.nf +++ b/modules/nf-core/picard/collectwgsmetrics/main.nf @@ -2,10 +2,10 @@ process PICARD_COLLECTWGSMETRICS { tag "$meta.id" label 'process_single' - conda "bioconda::picard=3.0.0 r::r-base" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' : + 'biocontainers/picard:3.1.0--hdfd78af_0' }" input: tuple val(meta), path(bam), path(bai) diff --git a/modules/nf-core/picard/collectwgsmetrics/meta.yml b/modules/nf-core/picard/collectwgsmetrics/meta.yml index 19906f08..5576ef92 100644 --- a/modules/nf-core/picard/collectwgsmetrics/meta.yml +++ b/modules/nf-core/picard/collectwgsmetrics/meta.yml @@ -68,3 +68,8 @@ authors: - "@flowuenne" - "@lassefolkersen" - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@flowuenne" + - "@lassefolkersen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 00000000..73badedb --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,7 @@ +name: samtools_faidx +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf index 59ed3088..3aa98822 100644 --- a/modules/nf-core/samtools/faidx/main.nf +++ b/modules/nf-core/samtools/faidx/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_FAIDX { tag "$fasta" label 'process_single' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml index 957b25e5..e189af28 100644 --- a/modules/nf-core/samtools/faidx/meta.yml +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -55,3 +55,7 @@ authors: - "@drpatelh" - "@ewels" - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 00000000..3c6f95b2 --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,7 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf index 0b20aa4b..256bd7c4 100644 --- a/modules/nf-core/samtools/index/main.nf +++ b/modules/nf-core/samtools/index/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_INDEX { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml index 8bd2fa6f..01a4ee03 100644 --- a/modules/nf-core/samtools/index/meta.yml +++ b/modules/nf-core/samtools/index/meta.yml @@ -51,3 +51,7 @@ authors: - "@drpatelh" - "@ewels" - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 00000000..508659f0 --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,7 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf index 2b7753fd..60f0c634 100644 --- a/modules/nf-core/samtools/sort/main.nf +++ b/modules/nf-core/samtools/sort/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_SORT { tag "$meta.id" label 'process_medium' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml index 07328431..2200de72 100644 --- a/modules/nf-core/samtools/sort/meta.yml +++ b/modules/nf-core/samtools/sort/meta.yml @@ -46,3 +46,6 @@ output: authors: - "@drpatelh" - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 00000000..1f72f3b9 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("test_samtools_sort") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_samtools_sort_stub") { + + config "./nextflow.config" + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ + file(params.test_data['sarscov2']['illumina']['test_paired_end_bam'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 00000000..a43566da --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,39 @@ +{ + "test_samtools_sort": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,a29570e7607d217c2fa4d75829e09cd7" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,46f7a36082fa1f68285fe30d689244e8" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,a29570e7607d217c2fa4d75829e09cd7" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,46f7a36082fa1f68285fe30d689244e8" + ] + } + ], + "timestamp": "2023-10-17T17:21:46.5427968" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 00000000..d0f35086 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 00000000..cd63ea20 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 00000000..141e7bd8 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf index cb91facf..ddf3f88a 100644 --- a/modules/nf-core/samtools/view/main.nf +++ b/modules/nf-core/samtools/view/main.nf @@ -2,7 +2,7 @@ process SAMTOOLS_VIEW { tag "$meta.id" label 'process_low' - conda "bioconda::samtools=1.17" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : 'biocontainers/samtools:1.17--h00cdaf9_0' }" diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml index 3b05450b..3dadafae 100644 --- a/modules/nf-core/samtools/view/meta.yml +++ b/modules/nf-core/samtools/view/meta.yml @@ -82,3 +82,8 @@ authors: - "@joseespinosa" - "@FriederikeHanssen" - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/star/align/environment.yml b/modules/nf-core/star/align/environment.yml new file mode 100644 index 00000000..6db20988 --- /dev/null +++ b/modules/nf-core/star/align/environment.yml @@ -0,0 +1,9 @@ +name: star_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::star=2.7.10a + - bioconda::samtools=1.16.1 + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/align/main.nf b/modules/nf-core/star/align/main.nf index d0e20384..fa645a6d 100644 --- a/modules/nf-core/star/align/main.nf +++ b/modules/nf-core/star/align/main.nf @@ -2,7 +2,7 @@ process STAR_ALIGN { tag "$meta.id" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" diff --git a/modules/nf-core/star/align/meta.yml b/modules/nf-core/star/align/meta.yml index 3d8fed0c..e80dbb7d 100644 --- a/modules/nf-core/star/align/meta.yml +++ b/modules/nf-core/star/align/meta.yml @@ -52,7 +52,6 @@ input: - seq_center: type: string description: Sequencing center - output: - bam: type: file @@ -106,8 +105,11 @@ output: type: file description: STAR output bedGraph format file(s) (optional) pattern: "*.bg" - authors: - "@kevinmenden" - "@drpatelh" - "@praveenraj2018" +maintainers: + - "@kevinmenden" + - "@drpatelh" + - "@praveenraj2018" diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 00000000..0b35ff51 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,9 @@ +name: star_genomegenerate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::star=2.7.10a + - bioconda::samtools=1.16.1 + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf index 43424042..473e62a6 100644 --- a/modules/nf-core/star/genomegenerate/main.nf +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -2,7 +2,7 @@ process STAR_GENOMEGENERATE { tag "$fasta" label 'process_high' - conda "bioconda::star=2.7.10a bioconda::samtools=1.16.1 conda-forge::gawk=5.1.0" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' : 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:1df389393721fc66f3fd8778ad938ac711951107-0' }" diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml index eba2d9cf..1061e1b8 100644 --- a/modules/nf-core/star/genomegenerate/meta.yml +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -31,7 +31,6 @@ input: - gtf: type: file description: GTF file of the reference genome - output: - meta: type: map @@ -46,7 +45,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@kevinmenden" - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/stringtie/merge/environment.yml b/modules/nf-core/stringtie/merge/environment.yml new file mode 100644 index 00000000..9914b202 --- /dev/null +++ b/modules/nf-core/stringtie/merge/environment.yml @@ -0,0 +1,7 @@ +name: stringtie_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::stringtie=2.2.1 diff --git a/modules/nf-core/stringtie/merge/main.nf b/modules/nf-core/stringtie/merge/main.nf index 12224f78..c2568219 100644 --- a/modules/nf-core/stringtie/merge/main.nf +++ b/modules/nf-core/stringtie/merge/main.nf @@ -2,7 +2,7 @@ process STRINGTIE_MERGE { label 'process_medium' // Note: 2.7X indices incompatible with AWS iGenomes. - conda "bioconda::stringtie=2.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' : 'biocontainers/stringtie:2.2.1--hecb563c_2' }" diff --git a/modules/nf-core/stringtie/merge/meta.yml b/modules/nf-core/stringtie/merge/meta.yml index 2e9784fe..5d02d678 100644 --- a/modules/nf-core/stringtie/merge/meta.yml +++ b/modules/nf-core/stringtie/merge/meta.yml @@ -32,6 +32,7 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@yuukiiwa" +maintainers: + - "@yuukiiwa" diff --git a/modules/nf-core/stringtie/stringtie/environment.yml b/modules/nf-core/stringtie/stringtie/environment.yml new file mode 100644 index 00000000..7a0eccdb --- /dev/null +++ b/modules/nf-core/stringtie/stringtie/environment.yml @@ -0,0 +1,7 @@ +name: stringtie_stringtie +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::stringtie=2.2.1 diff --git a/modules/nf-core/stringtie/stringtie/main.nf b/modules/nf-core/stringtie/stringtie/main.nf index d0f8b563..6e25ba27 100644 --- a/modules/nf-core/stringtie/stringtie/main.nf +++ b/modules/nf-core/stringtie/stringtie/main.nf @@ -2,7 +2,7 @@ process STRINGTIE_STRINGTIE { tag "$meta.id" label 'process_medium' - conda "bioconda::stringtie=2.2.1" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/stringtie:2.2.1--hecb563c_2' : 'biocontainers/stringtie:2.2.1--hecb563c_2' }" diff --git a/modules/nf-core/stringtie/stringtie/meta.yml b/modules/nf-core/stringtie/stringtie/meta.yml index 75518470..d8ebdd88 100644 --- a/modules/nf-core/stringtie/stringtie/meta.yml +++ b/modules/nf-core/stringtie/stringtie/meta.yml @@ -5,7 +5,6 @@ keywords: - assembly - quantification - gtf - tools: - stringtie2: description: | @@ -55,3 +54,5 @@ output: pattern: "versions.yml" authors: - "@drpatelh" +maintainers: + - "@drpatelh" From ef9f57a869e79446397e22c260aa9c80cab962ab Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 15 Nov 2023 10:40:39 +0100 Subject: [PATCH 42/45] move compression out of vcf_collect --- bin/vcf_collect.py | 5 ++--- modules/local/vcf_collect/main.nf | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 4d677de0..8e15ccfc 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -8,7 +8,6 @@ import ast import numpy as np import csv -import gzip logger = logging.getLogger() @@ -344,9 +343,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: "FORMAT", "Sample", ] - ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE, compression="gzip") + ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) - with gzip.open(out_file, "r+") as f: + with open(out_file, "r+") as f: content = f.read() f.seek(0, 0) f.write(header.rstrip("\r\n") + "\n" + content) diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 42f94c40..1b8e56fe 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -22,7 +22,8 @@ process VCF_COLLECT { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf.gz + vcf_collect.py --fusioninspector $fusioninspector_tsv --fusionreport $fusionreport_report --fusioninspector_gtf $fusioninspector_gtf_tsv --hgnc $hgnc_ref --sample ${prefix} --out ${prefix}_fusion_data.vcf + gzip ${prefix}_fusion_data.vcf cat <<-END_VERSIONS > versions.yml "${task.process}": From f33a5e9f36271d07c11cb28b762a2c3859aa7302 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 15 Nov 2023 13:25:59 +0100 Subject: [PATCH 43/45] fix ,/; --- bin/vcf_collect.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 8e15ccfc..57cfbcaa 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -319,7 +319,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"TRANSCRIPT_ID_A={row['CDS_LEFT_ID']};TRANSCRIPT_ID_B={row['CDS_RIGHT_ID']};" f"TRANSCRIPT_VERSION_A={row['Left_transcript_version']};TRANSCRIPT_VERSION_B={row['Right_transcript_version']};" f"HGNC_ID_A={row['Left_hgnc_id']};HGNC_ID_B={row['Right_hgnc_id']};" - f"EXON_NUMBER_A={row['Left_exon_number']},EXON_NUMBER_B={row['Right_exon_number']};" + f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" f"ANNOTATIONS={row['annots']}" ) df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" From a8b07438c70907de25960cfbf934e1b175e82cd2 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Thu, 16 Nov 2023 21:35:59 +0100 Subject: [PATCH 44/45] fix values display vcf --- bin/vcf_collect.py | 13 ++++++++++++- modules/local/vcf_collect/main.nf | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 57cfbcaa..dbbd384e 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -275,7 +275,7 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: concatenate_columns, axis=1 ) fusion_report.columns = fusion_report.columns.str.upper() - fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ", ".join(x)) + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( @@ -297,6 +297,17 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["INFO"] = "" df["Sample"] = "" df["Strand1"] = df["Strand1"].astype(str) + df["JunctionReadCount"] = df["JunctionReadCount"].fillna(0).astype(int).astype(str) + df["SpanningFragCount"] = df["SpanningFragCount"].fillna(0).astype(int).astype(str) + df["FFPM"] = df["FFPM"].fillna(0).astype(float).astype(str) + df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(int).astype(str) + df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(int).astype(str) + df["Left_hgnc_id"] = df["Left_hgnc_id"].fillna(0).astype(int).astype(str) + df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str) + df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) + df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) + df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) + df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str) for index, row in df.iterrows(): if row["Strand1"] == "nan": diff --git a/modules/local/vcf_collect/main.nf b/modules/local/vcf_collect/main.nf index 1b8e56fe..2af4a777 100644 --- a/modules/local/vcf_collect/main.nf +++ b/modules/local/vcf_collect/main.nf @@ -2,7 +2,7 @@ process VCF_COLLECT { tag "$meta.id" label 'process_single' - conda "conda-forge::python=3.8.3" + conda "conda-forge::pandas=1.5.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/pandas:1.5.2' : 'quay.io/biocontainers/pandas:1.5.2' }" From acf64a1d862a7c16fd68d6bd736100e1c1c4ae1c Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Fri, 17 Nov 2023 13:54:11 +0100 Subject: [PATCH 45/45] add hgnc id for fusionreport-only entries in vcf and add bug fix --- bin/vcf_collect.py | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index dbbd384e..b5ff126d 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -37,19 +37,36 @@ def vcf_collect( .join(read_build_fusionreport(fusionreport_in_file), how="outer", on="FUSION") .reset_index() ) + hgnc_df = build_hgnc_dataframe(hgnc) - df = build_hgcn_dataframe(hgnc).merge( - merged_df, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id" + df_symbol = merged_df[merged_df["Left_ensembl_gene_id"].isna()] + df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()] + + df_not_symbol = hgnc_df.merge( + df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id" ) + df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneA") + df = pd.concat([df_not_symbol, df_symbol]) df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) - df = build_hgcn_dataframe(hgnc).merge(df, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id") + + df_symbol = df[df["Right_ensembl_gene_id"].isna()] + df_not_symbol = df[df["Right_ensembl_gene_id"].notna()] + + df_not_symbol = hgnc_df.merge( + df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id" + ) + df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneB") + df = pd.concat([df_not_symbol, df_symbol]) df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) + gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") - all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0) - all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].astype(int) + all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) - all_df = all_df[(all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])] + all_df = all_df[ + ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) + | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) + ] all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) all_df = all_df[ @@ -83,10 +100,14 @@ def vcf_collect( all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) - all_df = all_df[(all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])] + all_df = all_df[ + ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) + | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) + ] all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) + all_df = all_df[ [ "FUSION", @@ -300,8 +321,8 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["JunctionReadCount"] = df["JunctionReadCount"].fillna(0).astype(int).astype(str) df["SpanningFragCount"] = df["SpanningFragCount"].fillna(0).astype(int).astype(str) df["FFPM"] = df["FFPM"].fillna(0).astype(float).astype(str) - df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(int).astype(str) - df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(int).astype(str) + df["ChromosomeA"] = df["ChromosomeA"].fillna(0).astype(str) + df["ChromosomeB"] = df["ChromosomeB"].fillna(0).astype(str) df["Left_hgnc_id"] = df["Left_hgnc_id"].fillna(0).astype(int).astype(str) df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str) df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) @@ -334,6 +355,7 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"ANNOTATIONS={row['annots']}" ) df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + return df @@ -362,13 +384,13 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: f.write(header.rstrip("\r\n") + "\n" + content) -def build_hgcn_dataframe(file: str) -> pd.DataFrame: +def build_hgnc_dataframe(file: str) -> pd.DataFrame: """ Build a DataFrame from HGNC input file, extracting 'hgnc_id' and 'ensembl_gene_id' columns. """ df = pd.read_csv(file, sep="\t", low_memory=False) df["hgnc_id"] = df["hgnc_id"].str.replace("HGNC:", "") - return df[["hgnc_id", "ensembl_gene_id"]].dropna() + return df[["hgnc_id", "ensembl_gene_id", "symbol"]].dropna() def build_gtf_dataframe(file: str) -> pd.DataFrame: