From ec3e795c4604e25ecada43f2bb3671c989a4e08c Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:34:38 +0100 Subject: [PATCH 1/4] update collectrnaseqmetrics memory requirement --- bin/check_samplesheet.py | 4 +- bin/vcf_collect.py | 60 ++++++++++++++----- .../local/picard/collectrnaseqmetrics/main.nf | 14 ++--- 3 files changed, 54 insertions(+), 24 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 57cf8e6b..218dd802 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -123,7 +123,9 @@ def _validate_fastq_format(self, filename): def _validate_strandedness(self, row): """Assert that the strandedness given is one of unstranded/forward/reverse""" if row[self._strandedness] not in self.VALID_STRANDEDNESSES: - raise AssertionError(f"Strandedness must be one of {', '.join(self.VALID_STRANDEDNESSES)}") + raise AssertionError( + f"Strandedness must be one of {', '.join(self.VALID_STRANDEDNESSES)}" + ) def validate_unique_samples(self): """ diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 7e988c1e..61283d3b 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -65,7 +65,9 @@ def vcf_collect( gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") - all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + all_df[["PosA", "orig_start", "orig_end"]] = ( + all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + ) all_df = all_df[ ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) @@ -75,7 +77,9 @@ def vcf_collect( all_df.replace("", np.nan, inplace=True) all_df = all_df.drop_duplicates() - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -116,8 +120,12 @@ def vcf_collect( ].drop_duplicates() all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str") all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna( + 0 + ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype( + int + ) all_df = all_df[ ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) @@ -126,7 +134,9 @@ def vcf_collect( all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan) all_df = all_df.replace("", np.nan) - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -212,7 +222,9 @@ def parse_args(argv=None): type=Path, help="HGNC database.", ) - parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample") + parser.add_argument( + "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample" + ) parser.add_argument( "--out", metavar="OUT", @@ -280,7 +292,11 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: df["annots"] = ( df["annots"] .apply(convert_to_list) - .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "") + .apply( + lambda x: ( + ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "" + ) + ) ) else: for i in [ @@ -359,9 +375,9 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) - return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( - ["FUSION"] - ) + return fusion_report[ + ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] + ].set_index(["FUSION"]) def read_fusionreport_csv(file: str) -> pd.DataFrame: @@ -370,7 +386,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: for column in columns_to_iterate: if column not in df.columns: df[column] = "" - df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str") + df[["starfusion", "arriba", "fusioncatcher"]] = df[ + ["starfusion", "arriba", "fusioncatcher"] + ].astype("str") for index, row in df.iterrows(): for column in columns_to_iterate: cell_value = row[column] @@ -398,7 +416,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True) df = df.set_index("Fusion") df.to_csv("tmp.csv") - return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]] + return df[ + ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"] + ] def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: @@ -425,7 +445,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) - df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str) + df["Right_transcript_version"] = ( + df["Right_transcript_version"].fillna(0).astype(int).astype(str) + ) df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str) df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str) df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan") @@ -452,7 +474,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" f"ANNOTATIONS={row['annots']}" ) - df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + df.loc[index, "Sample"] = ( + f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + ) return df @@ -497,7 +521,9 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: """ df = pd.read_csv(file, sep="\t") df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) - df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split( + ",", expand=True + ) return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]] @@ -511,7 +537,9 @@ def main(argv=None): or not args.fusionreport_csv or not args.hgnc ): - logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") + logger.error( + f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!" + ) sys.exit(2) vcf_collect( args.fusioninspector, diff --git a/modules/local/picard/collectrnaseqmetrics/main.nf b/modules/local/picard/collectrnaseqmetrics/main.nf index af0b8958..e633e563 100644 --- a/modules/local/picard/collectrnaseqmetrics/main.nf +++ b/modules/local/picard/collectrnaseqmetrics/main.nf @@ -2,10 +2,10 @@ process PICARD_COLLECTRNASEQMETRICS { tag "$meta.id" label 'process_medium' - conda "bioconda::picard=3.0.0 r::r-base" + conda "bioconda::picard=3.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' : - 'biocontainers/picard:3.0.0--hdfd78af_1' }" + 'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' : + 'biocontainers/picard:3.1.0--hdfd78af_0' }" input: tuple val(meta), path(bam), path(bai) @@ -33,15 +33,15 @@ process PICARD_COLLECTRNASEQMETRICS { def rrna = rrna_intervals == [] ? '' : "--RIBOSOMAL_INTERVALS ${rrna_intervals}" def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def avail_mem = 3 + def avail_mem = 3072 if (!task.memory) { log.info '[Picard CollectRnaMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' } else { - avail_mem = task.memory.giga - } + avail_mem = (task.memory.mega*0.8).intValue() + } """ picard \\ - -Xmx${avail_mem}g \\ + -Xmx${avail_mem}M \\ CollectRnaSeqMetrics \\ --TMP_DIR ./tmp \\ ${strandedness} \\ From 0980e9fab64b6492914ec2c90f3248e7245d7116 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 4 Mar 2024 09:36:46 +0100 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e8c618..547839a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Update to nf-tools 2.11.1 [#457] (https://github.com/nf-core/rnafusion/pull/457) +- Update picard collectrnaseqmetrics memory requirements to 0.8x what is provided [#474](https://github.com/nf-core/rnafusion/pull/474) ### Fixed From 2cea1a4179fd76185fe471a9398267269fc915ce Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 4 Mar 2024 11:19:01 +0100 Subject: [PATCH 3/4] first step fix --- bin/vcf_collect.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 61283d3b..00e5fdb8 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -352,7 +352,8 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: with open(fusionreport_file) as f: from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line] expression = ast.literal_eval(from_html[0].split('], "tool')[0]) - fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()}) + # print(expression) + fusion_report = pd.DataFrame(list(expression)) if not "arriba" in fusion_report.columns: fusion_report["arriba"] = "" if not "fusioncatcher" in fusion_report.columns: @@ -371,8 +372,10 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply( concatenate_columns, axis=1 ) + print(fusion_report) fusion_report.columns = fusion_report.columns.str.upper() - fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) + print(fusion_report["FOUND_DB"]) + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x) if len(x) > 0 else '') fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) return fusion_report[ From 78a56b37cd276a08c8197162c55b2b586543be04 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Wed, 3 Apr 2024 12:14:38 +0200 Subject: [PATCH 4/4] align { --- modules/local/picard/collectrnaseqmetrics/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/picard/collectrnaseqmetrics/main.nf b/modules/local/picard/collectrnaseqmetrics/main.nf index e633e563..ede593e0 100644 --- a/modules/local/picard/collectrnaseqmetrics/main.nf +++ b/modules/local/picard/collectrnaseqmetrics/main.nf @@ -38,7 +38,7 @@ process PICARD_COLLECTRNASEQMETRICS { log.info '[Picard CollectRnaMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' } else { avail_mem = (task.memory.mega*0.8).intValue() - } + } """ picard \\ -Xmx${avail_mem}M \\