From ec3e795c4604e25ecada43f2bb3671c989a4e08c Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 4 Mar 2024 09:34:38 +0100
Subject: [PATCH 1/4] update collectrnaseqmetrics memory requirement

---
 bin/check_samplesheet.py                      |  4 +-
 bin/vcf_collect.py                            | 60 ++++++++++++++-----
 .../local/picard/collectrnaseqmetrics/main.nf | 14 ++---
 3 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
index 57cf8e6b..218dd802 100755
--- a/bin/check_samplesheet.py
+++ b/bin/check_samplesheet.py
@@ -123,7 +123,9 @@ def _validate_fastq_format(self, filename):
     def _validate_strandedness(self, row):
         """Assert that the strandedness given is one of unstranded/forward/reverse"""
         if row[self._strandedness] not in self.VALID_STRANDEDNESSES:
-            raise AssertionError(f"Strandedness must be one of {', '.join(self.VALID_STRANDEDNESSES)}")
+            raise AssertionError(
+                f"Strandedness must be one of {', '.join(self.VALID_STRANDEDNESSES)}"
+            )
 
     def validate_unique_samples(self):
         """
diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 7e988c1e..61283d3b 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -65,7 +65,9 @@ def vcf_collect(
 
     gtf_df = build_gtf_dataframe(gtf)
     all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id")
-    all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
+    all_df[["PosA", "orig_start", "orig_end"]] = (
+        all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int)
+    )
 
     all_df = all_df[
         ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"]))
@@ -75,7 +77,9 @@ def vcf_collect(
     all_df.replace("", np.nan, inplace=True)
     all_df = all_df.drop_duplicates()
 
-    all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
+    all_df[["exon_number", "transcript_version"]] = all_df[
+        ["exon_number", "transcript_version"]
+    ].replace(0, np.nan)
     # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
     all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform(
         lambda x: x.fillna(method="ffill").fillna(method="bfill")
@@ -116,8 +120,12 @@ def vcf_collect(
     ].drop_duplicates()
     all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str")
     all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id")
-    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0)
-    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int)
+    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(
+        0
+    )
+    all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(
+        int
+    )
     all_df = all_df[
         ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"]))
         | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0))
@@ -126,7 +134,9 @@ def vcf_collect(
     all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan)
     all_df = all_df.replace("", np.nan)
 
-    all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan)
+    all_df[["exon_number", "transcript_version"]] = all_df[
+        ["exon_number", "transcript_version"]
+    ].replace(0, np.nan)
     # Fill non-empty values within each group for 'exon_number' and 'transcript_version'
     all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform(
         lambda x: x.fillna(method="ffill").fillna(method="bfill")
@@ -212,7 +222,9 @@ def parse_args(argv=None):
         type=Path,
         help="HGNC database.",
     )
-    parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample")
+    parser.add_argument(
+        "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample"
+    )
     parser.add_argument(
         "--out",
         metavar="OUT",
@@ -280,7 +292,11 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame:
         df["annots"] = (
             df["annots"]
             .apply(convert_to_list)
-            .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "")
+            .apply(
+                lambda x: (
+                    ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else ""
+                )
+            )
         )
     else:
         for i in [
@@ -359,9 +375,9 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
     fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x))
     fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)
 
-    return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index(
-        ["FUSION"]
-    )
+    return fusion_report[
+        ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]
+    ].set_index(["FUSION"])
 
 
 def read_fusionreport_csv(file: str) -> pd.DataFrame:
@@ -370,7 +386,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame:
     for column in columns_to_iterate:
         if column not in df.columns:
             df[column] = ""
-    df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str")
+    df[["starfusion", "arriba", "fusioncatcher"]] = df[
+        ["starfusion", "arriba", "fusioncatcher"]
+    ].astype("str")
     for index, row in df.iterrows():
         for column in columns_to_iterate:
             cell_value = row[column]
@@ -398,7 +416,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame:
     df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True)
     df = df.set_index("Fusion")
     df.to_csv("tmp.csv")
-    return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]]
+    return df[
+        ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]
+    ]
 
 
 def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
@@ -425,7 +445,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
     df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str)
     df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str)
     df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str)
-    df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str)
+    df["Right_transcript_version"] = (
+        df["Right_transcript_version"].fillna(0).astype(int).astype(str)
+    )
     df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str)
     df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str)
     df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan")
@@ -452,7 +474,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame:
             f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};"
             f"ANNOTATIONS={row['annots']}"
         )
-        df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
+        df.loc[index, "Sample"] = (
+            f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}"
+        )
 
     return df
 
@@ -497,7 +521,9 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame:
     """
     df = pd.read_csv(file, sep="\t")
     df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True)
-    df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True)
+    df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(
+        ",", expand=True
+    )
     return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]]
 
 
@@ -511,7 +537,9 @@ def main(argv=None):
         or not args.fusionreport_csv
         or not args.hgnc
     ):
-        logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!")
+        logger.error(
+            f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!"
+        )
         sys.exit(2)
     vcf_collect(
         args.fusioninspector,
diff --git a/modules/local/picard/collectrnaseqmetrics/main.nf b/modules/local/picard/collectrnaseqmetrics/main.nf
index af0b8958..e633e563 100644
--- a/modules/local/picard/collectrnaseqmetrics/main.nf
+++ b/modules/local/picard/collectrnaseqmetrics/main.nf
@@ -2,10 +2,10 @@ process PICARD_COLLECTRNASEQMETRICS {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "bioconda::picard=3.0.0 r::r-base"
+    conda "bioconda::picard=3.1.0"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/picard:3.0.0--hdfd78af_1' :
-        'biocontainers/picard:3.0.0--hdfd78af_1' }"
+        'https://depot.galaxyproject.org/singularity/picard:3.1.0--hdfd78af_0' :
+        'biocontainers/picard:3.1.0--hdfd78af_0' }"
 
     input:
     tuple val(meta), path(bam), path(bai)
@@ -33,15 +33,15 @@ process PICARD_COLLECTRNASEQMETRICS {
     def rrna = rrna_intervals == [] ? '' : "--RIBOSOMAL_INTERVALS ${rrna_intervals}"
     def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
-    def avail_mem = 3
+    def avail_mem = 3072
     if (!task.memory) {
         log.info '[Picard CollectRnaMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
     } else {
-        avail_mem = task.memory.giga
-    }
+        avail_mem = (task.memory.mega*0.8).intValue()
+        }
     """
     picard \\
-        -Xmx${avail_mem}g \\
+        -Xmx${avail_mem}M \\
         CollectRnaSeqMetrics \\
         --TMP_DIR ./tmp \\
         ${strandedness} \\

From 0980e9fab64b6492914ec2c90f3248e7245d7116 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 4 Mar 2024 09:36:46 +0100
Subject: [PATCH 2/4] update changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 58e8c618..547839a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Changed
 
 - Update to nf-tools 2.11.1 [#457] (https://github.com/nf-core/rnafusion/pull/457)
+- Update picard collectrnaseqmetrics memory requirements to 0.8x what is provided [#474](https://github.com/nf-core/rnafusion/pull/474)
 
 ### Fixed
 

From 2cea1a4179fd76185fe471a9398267269fc915ce Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Mon, 4 Mar 2024 11:19:01 +0100
Subject: [PATCH 3/4] first step fix

---
 bin/vcf_collect.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py
index 61283d3b..00e5fdb8 100755
--- a/bin/vcf_collect.py
+++ b/bin/vcf_collect.py
@@ -352,7 +352,8 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
     with open(fusionreport_file) as f:
         from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line]
         expression = ast.literal_eval(from_html[0].split('], "tool')[0])
-    fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()})
+        # print(expression)
+        fusion_report = pd.DataFrame(list(expression))
     if not "arriba" in fusion_report.columns:
         fusion_report["arriba"] = ""
     if not "fusioncatcher" in fusion_report.columns:
@@ -371,8 +372,10 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame:
     fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply(
         concatenate_columns, axis=1
     )
+    print(fusion_report)
     fusion_report.columns = fusion_report.columns.str.upper()
-    fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x))
+    print(fusion_report["FOUND_DB"])
+    fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x) if len(x) > 0 else '')
     fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True)
 
     return fusion_report[

From 78a56b37cd276a08c8197162c55b2b586543be04 Mon Sep 17 00:00:00 2001
From: Annick Renevey <47788523+rannick@users.noreply.github.com>
Date: Wed, 3 Apr 2024 12:14:38 +0200
Subject: [PATCH 4/4] align {

---
 modules/local/picard/collectrnaseqmetrics/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/picard/collectrnaseqmetrics/main.nf b/modules/local/picard/collectrnaseqmetrics/main.nf
index e633e563..ede593e0 100644
--- a/modules/local/picard/collectrnaseqmetrics/main.nf
+++ b/modules/local/picard/collectrnaseqmetrics/main.nf
@@ -38,7 +38,7 @@ process PICARD_COLLECTRNASEQMETRICS {
         log.info '[Picard CollectRnaMetrics] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.'
     } else {
         avail_mem = (task.memory.mega*0.8).intValue()
-        }
+    }
     """
     picard \\
         -Xmx${avail_mem}M \\