From a66ec8479e159e24413e9786793744d58705e313 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 18 Mar 2024 14:10:23 +0100 Subject: [PATCH 1/3] bugfix for vcf --- bin/vcf_collect.py | 69 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 7e988c1e..5b73a92e 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -65,7 +65,9 @@ def vcf_collect( gtf_df = build_gtf_dataframe(gtf) all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") - all_df[["PosA", "orig_start", "orig_end"]] = all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + all_df[["PosA", "orig_start", "orig_end"]] = ( + all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) + ) all_df = all_df[ ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) @@ -75,7 +77,9 @@ def vcf_collect( all_df.replace("", np.nan, inplace=True) all_df = all_df.drop_duplicates() - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -116,8 +120,12 @@ def vcf_collect( ].drop_duplicates() all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str") all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna(0) - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype(int) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna( + 0 + ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype( + int + ) all_df = all_df[ ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) @@ -126,7 +134,9 @@ def vcf_collect( all_df[["PosA", "PosB"]] = all_df[["PosA", "PosB"]].replace(0, np.nan) all_df = all_df.replace("", np.nan) - all_df[["exon_number", "transcript_version"]] = all_df[["exon_number", "transcript_version"]].replace(0, np.nan) + all_df[["exon_number", "transcript_version"]] = all_df[ + ["exon_number", "transcript_version"] + ].replace(0, np.nan) # Fill non-empty values within each group for 'exon_number' and 'transcript_version' all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") @@ -212,7 +222,9 @@ def parse_args(argv=None): type=Path, help="HGNC database.", ) - parser.add_argument("--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample") + parser.add_argument( + "--sample", metavar="SAMPLE", type=Path, help="Sample name.", default="Sample" + ) parser.add_argument( "--out", metavar="OUT", @@ -280,7 +292,11 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: df["annots"] = ( df["annots"] .apply(convert_to_list) - .apply(lambda x: ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "") + .apply( + lambda x: ( + ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "" + ) + ) ) else: for i in [ @@ -334,9 +350,10 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: Make all column headers uppercase. """ with open(fusionreport_file) as f: - from_html = [line.split('rows": [')[1] for line in f if 'name="fusion_list' in line] - expression = ast.literal_eval(from_html[0].split('], "tool')[0]) - fusion_report = pd.DataFrame.from_dict({k: [v] for k, v in expression.items()}) + from_html = [line.split('rows": ')[1] for line in f if 'name="fusion_list' in line] + tmp=str(from_html)[2:] + tmp2 = (tmp.split(', "tools": ')[0]) + fusion_report = pd.DataFrame(ast.literal_eval(tmp2)) if not "arriba" in fusion_report.columns: fusion_report["arriba"] = "" if not "fusioncatcher" in fusion_report.columns: @@ -356,12 +373,12 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: concatenate_columns, axis=1 ) fusion_report.columns = fusion_report.columns.str.upper() - fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x)) + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x) if len(x) > 0 else '') fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) - return fusion_report[["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"]].set_index( - ["FUSION"] - ) + return fusion_report[ + ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] + ].set_index(["FUSION"]) def read_fusionreport_csv(file: str) -> pd.DataFrame: @@ -370,7 +387,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: for column in columns_to_iterate: if column not in df.columns: df[column] = "" - df[["starfusion", "arriba", "fusioncatcher"]] = df[["starfusion", "arriba", "fusioncatcher"]].astype("str") + df[["starfusion", "arriba", "fusioncatcher"]] = df[ + ["starfusion", "arriba", "fusioncatcher"] + ].astype("str") for index, row in df.iterrows(): for column in columns_to_iterate: cell_value = row[column] @@ -398,7 +417,9 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: df[["GeneA", "GeneB"]] = df["Fusion"].str.split("--", expand=True) df = df.set_index("Fusion") df.to_csv("tmp.csv") - return df[["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"]] + return df[ + ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"] + ] def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: @@ -425,7 +446,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) - df["Right_transcript_version"] = df["Right_transcript_version"].fillna(0).astype(int).astype(str) + df["Right_transcript_version"] = ( + df["Right_transcript_version"].fillna(0).astype(int).astype(str) + ) df["PosA"] = df["PosA"].fillna(0).astype(int).astype(str) df["PosB"] = df["PosB"].fillna(0).astype(int).astype(str) df["PROT_FUSION_TYPE"] = df["PROT_FUSION_TYPE"].replace(".", "nan") @@ -452,7 +475,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: f"EXON_NUMBER_A={row['Left_exon_number']};EXON_NUMBER_B={row['Right_exon_number']};" f"ANNOTATIONS={row['annots']}" ) - df.loc[index, "Sample"] = f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + df.loc[index, "Sample"] = ( + f"./1:{row['JunctionReadCount']}:{row['SpanningFragCount']}:{row['FFPM']}" + ) return df @@ -497,7 +522,9 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: """ df = pd.read_csv(file, sep="\t") df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) - df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split(",", expand=True) + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split( + ",", expand=True + ) return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]] @@ -511,7 +538,9 @@ def main(argv=None): or not args.fusionreport_csv or not args.hgnc ): - logger.error(f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!") + logger.error( + f"The given input file {args.fusioninspector} or {args.fusionreport} was not found!" + ) sys.exit(2) vcf_collect( args.fusioninspector, From 3ec309015f0ce12c0c5d678075b00bfe19a30db8 Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Mon, 18 Mar 2024 20:09:46 +0100 Subject: [PATCH 2/3] black without line limit --- bin/vcf_collect.py | 135 +++++++++++++++++++++++++++++++-------------- 1 file changed, 95 insertions(+), 40 deletions(-) diff --git a/bin/vcf_collect.py b/bin/vcf_collect.py index 5b73a92e..1decbe90 100755 --- a/bin/vcf_collect.py +++ b/bin/vcf_collect.py @@ -47,9 +47,14 @@ def vcf_collect( df_not_symbol = merged_df[merged_df["Left_ensembl_gene_id"].notna()] df_not_symbol = hgnc_df.merge( - df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Left_ensembl_gene_id" + df_not_symbol, + how="right", + left_on="ensembl_gene_id", + right_on="Left_ensembl_gene_id", + ) + df_symbol = hgnc_df.merge( + df_symbol, how="right", left_on="symbol", right_on="GeneA" ) - df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneA") df = pd.concat([df_not_symbol, df_symbol]) df = df.rename(columns={"hgnc_id": "Left_hgnc_id"}) @@ -57,20 +62,30 @@ def vcf_collect( df_not_symbol = df[df["Right_ensembl_gene_id"].notna()] df_not_symbol = hgnc_df.merge( - df_not_symbol, how="right", left_on="ensembl_gene_id", right_on="Right_ensembl_gene_id" + df_not_symbol, + how="right", + left_on="ensembl_gene_id", + right_on="Right_ensembl_gene_id", + ) + df_symbol = hgnc_df.merge( + df_symbol, how="right", left_on="symbol", right_on="GeneB" ) - df_symbol = hgnc_df.merge(df_symbol, how="right", left_on="symbol", right_on="GeneB") df = pd.concat([df_not_symbol, df_symbol]) df = df.rename(columns={"hgnc_id": "Right_hgnc_id"}) gtf_df = build_gtf_dataframe(gtf) - all_df = df.merge(gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id") + all_df = df.merge( + gtf_df, how="left", left_on="CDS_LEFT_ID", right_on="Transcript_id" + ) all_df[["PosA", "orig_start", "orig_end"]] = ( all_df[["PosA", "orig_start", "orig_end"]].fillna(0).astype(int) ) all_df = all_df[ - ((all_df["PosA"] >= all_df["orig_start"]) & (all_df["PosA"] <= all_df["orig_end"])) + ( + (all_df["PosA"] >= all_df["orig_start"]) + & (all_df["PosA"] <= all_df["orig_end"]) + ) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) ] @@ -84,9 +99,9 @@ def vcf_collect( all_df["exon_number"] = all_df.groupby("PosA")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") ) - all_df["transcript_version"] = all_df.groupby("PosA")["transcript_version"].transform( - lambda x: x.fillna(method="ffill").fillna(method="bfill") - ) + all_df["transcript_version"] = all_df.groupby("PosA")[ + "transcript_version" + ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill")) all_df = all_df.rename(columns={"transcript_version": "Left_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Left_exon_number"}) @@ -119,15 +134,20 @@ def vcf_collect( ] ].drop_duplicates() all_df["CDS_RIGHT_ID"] = all_df["CDS_RIGHT_ID"].astype("str") - all_df = all_df.merge(gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id") - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].fillna( - 0 - ) - all_df[["PosB", "orig_start", "orig_end"]] = all_df[["PosB", "orig_start", "orig_end"]].astype( - int + all_df = all_df.merge( + gtf_df, how="left", left_on="CDS_RIGHT_ID", right_on="Transcript_id" ) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[ + ["PosB", "orig_start", "orig_end"] + ].fillna(0) + all_df[["PosB", "orig_start", "orig_end"]] = all_df[ + ["PosB", "orig_start", "orig_end"] + ].astype(int) all_df = all_df[ - ((all_df["PosB"] >= all_df["orig_start"]) & (all_df["PosB"] <= all_df["orig_end"])) + ( + (all_df["PosB"] >= all_df["orig_start"]) + & (all_df["PosB"] <= all_df["orig_end"]) + ) | ((all_df["orig_start"] == 0) & (all_df["orig_end"] == 0)) ] @@ -141,9 +161,9 @@ def vcf_collect( all_df["exon_number"] = all_df.groupby("PosB")["exon_number"].transform( lambda x: x.fillna(method="ffill").fillna(method="bfill") ) - all_df["transcript_version"] = all_df.groupby("PosB")["transcript_version"].transform( - lambda x: x.fillna(method="ffill").fillna(method="bfill") - ) + all_df["transcript_version"] = all_df.groupby("PosB")[ + "transcript_version" + ].transform(lambda x: x.fillna(method="ffill").fillna(method="bfill")) all_df = all_df.rename(columns={"transcript_version": "Right_transcript_version"}) all_df = all_df.rename(columns={"exon_number": "Right_exon_number"}) @@ -285,16 +305,26 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: df = pd.read_csv(file, sep="\t") df = df.rename(columns={"#FusionName": "FUSION"}) if not (df.empty): - df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split(":", expand=True) - df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split(":", expand=True) - df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split("^", expand=True) - df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split("^", expand=True) + df[["ChromosomeA", "PosA", "Strand1"]] = df["LeftBreakpoint"].str.split( + ":", expand=True + ) + df[["ChromosomeB", "PosB", "Strand2"]] = df["RightBreakpoint"].str.split( + ":", expand=True + ) + df[["LeftGeneName", "Left_ensembl_gene_id"]] = df["LeftGene"].str.split( + "^", expand=True + ) + df[["RightGeneName", "Right_ensembl_gene_id"]] = df["RightGene"].str.split( + "^", expand=True + ) df["annots"] = ( df["annots"] .apply(convert_to_list) .apply( lambda x: ( - ",".join(map(str, x)) if isinstance(x, list) else str(x) if pd.notna(x) else "" + ",".join(map(str, x)) + if isinstance(x, list) + else str(x) if pd.notna(x) else "" ) ) ) @@ -320,7 +350,9 @@ def build_fusioninspector_dataframe(file: str) -> pd.DataFrame: return df.set_index(["FUSION"]) -def replace_value_with_column_name(row: pd.Series, value_to_replace: str, column_name: str) -> str: +def replace_value_with_column_name( + row: pd.Series, value_to_replace: str, column_name: str +) -> str: """ Replace a specific value in a row with the corresponding column name. """ @@ -350,9 +382,11 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: Make all column headers uppercase. """ with open(fusionreport_file) as f: - from_html = [line.split('rows": ')[1] for line in f if 'name="fusion_list' in line] - tmp=str(from_html)[2:] - tmp2 = (tmp.split(', "tools": ')[0]) + from_html = [ + line.split('rows": ')[1] for line in f if 'name="fusion_list' in line + ] + tmp = str(from_html)[2:] + tmp2 = tmp.split(', "tools": ')[0] fusion_report = pd.DataFrame(ast.literal_eval(tmp2)) if not "arriba" in fusion_report.columns: fusion_report["arriba"] = "" @@ -369,12 +403,16 @@ def read_build_fusionreport(fusionreport_file: str) -> pd.DataFrame: fusion_report["starfusion"] = fusion_report[["starfusion"]].apply( replace_value_with_column_name, args=("true", "starfusion"), axis=1 ) - fusion_report["FOUND_IN"] = fusion_report[["arriba", "starfusion", "fusioncatcher"]].apply( - concatenate_columns, axis=1 - ) + fusion_report["FOUND_IN"] = fusion_report[ + ["arriba", "starfusion", "fusioncatcher"] + ].apply(concatenate_columns, axis=1) fusion_report.columns = fusion_report.columns.str.upper() - fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply(lambda x: ",".join(x) if len(x) > 0 else '') - fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split("--", expand=True) + fusion_report["FOUND_DB"] = fusion_report["FOUND_DB"].apply( + lambda x: ",".join(x) if len(x) > 0 else "" + ) + fusion_report[["GeneA", "GeneB"]] = fusion_report["FUSION"].str.split( + "--", expand=True + ) return fusion_report[ ["FUSION", "GeneA", "GeneB", "TOOLS_HITS", "SCORE", "FOUND_DB", "FOUND_IN"] @@ -418,7 +456,16 @@ def read_fusionreport_csv(file: str) -> pd.DataFrame: df = df.set_index("Fusion") df.to_csv("tmp.csv") return df[ - ["GeneA", "GeneB", "ChromosomeA", "PosA", "StrandA", "ChromosomeB", "PosB", "StrandB"] + [ + "GeneA", + "GeneB", + "ChromosomeA", + "PosA", + "StrandA", + "ChromosomeB", + "PosB", + "StrandB", + ] ] @@ -445,7 +492,9 @@ def column_manipulation(df: pd.DataFrame) -> pd.DataFrame: df["Right_hgnc_id"] = df["Right_hgnc_id"].fillna(0).astype(int).astype(str) df["Left_exon_number"] = df["Left_exon_number"].fillna(0).astype(int).astype(str) df["Right_exon_number"] = df["Right_exon_number"].fillna(0).astype(int).astype(str) - df["Left_transcript_version"] = df["Left_transcript_version"].fillna(0).astype(int).astype(str) + df["Left_transcript_version"] = ( + df["Left_transcript_version"].fillna(0).astype(int).astype(str) + ) df["Right_transcript_version"] = ( df["Right_transcript_version"].fillna(0).astype(int).astype(str) ) @@ -499,7 +548,9 @@ def write_vcf(df_to_print: pd.DataFrame, header: str, out_file: str) -> None: "FORMAT", "Sample", ] - ].to_csv(path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE) + ].to_csv( + path_or_buf=out_file, sep="\t", header=None, index=False, quoting=csv.QUOTE_NONE + ) with open(out_file, "r+") as f: content = f.read() @@ -521,11 +572,15 @@ def build_gtf_dataframe(file: str) -> pd.DataFrame: Build a DataFrame from GTF file converted in TSV, extracting relevant columns. """ df = pd.read_csv(file, sep="\t") - df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split("^", expand=True) - df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df["orig_coord_info"].str.split( - ",", expand=True + df[["fusion_dump", "Transcript_id"]] = df["transcript_id"].str.split( + "^", expand=True ) - return df[["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"]] + df[["orig_chromosome", "orig_start", "orig_end", "orig_dir"]] = df[ + "orig_coord_info" + ].str.split(",", expand=True) + return df[ + ["Transcript_id", "transcript_version", "exon_number", "orig_start", "orig_end"] + ] def main(argv=None): From 2b6ff48b95e7b471fb7c5559e2a0a9ea1f8d91ec Mon Sep 17 00:00:00 2001 From: Annick Renevey <47788523+rannick@users.noreply.github.com> Date: Tue, 19 Mar 2024 13:44:48 +0100 Subject: [PATCH 3/3] update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e8c618..5ce3a28a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - fix bug when using parameter "whitelist" [#466](https://github.com/nf-core/rnafusion/pull/466) - fix VCF_COLLECT handling when a tool is absent from FUSIONREPORT report [#458](https://github.com/nf-core/rnafusion/pull/458) - fix VCF_COLLECT when fusioninspector output is empty but fusionreport is not [#465](https://github.com/nf-core/rnafusion/pull/465) +- fix VCF_COLLECT bug [#481](https://github.com/nf-core/rnafusion/pull/481) ### Removed