From c0682c56512f13775c8049b9ea16941e304f7d2c Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 7 May 2024 21:25:40 -0600 Subject: [PATCH 01/33] Clean-up `process_consequences` and fix to handle `tc.lof` missingness correctly --- gnomad/utils/vep.py | 102 ++++++++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 31 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index aaa2d7926..2a0d74ca2 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -286,7 +286,7 @@ def add_most_severe_consequence_to_consequence( def process_consequences( - mt: Union[hl.MatrixTable, hl.Table], + t: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", penalize_flags: bool = True, csq_order: Optional[List[str]] = None, @@ -297,12 +297,24 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. + Each transcript consequence is annotated with a `csq_score` which is a combination + of the index of the consequence's `most_severe_consequence` in `csq_order` and a + boost for loss-of-function consequences, and polyphen predictions if `has_polyphen` + is True. + + The score adjustment is as follows: + - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) + - lof == 'HC' & lof_flags (-500) + - lof == 'OS' (-20) + - lof == 'LC' (-10) + - everything else (0) + .. note:: From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct in the release HTs. When using this function with gnomAD v4.0 or later, set `has_polyphen` to False. - :param mt: Input Table or MatrixTable. + :param t: Input Table or MatrixTable. :param vep_root: Root for VEP annotation (probably "vep"). :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC. @@ -316,56 +328,84 @@ def process_consequences( if csq_order is None: csq_order = CSQ_ORDER csqs = hl.literal(csq_order) + + # Assign a score to each consequence based on the order in csq_order. csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) - def _csq_score(tc: hl.expr.StructExpression) -> int: - return csq_dict[tc.most_severe_consequence] + def _find_worst_transcript_consequence( + tcl: hl.expr.ArrayExpression, + ) -> hl.expr.StructExpression: + """ + Find the worst transcript consequence in an array of transcript consequences. - flag_score = 500 - no_flag_score = flag_score * (1 + penalize_flags) + :param tcl: Array of transcript consequences. + :return: Worst transcript consequence. + """ + flag = 500 + no_flag = flag * (1 + penalize_flags) - def _csq_score_modifier(tc: hl.expr.StructExpression) -> float: - modifier = _csq_score(tc) - flag_condition = (tc.lof == "HC") & (tc.lof_flags != "") - modifier -= hl.if_else(flag_condition, flag_score, no_flag_score) - modifier -= hl.if_else(tc.lof == "OS", 20, 0) - modifier -= hl.if_else(tc.lof == "LC", 10, 0) - if has_polyphen: - modifier -= ( - hl.case() - .when(tc.polyphen_prediction == "probably_damaging", 0.5) - .when(tc.polyphen_prediction == "possibly_damaging", 0.25) - .when(tc.polyphen_prediction == "benign", 0.1) + # Score each consequence based on the order in csq_order. + score_expr = tcl.map( + lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)] + ) + + # Determine the score adjustment based on the consequence's LOF and LOF flags. + sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when((tc.lof == "HC") & (tc.lof_flags == ""), no_flag) + .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) + .when(tc.lof == "OS", 20) + .when(tc.lof == "LC", 10) .default(0) ) - return modifier + ) - def find_worst_transcript_consequence( - tcl: hl.expr.ArrayExpression, - ) -> hl.expr.StructExpression: - tcl = tcl.map( - lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc)) + # If requested, determine the score adjustment based on the consequence's + # PolyPhen prediction. + if has_polyphen: + polyphen_sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when(tc.polyphen_prediction == "probably_damaging", 0.5) + .when(tc.polyphen_prediction == "possibly_damaging", 0.25) + .when(tc.polyphen_prediction == "benign", 0.1) + .default(0) + ) + ) + sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr) + + # Calculate the final consequence score. + tcl = hl.map( + lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr ) + + # Return the worst consequence based on the calculated score. return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) - transcript_csqs = mt[vep_root].transcript_consequences.map( + # Annotate each transcript consequence with the 'most_severe_consequence'. + transcript_csqs = t[vep_root].transcript_consequences.map( add_most_severe_consequence_to_consequence ) + # Group transcript consequences by gene and find the worst consequence for each. gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) - worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values() + worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values() sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) + # Filter transcript consequences to only include canonical transcripts and find the + # worst consequence for each gene. canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) worst_csq_gene_canonical = gene_canonical_dict.map_values( - find_worst_transcript_consequence + _find_worst_transcript_consequence ).values() sorted_canonical_scores = hl.sorted( worst_csq_gene_canonical, key=lambda tc: tc.csq_score ) - vep_data = mt[vep_root].annotate( + # Annotate the HT/MT with the worst consequence for each gene and variant. + vep_data = t[vep_root].annotate( transcript_consequences=transcript_csqs, worst_consequence_term=csqs.find( lambda c: transcript_csqs.map( @@ -383,9 +423,9 @@ def find_worst_transcript_consequence( ) return ( - mt.annotate_rows(**{vep_root: vep_data}) - if isinstance(mt, hl.MatrixTable) - else mt.annotate(**{vep_root: vep_data}) + t.annotate_rows(**{vep_root: vep_data}) + if isinstance(t, hl.MatrixTable) + else t.annotate(**{vep_root: vep_data}) ) From 6d9a0ead7f6b52eff903958accec8ba77fb8e90b Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 7 May 2024 21:33:40 -0600 Subject: [PATCH 02/33] Adds the following fixes to `process_consequences`: - Allow lof_flags to be missing in addition to lof_flags == "" for lof == "HC" to have no flag penalty - Pass `csq_order` to `add_most_severe_consequence_to_consequence` --- gnomad/utils/vep.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 2a0d74ca2..f52bcde2d 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -271,14 +271,22 @@ def vep_or_lookup_vep( def add_most_severe_consequence_to_consequence( tc: hl.expr.StructExpression, + csq_order: Optional[List[str]] = None, ) -> hl.expr.StructExpression: """ Add most_severe_consequence annotation to transcript consequences. - This is for a given transcript, as there are often multiple annotations for a single transcript: - e.g. splice_region_variant&intron_variant -> splice_region_variant + This is for a given transcript, as there are often multiple annotations for a single + transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant + + :param tc: Transcript consequence struct to annotate. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. """ - csqs = hl.literal(CSQ_ORDER) + if csq_order is None: + csq_order = CSQ_ORDER + csqs = hl.literal(csq_order) return tc.annotate( most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c)) @@ -353,7 +361,7 @@ def _find_worst_transcript_consequence( sub_expr = tcl.map( lambda tc: ( hl.case(missing_false=True) - .when((tc.lof == "HC") & (tc.lof_flags == ""), no_flag) + .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag) .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) .when(tc.lof == "OS", 20) .when(tc.lof == "LC", 10) @@ -385,7 +393,7 @@ def _find_worst_transcript_consequence( # Annotate each transcript consequence with the 'most_severe_consequence'. transcript_csqs = t[vep_root].transcript_consequences.map( - add_most_severe_consequence_to_consequence + lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order) ) # Group transcript consequences by gene and find the worst consequence for each. From 66882823da1989a88f7e99deab52c2bb6b4609ae Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 7 May 2024 21:33:40 -0600 Subject: [PATCH 03/33] Updates to `get_most_severe_consequence_for_summary` to make it possible to use in `default_generate_gene_lof_matrix` --- gnomad/utils/vep.py | 185 ++++++++++++++++++++++++++++++++------------ 1 file changed, 137 insertions(+), 48 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index f52bcde2d..eae0e3ffe 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -269,28 +269,52 @@ def vep_or_lookup_vep( return vep_ht.union(revep_ht) -def add_most_severe_consequence_to_consequence( - tc: hl.expr.StructExpression, +def get_most_severe_consequence_expr( + csq_expr: hl.expr.CollectionExpression, csq_order: Optional[List[str]] = None, -) -> hl.expr.StructExpression: +) -> hl.expr.StringExpression: """ - Add most_severe_consequence annotation to transcript consequences. + Get the most severe consequence from a collection of consequences. This is for a given transcript, as there are often multiple annotations for a single transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant - :param tc: Transcript consequence struct to annotate. + :param csq_expr: CollectionExpression of consequences. :param csq_order: Optional list indicating the order of VEP consequences, sorted - from high to low impact. Default is None, which uses the value of the - `CSQ_ORDER` global. + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :return: Most severe consequence in `csq_expr`. """ if csq_order is None: csq_order = CSQ_ORDER csqs = hl.literal(csq_order) - return tc.annotate( - most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c)) - ) + return csqs.find(lambda c: csq_expr.contains(c)) + + +def add_most_severe_consequence_to_consequence( + tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression], + csq_order: Optional[List[str]] = None, +) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]: + """ + Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences. + + For a single transcript consequence, `tc` should be a StructExpression with a + `consequence_terms` field. For an array of transcript consequences, `tc` should be + an ArrayExpression of StructExpressions with a `consequence_terms` field. + + :param tc: Transcript consequence or array of transcript consequences to annotate. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :return: Transcript consequence or array of transcript consequences annotated with + the most severe consequence. + """ + csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order) + if isinstance(tc, hl.expr.StructExpression): + return tc.annotate(most_severe_consequence=csq(tc)) + else: + return tc.map(lambda x: x.annotate(most_severe_consequence=csq(x))) def process_consequences( @@ -688,6 +712,8 @@ def get_most_severe_consequence_for_summary( ht: hl.Table, csq_order: List[str] = CSQ_ORDER, loftee_labels: List[str] = LOFTEE_LABELS, + by_gene: bool = False, + include_transcript_csqs=False, ) -> hl.Table: """ Prepare a hail Table for summary statistics generation. @@ -698,67 +724,130 @@ def get_most_severe_consequence_for_summary( - lof: Whether the variant is a loss-of-function variant - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags) - Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts. + Assumes input Table is annotated with VEP and that VEP annotations have been + filtered to canonical transcripts. :param ht: Input Table. - :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. - :param loftee_labels: Annotations added by LOFTEE. Default is LOFTEE_LABELS. + :param csq_order: Order of VEP consequences, sorted from high to low impact. + Default is CSQ_ORDER. + :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. + Default is LOFTEE_LABELS. + :param by_gene: Whether to group by gene. Default is False. + :param include_transcript_csqs: Whether to include all transcript consequences for + the most severe consequence. Default is False. :return: Table annotated with VEP summary annotations. """ + # Get type of transcript_consequences field for later use with hl.missing. + tc = ht.vep.transcript_consequences.map( + lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr)) + ) + tc_dtype = tc.dtype def _get_most_severe_csq( - csq_list: hl.expr.ArrayExpression, protein_coding: bool + csq_list: hl.expr.ArrayExpression, + protein_coding: bool = False, + include_tc: bool = False, ) -> hl.expr.StructExpression: """ Process VEP consequences to generate summary annotations. :param csq_list: VEP consequences list to be processed. - :param protein_coding: Whether variant is in a protein-coding transcript. + :param protein_coding: Whether to filter to only protein-coding transcripts. + Default is False. + :param include_tc: Whether to include all transcript consequences for the most + severe consequence. Default is False. :return: Struct containing summary annotations. """ - lof = hl.null(hl.tstr) - no_lof_flags = hl.null(hl.tbool) + lof = hl.missing(hl.tstr) + no_lof_flags = hl.missing(hl.tbool) if protein_coding: - all_lofs = csq_list.map(lambda x: x.lof) - lof = hl.literal(loftee_labels).find(lambda x: all_lofs.contains(x)) - csq_list = hl.if_else( - hl.is_defined(lof), csq_list.filter(lambda x: x.lof == lof), csq_list + # Filter to only protein-coding transcripts. + csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") + # Get the highest impact LOFTEE label. + lof = get_most_severe_consequence_expr( + csq_list.map(lambda x: x.lof), csq_order=loftee_labels ) - no_lof_flags = hl.or_missing( - hl.is_defined(lof), - csq_list.any(lambda x: (x.lof == lof) & hl.is_missing(x.lof_flags)), + # Filter to only consequences with the highest impact LOFTEE label. + lof_csq = hl.or_missing( + hl.is_defined(lof), csq_list.filter(lambda x: x.lof == lof) ) - all_csq_terms = csq_list.flatmap(lambda x: x.consequence_terms) - most_severe_csq = hl.literal(csq_order).find( - lambda x: all_csq_terms.contains(x) - ) - return hl.struct( - most_severe_csq=most_severe_csq, + # Check if any of the lof consequences have no lof_flags. + no_lof_flags = lof_csq.any( + lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") + ) + csq_list = hl.coalesce(lof_csq, csq_list) + + # Add most_severe_consequence to each consequence. + csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) + + # Get the most severe consequence of all consequences in the list. + result = hl.struct( + most_severe_csq=get_most_severe_consequence_expr( + csq_list.map(lambda x: x.most_severe_consequence), csq_order=csq_order + ), protein_coding=protein_coding, lof=lof, no_lof_flags=no_lof_flags, ) - protein_coding = ht.vep.transcript_consequences.filter( - lambda x: x.biotype == "protein_coding" - ) - return ht.annotate( - **hl.case(missing_false=True) - .when(hl.len(protein_coding) > 0, _get_most_severe_csq(protein_coding, True)) - .when( - hl.len(ht.vep.transcript_consequences) > 0, - _get_most_severe_csq(ht.vep.transcript_consequences, False), - ) - .when( - hl.len(ht.vep.regulatory_feature_consequences) > 0, - _get_most_severe_csq(ht.vep.regulatory_feature_consequences, False), + # Add an annotation indicating all transcript consequences that match the most + # severe consequence. If transcript consequences are requested, but the + # current consequence list is not for transcript_consequences, include the + # annotation, but set values to missing. + if include_transcript_csqs: + tc_expr = hl.missing(tc_dtype) + if include_tc: + tc_expr = csq_list.filter( + lambda x: x.most_severe_consequence == result.most_severe_csq + ) + # Rename most_severe_consequence to most_severe_csq for consistency. + tc_expr = tc_expr.map( + lambda x: x.rename({"most_severe_consequence": "most_severe_csq"}) + ) + result = result.annotate(transcript_consequences=tc_expr) + + return hl.or_missing(hl.len(csq_list) > 0, result) + + def _get_most_severe_csq_multi( + csqs: hl.ArrayExpression, + csq_lists: List[hl.ArrayExpression] = [], + ) -> hl.expr.StructExpression: + """ + Process multiple VEP consequences lists to determine the most severe consequence. + + First, filter to only protein-coding transcripts and determine the most severe + consequence. If no protein-coding transcripts are present, determine the most + severe consequence for all transcripts. If additional VEP consequences lists are + provided, process those lists in the order they are provided. + + :param csqs: VEP transcript consequences array expression. + :param csq_lists: List of additional VEP consequences lists to be processed in + the order they should be considered for the most severe consequence. + :return: Struct containing summary annotations for the most severe consequence. + """ + return hl.coalesce( + _get_most_severe_csq(csqs, protein_coding=True, include_tc=True), + _get_most_severe_csq(csqs, include_tc=True), + *[_get_most_severe_csq(c) for c in csq_lists], ) - .when( - hl.len(ht.vep.motif_feature_consequences) > 0, - _get_most_severe_csq(ht.vep.motif_feature_consequences, False), + + # Annotate the Table with the most severe consequence by gene or variant. + if by_gene: + return ht.annotate( + worst_csq_by_gene=tc.group_by(lambda x: (x.gene_id, x.gene_symbol)) + .map_values(lambda x: _get_most_severe_csq_multi(x)) + .items() + .map(lambda x: x[1].annotate(gene_id=x[0][0], gene_symbol=x[0][1])) ) - .default(_get_most_severe_csq(ht.vep.intergenic_consequences, False)) - ) + else: + # If most severe consequence is requested for each variant, include additional + # VEP consequence lists. + csq_lists = [ + ht.vep.regulatory_feature_consequences, + ht.vep.motif_feature_consequences, + ht.vep.intergenic_consequences, + ] + return ht.annotate(**_get_most_severe_csq_multi(tc, csq_lists)) def filter_vep_transcript_csqs( From d1c2f8af4d9c70d17b063a41f8e9ad858f566dd7 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 8 May 2024 16:21:41 -0600 Subject: [PATCH 04/33] Modify `process_consequences` to use `get_most_severe_consequence_for_summary` --- gnomad/utils/vep.py | 492 +++++++++++++++++++++++++++----------------- 1 file changed, 298 insertions(+), 194 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index eae0e3ffe..a8f7b9a94 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -4,7 +4,7 @@ import logging import os import subprocess -from typing import Callable, List, Optional, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import hail as hl @@ -323,24 +323,13 @@ def process_consequences( penalize_flags: bool = True, csq_order: Optional[List[str]] = None, has_polyphen: bool = True, + prioritize_protein_coding: bool = False, ) -> Union[hl.MatrixTable, hl.Table]: """ Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root]. `most_severe_consequence` is the worst consequence for a transcript. - Each transcript consequence is annotated with a `csq_score` which is a combination - of the index of the consequence's `most_severe_consequence` in `csq_order` and a - boost for loss-of-function consequences, and polyphen predictions if `has_polyphen` - is True. - - The score adjustment is as follows: - - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) - - lof == 'HC' & lof_flags (-500) - - lof == 'OS' (-20) - - lof == 'LC' (-10) - - everything else (0) - .. note:: From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct in the release HTs. When using this function with gnomAD v4.0 or later, @@ -355,14 +344,14 @@ def process_consequences( `CSQ_ORDER` global. :param has_polyphen: Whether the input VEP Struct has a PolyPhen annotation which will be used to modify the consequence score. Default is True. - :return: MT with better formatted consequences. + :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts + when determining the worst consequence. Default is False. + :return: HT/MT with better formatted consequences. """ - if csq_order is None: - csq_order = CSQ_ORDER - csqs = hl.literal(csq_order) - - # Assign a score to each consequence based on the order in csq_order. - csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) + # If has_polyphen is True, set the order of PolyPhen consequences. + polyphen_order = None + if has_polyphen: + polyphen_order = ["probably_damaging", "possibly_damaging", "benign"] def _find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, @@ -373,85 +362,61 @@ def _find_worst_transcript_consequence( :param tcl: Array of transcript consequences. :return: Worst transcript consequence. """ - flag = 500 - no_flag = flag * (1 + penalize_flags) - - # Score each consequence based on the order in csq_order. - score_expr = tcl.map( - lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)] + ms_csq = get_most_severe_consequence_for_summary( + csq_expr_dict={"transcript_consequences": tcl}, + csq_order=csq_order, + vep_root=vep_root, + include_transcript_csqs=True, + prioritize_protein_coding=prioritize_protein_coding, + csq_list_order=["transcript_consequences"], + add_order_by_csq_list={"transcript_consequences": polyphen_order}, ) + tcl = ms_csq.transcript_consequences - # Determine the score adjustment based on the consequence's LOF and LOF flags. - sub_expr = tcl.map( - lambda tc: ( - hl.case(missing_false=True) - .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag) - .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) - .when(tc.lof == "OS", 20) - .when(tc.lof == "LC", 10) - .default(0) + # Penalize LOFTEE flagged variants. + if penalize_flags: + no_flags = tcl.filter( + lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") ) - ) - - # If requested, determine the score adjustment based on the consequence's - # PolyPhen prediction. - if has_polyphen: - polyphen_sub_expr = tcl.map( - lambda tc: ( - hl.case(missing_false=True) - .when(tc.polyphen_prediction == "probably_damaging", 0.5) - .when(tc.polyphen_prediction == "possibly_damaging", 0.25) - .when(tc.polyphen_prediction == "benign", 0.1) - .default(0) - ) + tcl = hl.if_else( + (ms_csq.lof == "HC") & ~ms_csq.no_lof_flags & (hl.len(no_flags) > 0), + no_flags, + tcl, ) - sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr) - - # Calculate the final consequence score. - tcl = hl.map( - lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr - ) # Return the worst consequence based on the calculated score. - return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) + return hl.or_missing(hl.len(tcl) > 0, tcl[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. - transcript_csqs = t[vep_root].transcript_consequences.map( + csqs = t[vep_root].transcript_consequences.map( lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order) ) # Group transcript consequences by gene and find the worst consequence for each. - gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) - worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values() - sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) - - # Filter transcript consequences to only include canonical transcripts and find the - # worst consequence for each gene. - canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) - gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) - worst_csq_gene_canonical = gene_canonical_dict.map_values( - _find_worst_transcript_consequence - ).values() - sorted_canonical_scores = hl.sorted( - worst_csq_gene_canonical, key=lambda tc: tc.csq_score + gene_csqs = ( + csqs.group_by(lambda tc: tc.gene_symbol) + .map_values(_find_worst_transcript_consequence) + .values() + ) + + # Filter transcript consequences to only include canonical transcripts. + canonical = csqs.filter(lambda csq: csq.canonical == 1) + gene_canonical = ( + canonical.group_by(lambda tc: tc.gene_symbol) + .map_values(_find_worst_transcript_consequence) + .values() ) # Annotate the HT/MT with the worst consequence for each gene and variant. vep_data = t[vep_root].annotate( - transcript_consequences=transcript_csqs, - worst_consequence_term=csqs.find( - lambda c: transcript_csqs.map( - lambda csq: csq.most_severe_consequence - ).contains(c) - ), - worst_csq_by_gene=sorted_scores, - worst_csq_for_variant=hl.or_missing( - hl.len(sorted_scores) > 0, sorted_scores[0] - ), - worst_csq_by_gene_canonical=sorted_canonical_scores, - worst_csq_for_variant_canonical=hl.or_missing( - hl.len(sorted_canonical_scores) > 0, sorted_canonical_scores[0] + transcript_consequences=csqs, + worst_consequence_term=get_most_severe_consequence_expr( + csqs.map(lambda csq: csq.most_severe_consequence), csq_order ), + worst_csq_by_gene=gene_csqs, + worst_csq_for_variant=_find_worst_transcript_consequence(csqs), + worst_csq_by_gene_canonical=gene_canonical, + worst_csq_for_variant_canonical=_find_worst_transcript_consequence(canonical), ) return ( @@ -708,146 +673,285 @@ def get_csq_from_struct( return hl.or_missing(hl.len(csq) > 0, csq) +def filter_to_most_severe_transcript_csqs( + csq_list: hl.expr.ArrayExpression, + csq_order: List[str] = CSQ_ORDER, + filter_protein_coding: bool = False, + prioritize_loftee: bool = False, + loftee_labels: List[str] = LOFTEE_LABELS, + additional_order: Tuple[str, List[str]] = None, +) -> hl.expr.StructExpression: + """ + Filter a list of VEP consequences to all entries that have the most severe consequence. + + If `filter_protein_coding` is True, filter to only protein-coding transcripts before + determining the most severe consequence. If `prioritize_loftee` is True, prioritize + LOFTEE consequences by filtering to only LOFTEE consequences and determining the + most severe consequence. If `additional_order` is provided, additional ordering is + applied to the consequences in the list. + + .. note:: + + - If you have multiple lists of consequences and want to determine the most + severe consequence across all lists, consider using + `get_most_severe_consequence_for_summary`. + + - If you want to group consequences by gene and determine the most severe + consequence for each gene, consider using `process_consequences`. + + :param csq_list: ArrayExpression of VEP consequences. + :param csq_order: List indicating the order of VEP consequences, sorted from high to + low impact. Default is the value of the `CSQ_ORDER` global. + :param filter_protein_coding: Whether to filter to only protein-coding transcripts + before determining the most severe consequence. Default is False. + :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is + False. + :param loftee_labels: List of LOFTEE labels in order of priority, sorted from high + to low impact. Default is the value of the `LOFTEE_LABELS` global. + :param additional_order: Tuple indicating the additional ordering to apply to the + consequences in the list. The first element is the name of the consequences list + and the second element is the order of consequences, sorted from high to low + impact. Default is None. + :return: StructExpression with the most severe consequence and the list of + consequences that match the most severe consequence. + """ + if filter_protein_coding: + csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") + + def _get_most_severe_csq( + csq_list: hl.expr.ArrayExpression, + csq_field: str = "most_severe_consequence", + csq_order: List[str] = csq_order, + ) -> Tuple[hl.expr.StringExpression, hl.expr.ArrayExpression]: + """ + Get the most severe consequence from a list of consequences and consequence order. + + :param csq_list: ArrayExpression of VEP consequences. + :param csq_field: Field to use for the most severe consequence. Default is + 'most_severe_consequence'. + :param csq_order: List indicating the order of VEP consequences, sorted from + high to low impact. + :return: Tuple containing the most severe consequence and the list of + consequences that match the most severe consequence. + """ + # Get the highest impact csq label. + ms_csq = get_most_severe_consequence_expr( + csq_list.map(lambda x: x[csq_field]), csq_order=csq_order + ) + # Filter to only consequences with the highest impact csq label. + csq_list = hl.or_missing( + hl.is_defined(ms_csq), csq_list.filter(lambda x: x[csq_field] == ms_csq) + ) + + return ms_csq, csq_list + + lof = hl.missing(hl.tstr) + no_lof_flags = hl.missing(hl.tbool) + if prioritize_loftee: + lof, lof_csq = _get_most_severe_csq( + csq_list, csq_field="lof", csq_order=loftee_labels + ) + # Check if any of the lof consequences have no lof_flags. + no_lof_flags = lof_csq.any( + lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") + ) + # If there are no lof consequences, set the consequence list to the original + # list. + csq_list = hl.coalesce(lof_csq, csq_list) + + # Add most_severe_consequence to each consequence. + csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) + + # Get the most severe consequence of all consequences in the list. + ms_csq, csq_list = _get_most_severe_csq(csq_list, csq_order=csq_order) + result = hl.struct( + most_severe_consequence=ms_csq, + protein_coding=filter_protein_coding, + lof=lof, + no_lof_flags=no_lof_flags, + ) + + if additional_order is not None: + # Get the highest impact consequences from the additional ordering. + _, add_csq_expr = _get_most_severe_csq( + csq_list, additional_order[0], additional_order[1] + ) + # If there are consequences from the additional ordering, set the consequence + # list to the additional ordering, otherwise keep the original list. + csq_list = hl.coalesce(add_csq_expr, csq_list) + + result = result.annotate(consequences=csq_list) + + return hl.or_missing(hl.len(csq_list) > 0, result) + + def get_most_severe_consequence_for_summary( - ht: hl.Table, + ht: Optional[hl.Table] = None, + csq_expr_dict: Optional[Dict[str, hl.expr.ArrayExpression]] = None, csq_order: List[str] = CSQ_ORDER, loftee_labels: List[str] = LOFTEE_LABELS, - by_gene: bool = False, - include_transcript_csqs=False, + vep_root: str = "vep", + include_transcript_csqs: bool = False, + prioritize_protein_coding: bool = True, + csq_list_order: Union[List[str], Tuple[str]] = ( + "transcript_consequences", + "regulatory_feature_consequences", + "motif_feature_consequences", + "intergenic_consequences", + ), + add_order_by_csq_list: Dict[str, List[str]] = None, ) -> hl.Table: """ - Prepare a hail Table for summary statistics generation. + Process multiple VEP consequences lists to determine the most severe consequence. + + Useful for generating summary annotations for VEP consequences. Adds the following annotations: - - most_severe_csq: Most severe consequence for variant - - protein_coding: Whether the variant is present on a protein-coding transcript - - lof: Whether the variant is a loss-of-function variant - - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags) + - most_severe_csq: Most severe consequence for variant. + - protein_coding: Whether the variant is present on a protein-coding transcript. + - lof: Whether the variant is a loss-of-function variant. + - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags). + + If `include_transcript_csqs` is True, an additional annotation is added: + - transcript_consequences: All transcript consequences for the most severe + consequence. + + If `prioritize_protein_coding` is True and "transcript_consequences" is in + `csq_list_order`, protein-coding transcripts are prioritized by filtering to only + protein-coding transcripts and determining the most severe consequence. If no + protein-coding transcripts are present, determine the most severe consequence for + all transcripts. If additional VEP consequences lists are requested, process those + lists in the order they appear in `csq_list_order`. + + If `add_order_by_csq_list` is provided, additional ordering is applied to the + consequences in the list. The key is the name of the consequences list and the value + is the order of consequences, sorted from high to low impact. An example use of this + parameter is to prioritize PolyPhen consequences for protein-coding transcripts. + + .. note:: + + Assumes input Table is annotated with VEP and that VEP annotations have been + filtered to canonical transcripts if wanted. - Assumes input Table is annotated with VEP and that VEP annotations have been - filtered to canonical transcripts. - :param ht: Input Table. + :param csqs: VEP transcript consequences array expression. + :param csq_lists: List of additional VEP consequences lists to be processed in + the order they should be considered for the most severe consequence. + :return: Struct containing summary annotations for the most severe consequence. + + :param ht: Input Table with VEP annotations. One of `ht` or `csq_expr_dict` must be + provided. Default is None. + :param csq_expr_dict: Dictionary of VEP consequences lists to be processed. One of + `ht` or `csq_expr_dict` must be provided. Default is None. :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. Default is LOFTEE_LABELS. - :param by_gene: Whether to group by gene. Default is False. + :param vep_root: Name used for VEP annotation. Default is 'vep'. :param include_transcript_csqs: Whether to include all transcript consequences for the most severe consequence. Default is False. + :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts + when determining the worst consequence. Default is True. + :param csq_list_order: Order of VEP consequences lists to be processed. Default is + ('transcript_consequences', 'regulatory_feature_consequences', + 'motif_feature_consequences', 'intergenic_consequences'). + :param add_order_by_csq_list: Dictionary of additional ordering for VEP consequences + lists. The key is the name of the consequences list and the value is the order + of consequences, sorted from high to low impact. Default is None. :return: Table annotated with VEP summary annotations. """ - # Get type of transcript_consequences field for later use with hl.missing. - tc = ht.vep.transcript_consequences.map( - lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr)) - ) - tc_dtype = tc.dtype + if ht is None and csq_expr_dict is None: + raise ValueError("Either `ht` or `csq_expr_dict` must be provided!") + + if csq_expr_dict is None: + vep_expr = ht[vep_root] + else: + vep_expr = csq_expr_dict + + if add_order_by_csq_list is None: + add_order_by_csq_list = {} def _get_most_severe_csq( - csq_list: hl.expr.ArrayExpression, + csq_list: hl.ArrayExpression, protein_coding: bool = False, - include_tc: bool = False, - ) -> hl.expr.StructExpression: - """ - Process VEP consequences to generate summary annotations. - - :param csq_list: VEP consequences list to be processed. - :param protein_coding: Whether to filter to only protein-coding transcripts. - Default is False. - :param include_tc: Whether to include all transcript consequences for the most - severe consequence. Default is False. - :return: Struct containing summary annotations. - """ - lof = hl.missing(hl.tstr) - no_lof_flags = hl.missing(hl.tbool) - if protein_coding: - # Filter to only protein-coding transcripts. - csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") - # Get the highest impact LOFTEE label. - lof = get_most_severe_consequence_expr( - csq_list.map(lambda x: x.lof), csq_order=loftee_labels - ) - # Filter to only consequences with the highest impact LOFTEE label. - lof_csq = hl.or_missing( - hl.is_defined(lof), csq_list.filter(lambda x: x.lof == lof) - ) - # Check if any of the lof consequences have no lof_flags. - no_lof_flags = lof_csq.any( - lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") - ) - csq_list = hl.coalesce(lof_csq, csq_list) - - # Add most_severe_consequence to each consequence. - csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) - - # Get the most severe consequence of all consequences in the list. - result = hl.struct( - most_severe_csq=get_most_severe_consequence_expr( - csq_list.map(lambda x: x.most_severe_consequence), csq_order=csq_order - ), - protein_coding=protein_coding, - lof=lof, - no_lof_flags=no_lof_flags, + include_csqs: bool = False, + additional_order: Optional[List[str]] = None, + ) -> hl.StructExpression: + csq_list = filter_to_most_severe_transcript_csqs( + csq_list, + csq_order=csq_order, + filter_protein_coding=protein_coding, + prioritize_loftee=protein_coding, + loftee_labels=loftee_labels, + additional_order=additional_order, ) - # Add an annotation indicating all transcript consequences that match the most - # severe consequence. If transcript consequences are requested, but the - # current consequence list is not for transcript_consequences, include the - # annotation, but set values to missing. - if include_transcript_csqs: - tc_expr = hl.missing(tc_dtype) - if include_tc: - tc_expr = csq_list.filter( - lambda x: x.most_severe_consequence == result.most_severe_csq - ) - # Rename most_severe_consequence to most_severe_csq for consistency. - tc_expr = tc_expr.map( - lambda x: x.rename({"most_severe_consequence": "most_severe_csq"}) - ) - result = result.annotate(transcript_consequences=tc_expr) + # Drop the consequences field if not requested. + if not include_csqs: + csq_list = csq_list.drop("consequences") - return hl.or_missing(hl.len(csq_list) > 0, result) + return csq_list - def _get_most_severe_csq_multi( - csqs: hl.ArrayExpression, - csq_lists: List[hl.ArrayExpression] = [], - ) -> hl.expr.StructExpression: - """ - Process multiple VEP consequences lists to determine the most severe consequence. - - First, filter to only protein-coding transcripts and determine the most severe - consequence. If no protein-coding transcripts are present, determine the most - severe consequence for all transcripts. If additional VEP consequences lists are - provided, process those lists in the order they are provided. - - :param csqs: VEP transcript consequences array expression. - :param csq_lists: List of additional VEP consequences lists to be processed in - the order they should be considered for the most severe consequence. - :return: Struct containing summary annotations for the most severe consequence. - """ - return hl.coalesce( - _get_most_severe_csq(csqs, protein_coding=True, include_tc=True), - _get_most_severe_csq(csqs, include_tc=True), - *[_get_most_severe_csq(c) for c in csq_lists], + # Get type of transcript_consequences field for use with hl.missing for other + # consequence lists. + tc_dtype = None + if include_transcript_csqs and "transcript_consequences" in csq_list_order: + tc_dtype = ( + vep_expr["transcript_consequences"] + .map(lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr))) + .dtype ) - # Annotate the Table with the most severe consequence by gene or variant. - if by_gene: - return ht.annotate( - worst_csq_by_gene=tc.group_by(lambda x: (x.gene_id, x.gene_symbol)) - .map_values(lambda x: _get_most_severe_csq_multi(x)) - .items() - .map(lambda x: x[1].annotate(gene_id=x[0][0], gene_symbol=x[0][1])) + # Get the most severe consequence for each VEP consequences list. + ms_csqs_list = [] + for c in csq_list_order: + if c not in vep_expr: + logger.warning(f"VEP consequences list %s not found in input!", c) + continue + csqs = vep_expr[c] + is_tc = c == "transcript_consequences" + ms_csqs = _get_most_severe_csq( + csqs, + # Only include transcript consequences if requested and the current list is + # for transcript consequences. + include_csqs=include_transcript_csqs and is_tc, + additional_order=add_order_by_csq_list.get(c), ) + + # If prioritizing protein-coding transcripts, get the most severe consequence + # for protein-coding transcripts and coalesce with the current most severe + # transcript consequence. + if is_tc and prioritize_protein_coding: + ms_csqs = hl.coalesce( + _get_most_severe_csq( + csqs, + protein_coding=True, + include_csqs=include_transcript_csqs, + additional_order=add_order_by_csq_list.get(c), + ), + ms_csqs, + ) + + # If the current list is not for transcript consequences, annotate with missing + # for transcript_consequences if transcript consequences are requested. + if tc_dtype is not None: + ms_csqs = ms_csqs.transmute( + transcript_consequences=( + ms_csqs.consequences if is_tc else hl.missing(tc_dtype) + ) + ) + ms_csqs_list.append(ms_csqs) + + ms_csqs = hl.coalesce(*ms_csqs_list) + + # Rename most_severe_consequence to most_severe_csq for consistency with older + # version of code. + ms_csqs = ms_csqs.rename({"most_severe_consequence": "most_severe_csq"}) + + if csq_expr_dict is None: + return ht.annotate(**ms_csqs) else: - # If most severe consequence is requested for each variant, include additional - # VEP consequence lists. - csq_lists = [ - ht.vep.regulatory_feature_consequences, - ht.vep.motif_feature_consequences, - ht.vep.intergenic_consequences, - ] - return ht.annotate(**_get_most_severe_csq_multi(tc, csq_lists)) + return ms_csqs def filter_vep_transcript_csqs( From be6c4b37df4d005a7a34d9a0136dc088b3feb0cb Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 12 Jun 2024 09:49:46 -0600 Subject: [PATCH 05/33] deprecate get_most_severe_consequence_for_summary --- gnomad/utils/vep.py | 84 +++++++++++++++++++++++++-------------------- requirements.txt | 1 + 2 files changed, 47 insertions(+), 38 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 6774914e7..f8a2d7649 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -7,6 +7,7 @@ from typing import Callable, Dict, List, Optional, Tuple, Union import hail as hl +from deprecated import deprecated from gnomad.resources.resource_utils import VersionedTableResource from gnomad.utils.filtering import combine_functions @@ -363,10 +364,9 @@ def _find_worst_transcript_consequence( :param tcl: Array of transcript consequences. :return: Worst transcript consequence. """ - ms_csq = get_most_severe_consequence_for_summary( - csq_expr_dict={"transcript_consequences": tcl}, + ms_csq = get_most_severe_csq_from_multiple_csq_lists( + hl.struct(transcript_consequences=tcl), csq_order=csq_order, - vep_root=vep_root, include_transcript_csqs=True, prioritize_protein_coding=prioritize_protein_coding, csq_list_order=["transcript_consequences"], @@ -695,7 +695,7 @@ def filter_to_most_severe_transcript_csqs( - If you have multiple lists of consequences and want to determine the most severe consequence across all lists, consider using - `get_most_severe_consequence_for_summary`. + `get_most_severe_csq_from_multiple_csq_lists`. - If you want to group consequences by gene and determine the most severe consequence for each gene, consider using `process_consequences`. @@ -786,12 +786,45 @@ def _get_most_severe_csq( return hl.or_missing(hl.len(csq_list) > 0, result) +@deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") def get_most_severe_consequence_for_summary( - ht: Optional[hl.Table] = None, - csq_expr_dict: Optional[Dict[str, hl.expr.ArrayExpression]] = None, + ht: hl.Table, + csq_order: List[str] = CSQ_ORDER, + loftee_labels: List[str] = LOFTEE_LABELS, +) -> hl.Table: + """ + Use `get_most_severe_csq_from_multiple_csq_lists` instead, this function is deprecated. + + Prepare a hail Table for summary statistics generation. + + Adds the following annotations: + - most_severe_csq: Most severe consequence for variant + - protein_coding: Whether the variant is present on a protein-coding transcript + - lof: Whether the variant is a loss-of-function variant + - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags) + + Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts. + + :param ht: Input Table. + :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. + :param loftee_labels: Annotations added by LOFTEE. Default is LOFTEE_LABELS. + :return: Table annotated with VEP summary annotations. + """ + csq_expr = get_most_severe_csq_from_multiple_csq_lists( + ht.vep, csq_order=csq_order, loftee_labels=loftee_labels + ) + + # Rename most_severe_consequence to most_severe_csq for consistency with older + # version of code. + csq_expr = csq_expr.rename({"most_severe_consequence": "most_severe_csq"}) + + return ht.annotate(**csq_expr) + + +def get_most_severe_csq_from_multiple_csq_lists( + csq_expr: hl.expr.StructExpression, csq_order: List[str] = CSQ_ORDER, loftee_labels: List[str] = LOFTEE_LABELS, - vep_root: str = "vep", include_transcript_csqs: bool = False, prioritize_protein_coding: bool = True, csq_list_order: Union[List[str], Tuple[str]] = ( @@ -834,21 +867,12 @@ def get_most_severe_consequence_for_summary( Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts if wanted. - - :param csqs: VEP transcript consequences array expression. - :param csq_lists: List of additional VEP consequences lists to be processed in - the order they should be considered for the most severe consequence. - :return: Struct containing summary annotations for the most severe consequence. - - :param ht: Input Table with VEP annotations. One of `ht` or `csq_expr_dict` must be - provided. Default is None. - :param csq_expr_dict: Dictionary of VEP consequences lists to be processed. One of - `ht` or `csq_expr_dict` must be provided. Default is None. + :param csq_expr: StructExpression of VEP consequences to get the most severe + consequence from. :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. Default is LOFTEE_LABELS. - :param vep_root: Name used for VEP annotation. Default is 'vep'. :param include_transcript_csqs: Whether to include all transcript consequences for the most severe consequence. Default is False. :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts @@ -861,14 +885,6 @@ def get_most_severe_consequence_for_summary( of consequences, sorted from high to low impact. Default is None. :return: Table annotated with VEP summary annotations. """ - if ht is None and csq_expr_dict is None: - raise ValueError("Either `ht` or `csq_expr_dict` must be provided!") - - if csq_expr_dict is None: - vep_expr = ht[vep_root] - else: - vep_expr = csq_expr_dict - if add_order_by_csq_list is None: add_order_by_csq_list = {} @@ -898,7 +914,7 @@ def _get_most_severe_csq( tc_dtype = None if include_transcript_csqs and "transcript_consequences" in csq_list_order: tc_dtype = ( - vep_expr["transcript_consequences"] + csq_expr["transcript_consequences"] .map(lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr))) .dtype ) @@ -906,10 +922,11 @@ def _get_most_severe_csq( # Get the most severe consequence for each VEP consequences list. ms_csqs_list = [] for c in csq_list_order: - if c not in vep_expr: + if c not in csq_expr: logger.warning(f"VEP consequences list %s not found in input!", c) continue - csqs = vep_expr[c] + + csqs = csq_expr[c] is_tc = c == "transcript_consequences" ms_csqs = _get_most_severe_csq( csqs, @@ -945,15 +962,6 @@ def _get_most_severe_csq( ms_csqs = hl.coalesce(*ms_csqs_list) - # Rename most_severe_consequence to most_severe_csq for consistency with older - # version of code. - ms_csqs = ms_csqs.rename({"most_severe_consequence": "most_severe_csq"}) - - if csq_expr_dict is None: - return ht.annotate(**ms_csqs) - else: - return ms_csqs - def filter_vep_transcript_csqs( t: Union[hl.Table, hl.MatrixTable], diff --git a/requirements.txt b/requirements.txt index f30acc1d6..563a61d1b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ annoy +deprecated ga4gh.vrs[extras]==0.8.4 hail hdbscan From e40a74d09cbfc0c098c7fb759e0374b5714098cf Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 12 Jun 2024 09:50:29 -0600 Subject: [PATCH 06/33] Pull out `prioritize_loftee_hc_no_flags` from `process_consequences` --- gnomad/utils/vep.py | 55 ++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index f8a2d7649..33a135ead 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -138,6 +138,11 @@ Set containing loss-of-function consequence strings. """ +POLYPHEN_ORDER = ["probably_damaging", "possibly_damaging", "benign"] +""" +Order of PolyPhen predictions from most to least severe. +""" + def get_vep_help(vep_config_path: Optional[str] = None): """ @@ -272,7 +277,7 @@ def vep_or_lookup_vep( def get_most_severe_consequence_expr( - csq_expr: hl.expr.CollectionExpression, + csq_expr: hl.expr.ArrayExpression, csq_order: Optional[List[str]] = None, ) -> hl.expr.StringExpression: """ @@ -281,7 +286,7 @@ def get_most_severe_consequence_expr( This is for a given transcript, as there are often multiple annotations for a single transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant - :param csq_expr: CollectionExpression of consequences. + :param csq_expr: ArrayExpression of consequences. :param csq_order: Optional list indicating the order of VEP consequences, sorted from high to low impact. Default is None, which uses the value of the `CSQ_ORDER` global. @@ -319,6 +324,36 @@ def add_most_severe_consequence_to_consequence( return tc.map(lambda x: x.annotate(most_severe_consequence=csq(x))) +def prioritize_loftee_hc_no_flags( + most_severe_csq: Union[hl.Table, hl.expr.StructExpression], +) -> hl.expr.StructExpression: + """ + Prioritize LOFTEE HC LOF consequences with no LOF flags. + + Given the result of `get_most_severe_csq_from_multiple_csq_lists`, this function + will filter the transcript consequences to only include those with no LOF flags if + the most severe consequence is a LOFTEE HC LOF and there are transcript consequences + with no LOF flags. + + :param most_severe_csq: Table or StructExpression with most severe consequence + information. This should be the result of + `get_most_severe_csq_from_multiple_csq_lists`. + :return: StructExpression with HC LOF consequences with no LOF flags if they exist, + otherwise all transcript consequences. + """ + tcl = most_severe_csq.transcript_consequences + + # Filter transcript consequences to only consequences that have no LOF flags. + no_flags = tcl.filter(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) + # If the most severe consequence is a LOFTEE HC LOF and there are transcript + # consequences with no LOF flags, return only those transcripts. + return hl.if_else( + (most_severe_csq.lof == "HC") & (hl.len(no_flags) > 0), + no_flags, + tcl, + ) + + def process_consequences( t: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", @@ -333,6 +368,7 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. .. note:: + From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct in the release HTs. When using this function with gnomAD v4.0 or later, set `has_polyphen` to False. @@ -351,9 +387,7 @@ def process_consequences( :return: HT/MT with better formatted consequences. """ # If has_polyphen is True, set the order of PolyPhen consequences. - polyphen_order = None - if has_polyphen: - polyphen_order = ["probably_damaging", "possibly_damaging", "benign"] + polyphen_order = POLYPHEN_ORDER if has_polyphen else None def _find_worst_transcript_consequence( tcl: hl.expr.ArrayExpression, @@ -375,17 +409,8 @@ def _find_worst_transcript_consequence( tcl = ms_csq.transcript_consequences # Penalize LOFTEE flagged variants. - if penalize_flags: - no_flags = tcl.filter( - lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") - ) - tcl = hl.if_else( - (ms_csq.lof == "HC") & ~ms_csq.no_lof_flags & (hl.len(no_flags) > 0), - no_flags, - tcl, - ) + tcl = prioritize_loftee_hc_no_flags(ms_csq) if penalize_flags else tcl - # Return the worst consequence based on the calculated score. return hl.or_missing(hl.len(tcl) > 0, tcl[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. From 2bf94f15b6a5cc524750ef3eb18077ab31ebec93 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:22:01 -0600 Subject: [PATCH 07/33] Make add_most_severe_csq_to_tc_within_vep_root and add_most_severe_consequence_to_consequence more flexible --- gnomad/utils/vep.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 33a135ead..5b952f986 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -302,6 +302,7 @@ def get_most_severe_consequence_expr( def add_most_severe_consequence_to_consequence( tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression], csq_order: Optional[List[str]] = None, + most_severe_csq_field: str = "most_severe_consequence", ) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]: """ Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences. @@ -314,14 +315,16 @@ def add_most_severe_consequence_to_consequence( :param csq_order: Optional list indicating the order of VEP consequences, sorted from high to low impact. Default is None, which uses the value of the `CSQ_ORDER` global. + :param most_severe_csq_field: Field name to use for most severe consequence. Default + is 'most_severe_consequence'. :return: Transcript consequence or array of transcript consequences annotated with the most severe consequence. """ csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order) if isinstance(tc, hl.expr.StructExpression): - return tc.annotate(most_severe_consequence=csq(tc)) + return tc.annotate(**{most_severe_csq_field: csq(tc)}) else: - return tc.map(lambda x: x.annotate(most_severe_consequence=csq(x))) + return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)})) def prioritize_loftee_hc_no_flags( @@ -1102,24 +1105,41 @@ def filter_vep_transcript_csqs( def add_most_severe_csq_to_tc_within_vep_root( - t: Union[hl.Table, hl.MatrixTable], vep_root: str = "vep" + t: Union[hl.Table, hl.MatrixTable], + vep_root: str = "vep", + csq_field: str = "transcript_consequences", + most_severe_csq_field: str = "most_severe_consequence", + csq_order: Optional[List[str]] = None, ) -> Union[hl.Table, hl.MatrixTable]: """ - Add most_severe_consequence annotation to 'transcript_consequences' within the vep root annotation. + Add `most_severe_csq_field` annotation to `csq_field` within the `vep_root` annotation. :param t: Input Table or MatrixTable. :param vep_root: Root for vep annotation (probably vep). + :param csq_field: Field name of VEP consequences ArrayExpression within `vep_root` + to add most severe consequence to. Default is 'transcript_consequences'. + :param most_severe_csq_field: Field name to use for most severe consequence. Default + is 'most_severe_consequence'. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. :return: Table or MatrixTable with most_severe_consequence annotation added. """ - annotation = t[vep_root].annotate( - transcript_consequences=t[vep_root].transcript_consequences.map( - add_most_severe_consequence_to_consequence - ) + vep_expr = t[vep_root] + csq_expr = vep_expr[csq_field] + vep_expr = vep_expr.annotate( + **{ + csq_field: add_most_severe_consequence_to_consequence( + csq_expr, + csq_order=csq_order, + most_severe_csq_field=most_severe_csq_field, + ) + } ) return ( - t.annotate_rows(**{vep_root: annotation}) + t.annotate_rows(**{vep_root: vep_expr}) if isinstance(t, hl.MatrixTable) - else t.annotate(**{vep_root: annotation}) + else t.annotate(**{vep_root: vep_expr}) ) From f0450466e38fa5c12763b1ff834de8339f5a78bb Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:26:02 -0600 Subject: [PATCH 08/33] Clean-up vep consequence functions --- gnomad/utils/vep.py | 280 +++++++++++++++++++++++++++----------------- 1 file changed, 173 insertions(+), 107 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 5b952f986..dca17068d 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -308,8 +308,9 @@ def add_most_severe_consequence_to_consequence( Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences. For a single transcript consequence, `tc` should be a StructExpression with a - `consequence_terms` field. For an array of transcript consequences, `tc` should be - an ArrayExpression of StructExpressions with a `consequence_terms` field. + `consequence_terms` field, e.g. Struct(consequence_terms=['missense_variant']). + For an array of transcript consequences, `tc` should be an ArrayExpression of + StructExpressions with a `consequence_terms` field. :param tc: Transcript consequence or array of transcript consequences to annotate. :param csq_order: Optional list indicating the order of VEP consequences, sorted @@ -412,7 +413,7 @@ def _find_worst_transcript_consequence( tcl = ms_csq.transcript_consequences # Penalize LOFTEE flagged variants. - tcl = prioritize_loftee_hc_no_flags(ms_csq) if penalize_flags else tcl + tcl = _prioritize_loftee_hc_no_flags(ms_csq) if penalize_flags else tcl return hl.or_missing(hl.len(tcl) > 0, tcl[0]) @@ -702,116 +703,112 @@ def get_csq_from_struct( return hl.or_missing(hl.len(csq) > 0, csq) -def filter_to_most_severe_transcript_csqs( - csq_list: hl.expr.ArrayExpression, +def filter_to_most_severe_consequences( + csq_expr: hl.expr.ArrayExpression, csq_order: List[str] = CSQ_ORDER, - filter_protein_coding: bool = False, - prioritize_loftee: bool = False, - loftee_labels: List[str] = LOFTEE_LABELS, - additional_order: Tuple[str, List[str]] = None, -) -> hl.expr.StructExpression: + most_severe_csq_field: str = "most_severe_consequence", +) -> hl.expr.ArrayExpression: """ - Filter a list of VEP consequences to all entries that have the most severe consequence. + Filter an array of VEP consequences to all entries that have the most severe consequence. - If `filter_protein_coding` is True, filter to only protein-coding transcripts before - determining the most severe consequence. If `prioritize_loftee` is True, prioritize - LOFTEE consequences by filtering to only LOFTEE consequences and determining the - most severe consequence. If `additional_order` is provided, additional ordering is - applied to the consequences in the list. + This function expects that all entries in the `csq_list` are already annotated with + the most severe consequence using `add_most_severe_consequence_to_consequence` or + `add_most_severe_csq_to_tc_within_vep_root`. .. note:: - If you have multiple lists of consequences and want to determine the most severe consequence across all lists, consider using - `get_most_severe_csq_from_multiple_csq_lists`. + `get_most_severe_consequence_for_summary`. - If you want to group consequences by gene and determine the most severe consequence for each gene, consider using `process_consequences`. - :param csq_list: ArrayExpression of VEP consequences. + :param csq_expr: ArrayExpression of VEP consequences. :param csq_order: List indicating the order of VEP consequences, sorted from high to low impact. Default is the value of the `CSQ_ORDER` global. - :param filter_protein_coding: Whether to filter to only protein-coding transcripts - before determining the most severe consequence. Default is False. - :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is - False. - :param loftee_labels: List of LOFTEE labels in order of priority, sorted from high - to low impact. Default is the value of the `LOFTEE_LABELS` global. - :param additional_order: Tuple indicating the additional ordering to apply to the - consequences in the list. The first element is the name of the consequences list - and the second element is the order of consequences, sorted from high to low - impact. Default is None. - :return: StructExpression with the most severe consequence and the list of - consequences that match the most severe consequence. + :param most_severe_csq_field: Field containing the most severe consequence for each + consequence in `csq_list`. Default is 'most_severe_consequence'. + :return: ArrayExpression with of the consequences that match the most severe + consequence. """ - if filter_protein_coding: - csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") + # Get the highest impact csq label. + ms_csq = get_most_severe_consequence_expr( + csq_expr.map(lambda x: x[most_severe_csq_field]), csq_order=csq_order + ) - def _get_most_severe_csq( - csq_list: hl.expr.ArrayExpression, - csq_field: str = "most_severe_consequence", - csq_order: List[str] = csq_order, - ) -> Tuple[hl.expr.StringExpression, hl.expr.ArrayExpression]: - """ - Get the most severe consequence from a list of consequences and consequence order. + # Filter to only consequences with the highest impact csq label, and return missing + # if the most severe consequence is missing. + return hl.or_missing( + hl.is_defined(ms_csq), + csq_expr.filter(lambda x: x[most_severe_csq_field] == ms_csq), + ) - :param csq_list: ArrayExpression of VEP consequences. - :param csq_field: Field to use for the most severe consequence. Default is - 'most_severe_consequence'. - :param csq_order: List indicating the order of VEP consequences, sorted from - high to low impact. - :return: Tuple containing the most severe consequence and the list of - consequences that match the most severe consequence. - """ - # Get the highest impact csq label. - ms_csq = get_most_severe_consequence_expr( - csq_list.map(lambda x: x[csq_field]), csq_order=csq_order - ) - # Filter to only consequences with the highest impact csq label. - csq_list = hl.or_missing( - hl.is_defined(ms_csq), csq_list.filter(lambda x: x[csq_field] == ms_csq) - ) - return ms_csq, csq_list +# TODO: Can add this to `filter_vep_transcript_csqs`, but need to also make that +# function work with just an array of transcript consequences. +def filter_vep_consequences_by_loftee( + csq_expr: hl.expr.ArrayExpression, + loftee_labels: Optional[List[str]] = None, + no_lof_flags: bool = False, + keep: bool = True, +) -> hl.expr.StructExpression: + """ + Filter VEP transcript consequences by LOFTEE. - lof = hl.missing(hl.tstr) - no_lof_flags = hl.missing(hl.tbool) - if prioritize_loftee: - lof, lof_csq = _get_most_severe_csq( - csq_list, csq_field="lof", csq_order=loftee_labels - ) - # Check if any of the lof consequences have no lof_flags. - no_lof_flags = lof_csq.any( + :param csq_expr: ArrayExpression of VEP consequences with LOFTEE annotations. + :param loftee_labels: List of LOFTEE labels to filter to. Default is None, which + filters to all LOFTEE labels. + :param no_lof_flags: Whether to filter to consequences with no LOFTEE flags. + Default is False. + :param keep: Whether to keep the consequences that match the filter criteria. + Default is True. + :return: StructExpression with the filtered consequences. + """ + filter_criteria = [lambda csq: True] + + if loftee_labels: + logger.info("Filtering to LOFTEE labels: %s...", loftee_labels) + filter_criteria.append(lambda x: hl.set(loftee_labels).contains(x.lof)) + + if no_lof_flags: + logger.info("Filtering to consequences with no LOFTEE flags...") + filter_criteria.append( lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") ) - # If there are no lof consequences, set the consequence list to the original - # list. - csq_list = hl.coalesce(lof_csq, csq_list) - - # Add most_severe_consequence to each consequence. - csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) - - # Get the most severe consequence of all consequences in the list. - ms_csq, csq_list = _get_most_severe_csq(csq_list, csq_order=csq_order) - result = hl.struct( - most_severe_consequence=ms_csq, - protein_coding=filter_protein_coding, - lof=lof, - no_lof_flags=no_lof_flags, - ) - if additional_order is not None: - # Get the highest impact consequences from the additional ordering. - _, add_csq_expr = _get_most_severe_csq( - csq_list, additional_order[0], additional_order[1] - ) - # If there are consequences from the additional ordering, set the consequence - # list to the additional ordering, otherwise keep the original list. - csq_list = hl.coalesce(add_csq_expr, csq_list) + return csq_expr.filter(lambda x: combine_functions(filter_criteria, x), keep=keep) + + +def _prioritize_loftee_hc_no_flags( + most_severe_csq: Union[hl.Table, hl.expr.StructExpression], +) -> hl.expr.StructExpression: + """ + Prioritize LOFTEE HC LOF consequences with no LOF flags. + + Given the result of `get_most_severe_csq_from_multiple_csq_lists`, this function + will filter the transcript consequences to only include those with no LOF flags if + the most severe consequence is a LOFTEE HC LOF and there are transcript consequences + with no LOF flags. - result = result.annotate(consequences=csq_list) + :param most_severe_csq: Table or StructExpression with most severe consequence + information. This should be the result of + `get_most_severe_csq_from_multiple_csq_lists`. + :return: StructExpression with HC LOF consequences with no LOF flags if they exist, + otherwise all transcript consequences. + """ + tcl = most_severe_csq.transcript_consequences - return hl.or_missing(hl.len(csq_list) > 0, result) + # Filter transcript consequences to only consequences that have no LOF flags. + no_flags = filter_vep_consequences_by_loftee(tcl, no_lof_flags=True) + + # If the most severe consequence is a LOFTEE HC LOF and there are transcript + # consequences with no LOF flags, return only those transcripts. + return hl.if_else( + (most_severe_csq.lof == "HC") & (hl.len(no_flags) > 0), + no_flags, + tcl, + ) @deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") @@ -850,7 +847,7 @@ def get_most_severe_consequence_for_summary( def get_most_severe_csq_from_multiple_csq_lists( - csq_expr: hl.expr.StructExpression, + vep_expr: hl.expr.StructExpression, csq_order: List[str] = CSQ_ORDER, loftee_labels: List[str] = LOFTEE_LABELS, include_transcript_csqs: bool = False, @@ -895,7 +892,7 @@ def get_most_severe_csq_from_multiple_csq_lists( Assumes input Table is annotated with VEP and that VEP annotations have been filtered to canonical transcripts if wanted. - :param csq_expr: StructExpression of VEP consequences to get the most severe + :param vep_expr: StructExpression of VEP consequences to get the most severe consequence from. :param csq_order: Order of VEP consequences, sorted from high to low impact. Default is CSQ_ORDER. @@ -920,17 +917,79 @@ def _get_most_severe_csq( csq_list: hl.ArrayExpression, protein_coding: bool = False, include_csqs: bool = False, + prioritize_loftee: bool = False, additional_order: Optional[List[str]] = None, ) -> hl.StructExpression: - csq_list = filter_to_most_severe_transcript_csqs( - csq_list, - csq_order=csq_order, - filter_protein_coding=protein_coding, - prioritize_loftee=protein_coding, - loftee_labels=loftee_labels, - additional_order=additional_order, + """ + Filter a list of consequences to those that have the most severe consequence. + + If `protein_coding` is True, filter to only protein-coding transcripts before + determining the most severe consequence. If `prioritize_loftee` is True, + prioritize LOFTEE consequences by filtering to only LOFTEE consequences and + determining the most severe consequence. If `additional_order` is provided, + additional ordering is applied to the consequences in the list. + + :param csq_list: ArrayExpression of VEP consequences. + :param protein_coding: Whether to filter to only protein-coding transcripts + before determining the most severe consequence. Default is False. + :param include_csqs: Whether to include all transcript consequences for the most + severe consequence. Default is False. + :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is + False. + :param additional_order: Tuple indicating the additional ordering to apply to + the consequences in the list. The first element is the name of the + consequences list and the second element is the order of consequences, + sorted from high to low impact. Default is None. + :return: StructExpression with the most severe consequence and the list of + consequences that match the most severe consequence. + """ + if protein_coding: + csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") + + lof = hl.missing(hl.tstr) + no_lof_flags = hl.missing(hl.tbool) + if prioritize_loftee: + lof_csq = filter_to_most_severe_consequences(csq_list, loftee_labels, "lof") + lof = lof_csq[0].lof + + # Check if any of the lof consequences have no lof_flags. + no_lof_flags = ( + hl.len(filter_vep_consequences_by_loftee(lof_csq, no_lof_flags=True)) + > 0 + ) + + # If there are no lof consequences, set the consequence list to the original + # list. + csq_list = hl.coalesce(lof_csq, csq_list) + + # Add most_severe_consequence to each consequence. + csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) + + # Get the most severe consequence of all consequences in the list. + csq_list = filter_to_most_severe_consequences(csq_list, csq_order=csq_order) + ms_csq = csq_list[0].most_severe_consequence + result = hl.struct( + most_severe_consequence=ms_csq, + protein_coding=protein_coding, + lof=lof, + no_lof_flags=no_lof_flags, ) + if additional_order is not None: + # Get the highest impact consequences from the additional ordering. + add_csq_expr = filter_to_most_severe_consequences( + csq_list, + most_severe_csq_field=additional_order[0], + csq_order=additional_order[1], + ) + # If there are consequences from the additional ordering, set the + # consequence list to the additional ordering, otherwise keep the original + # list. + csq_list = hl.coalesce(add_csq_expr, csq_list) + + result = result.annotate(consequences=csq_list) + csq_list = hl.or_missing(hl.len(csq_list) > 0, result) + # Drop the consequences field if not requested. if not include_csqs: csq_list = csq_list.drop("consequences") @@ -942,7 +1001,7 @@ def _get_most_severe_csq( tc_dtype = None if include_transcript_csqs and "transcript_consequences" in csq_list_order: tc_dtype = ( - csq_expr["transcript_consequences"] + vep_expr["transcript_consequences"] .map(lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr))) .dtype ) @@ -950,16 +1009,16 @@ def _get_most_severe_csq( # Get the most severe consequence for each VEP consequences list. ms_csqs_list = [] for c in csq_list_order: - if c not in csq_expr: + if c not in vep_expr: logger.warning(f"VEP consequences list %s not found in input!", c) continue - - csqs = csq_expr[c] + csqs = vep_expr[c] is_tc = c == "transcript_consequences" ms_csqs = _get_most_severe_csq( csqs, # Only include transcript consequences if requested and the current list is # for transcript consequences. + prioritize_loftee=True if is_tc else False, include_csqs=include_transcript_csqs and is_tc, additional_order=add_order_by_csq_list.get(c), ) @@ -972,6 +1031,7 @@ def _get_most_severe_csq( _get_most_severe_csq( csqs, protein_coding=True, + prioritize_loftee=True, include_csqs=include_transcript_csqs, additional_order=add_order_by_csq_list.get(c), ), @@ -981,15 +1041,21 @@ def _get_most_severe_csq( # If the current list is not for transcript consequences, annotate with missing # for transcript_consequences if transcript consequences are requested. if tc_dtype is not None: - ms_csqs = ms_csqs.transmute( - transcript_consequences=( - ms_csqs.consequences if is_tc else hl.missing(tc_dtype) - ) - ) + tc_expr = ms_csqs.consequences if is_tc else hl.missing(tc_dtype) + ms_csqs = ms_csqs.annotate(consequences=tc_expr) + ms_csqs_list.append(ms_csqs) ms_csqs = hl.coalesce(*ms_csqs_list) + # Rename most_severe_consequence to most_severe_csq for consistency with older + # version of code. + rename_map = {"most_severe_consequence": "most_severe_csq"} + if tc_dtype is not None: + rename_map["consequences"] = "transcript_consequences" + + return ms_csqs.rename(rename_map) + def filter_vep_transcript_csqs( t: Union[hl.Table, hl.MatrixTable], From 7939af85b017300563c06bd235d3edc8b6a068d5 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 12 Jun 2024 15:27:48 -0600 Subject: [PATCH 09/33] Remove duplicate version of prioritize_loftee_hc_no_flags --- gnomad/utils/vep.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index dca17068d..376675a3a 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -328,36 +328,6 @@ def add_most_severe_consequence_to_consequence( return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)})) -def prioritize_loftee_hc_no_flags( - most_severe_csq: Union[hl.Table, hl.expr.StructExpression], -) -> hl.expr.StructExpression: - """ - Prioritize LOFTEE HC LOF consequences with no LOF flags. - - Given the result of `get_most_severe_csq_from_multiple_csq_lists`, this function - will filter the transcript consequences to only include those with no LOF flags if - the most severe consequence is a LOFTEE HC LOF and there are transcript consequences - with no LOF flags. - - :param most_severe_csq: Table or StructExpression with most severe consequence - information. This should be the result of - `get_most_severe_csq_from_multiple_csq_lists`. - :return: StructExpression with HC LOF consequences with no LOF flags if they exist, - otherwise all transcript consequences. - """ - tcl = most_severe_csq.transcript_consequences - - # Filter transcript consequences to only consequences that have no LOF flags. - no_flags = tcl.filter(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) - # If the most severe consequence is a LOFTEE HC LOF and there are transcript - # consequences with no LOF flags, return only those transcripts. - return hl.if_else( - (most_severe_csq.lof == "HC") & (hl.len(no_flags) > 0), - no_flags, - tcl, - ) - - def process_consequences( t: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", From c6122034d00dbefc707d188666458871d118d86e Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 13 Jun 2024 11:17:05 -0600 Subject: [PATCH 10/33] Add prioritize_loftee_no_flags to get_most_severe_csq_from_multiple_csq_lists and make use of it in process_consequences --- gnomad/utils/vep.py | 122 ++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 78 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 376675a3a..75a4f9d3b 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -364,7 +364,7 @@ def process_consequences( polyphen_order = POLYPHEN_ORDER if has_polyphen else None def _find_worst_transcript_consequence( - tcl: hl.expr.ArrayExpression, + tc_expr: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: """ Find the worst transcript consequence in an array of transcript consequences. @@ -372,25 +372,21 @@ def _find_worst_transcript_consequence( :param tcl: Array of transcript consequences. :return: Worst transcript consequence. """ - ms_csq = get_most_severe_csq_from_multiple_csq_lists( - hl.struct(transcript_consequences=tcl), + tc_expr = get_most_severe_csq_from_multiple_csq_lists( + hl.struct(transcript_consequences=tc_expr), csq_order=csq_order, include_transcript_csqs=True, prioritize_protein_coding=prioritize_protein_coding, + prioritize_loftee_no_flags=penalize_flags, csq_list_order=["transcript_consequences"], add_order_by_csq_list={"transcript_consequences": polyphen_order}, - ) - tcl = ms_csq.transcript_consequences - - # Penalize LOFTEE flagged variants. - tcl = _prioritize_loftee_hc_no_flags(ms_csq) if penalize_flags else tcl + ).transcript_consequences - return hl.or_missing(hl.len(tcl) > 0, tcl[0]) + return hl.or_missing(hl.len(tc_expr) > 0, tc_expr[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. - csqs = t[vep_root].transcript_consequences.map( - lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order) - ) + csqs = t[vep_root].transcript_consequences + csqs = add_most_severe_consequence_to_consequence(csqs, csq_order) # Group transcript consequences by gene and find the worst consequence for each. gene_csqs = ( @@ -750,37 +746,6 @@ def filter_vep_consequences_by_loftee( return csq_expr.filter(lambda x: combine_functions(filter_criteria, x), keep=keep) -def _prioritize_loftee_hc_no_flags( - most_severe_csq: Union[hl.Table, hl.expr.StructExpression], -) -> hl.expr.StructExpression: - """ - Prioritize LOFTEE HC LOF consequences with no LOF flags. - - Given the result of `get_most_severe_csq_from_multiple_csq_lists`, this function - will filter the transcript consequences to only include those with no LOF flags if - the most severe consequence is a LOFTEE HC LOF and there are transcript consequences - with no LOF flags. - - :param most_severe_csq: Table or StructExpression with most severe consequence - information. This should be the result of - `get_most_severe_csq_from_multiple_csq_lists`. - :return: StructExpression with HC LOF consequences with no LOF flags if they exist, - otherwise all transcript consequences. - """ - tcl = most_severe_csq.transcript_consequences - - # Filter transcript consequences to only consequences that have no LOF flags. - no_flags = filter_vep_consequences_by_loftee(tcl, no_lof_flags=True) - - # If the most severe consequence is a LOFTEE HC LOF and there are transcript - # consequences with no LOF flags, return only those transcripts. - return hl.if_else( - (most_severe_csq.lof == "HC") & (hl.len(no_flags) > 0), - no_flags, - tcl, - ) - - @deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") def get_most_severe_consequence_for_summary( ht: hl.Table, @@ -822,6 +787,7 @@ def get_most_severe_csq_from_multiple_csq_lists( loftee_labels: List[str] = LOFTEE_LABELS, include_transcript_csqs: bool = False, prioritize_protein_coding: bool = True, + prioritize_loftee_no_flags: bool = False, csq_list_order: Union[List[str], Tuple[str]] = ( "transcript_consequences", "regulatory_feature_consequences", @@ -872,6 +838,8 @@ def get_most_severe_csq_from_multiple_csq_lists( the most severe consequence. Default is False. :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts when determining the worst consequence. Default is True. + :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE consequences with no + flags over those with flags. Default is False. :param csq_list_order: Order of VEP consequences lists to be processed. Default is ('transcript_consequences', 'regulatory_feature_consequences', 'motif_feature_consequences', 'intergenic_consequences'). @@ -888,7 +856,7 @@ def _get_most_severe_csq( protein_coding: bool = False, include_csqs: bool = False, prioritize_loftee: bool = False, - additional_order: Optional[List[str]] = None, + additional_order: Optional[Tuple[str, List[str]]] = None, ) -> hl.StructExpression: """ Filter a list of consequences to those that have the most severe consequence. @@ -913,36 +881,35 @@ def _get_most_severe_csq( :return: StructExpression with the most severe consequence and the list of consequences that match the most severe consequence. """ + lof = hl.missing(hl.tstr) + no_lof_flags = hl.missing(hl.tbool) + if protein_coding: csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") - lof = hl.missing(hl.tstr) - no_lof_flags = hl.missing(hl.tbool) if prioritize_loftee: lof_csq = filter_to_most_severe_consequences(csq_list, loftee_labels, "lof") - lof = lof_csq[0].lof - - # Check if any of the lof consequences have no lof_flags. - no_lof_flags = ( - hl.len(filter_vep_consequences_by_loftee(lof_csq, no_lof_flags=True)) - > 0 + lof_csq_no_flag = filter_vep_consequences_by_loftee( + lof_csq, no_lof_flags=True ) - # If there are no lof consequences, set the consequence list to the original - # list. + # Check if any of the lof consequences have no lof flags. + no_lof_flags = hl.len(lof_csq_no_flag) > 0 + + # If requested and there are lof consequences without flags, use those. + if prioritize_loftee_no_flags: + lof_csq = hl.if_else(no_lof_flags, lof_csq_no_flag, lof_csq) + + lof = lof_csq[0].lof + + # If there are no lof consequences, use the original consequences list. csq_list = hl.coalesce(lof_csq, csq_list) - # Add most_severe_consequence to each consequence. - csq_list = add_most_severe_consequence_to_consequence(csq_list, csq_order) - - # Get the most severe consequence of all consequences in the list. - csq_list = filter_to_most_severe_consequences(csq_list, csq_order=csq_order) - ms_csq = csq_list[0].most_severe_consequence - result = hl.struct( - most_severe_consequence=ms_csq, - protein_coding=protein_coding, - lof=lof, - no_lof_flags=no_lof_flags, + # Add most_severe_consequence to each consequence and get the most severe + # consequence of all consequences in the list. + csq_list = filter_to_most_severe_consequences( + add_most_severe_consequence_to_consequence(csq_list, csq_order), + csq_order=csq_order, ) if additional_order is not None: @@ -957,14 +924,16 @@ def _get_most_severe_csq( # list. csq_list = hl.coalesce(add_csq_expr, csq_list) - result = result.annotate(consequences=csq_list) - csq_list = hl.or_missing(hl.len(csq_list) > 0, result) - - # Drop the consequences field if not requested. - if not include_csqs: - csq_list = csq_list.drop("consequences") - - return csq_list + return hl.or_missing( + hl.len(csq_list) > 0, + hl.struct( + most_severe_consequence=csq_list[0].most_severe_consequence, + protein_coding=protein_coding, + lof=lof, + no_lof_flags=no_lof_flags, + **({"consequences": csq_list} if include_csqs else {}), + ), + ) # Get type of transcript_consequences field for use with hl.missing for other # consequence lists. @@ -1018,13 +987,10 @@ def _get_most_severe_csq( ms_csqs = hl.coalesce(*ms_csqs_list) - # Rename most_severe_consequence to most_severe_csq for consistency with older - # version of code. - rename_map = {"most_severe_consequence": "most_severe_csq"} if tc_dtype is not None: - rename_map["consequences"] = "transcript_consequences" + ms_csqs = ms_csqs.rename({"consequences": "transcript_consequences"}) - return ms_csqs.rename(rename_map) + return ms_csqs def filter_vep_transcript_csqs( From 059b012f1abbec2608f340f72defe4a9c00f28af Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 13 Jun 2024 12:52:26 -0600 Subject: [PATCH 11/33] Add tests for vep functions used in this PR --- tests/test_vep.py | 201 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 201 insertions(+) create mode 100644 tests/test_vep.py diff --git a/tests/test_vep.py b/tests/test_vep.py new file mode 100644 index 000000000..88985b3da --- /dev/null +++ b/tests/test_vep.py @@ -0,0 +1,201 @@ +"""Tests for the gnomAD VEP utility functions.""" +import hail as hl +from gnomad.utils.vep import ( + add_most_severe_consequence_to_consequence, + get_most_severe_consequence_expr, + add_most_severe_csq_to_tc_within_vep_root, + filter_to_most_severe_consequences, + CSQ_ORDER, + filter_vep_consequences_by_loftee, + get_most_severe_csq_from_multiple_csq_lists, + process_consequences, +) + + +def test_get_most_severe_consequence_expr(): + """Test the get_most_severe_consequence_expr function.""" + # Create a mock ArrayExpression of consequences + csq_expr = hl.literal(['splice_region_variant', 'intron_variant']) + + # Test with default csq_order + result = get_most_severe_consequence_expr(csq_expr) + assert result.value == 'splice_region_variant', "Expected 'splice_region_variant'" + + # Test with custom csq_order + custom_csq_order = ['intron_variant', 'splice_region_variant'] + result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) + assert result.value == 'intron_variant', "Expected 'intron_variant'" + + # Test with csq_expr that contains a consequence not in csq_order + csq_expr = hl.literal(['non_existent_consequence']) + result = get_most_severe_consequence_expr(csq_expr) + assert result.value is None, "Expected None for non-existent consequence" + + +def test_add_most_severe_consequence_to_consequence(): + """Test the add_most_severe_consequence_to_consequence function.""" + # Create a mock StructExpression of transcript consequence + tc_expr = hl.struct(consequence_terms=['missense_variant', 'synonymous_variant']) + + # Test with default csq_order + result = add_most_severe_consequence_to_consequence(tc_expr) + assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms), "Expected 'missense_variant'" + + # Test with custom csq_order + custom_csq_order = ['synonymous_variant', 'missense_variant'] + result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) + assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order), "Expected 'synonymous_variant'" + + # Test with csq_expr that contains a consequence not in csq_order + tc_expr = hl.struct(consequence_terms=['non_existent_consequence']) + result = add_most_severe_consequence_to_consequence(tc_expr) + assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms), "Expected None for non-existent consequence" + + +def test_add_most_severe_csq_to_tc_within_vep_root(): + """Test the add_most_severe_csq_to_tc_within_vep_root function.""" + # Create a mock MatrixTable with vep.transcript_consequences + mt = hl.utils.range_matrix_table(1, 1) + mt = mt.annotate_rows(vep=hl.struct(transcript_consequences=[hl.struct(consequence_terms=['missense_variant', 'synonymous_variant'])])) + + # Apply the function + result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) + + # Check that the most_severe_consequence field is added + assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0] + + # Check that the most_severe_consequence is correct + assert result_mt.vep.transcript_consequences[0].most_severe_consequence == 'missense_variant' + + +def test_filter_to_most_severe_consequences(): + """Test the filter_to_most_severe_consequences function.""" + # Create a mock ArrayExpression of VEP consequences + csq_expr = hl.literal([ + hl.struct(most_severe_consequence='missense_variant'), + hl.struct(most_severe_consequence='synonymous_variant'), + hl.struct(most_severe_consequence='missense_variant') + ]) + + # Apply the function + result = filter_to_most_severe_consequences( + csq_expr, CSQ_ORDER, 'most_severe_consequence' + ) + + # Check that the result only contains the most severe consequences + assert len(result) == 2, "Expected 2 consequences" + assert all(csq.most_severe_consequence == 'missense_variant' for csq in + result), "Expected 'missense_variant'" + + +def test_filter_vep_consequences_by_loftee(): + """Test the filter_vep_consequences_by_loftee function.""" + # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations + csq_expr = hl.literal([ + hl.struct(lof='HC', lof_flags=None), + hl.struct(lof='HC', lof_flags=''), + hl.struct(lof='LC', lof_flags='flag1'), + hl.struct(lof='OS', lof_flags=None), + hl.struct(lof=None, lof_flags='flag2') + ]) + + # Test with default parameters + result = filter_vep_consequences_by_loftee(csq_expr) + assert len(result) == 6, "Expected 6 consequences" + + # Test with loftee_labels + result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC']) + assert len(result) == 2, "Expected 2 consequences" + assert result[0].lof == 'HC', "Expected 'HC'" + + # Test with no_lof_flags + result = filter_vep_consequences_by_loftee(csq_expr, no_lof_flags=True) + assert len(result) == 3, "Expected 3 consequences" + assert all(csq.lof_flags is None for csq in result), "Expected no LOFTEE flags" + + # Test with loftee_labels and no_lof_flags + result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC'], + no_lof_flags=True) + assert len(result) == 1, "Expected 1 consequence" + assert result[0].lof == 'HC', "Expected 'HC'" + assert all(csq.lof_flags is None for csq in result), "Expected no LOFTEE flags" + + # Test with keep=False + result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC'], + keep=False) + assert len(result) == 3, "Expected 3 consequences" + assert all(csq.lof != 'HC' for csq in result), "Expected no 'HC' consequences" + + +def test_get_most_severe_csq_from_multiple_csq_lists(): + """Test the get_most_severe_csq_from_multiple_csq_lists function.""" + # Create a mock VEP expression + vep_expr = hl.struct( + transcript_consequences=[ + hl.struct( + biotype='protein_coding', + lof='HC', + most_severe_consequence='missense_variant' + ), + hl.struct( + biotype='protein_coding', + lof='HC', + most_severe_consequence='synonymous_variant' + ) + ], + intergenic_consequences=[ + hl.struct( + biotype='intergenic', + lof=None, + most_severe_consequence='intergenic_variant' + ) + ] + ) + + # Test the function + result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) + + # Check the most_severe_csq + assert result.most_severe_csq == 'missense_variant', "Expected 'missense_variant'" + + # Check the protein_coding + assert result.protein_coding == True, "Expected True" + + # Check the lof + assert result.lof == 'HC', "Expected 'HC'" + + # Check the no_lof_flags + assert result.no_lof_flags == False, "Expected False" + + # Check the transcript_consequences + assert len(result.transcript_consequences) == 2, "Expected 2 transcript consequences" + assert result.transcript_consequences[0].most_severe_consequence == 'missense_variant', "Expected 'missense_variant' for the first transcript consequence" + assert result.transcript_consequences[1].most_severe_consequence == 'synonymous_variant', "Expected 'synonymous_variant' for the second transcript consequence" + + +def test_process_consequences(): + """Test the process_consequences function.""" + # Create a mock MatrixTable with vep.transcript_consequences + mt = hl.utils.range_matrix_table(1, 1) + mt = mt.annotate_rows(vep=hl.struct(transcript_consequences=[hl.struct(consequence_terms=['missense_variant', 'synonymous_variant'])])) + + # Apply the function + result_mt = process_consequences(mt) + + # Check that the most_severe_consequence field is added + assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0] + + # Check that the most_severe_consequence is correct + assert result_mt.vep.transcript_consequences[0].most_severe_consequence == 'missense_variant' + + # Check that the worst_csq_by_gene field is added + assert 'worst_csq_by_gene' in result_mt.vep + + # Check that the worst_csq_for_variant field is added + assert 'worst_csq_for_variant' in result_mt.vep + + # Check that the worst_csq_by_gene_canonical field is added + assert 'worst_csq_by_gene_canonical' in result_mt.vep + + # Check that the worst_csq_for_variant_canonical field is added + assert 'worst_csq_for_variant_canonical' in result_mt.vep From aa67cc37420c117e86ee76f21e1e03dee4b24489 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:41:37 -0600 Subject: [PATCH 12/33] Move some of the parts in `get_most_severe_csq_from_multiple_csq_lists` to `filter_to_most_severe_consequences` and clean them up --- gnomad/utils/vep.py | 376 +++++++++++++++++++++++--------------------- 1 file changed, 197 insertions(+), 179 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 75a4f9d3b..1f36fb6c1 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -369,13 +369,13 @@ def _find_worst_transcript_consequence( """ Find the worst transcript consequence in an array of transcript consequences. - :param tcl: Array of transcript consequences. + :param tc_expr: Array of transcript consequences. :return: Worst transcript consequence. """ tc_expr = get_most_severe_csq_from_multiple_csq_lists( hl.struct(transcript_consequences=tc_expr), csq_order=csq_order, - include_transcript_csqs=True, + include_csqs=True, prioritize_protein_coding=prioritize_protein_coding, prioritize_loftee_no_flags=penalize_flags, csq_list_order=["transcript_consequences"], @@ -671,44 +671,144 @@ def get_csq_from_struct( def filter_to_most_severe_consequences( csq_expr: hl.expr.ArrayExpression, - csq_order: List[str] = CSQ_ORDER, - most_severe_csq_field: str = "most_severe_consequence", -) -> hl.expr.ArrayExpression: + csq_order: Optional[List[str]] = None, + loftee_labels: Optional[List[str]] = None, + filter_to_protein_coding: bool = False, + prioritize_loftee: bool = False, + prioritize_loftee_no_flags: bool = False, + additional_order_field: Optional[str] = None, + additional_order: Optional[List[str]] = None, +) -> hl.StructExpression: """ Filter an array of VEP consequences to all entries that have the most severe consequence. - This function expects that all entries in the `csq_list` are already annotated with - the most severe consequence using `add_most_severe_consequence_to_consequence` or - `add_most_severe_csq_to_tc_within_vep_root`. + Returns a struct with the following annotations: + + - most_severe_consequence: Most severe consequence for variant. + - lof: Whether the variant is a loss-of-function variant. + - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags). + - consequences: Array of consequences that match the most severe consequence. .. note:: - If you have multiple lists of consequences and want to determine the most severe consequence across all lists, consider using - `get_most_severe_consequence_for_summary`. + `get_most_severe_csq_from_multiple_csq_lists`. - If you want to group consequences by gene and determine the most severe consequence for each gene, consider using `process_consequences`. - :param csq_expr: ArrayExpression of VEP consequences. + If `filter_to_protein_coding` is True, filter to only protein-coding transcripts + before determining the most severe consequence. + + If `prioritize_loftee` is True, prioritize consequences with LOFTEE annotations, in + the order of `loftee_labels`, over those without LOFTEE annotations. If + `prioritize_loftee_no_flags` is True, prioritize LOFTEE consequences with no flags + over those with flags. + + If `additional_order` is provided, additional ordering is applied to the + consequences in the list after any of the above prioritization. An example use of + this parameter is to prioritize by PolyPhen predictions. + + :param csq_expr: ArrayExpression of VEP consequences to filter. :param csq_order: List indicating the order of VEP consequences, sorted from high to - low impact. Default is the value of the `CSQ_ORDER` global. - :param most_severe_csq_field: Field containing the most severe consequence for each - consequence in `csq_list`. Default is 'most_severe_consequence'. + low impact. Default is None, which uses the value of the `CSQ_ORDER` global. + :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. + Default is None, which uses the value of the `LOFTEE_LABELS` global. + :param filter_to_protein_coding: Whether to filter to only protein-coding + transcripts before determining the most severe consequence. Default is False. + :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is + False. + :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE consequences with no + flags over those with flags. Default is False. + :param additional_order_field: Field name of the consequence annotation to use for + additional ordering to apply to the consequences in the list. Default is None. + :param additional_order: The ordering to use for prioritizing consequences in the + `additional_order_field`. Default is None. :return: ArrayExpression with of the consequences that match the most severe consequence. """ - # Get the highest impact csq label. - ms_csq = get_most_severe_consequence_expr( - csq_expr.map(lambda x: x[most_severe_csq_field]), csq_order=csq_order - ) + if ((additional_order_field is None) + (additional_order is None)) == 1: + raise ValueError( + "If `additional_order_field` is provided, `additional_order` must also be" + " provided and vice versa." + ) - # Filter to only consequences with the highest impact csq label, and return missing - # if the most severe consequence is missing. - return hl.or_missing( - hl.is_defined(ms_csq), - csq_expr.filter(lambda x: x[most_severe_csq_field] == ms_csq), - ) + loftee_labels = loftee_labels or LOFTEE_LABELS + + if filter_to_protein_coding: + logger.info("Filtering to protein-coding transcripts...") + csq_expr = csq_expr.filter(lambda x: x.biotype == "protein_coding") + csq_expr = hl.or_missing(hl.len(csq_expr) > 0, csq_expr) + + def _filter_to_most_severe( + expr: hl.expr.ArrayExpression, + field: str, + order: Optional[List[str]] = None, + ) -> Tuple[hl.expr.StringExpression, hl.expr.ArrayExpression]: + """ + Filter to the most severe consequence in a list of consequences. + + :param expr: ArrayExpression of consequences to filter. + :param field: Field name of the consequence annotation. + :param order: List indicating the order of VEP consequences, sorted from high to + low impact. Default is None, which uses the value of the `CSQ_ORDER` global. + :return: The most severe consequence and an ArrayExpression of consequences that + match the most severe consequence. + """ + ms_type = expr.dtype.element_type[field] + is_collection = isinstance(ms_type, hl.tarray) or isinstance(ms_type, hl.tset) + + # Get the highest impact csq label. + expr_map = expr.flatmap if is_collection else expr.map + filter_func = lambda x, ms: x.contains(ms) if is_collection else x == ms + ms_csq = get_most_severe_consequence_expr( + expr_map(lambda x: x[field]), csq_order=order + ) + + # Filter to only consequences with the highest impact csq label, and return + # missing if the most severe consequence is missing. + return ms_csq, hl.or_missing( + (hl.len(expr) > 0) & hl.is_defined(ms_csq), + expr.filter(lambda x: filter_func(x[field], ms_csq)), + ) + + # Initialize the lof struct with missing values. + lof_expr = hl.struct(lof=hl.missing(hl.tstr), no_lof_flags=hl.missing(hl.tbool)) + + # If requested, filter to the most severe LOFTEE consequences, and determine if + # there are any LOFTEE consequences without flags. + if prioritize_loftee: + lof, lof_csq = _filter_to_most_severe(csq_expr, "lof", loftee_labels) + lof_csq_no_flag = filter_vep_consequences_by_loftee(lof_csq, no_lof_flags=True) + no_lof_flags = hl.len(lof_csq_no_flag) > 0 + + # If requested and there are lof consequences without flags, use those. + if prioritize_loftee_no_flags: + lof_csq = hl.if_else(no_lof_flags, lof_csq_no_flag, lof_csq) + + # If there are no lof consequences, use the original consequences list. + csq_expr = hl.coalesce(lof_csq, csq_expr) + lof_expr = lof_expr.annotate(lof=lof, no_lof_flags=no_lof_flags) + + # Get the most severe consequence and the consequences that match the most severe + # consequence. + ms_csq, csq_expr = _filter_to_most_severe(csq_expr, "consequence_terms", csq_order) + + # If additional ordering is provided, filter to the most severe consequence with the + # highest impact from the additional ordering. + if additional_order is not None: + # Get the highest impact consequences from the additional ordering. + _, add_csq_expr = _filter_to_most_severe( + csq_expr, additional_order_field, additional_order + ) + + # If there are consequences from the additional ordering, set the + # consequence list to the additional ordering, otherwise keep the original + # list. + csq_expr = hl.coalesce(add_csq_expr, csq_expr) + + return hl.struct(most_severe_consequence=ms_csq, **lof_expr, consequences=csq_expr) # TODO: Can add this to `filter_vep_transcript_csqs`, but need to also make that @@ -783,10 +883,11 @@ def get_most_severe_consequence_for_summary( def get_most_severe_csq_from_multiple_csq_lists( vep_expr: hl.expr.StructExpression, - csq_order: List[str] = CSQ_ORDER, - loftee_labels: List[str] = LOFTEE_LABELS, - include_transcript_csqs: bool = False, + csq_order: Optional[List[str]] = None, + loftee_labels: Optional[List[str]] = None, + include_csqs: bool = False, prioritize_protein_coding: bool = True, + prioritize_loftee: bool = True, prioritize_loftee_no_flags: bool = False, csq_list_order: Union[List[str], Tuple[str]] = ( "transcript_consequences", @@ -799,17 +900,20 @@ def get_most_severe_csq_from_multiple_csq_lists( """ Process multiple VEP consequences lists to determine the most severe consequence. - Useful for generating summary annotations for VEP consequences. - Adds the following annotations: - - most_severe_csq: Most severe consequence for variant. + - most_severe_consequence: Most severe consequence for variant. - protein_coding: Whether the variant is present on a protein-coding transcript. - lof: Whether the variant is a loss-of-function variant. - no_lof_flags: Whether the variant has any LOFTEE flags (True if no flags). - If `include_transcript_csqs` is True, an additional annotation is added: - - transcript_consequences: All transcript consequences for the most severe - consequence. + .. note:: + + Assumes input Table is annotated with VEP and that VEP annotations have been + filtered to canonical transcripts if wanted. + + If `include_csqs` is True, additional annotations are added for each VEP + consequences list in `csq_list_order`, with the consequences that match the most + severe consequence term. If `prioritize_protein_coding` is True and "transcript_consequences" is in `csq_list_order`, protein-coding transcripts are prioritized by filtering to only @@ -823,23 +927,25 @@ def get_most_severe_csq_from_multiple_csq_lists( is the order of consequences, sorted from high to low impact. An example use of this parameter is to prioritize PolyPhen consequences for protein-coding transcripts. - .. note:: - - Assumes input Table is annotated with VEP and that VEP annotations have been - filtered to canonical transcripts if wanted. + If `prioritize_loftee` is True, prioritize consequences with LOFTEE annotations, in + the order of `loftee_labels`, over those without LOFTEE annotations. If + `prioritize_loftee_no_flags` is True, prioritize LOFTEE consequences with no flags + over those with flags. :param vep_expr: StructExpression of VEP consequences to get the most severe consequence from. - :param csq_order: Order of VEP consequences, sorted from high to low impact. - Default is CSQ_ORDER. + :param csq_order: Order of VEP consequences, sorted from high to low impact. Default + is None, which uses the value of the `CSQ_ORDER` global. :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. - Default is LOFTEE_LABELS. - :param include_transcript_csqs: Whether to include all transcript consequences for - the most severe consequence. Default is False. + Default is None, which uses the value of the `LOFTEE_LABELS` global. + :param include_csqs: Whether to include all consequences for the most severe + consequence. Default is False. :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts when determining the worst consequence. Default is True. - :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE consequences with no - flags over those with flags. Default is False. + :param prioritize_loftee: Whether to prioritize consequences with LOFTEE annotations + over those without. Default is False. + :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE annotated + consequences with no flags over those with flags. Default is False. :param csq_list_order: Order of VEP consequences lists to be processed. Default is ('transcript_consequences', 'regulatory_feature_consequences', 'motif_feature_consequences', 'intergenic_consequences'). @@ -848,149 +954,61 @@ def get_most_severe_csq_from_multiple_csq_lists( of consequences, sorted from high to low impact. Default is None. :return: Table annotated with VEP summary annotations. """ - if add_order_by_csq_list is None: - add_order_by_csq_list = {} - - def _get_most_severe_csq( - csq_list: hl.ArrayExpression, - protein_coding: bool = False, - include_csqs: bool = False, - prioritize_loftee: bool = False, - additional_order: Optional[Tuple[str, List[str]]] = None, - ) -> hl.StructExpression: - """ - Filter a list of consequences to those that have the most severe consequence. - - If `protein_coding` is True, filter to only protein-coding transcripts before - determining the most severe consequence. If `prioritize_loftee` is True, - prioritize LOFTEE consequences by filtering to only LOFTEE consequences and - determining the most severe consequence. If `additional_order` is provided, - additional ordering is applied to the consequences in the list. - - :param csq_list: ArrayExpression of VEP consequences. - :param protein_coding: Whether to filter to only protein-coding transcripts - before determining the most severe consequence. Default is False. - :param include_csqs: Whether to include all transcript consequences for the most - severe consequence. Default is False. - :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is - False. - :param additional_order: Tuple indicating the additional ordering to apply to - the consequences in the list. The first element is the name of the - consequences list and the second element is the order of consequences, - sorted from high to low impact. Default is None. - :return: StructExpression with the most severe consequence and the list of - consequences that match the most severe consequence. - """ - lof = hl.missing(hl.tstr) - no_lof_flags = hl.missing(hl.tbool) - - if protein_coding: - csq_list = csq_list.filter(lambda x: x.biotype == "protein_coding") - - if prioritize_loftee: - lof_csq = filter_to_most_severe_consequences(csq_list, loftee_labels, "lof") - lof_csq_no_flag = filter_vep_consequences_by_loftee( - lof_csq, no_lof_flags=True - ) - - # Check if any of the lof consequences have no lof flags. - no_lof_flags = hl.len(lof_csq_no_flag) > 0 - - # If requested and there are lof consequences without flags, use those. - if prioritize_loftee_no_flags: - lof_csq = hl.if_else(no_lof_flags, lof_csq_no_flag, lof_csq) - - lof = lof_csq[0].lof + add_order_by_csq_list = add_order_by_csq_list or {} + loftee_labels = loftee_labels or LOFTEE_LABELS - # If there are no lof consequences, use the original consequences list. - csq_list = hl.coalesce(lof_csq, csq_list) - - # Add most_severe_consequence to each consequence and get the most severe - # consequence of all consequences in the list. - csq_list = filter_to_most_severe_consequences( - add_most_severe_consequence_to_consequence(csq_list, csq_order), - csq_order=csq_order, - ) - - if additional_order is not None: - # Get the highest impact consequences from the additional ordering. - add_csq_expr = filter_to_most_severe_consequences( - csq_list, - most_severe_csq_field=additional_order[0], - csq_order=additional_order[1], - ) - # If there are consequences from the additional ordering, set the - # consequence list to the additional ordering, otherwise keep the original - # list. - csq_list = hl.coalesce(add_csq_expr, csq_list) - - return hl.or_missing( - hl.len(csq_list) > 0, - hl.struct( - most_severe_consequence=csq_list[0].most_severe_consequence, - protein_coding=protein_coding, - lof=lof, - no_lof_flags=no_lof_flags, - **({"consequences": csq_list} if include_csqs else {}), - ), - ) - - # Get type of transcript_consequences field for use with hl.missing for other - # consequence lists. - tc_dtype = None - if include_transcript_csqs and "transcript_consequences" in csq_list_order: - tc_dtype = ( - vep_expr["transcript_consequences"] - .map(lambda x: x.annotate(most_severe_consequence=hl.missing(hl.tstr))) - .dtype - ) + # Create a struct with missing values for each VEP consequences list. + ms_csq_list_expr = hl.struct( + **{c: hl.missing(vep_expr[c].dtype) for c in csq_list_order if c in vep_expr} + ) - # Get the most severe consequence for each VEP consequences list. - ms_csqs_list = [] - for c in csq_list_order: - if c not in vep_expr: - logger.warning(f"VEP consequences list %s not found in input!", c) - continue - csqs = vep_expr[c] - is_tc = c == "transcript_consequences" - ms_csqs = _get_most_severe_csq( - csqs, - # Only include transcript consequences if requested and the current list is - # for transcript consequences. - prioritize_loftee=True if is_tc else False, - include_csqs=include_transcript_csqs and is_tc, - additional_order=add_order_by_csq_list.get(c), - ) + # Build the case expression to determine the most severe consequence. + ms_csq_expr = hl.case(missing_false=True) + for csq_list in csq_list_order: + if csq_list not in vep_expr: + logger.warning(f"VEP consequences list %s not found in input!", csq_list) + + is_tc = csq_list == "transcript_consequences" + csq_expr = vep_expr[csq_list] + + # Set the base arguments for filtering to the most severe consequence using + # filter_to_most_severe_consequences. + base_args = { + "csq_order": csq_order, + "loftee_labels": loftee_labels, + "prioritize_loftee": True if (prioritize_loftee and is_tc) else False, + "prioritize_loftee_no_flags": prioritize_loftee_no_flags, + "additional_order": add_order_by_csq_list.get(csq_list), + } + ms_expr = filter_to_most_severe_consequences(csq_expr, **base_args) # If prioritizing protein-coding transcripts, get the most severe consequence - # for protein-coding transcripts and coalesce with the current most severe - # transcript consequence. - if is_tc and prioritize_protein_coding: - ms_csqs = hl.coalesce( - _get_most_severe_csq( - csqs, - protein_coding=True, - prioritize_loftee=True, - include_csqs=include_transcript_csqs, - additional_order=add_order_by_csq_list.get(c), - ), - ms_csqs, - ) - - # If the current list is not for transcript consequences, annotate with missing - # for transcript_consequences if transcript consequences are requested. - if tc_dtype is not None: - tc_expr = ms_csqs.consequences if is_tc else hl.missing(tc_dtype) - ms_csqs = ms_csqs.annotate(consequences=tc_expr) - - ms_csqs_list.append(ms_csqs) + # for protein-coding transcripts. + if prioritize_protein_coding: + ms_expr = ms_expr.annotate(protein_coding=hl.missing(hl.tbool)) + if is_tc: + pc_ms_expr = filter_to_most_severe_consequences( + csq_expr, filter_to_protein_coding=True, **base_args + ) + ms_expr = hl.if_else( + hl.is_defined(pc_ms_expr.most_severe_consequence), + pc_ms_expr.annotate(protein_coding=True), + ms_expr.annotate(protein_coding=False), + ) - ms_csqs = hl.coalesce(*ms_csqs_list) + # Annotate the current consequence list with the consequences that match the + # most severe consequence term. + if include_csqs: + ms_expr = ms_expr.annotate( + **ms_csq_list_expr.annotate(**{csq_list: ms_expr.consequences}) + ) + ms_expr = ms_expr.drop("consequences") - if tc_dtype is not None: - ms_csqs = ms_csqs.rename({"consequences": "transcript_consequences"}) + # If the length of the consequence list is not 0, set the most severe + # consequence to the most severe consequence for the current list. + ms_csq_expr = ms_csq_expr.when(hl.len(csq_expr) > 0, ms_expr) - return ms_csqs + return ms_csq_expr.or_missing() def filter_vep_transcript_csqs( From d6f8d21f90300a73ddd021779f6decc9db1c73d5 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:43:09 -0600 Subject: [PATCH 13/33] Small cleanup --- gnomad/utils/vep.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 1f36fb6c1..1e6690301 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -292,11 +292,8 @@ def get_most_severe_consequence_expr( `CSQ_ORDER` global. :return: Most severe consequence in `csq_expr`. """ - if csq_order is None: - csq_order = CSQ_ORDER - csqs = hl.literal(csq_order) - - return csqs.find(lambda c: csq_expr.contains(c)) + csq_order = csq_order or CSQ_ORDER + return hl.literal(csq_order).find(lambda c: csq_expr.contains(c)) def add_most_severe_consequence_to_consequence( @@ -385,8 +382,9 @@ def _find_worst_transcript_consequence( return hl.or_missing(hl.len(tc_expr) > 0, tc_expr[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. - csqs = t[vep_root].transcript_consequences - csqs = add_most_severe_consequence_to_consequence(csqs, csq_order) + csqs = add_most_severe_consequence_to_consequence( + t[vep_root].transcript_consequences, csq_order + ) # Group transcript consequences by gene and find the worst consequence for each. gene_csqs = ( From cce80d1edee5b8970bbe9bfd179e26da486d2401 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:43:33 -0600 Subject: [PATCH 14/33] Fix the use of keep --- gnomad/utils/vep.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 1e6690301..009c3209e 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -841,7 +841,10 @@ def filter_vep_consequences_by_loftee( lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") ) - return csq_expr.filter(lambda x: combine_functions(filter_criteria, x), keep=keep) + if keep: + return csq_expr.filter(lambda x: combine_functions(filter_criteria, x)) + else: + return csq_expr.filter(lambda x: ~combine_functions(filter_criteria, x)) @deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") From 9de076abf1033b2dcf52eca7ba510b9ec75f8af3 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:59:26 -0600 Subject: [PATCH 15/33] Change to use `get_most_severe_csq_from_multiple_csq_lists` instead of `get_most_severe_consequence_for_summary` --- gnomad/assessment/summary_stats.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py index cdfa54d6e..90479ff3e 100644 --- a/gnomad/assessment/summary_stats.py +++ b/gnomad/assessment/summary_stats.py @@ -12,7 +12,7 @@ add_most_severe_consequence_to_consequence, filter_vep_to_canonical_transcripts, filter_vep_to_mane_select_transcripts, - get_most_severe_consequence_for_summary, + get_most_severe_csq_from_multiple_csq_lists, process_consequences, ) @@ -231,11 +231,11 @@ def get_summary_counts( logger.info("Filtering to mane select transcripts...") ht = filter_vep_to_mane_select_transcripts(ht) - logger.info("Getting VEP summary annotations...") - ht = get_most_severe_consequence_for_summary(ht) - - logger.info("Annotating with frequency bin information...") - ht = ht.annotate(freq_bin=freq_bin_expr(ht[freq_field], index)) + logger.info("Annotating with VEP summary and frequency bin information...") + ht = ht.annotate( + freq_bin=freq_bin_expr(ht[freq_field], index), + **get_most_severe_csq_from_multiple_csq_lists(ht.vep), + ) logger.info( "Annotating HT globals with total counts/total allele counts per variant" @@ -248,7 +248,7 @@ def get_summary_counts( ht.alleles, ht.lof, ht.no_lof_flags, - ht.most_severe_csq, + ht.most_severe_consequence, prefix_str="total_", ) ) @@ -259,7 +259,7 @@ def get_summary_counts( ht[freq_field][index].AC, ht.lof, ht.no_lof_flags, - ht.most_severe_csq, + ht.most_severe_consequence, ) ) ) @@ -272,7 +272,7 @@ def get_summary_counts( ht.alleles, ht.lof, ht.no_lof_flags, - ht.most_severe_csq, + ht.most_severe_consequence, ) ) @@ -489,7 +489,7 @@ def _create_filter_by_csq( if not isinstance(csq_set, hl.expr.CollectionExpression): csq_set = hl.set(csq_set) - return csq_set.contains(t.most_severe_csq) + return csq_set.contains(t.most_severe_consequence) # Set up filters for specific consequences or sets of consequences. csq_filters = { @@ -1033,11 +1033,11 @@ def default_generate_gene_lof_summary( ) if filter_loftee: - lof_ht = get_most_severe_consequence_for_summary(mt.rows()) + lof_expr = get_most_severe_csq_from_multiple_csq_lists(mt.vep) mt = mt.filter_rows( - hl.is_defined(lof_ht[mt.row_key].lof) - & (lof_ht[mt.row_key].lof == "HC") - & (lof_ht[mt.row_key].no_lof_flags) + hl.is_defined(lof_expr.lof) + & (lof_expr.lof == "HC") + & (lof_expr.no_lof_flags) ) ht = mt.annotate_rows( From 2b9aaedff397e9b934df26505d45004e3bdbc1d2 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 12:21:20 -0600 Subject: [PATCH 16/33] use `filter_vep_transcript_csqs_expr` for loftee filter --- gnomad/utils/vep.py | 54 +++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 39 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index f896815b1..4f351a518 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -784,7 +784,7 @@ def _filter_to_most_severe( # there are any LOFTEE consequences without flags. if prioritize_loftee: lof, lof_csq = _filter_to_most_severe(csq_expr, "lof", loftee_labels) - lof_csq_no_flag = filter_vep_consequences_by_loftee(lof_csq, no_lof_flags=True) + lof_csq_no_flag = filter_vep_transcript_csqs_expr(lof_csq, no_lof_flags=True) no_lof_flags = hl.len(lof_csq_no_flag) > 0 # If requested and there are lof consequences without flags, use those. @@ -815,44 +815,6 @@ def _filter_to_most_severe( return hl.struct(most_severe_consequence=ms_csq, **lof_expr, consequences=csq_expr) -# TODO: Can add this to `filter_vep_transcript_csqs`, but need to also make that -# function work with just an array of transcript consequences. -def filter_vep_consequences_by_loftee( - csq_expr: hl.expr.ArrayExpression, - loftee_labels: Optional[List[str]] = None, - no_lof_flags: bool = False, - keep: bool = True, -) -> hl.expr.StructExpression: - """ - Filter VEP transcript consequences by LOFTEE. - - :param csq_expr: ArrayExpression of VEP consequences with LOFTEE annotations. - :param loftee_labels: List of LOFTEE labels to filter to. Default is None, which - filters to all LOFTEE labels. - :param no_lof_flags: Whether to filter to consequences with no LOFTEE flags. - Default is False. - :param keep: Whether to keep the consequences that match the filter criteria. - Default is True. - :return: StructExpression with the filtered consequences. - """ - filter_criteria = [lambda csq: True] - - if loftee_labels: - logger.info("Filtering to LOFTEE labels: %s...", loftee_labels) - filter_criteria.append(lambda x: hl.set(loftee_labels).contains(x.lof)) - - if no_lof_flags: - logger.info("Filtering to consequences with no LOFTEE flags...") - filter_criteria.append( - lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") - ) - - if keep: - return csq_expr.filter(lambda x: combine_functions(filter_criteria, x)) - else: - return csq_expr.filter(lambda x: ~combine_functions(filter_criteria, x)) - - @deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") def get_most_severe_consequence_for_summary( ht: hl.Table, @@ -1074,6 +1036,8 @@ def filter_vep_transcript_csqs_expr( mane_select: bool = False, ensembl_only: bool = True, protein_coding: bool = False, + loftee_labels: Optional[List[str]] = None, + no_lof_flags: bool = False, csqs: List[str] = None, keep_csqs: bool = True, genes: Optional[List[str]] = None, @@ -1105,6 +1069,10 @@ def filter_vep_transcript_csqs_expr( Emsembl. Default is True. :param protein_coding: Whether to filter to only protein-coding transcripts. Default is False. + :param loftee_labels: List of LOFTEE labels to filter to. Default is None, which + filters to all LOFTEE labels. + :param no_lof_flags: Whether to filter to consequences with no LOFTEE flags. + Default is False. :param csqs: Optional list of consequence terms to filter to. Transcript consequences are filtered to those where 'most_severe_consequence' is in the list of consequence terms `csqs`. Default is None. @@ -1148,6 +1116,14 @@ def filter_vep_transcript_csqs_expr( if protein_coding: logger.info("Filtering to protein coding transcripts...") criteria.append(lambda csq: csq.biotype == "protein_coding") + if loftee_labels: + logger.info( + "Filtering to consequences with LOFTEE labels: %s...", loftee_labels + ) + criteria.append(lambda x: hl.set(loftee_labels).contains(x.lof)) + if no_lof_flags: + logger.info("Filtering to consequences with no LOFTEE flags...") + criteria.append(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) if genes is not None: logger.info("Filtering to genes of interest...") genes = hl.literal(genes) From fdc41d6fad02749f4bda02769dd6ff0aa506f32c Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 12:48:02 -0600 Subject: [PATCH 17/33] Use filter_vep_transcript_csqs_expr for protein coding filter --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 6e7d32e90..04a34ce4c 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -742,7 +742,7 @@ def filter_to_most_severe_consequences( if filter_to_protein_coding: logger.info("Filtering to protein-coding transcripts...") - csq_expr = csq_expr.filter(lambda x: x.biotype == "protein_coding") + csq_expr = filter_vep_transcript_csqs_expr(csq_expr, protein_coding=True) csq_expr = hl.or_missing(hl.len(csq_expr) > 0, csq_expr) def _filter_to_most_severe( From 6c4e3405aad805a1c7db04cbba70e9d953b077c3 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:29:41 -0600 Subject: [PATCH 18/33] Remove process_consequence changes --- gnomad/utils/vep.py | 128 ++++++++++++++++++++++++++++++-------------- tests/test_vep.py | 28 ---------- 2 files changed, 89 insertions(+), 67 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 04a34ce4c..86cdd17e1 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -337,13 +337,24 @@ def process_consequences( penalize_flags: bool = True, csq_order: Optional[List[str]] = None, has_polyphen: bool = True, - prioritize_protein_coding: bool = False, ) -> Union[hl.MatrixTable, hl.Table]: """ Add most_severe_consequence into [vep_root].transcript_consequences, and worst_csq_by_gene, any_lof into [vep_root]. `most_severe_consequence` is the worst consequence for a transcript. + Each transcript consequence is annotated with a `csq_score` which is a combination + of the index of the consequence's `most_severe_consequence` in `csq_order` and an + extra deduction for loss-of-function consequences, and polyphen predictions if + `has_polyphen` is True. Lower scores translate to higher severity. + + The score adjustment is as follows: + - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) + - lof == 'HC' & lof_flags (-500) + - lof == 'OS' (-20) + - lof == 'LC' (-10) + - everything else (0) + .. note:: From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct @@ -359,64 +370,103 @@ def process_consequences( `CSQ_ORDER` global. :param has_polyphen: Whether the input VEP Struct has a PolyPhen annotation which will be used to modify the consequence score. Default is True. - :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts - when determining the worst consequence. Default is False. - :return: HT/MT with better formatted consequences. + :return: MT with better formatted consequences. """ - # If has_polyphen is True, set the order of PolyPhen consequences. - polyphen_order = POLYPHEN_ORDER if has_polyphen else None + if csq_order is None: + csq_order = CSQ_ORDER + csqs = hl.literal(csq_order) + + # Assign a score to each consequence based on the order in csq_order. + csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) def _find_worst_transcript_consequence( - tc_expr: hl.expr.ArrayExpression, + tcl: hl.expr.ArrayExpression, ) -> hl.expr.StructExpression: """ Find the worst transcript consequence in an array of transcript consequences. - :param tc_expr: Array of transcript consequences. + :param tcl: Array of transcript consequences. :return: Worst transcript consequence. """ - tc_expr = get_most_severe_csq_from_multiple_csq_lists( - hl.struct(transcript_consequences=tc_expr), - csq_order=csq_order, - include_csqs=True, - prioritize_protein_coding=prioritize_protein_coding, - prioritize_loftee_no_flags=penalize_flags, - csq_list_order=["transcript_consequences"], - add_order_by_csq_list={"transcript_consequences": polyphen_order}, - ).transcript_consequences - - return hl.or_missing(hl.len(tc_expr) > 0, tc_expr[0]) + flag = 500 + no_flag = flag * (1 + penalize_flags) + + # Score each consequence based on the order in csq_order. + score_expr = tcl.map( + lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)] + ) + + # Determine the score adjustment based on the consequence's LOF and LOF flags. + sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag) + .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) + .when(tc.lof == "OS", 20) + .when(tc.lof == "LC", 10) + .default(0) + ) + ) + + # If requested, determine the score adjustment based on the consequence's + # PolyPhen prediction. + if has_polyphen: + polyphen_sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when(tc.polyphen_prediction == "probably_damaging", 0.5) + .when(tc.polyphen_prediction == "possibly_damaging", 0.25) + .when(tc.polyphen_prediction == "benign", 0.1) + .default(0) + ) + ) + sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr) + + # Calculate the final consequence score. + tcl = hl.map( + lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr + ) + + # Return the worst consequence based on the calculated score. + return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. - csqs = add_most_severe_consequence_to_consequence( + transcript_csqs = add_most_severe_consequence_to_consequence( t[vep_root].transcript_consequences, csq_order ) # Group transcript consequences by gene and find the worst consequence for each. - gene_csqs = ( - csqs.group_by(lambda tc: tc.gene_symbol) - .map_values(_find_worst_transcript_consequence) - .values() - ) - - # Filter transcript consequences to only include canonical transcripts. - canonical = csqs.filter(lambda csq: csq.canonical == 1) - gene_canonical = ( - canonical.group_by(lambda tc: tc.gene_symbol) - .map_values(_find_worst_transcript_consequence) - .values() + gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) + worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values() + sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) + + # Filter transcript consequences to only include canonical transcripts and find the + # worst consequence for each gene. + canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) + gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) + worst_csq_gene_canonical = gene_canonical_dict.map_values( + _find_worst_transcript_consequence + ).values() + sorted_canonical_scores = hl.sorted( + worst_csq_gene_canonical, key=lambda tc: tc.csq_score ) # Annotate the HT/MT with the worst consequence for each gene and variant. vep_data = t[vep_root].annotate( - transcript_consequences=csqs, - worst_consequence_term=get_most_severe_consequence_expr( - csqs.map(lambda csq: csq.most_severe_consequence), csq_order + transcript_consequences=transcript_csqs, + worst_consequence_term=csqs.find( + lambda c: transcript_csqs.map( + lambda csq: csq.most_severe_consequence + ).contains(c) + ), + worst_csq_by_gene=sorted_scores, + worst_csq_for_variant=hl.or_missing( + hl.len(sorted_scores) > 0, sorted_scores[0] + ), + worst_csq_by_gene_canonical=sorted_canonical_scores, + worst_csq_for_variant_canonical=hl.or_missing( + hl.len(sorted_canonical_scores) > 0, sorted_canonical_scores[0] ), - worst_csq_by_gene=gene_csqs, - worst_csq_for_variant=_find_worst_transcript_consequence(csqs), - worst_csq_by_gene_canonical=gene_canonical, - worst_csq_for_variant_canonical=_find_worst_transcript_consequence(canonical), ) return ( diff --git a/tests/test_vep.py b/tests/test_vep.py index 88985b3da..beffe08ea 100644 --- a/tests/test_vep.py +++ b/tests/test_vep.py @@ -171,31 +171,3 @@ def test_get_most_severe_csq_from_multiple_csq_lists(): assert len(result.transcript_consequences) == 2, "Expected 2 transcript consequences" assert result.transcript_consequences[0].most_severe_consequence == 'missense_variant', "Expected 'missense_variant' for the first transcript consequence" assert result.transcript_consequences[1].most_severe_consequence == 'synonymous_variant', "Expected 'synonymous_variant' for the second transcript consequence" - - -def test_process_consequences(): - """Test the process_consequences function.""" - # Create a mock MatrixTable with vep.transcript_consequences - mt = hl.utils.range_matrix_table(1, 1) - mt = mt.annotate_rows(vep=hl.struct(transcript_consequences=[hl.struct(consequence_terms=['missense_variant', 'synonymous_variant'])])) - - # Apply the function - result_mt = process_consequences(mt) - - # Check that the most_severe_consequence field is added - assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0] - - # Check that the most_severe_consequence is correct - assert result_mt.vep.transcript_consequences[0].most_severe_consequence == 'missense_variant' - - # Check that the worst_csq_by_gene field is added - assert 'worst_csq_by_gene' in result_mt.vep - - # Check that the worst_csq_for_variant field is added - assert 'worst_csq_for_variant' in result_mt.vep - - # Check that the worst_csq_by_gene_canonical field is added - assert 'worst_csq_by_gene_canonical' in result_mt.vep - - # Check that the worst_csq_for_variant_canonical field is added - assert 'worst_csq_for_variant_canonical' in result_mt.vep From 7d7ff8128f4d2d751108348435159c4099371bb1 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:33:06 -0600 Subject: [PATCH 19/33] Move vep tests into utils directory --- tests/{ => utils}/test_vep.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{ => utils}/test_vep.py (100%) diff --git a/tests/test_vep.py b/tests/utils/test_vep.py similarity index 100% rename from tests/test_vep.py rename to tests/utils/test_vep.py From e1b542a04c689809fc3e97948827cbe8bc43624f Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:51:15 -0600 Subject: [PATCH 20/33] Fix tests --- .pre-commit-config.yaml | 6 +- gnomad/utils/vep.py | 1 + tests/utils/test_vep.py | 151 ++++++++++++++++++++++++---------------- 3 files changed, 96 insertions(+), 62 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 22ca75faa..4ebe97335 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,13 +11,13 @@ repos: hooks: - id: black language_version: python3 - files: gnomad + files: gnomad tests - repo: https://github.com/pre-commit/mirrors-autopep8 rev: v2.0.2 # This should be kept in sync with the version in requirements-dev.in hooks: - id: autopep8 args: ["--exit-code", "--in-place"] - files: gnomad + files: gnomad tests - repo: https://github.com/pycqa/pydocstyle rev: 6.3.0 # This should be kept in sync with the version in requirements-dev.in hooks: @@ -27,4 +27,4 @@ repos: hooks: - id: isort args: ["--profile", "black", "--filter-files"] - files: gnomad + files: gnomad tests diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 86cdd17e1..902ab359f 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -986,6 +986,7 @@ def get_most_severe_csq_from_multiple_csq_lists( for csq_list in csq_list_order: if csq_list not in vep_expr: logger.warning(f"VEP consequences list %s not found in input!", csq_list) + continue is_tc = csq_list == "transcript_consequences" csq_expr = vep_expr[csq_list] diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py index beffe08ea..8787c21e7 100644 --- a/tests/utils/test_vep.py +++ b/tests/utils/test_vep.py @@ -6,9 +6,8 @@ add_most_severe_csq_to_tc_within_vep_root, filter_to_most_severe_consequences, CSQ_ORDER, - filter_vep_consequences_by_loftee, get_most_severe_csq_from_multiple_csq_lists, - process_consequences, + filter_vep_transcript_csqs_expr, ) @@ -19,17 +18,17 @@ def test_get_most_severe_consequence_expr(): # Test with default csq_order result = get_most_severe_consequence_expr(csq_expr) - assert result.value == 'splice_region_variant', "Expected 'splice_region_variant'" + assert hl.eval(result) == 'splice_region_variant', "Expected 'splice_region_variant'" # Test with custom csq_order custom_csq_order = ['intron_variant', 'splice_region_variant'] result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) - assert result.value == 'intron_variant', "Expected 'intron_variant'" + assert hl.eval(result) == 'intron_variant', "Expected 'intron_variant'" # Test with csq_expr that contains a consequence not in csq_order csq_expr = hl.literal(['non_existent_consequence']) result = get_most_severe_consequence_expr(csq_expr) - assert result.value is None, "Expected None for non-existent consequence" + assert hl.eval(result) is None, "Expected None for non-existent consequence" def test_add_most_severe_consequence_to_consequence(): @@ -39,17 +38,17 @@ def test_add_most_severe_consequence_to_consequence(): # Test with default csq_order result = add_most_severe_consequence_to_consequence(tc_expr) - assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms), "Expected 'missense_variant'" + assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms)), "Expected 'missense_variant'" # Test with custom csq_order custom_csq_order = ['synonymous_variant', 'missense_variant'] result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) - assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order), "Expected 'synonymous_variant'" + assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order)), "Expected 'synonymous_variant'" # Test with csq_expr that contains a consequence not in csq_order tc_expr = hl.struct(consequence_terms=['non_existent_consequence']) result = add_most_severe_consequence_to_consequence(tc_expr) - assert result.most_severe_consequence == get_most_severe_consequence_expr(tc_expr.consequence_terms), "Expected None for non-existent consequence" + assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms)), "Expected None for non-existent consequence" def test_add_most_severe_csq_to_tc_within_vep_root(): @@ -62,69 +61,77 @@ def test_add_most_severe_csq_to_tc_within_vep_root(): result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) # Check that the most_severe_consequence field is added - assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0] + assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0].keys(), "Expected 'most_severe_consequence' field" # Check that the most_severe_consequence is correct - assert result_mt.vep.transcript_consequences[0].most_severe_consequence == 'missense_variant' + assert result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] == 'missense_variant' def test_filter_to_most_severe_consequences(): """Test the filter_to_most_severe_consequences function.""" - # Create a mock ArrayExpression of VEP consequences - csq_expr = hl.literal([ - hl.struct(most_severe_consequence='missense_variant'), - hl.struct(most_severe_consequence='synonymous_variant'), - hl.struct(most_severe_consequence='missense_variant') + # Create a mock array of VEP consequences + mock_csq_expr = hl.literal([ + hl.struct( + consequence_terms=['missense_variant', 'synonymous_variant'], + lof='HC', + lof_flags='HC', + protein_coding=1 + ), + hl.struct( + consequence_terms=['splice_region_variant', 'synonymous_variant'], + lof='LC', + lof_flags='HC', + protein_coding=1 + ), + hl.struct( + consequence_terms=['frameshift_variant', 'synonymous_variant'], + lof='HC', + lof_flags='', + protein_coding=1 + ) ]) - # Apply the function - result = filter_to_most_severe_consequences( - csq_expr, CSQ_ORDER, 'most_severe_consequence' - ) + # Call the function with the mock data + result = filter_to_most_severe_consequences(mock_csq_expr, prioritize_loftee=True) - # Check that the result only contains the most severe consequences - assert len(result) == 2, "Expected 2 consequences" - assert all(csq.most_severe_consequence == 'missense_variant' for csq in - result), "Expected 'missense_variant'" + # Assert that the output is as expected + assert hl.eval(result.most_severe_consequence) == 'frameshift_variant', "Expected 'frameshift_variant'" + assert hl.eval(result.lof) == 'HC', "Expected 'HC'" + assert hl.eval(result.no_lof_flags) + assert hl.eval(hl.len(result.consequences)) == 1, "Expected 1 consequence" + assert hl.eval(result.consequences[0].consequence_terms) == ['frameshift_variant', 'synonymous_variant'], "Expected ['frameshift_variant', 'synonymous_variant']" -def test_filter_vep_consequences_by_loftee(): +def test_filter_vep_transcript_csqs_expr_loftee(): """Test the filter_vep_consequences_by_loftee function.""" # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations csq_expr = hl.literal([ - hl.struct(lof='HC', lof_flags=None), + hl.struct(lof='HC', lof_flags=hl.missing(hl.tstr)), hl.struct(lof='HC', lof_flags=''), hl.struct(lof='LC', lof_flags='flag1'), - hl.struct(lof='OS', lof_flags=None), - hl.struct(lof=None, lof_flags='flag2') + hl.struct(lof='OS', lof_flags=hl.missing(hl.tstr)), + hl.struct(lof=hl.missing(hl.tstr), lof_flags='flag2') ]) # Test with default parameters - result = filter_vep_consequences_by_loftee(csq_expr) - assert len(result) == 6, "Expected 6 consequences" + result = filter_vep_transcript_csqs_expr(csq_expr) + assert hl.eval(hl.len(result)) == 5, "Expected 5 consequences" # Test with loftee_labels - result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC']) - assert len(result) == 2, "Expected 2 consequences" - assert result[0].lof == 'HC', "Expected 'HC'" + result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=['HC']) + assert hl.eval(hl.len(result)) == 2, "Expected 2 consequences" + assert hl.eval(result[0].lof) == 'HC', "Expected 'HC'" # Test with no_lof_flags - result = filter_vep_consequences_by_loftee(csq_expr, no_lof_flags=True) - assert len(result) == 3, "Expected 3 consequences" - assert all(csq.lof_flags is None for csq in result), "Expected no LOFTEE flags" + result = filter_vep_transcript_csqs_expr(csq_expr, no_lof_flags=True) + assert hl.eval(hl.len(result)) == 3, "Expected 3 consequences" + assert hl.eval(hl.all(result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == '')))), "Expected no LOFTEE flags" # Test with loftee_labels and no_lof_flags - result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC'], - no_lof_flags=True) - assert len(result) == 1, "Expected 1 consequence" - assert result[0].lof == 'HC', "Expected 'HC'" - assert all(csq.lof_flags is None for csq in result), "Expected no LOFTEE flags" - - # Test with keep=False - result = filter_vep_consequences_by_loftee(csq_expr, loftee_labels=['HC'], - keep=False) - assert len(result) == 3, "Expected 3 consequences" - assert all(csq.lof != 'HC' for csq in result), "Expected no 'HC' consequences" + result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=['HC'], no_lof_flags=True) + assert hl.eval(hl.len(result)) == 2, "Expected 2 consequence" + assert hl.eval(result[0].lof) == 'HC', "Expected 'HC'" + assert hl.eval(hl.all(result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == '')))), "Expected no LOFTEE flags" def test_get_most_severe_csq_from_multiple_csq_lists(): @@ -135,19 +142,28 @@ def test_get_most_severe_csq_from_multiple_csq_lists(): hl.struct( biotype='protein_coding', lof='HC', - most_severe_consequence='missense_variant' + lof_flags='flag1', + consequence_terms='missense_variant' ), hl.struct( biotype='protein_coding', lof='HC', - most_severe_consequence='synonymous_variant' - ) + lof_flags=hl.missing(hl.tstr), + consequence_terms='synonymous_variant' + ), + hl.struct( + biotype='protein_coding', + lof='HC', + lof_flags=hl.missing(hl.tstr), + consequence_terms='missense_variant' + ), ], intergenic_consequences=[ hl.struct( biotype='intergenic', - lof=None, - most_severe_consequence='intergenic_variant' + lof=hl.missing(hl.tstr), + lof_flags=hl.missing(hl.tstr), + consequence_terms='intergenic_variant' ) ] ) @@ -155,19 +171,36 @@ def test_get_most_severe_csq_from_multiple_csq_lists(): # Test the function result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) - # Check the most_severe_csq - assert result.most_severe_csq == 'missense_variant', "Expected 'missense_variant'" + # Check the most_severe_consequence + assert hl.eval(result.most_severe_consequence) == 'missense_variant', "Expected 'missense_variant'" # Check the protein_coding - assert result.protein_coding == True, "Expected True" + assert hl.eval(result.protein_coding), "Expected True" # Check the lof - assert result.lof == 'HC', "Expected 'HC'" + assert hl.eval(result.lof) == 'HC', "Expected 'HC'" # Check the no_lof_flags - assert result.no_lof_flags == False, "Expected False" + assert hl.eval(result.no_lof_flags), "Expected True" # Check the transcript_consequences - assert len(result.transcript_consequences) == 2, "Expected 2 transcript consequences" - assert result.transcript_consequences[0].most_severe_consequence == 'missense_variant', "Expected 'missense_variant' for the first transcript consequence" - assert result.transcript_consequences[1].most_severe_consequence == 'synonymous_variant', "Expected 'synonymous_variant' for the second transcript consequence" + result = get_most_severe_csq_from_multiple_csq_lists(vep_expr, include_csqs=True) + assert hl.eval(hl.len(result.transcript_consequences)) == 2, "Expected 2 transcript consequences" + + # Create a mock VEP expression + vep_expr = hl.struct( + transcript_consequences=[ + hl.struct( + biotype='protein_coding', + lof='HC', + lof_flags='flag1', + consequence_terms='missense_variant' + ) + ] + ) + + # Test the function + result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) + + # Check the no_lof_flags + assert hl.eval(result.no_lof_flags) == False, "Expected False" From b458199963531d9ff7e4315d5e1abd042e6617c2 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:59:45 -0600 Subject: [PATCH 21/33] formatting of tests --- tests/utils/test_vep.py | 188 ++++++++++++++++++++++++---------------- 1 file changed, 115 insertions(+), 73 deletions(-) diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py index 8787c21e7..c09a16869 100644 --- a/tests/utils/test_vep.py +++ b/tests/utils/test_vep.py @@ -1,32 +1,34 @@ """Tests for the gnomAD VEP utility functions.""" import hail as hl + from gnomad.utils.vep import ( add_most_severe_consequence_to_consequence, - get_most_severe_consequence_expr, add_most_severe_csq_to_tc_within_vep_root, filter_to_most_severe_consequences, - CSQ_ORDER, - get_most_severe_csq_from_multiple_csq_lists, filter_vep_transcript_csqs_expr, + get_most_severe_consequence_expr, + get_most_severe_csq_from_multiple_csq_lists, ) def test_get_most_severe_consequence_expr(): """Test the get_most_severe_consequence_expr function.""" # Create a mock ArrayExpression of consequences - csq_expr = hl.literal(['splice_region_variant', 'intron_variant']) + csq_expr = hl.literal(["splice_region_variant", "intron_variant"]) # Test with default csq_order result = get_most_severe_consequence_expr(csq_expr) - assert hl.eval(result) == 'splice_region_variant', "Expected 'splice_region_variant'" + assert ( + hl.eval(result) == "splice_region_variant" + ), "Expected 'splice_region_variant'" # Test with custom csq_order - custom_csq_order = ['intron_variant', 'splice_region_variant'] + custom_csq_order = ["intron_variant", "splice_region_variant"] result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) - assert hl.eval(result) == 'intron_variant', "Expected 'intron_variant'" + assert hl.eval(result) == "intron_variant", "Expected 'intron_variant'" # Test with csq_expr that contains a consequence not in csq_order - csq_expr = hl.literal(['non_existent_consequence']) + csq_expr = hl.literal(["non_existent_consequence"]) result = get_most_severe_consequence_expr(csq_expr) assert hl.eval(result) is None, "Expected None for non-existent consequence" @@ -34,104 +36,140 @@ def test_get_most_severe_consequence_expr(): def test_add_most_severe_consequence_to_consequence(): """Test the add_most_severe_consequence_to_consequence function.""" # Create a mock StructExpression of transcript consequence - tc_expr = hl.struct(consequence_terms=['missense_variant', 'synonymous_variant']) + tc_expr = hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]) # Test with default csq_order result = add_most_severe_consequence_to_consequence(tc_expr) - assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms)), "Expected 'missense_variant'" + assert hl.eval(result.most_severe_consequence) == hl.eval( + get_most_severe_consequence_expr(tc_expr.consequence_terms) + ), "Expected 'missense_variant'" # Test with custom csq_order - custom_csq_order = ['synonymous_variant', 'missense_variant'] + custom_csq_order = ["synonymous_variant", "missense_variant"] result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) - assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order)), "Expected 'synonymous_variant'" + assert hl.eval(result.most_severe_consequence) == hl.eval( + get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order) + ), "Expected 'synonymous_variant'" # Test with csq_expr that contains a consequence not in csq_order - tc_expr = hl.struct(consequence_terms=['non_existent_consequence']) + tc_expr = hl.struct(consequence_terms=["non_existent_consequence"]) result = add_most_severe_consequence_to_consequence(tc_expr) - assert hl.eval(result.most_severe_consequence) == hl.eval(get_most_severe_consequence_expr(tc_expr.consequence_terms)), "Expected None for non-existent consequence" + assert hl.eval(result.most_severe_consequence) == hl.eval( + get_most_severe_consequence_expr(tc_expr.consequence_terms) + ), "Expected None for non-existent consequence" def test_add_most_severe_csq_to_tc_within_vep_root(): """Test the add_most_severe_csq_to_tc_within_vep_root function.""" # Create a mock MatrixTable with vep.transcript_consequences mt = hl.utils.range_matrix_table(1, 1) - mt = mt.annotate_rows(vep=hl.struct(transcript_consequences=[hl.struct(consequence_terms=['missense_variant', 'synonymous_variant'])])) + mt = mt.annotate_rows( + vep=hl.struct( + transcript_consequences=[ + hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]) + ] + ) + ) # Apply the function result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) # Check that the most_severe_consequence field is added - assert 'most_severe_consequence' in result_mt.vep.transcript_consequences[0].keys(), "Expected 'most_severe_consequence' field" + assert ( + "most_severe_consequence" in result_mt.vep.transcript_consequences[0].keys() + ), "Expected 'most_severe_consequence' field" # Check that the most_severe_consequence is correct - assert result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] == 'missense_variant' + assert ( + result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] + == "missense_variant" + ) def test_filter_to_most_severe_consequences(): """Test the filter_to_most_severe_consequences function.""" # Create a mock array of VEP consequences - mock_csq_expr = hl.literal([ - hl.struct( - consequence_terms=['missense_variant', 'synonymous_variant'], - lof='HC', - lof_flags='HC', - protein_coding=1 - ), - hl.struct( - consequence_terms=['splice_region_variant', 'synonymous_variant'], - lof='LC', - lof_flags='HC', - protein_coding=1 - ), - hl.struct( - consequence_terms=['frameshift_variant', 'synonymous_variant'], - lof='HC', - lof_flags='', - protein_coding=1 - ) - ]) + mock_csq_expr = hl.literal( + [ + hl.struct( + consequence_terms=["missense_variant", "synonymous_variant"], + lof="HC", + lof_flags="HC", + protein_coding=1, + ), + hl.struct( + consequence_terms=["splice_region_variant", "synonymous_variant"], + lof="LC", + lof_flags="HC", + protein_coding=1, + ), + hl.struct( + consequence_terms=["frameshift_variant", "synonymous_variant"], + lof="HC", + lof_flags="", + protein_coding=1, + ), + ] + ) # Call the function with the mock data result = filter_to_most_severe_consequences(mock_csq_expr, prioritize_loftee=True) # Assert that the output is as expected - assert hl.eval(result.most_severe_consequence) == 'frameshift_variant', "Expected 'frameshift_variant'" - assert hl.eval(result.lof) == 'HC', "Expected 'HC'" + assert ( + hl.eval(result.most_severe_consequence) == "frameshift_variant" + ), "Expected 'frameshift_variant'" + assert hl.eval(result.lof) == "HC", "Expected 'HC'" assert hl.eval(result.no_lof_flags) assert hl.eval(hl.len(result.consequences)) == 1, "Expected 1 consequence" - assert hl.eval(result.consequences[0].consequence_terms) == ['frameshift_variant', 'synonymous_variant'], "Expected ['frameshift_variant', 'synonymous_variant']" + assert hl.eval(result.consequences[0].consequence_terms) == [ + "frameshift_variant", + "synonymous_variant", + ], "Expected ['frameshift_variant', 'synonymous_variant']" def test_filter_vep_transcript_csqs_expr_loftee(): """Test the filter_vep_consequences_by_loftee function.""" # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations - csq_expr = hl.literal([ - hl.struct(lof='HC', lof_flags=hl.missing(hl.tstr)), - hl.struct(lof='HC', lof_flags=''), - hl.struct(lof='LC', lof_flags='flag1'), - hl.struct(lof='OS', lof_flags=hl.missing(hl.tstr)), - hl.struct(lof=hl.missing(hl.tstr), lof_flags='flag2') - ]) + csq_expr = hl.literal( + [ + hl.struct(lof="HC", lof_flags=hl.missing(hl.tstr)), + hl.struct(lof="HC", lof_flags=""), + hl.struct(lof="LC", lof_flags="flag1"), + hl.struct(lof="OS", lof_flags=hl.missing(hl.tstr)), + hl.struct(lof=hl.missing(hl.tstr), lof_flags="flag2"), + ] + ) # Test with default parameters result = filter_vep_transcript_csqs_expr(csq_expr) assert hl.eval(hl.len(result)) == 5, "Expected 5 consequences" # Test with loftee_labels - result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=['HC']) + result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=["HC"]) assert hl.eval(hl.len(result)) == 2, "Expected 2 consequences" - assert hl.eval(result[0].lof) == 'HC', "Expected 'HC'" + assert hl.eval(result[0].lof) == "HC", "Expected 'HC'" # Test with no_lof_flags result = filter_vep_transcript_csqs_expr(csq_expr, no_lof_flags=True) assert hl.eval(hl.len(result)) == 3, "Expected 3 consequences" - assert hl.eval(hl.all(result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == '')))), "Expected no LOFTEE flags" + assert hl.eval( + hl.all( + result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == "")) + ) + ), "Expected no LOFTEE flags" # Test with loftee_labels and no_lof_flags - result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=['HC'], no_lof_flags=True) + result = filter_vep_transcript_csqs_expr( + csq_expr, loftee_labels=["HC"], no_lof_flags=True + ) assert hl.eval(hl.len(result)) == 2, "Expected 2 consequence" - assert hl.eval(result[0].lof) == 'HC', "Expected 'HC'" - assert hl.eval(hl.all(result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == '')))), "Expected no LOFTEE flags" + assert hl.eval(result[0].lof) == "HC", "Expected 'HC'" + assert hl.eval( + hl.all( + result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == "")) + ) + ), "Expected no LOFTEE flags" def test_get_most_severe_csq_from_multiple_csq_lists(): @@ -140,61 +178,65 @@ def test_get_most_severe_csq_from_multiple_csq_lists(): vep_expr = hl.struct( transcript_consequences=[ hl.struct( - biotype='protein_coding', - lof='HC', - lof_flags='flag1', - consequence_terms='missense_variant' + biotype="protein_coding", + lof="HC", + lof_flags="flag1", + consequence_terms="missense_variant", ), hl.struct( - biotype='protein_coding', - lof='HC', + biotype="protein_coding", + lof="HC", lof_flags=hl.missing(hl.tstr), - consequence_terms='synonymous_variant' + consequence_terms="synonymous_variant", ), hl.struct( - biotype='protein_coding', - lof='HC', + biotype="protein_coding", + lof="HC", lof_flags=hl.missing(hl.tstr), - consequence_terms='missense_variant' + consequence_terms="missense_variant", ), ], intergenic_consequences=[ hl.struct( - biotype='intergenic', + biotype="intergenic", lof=hl.missing(hl.tstr), lof_flags=hl.missing(hl.tstr), - consequence_terms='intergenic_variant' + consequence_terms="intergenic_variant", ) - ] + ], ) # Test the function result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) # Check the most_severe_consequence - assert hl.eval(result.most_severe_consequence) == 'missense_variant', "Expected 'missense_variant'" + assert ( + hl.eval(result.most_severe_consequence) == "missense_variant" + ), "Expected 'missense_variant'" # Check the protein_coding assert hl.eval(result.protein_coding), "Expected True" # Check the lof - assert hl.eval(result.lof) == 'HC', "Expected 'HC'" + assert hl.eval(result.lof) == "HC", "Expected 'HC'" # Check the no_lof_flags assert hl.eval(result.no_lof_flags), "Expected True" # Check the transcript_consequences result = get_most_severe_csq_from_multiple_csq_lists(vep_expr, include_csqs=True) - assert hl.eval(hl.len(result.transcript_consequences)) == 2, "Expected 2 transcript consequences" + assert ( + hl.eval(hl.len(result.transcript_consequences)) == 2 + ), "Expected 2 transcript consequences" # Create a mock VEP expression vep_expr = hl.struct( transcript_consequences=[ hl.struct( - biotype='protein_coding', - lof='HC', - lof_flags='flag1', - consequence_terms='missense_variant' + biotype="protein_coding", + lof="HC", + lof_flags="flag1", + consequence_terms="missense_variant", ) ] ) From aca10f5e2f27f903e20b416852202a8629995d62 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:02:13 -0600 Subject: [PATCH 22/33] Move POLYPHEN ORDER to a different PR --- gnomad/utils/vep.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 902ab359f..47fa1c20b 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -144,11 +144,6 @@ Set containing loss-of-function consequence strings. """ -POLYPHEN_ORDER = ["probably_damaging", "possibly_damaging", "benign"] -""" -Order of PolyPhen predictions from most to least severe. -""" - def get_vep_help(vep_config_path: Optional[str] = None): """ From 06883087d2a290eb26f96fae1922a353e40ce9dd Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:16:33 -0600 Subject: [PATCH 23/33] Add extra newline --- tests/utils/test_vep.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py index c09a16869..32128f660 100644 --- a/tests/utils/test_vep.py +++ b/tests/utils/test_vep.py @@ -1,4 +1,5 @@ """Tests for the gnomAD VEP utility functions.""" + import hail as hl from gnomad.utils.vep import ( From 9cab3009df6ff218d1772afc5a173099d6999ae4 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:28:51 -0600 Subject: [PATCH 24/33] Remove unneeded f-string --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 47fa1c20b..50c53d303 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -980,7 +980,7 @@ def get_most_severe_csq_from_multiple_csq_lists( ms_csq_expr = hl.case(missing_false=True) for csq_list in csq_list_order: if csq_list not in vep_expr: - logger.warning(f"VEP consequences list %s not found in input!", csq_list) + logger.warning("VEP consequences list %s not found in input!", csq_list) continue is_tc = csq_list == "transcript_consequences" From e4d5cef2e9c96b10fb9bee0f5da69d3424d15271 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 25 Jun 2024 15:35:53 -0600 Subject: [PATCH 25/33] Change `filter_vep_transcript_csqs_expr` to set `csq_expr` as missing if it's empty after filtering --- gnomad/utils/vep.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 50c53d303..5e9dd8a02 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -788,7 +788,6 @@ def filter_to_most_severe_consequences( if filter_to_protein_coding: logger.info("Filtering to protein-coding transcripts...") csq_expr = filter_vep_transcript_csqs_expr(csq_expr, protein_coding=True) - csq_expr = hl.or_missing(hl.len(csq_expr) > 0, csq_expr) def _filter_to_most_severe( expr: hl.expr.ArrayExpression, @@ -1113,6 +1112,8 @@ def filter_vep_transcript_csqs_expr( is not already annotated on the `csq_expr` elements, the most severe consequence will be added to the `csq_expr` for filtering. + If `csq_expr` is empty after filtering, the expression will be set to missing. + :param csq_expr: ArrayExpression of VEP transcript consequences. :param synonymous: Whether to filter to variants where the most severe consequence is 'synonymous_variant'. Default is False. @@ -1194,7 +1195,9 @@ def filter_vep_transcript_csqs_expr( if len(criteria) == 1: logger.warning("No changes have been made to input transcript consequences!") - return csq_expr.filter(lambda x: combine_functions(criteria, x)) + csq_expr = csq_expr.filter(lambda x: combine_functions(criteria, x)) + + return hl.or_missing(hl.len(csq_expr) > 0, csq_expr) def add_most_severe_csq_to_tc_within_vep_root( From a67df8202faadd46fea451563bd1a4dc215e07d5 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:29:46 -0600 Subject: [PATCH 26/33] Address reviewer comments --- gnomad/utils/vep.py | 206 +++++++++++++++++++++------------------- tests/utils/test_vep.py | 109 +++++++++++++-------- 2 files changed, 176 insertions(+), 139 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 5e9dd8a02..998f27afd 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -722,7 +722,7 @@ def filter_to_most_severe_consequences( csq_expr: hl.expr.ArrayExpression, csq_order: Optional[List[str]] = None, loftee_labels: Optional[List[str]] = None, - filter_to_protein_coding: bool = False, + prioritize_protein_coding: bool = False, prioritize_loftee: bool = False, prioritize_loftee_no_flags: bool = False, additional_order_field: Optional[str] = None, @@ -740,15 +740,18 @@ def filter_to_most_severe_consequences( .. note:: - - If you have multiple lists of consequences and want to determine the most - severe consequence across all lists, consider using + - If you have multiple lists of consequences (such as lists of both + 'transcript_consequences' and 'intergenic_consequences') and want to + determine the most severe consequence across all lists, consider using `get_most_severe_csq_from_multiple_csq_lists`. - If you want to group consequences by gene and determine the most severe consequence for each gene, consider using `process_consequences`. - If `filter_to_protein_coding` is True, filter to only protein-coding transcripts - before determining the most severe consequence. + If `prioritize_protein_coding` is True, protein-coding transcripts are prioritized + by filtering to only protein-coding transcripts and determining the + most severe consequence. If no protein-coding transcripts are present, determine + the most severe consequence for all transcripts. If `prioritize_loftee` is True, prioritize consequences with LOFTEE annotations, in the order of `loftee_labels`, over those without LOFTEE annotations. If @@ -764,8 +767,8 @@ def filter_to_most_severe_consequences( low impact. Default is None, which uses the value of the `CSQ_ORDER` global. :param loftee_labels: Annotations added by LOFTEE, sorted from high to low impact. Default is None, which uses the value of the `LOFTEE_LABELS` global. - :param filter_to_protein_coding: Whether to filter to only protein-coding - transcripts before determining the most severe consequence. Default is False. + :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts + when determining the worst consequence. Default is False. :param prioritize_loftee: Whether to prioritize LOFTEE consequences. Default is False. :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE consequences with no @@ -777,86 +780,93 @@ def filter_to_most_severe_consequences( :return: ArrayExpression with of the consequences that match the most severe consequence. """ + # Get the dtype of the csq_expr ArrayExpression elements + csq_type = csq_expr.dtype.element_type + if ((additional_order_field is None) + (additional_order is None)) == 1: raise ValueError( "If `additional_order_field` is provided, `additional_order` must also be" " provided and vice versa." ) - loftee_labels = loftee_labels or LOFTEE_LABELS - - if filter_to_protein_coding: - logger.info("Filtering to protein-coding transcripts...") - csq_expr = filter_vep_transcript_csqs_expr(csq_expr, protein_coding=True) + if additional_order_field and additional_order_field not in csq_type.fields: + raise ValueError("Additional order field not found in consequence type.") - def _filter_to_most_severe( - expr: hl.expr.ArrayExpression, - field: str, - order: Optional[List[str]] = None, - ) -> Tuple[hl.expr.StringExpression, hl.expr.ArrayExpression]: - """ - Filter to the most severe consequence in a list of consequences. - - :param expr: ArrayExpression of consequences to filter. - :param field: Field name of the consequence annotation. - :param order: List indicating the order of VEP consequences, sorted from high to - low impact. Default is None, which uses the value of the `CSQ_ORDER` global. - :return: The most severe consequence and an ArrayExpression of consequences that - match the most severe consequence. - """ - ms_type = expr.dtype.element_type[field] - is_collection = isinstance(ms_type, hl.tarray) or isinstance(ms_type, hl.tset) - - # Get the highest impact csq label. - expr_map = expr.flatmap if is_collection else expr.map - filter_func = lambda x, ms: x.contains(ms) if is_collection else x == ms - ms_csq = get_most_severe_consequence_expr( - expr_map(lambda x: x[field]), csq_order=order - ) + # Define the order of fields to prioritize by, based on specified parameters. + priority_order = ( + (["protein_coding"] if prioritize_protein_coding else []) + + (["lof"] if prioritize_loftee else []) + + (["no_lof_flags"] if prioritize_loftee_no_flags else []) + + ["most_severe_consequence"] + ) - # Filter to only consequences with the highest impact csq label, and return - # missing if the most severe consequence is missing. - return ms_csq, hl.or_missing( - (hl.len(expr) > 0) & hl.is_defined(ms_csq), - expr.filter(lambda x: filter_func(x[field], ms_csq)), - ) + # Define the impact ordering of VEP consequences and LOFTEE labels. If not provided, + # use the globals CSQ_ORDER (set in get_most_severe_consequence_expr) and + # LOFTEE_LABELS. + loftee_labels = loftee_labels or LOFTEE_LABELS + term_order = {"most_severe_consequence": csq_order, "lof": loftee_labels} + + # Add the additional order field to the priority order and term order if provided. + if additional_order_field: + priority_order.append(additional_order_field) + term_order[additional_order_field] = additional_order + + # Define initial result and current expression. + result = {} + order_result_fields = ["most_severe_consequence", "lof"] + curr_expr = hl.or_missing(hl.len(csq_expr) > 0, csq_expr) + + for curr_field in priority_order: + order = term_order.get(curr_field) + if order is None and curr_field != "most_severe_consequence": + # If there is no order specified for the current field, then the field is + # used as a parameter to filter_vep_transcript_csqs_expr and if there are + # any consequences remaining, the result is set to True. + curr_expr = filter_vep_transcript_csqs_expr(curr_expr, **{curr_field: True}) + result[curr_field] = hl.len(curr_expr) > 0 + else: + # Handle the case where the current field is a collection of consequences + # each with a 'consequence_terms' field (e.g. transcript_consequences) that + # need to be flattened to determine the most severe consequence. + f = curr_field if curr_field in csq_type.fields else "consequence_terms" + if isinstance(csq_type[f], hl.tarray) or isinstance(csq_type[f], hl.tset): + f_map = csq_expr.flatmap + f_func = lambda x, csq: x.contains(csq) + else: + f_map = csq_expr.map + f_func = lambda x, csq: x == csq + + # Get the most severe (highest impact) consequence for the current field. + ms_csq_expr = get_most_severe_consequence_expr(f_map(lambda x: x[f]), order) + + # Filter to only elements that contain the most severe (highest impact) + # consequence for the current field, and return missing if the most severe + # consequence is missing. + curr_expr = hl.or_missing( + hl.is_defined(ms_csq_expr), + csq_expr.filter(lambda x: f_func(x[f], ms_csq_expr)) + ) - # Initialize the lof struct with missing values. - lof_expr = hl.struct(lof=hl.missing(hl.tstr), no_lof_flags=hl.missing(hl.tbool)) - - # If requested, filter to the most severe LOFTEE consequences, and determine if - # there are any LOFTEE consequences without flags. - if prioritize_loftee: - lof, lof_csq = _filter_to_most_severe(csq_expr, "lof", loftee_labels) - lof_csq_no_flag = filter_vep_transcript_csqs_expr(lof_csq, no_lof_flags=True) - no_lof_flags = hl.len(lof_csq_no_flag) > 0 - - # If requested and there are lof consequences without flags, use those. - if prioritize_loftee_no_flags: - lof_csq = hl.if_else(no_lof_flags, lof_csq_no_flag, lof_csq) - - # If there are no lof consequences, use the original consequences list. - csq_expr = hl.coalesce(lof_csq, csq_expr) - lof_expr = lof_expr.annotate(lof=lof, no_lof_flags=no_lof_flags) - - # Get the most severe consequence and the consequences that match the most severe - # consequence. - ms_csq, csq_expr = _filter_to_most_severe(csq_expr, "consequence_terms", csq_order) - - # If additional ordering is provided, filter to the most severe consequence with the - # highest impact from the additional ordering. - if additional_order is not None: - # Get the highest impact consequences from the additional ordering. - _, add_csq_expr = _filter_to_most_severe( - csq_expr, additional_order_field, additional_order - ) + # Add the most severe consequence to the result if the field is in the order + # result fields. When there is no most severe consequence and the current + # field has a result expression, the result is kept as the existing result + # value. + if curr_field in order_result_fields: + if curr_field in result: + ms_csq_expr = hl.or_else(ms_csq_expr, result[curr_field]) + result[curr_field] = ms_csq_expr + + if curr_field == "lof": + result["no_lof_flags"] = hl.any( + curr_expr.map( + lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "") + ) + ) - # If there are consequences from the additional ordering, set the - # consequence list to the additional ordering, otherwise keep the original - # list. - csq_expr = hl.coalesce(add_csq_expr, csq_expr) + curr_expr = hl.or_missing(hl.len(curr_expr) > 0, curr_expr) + csq_expr = hl.or_else(curr_expr, csq_expr) - return hl.struct(most_severe_consequence=ms_csq, **lof_expr, consequences=csq_expr) + return hl.struct(**result, consequences=csq_expr) @deprecated(reason="Replaced by get_most_severe_csq_from_multiple_csq_lists") @@ -908,7 +918,7 @@ def get_most_severe_csq_from_multiple_csq_lists( "motif_feature_consequences", "intergenic_consequences", ), - add_order_by_csq_list: Dict[str, List[str]] = None, + add_order_by_csq_list: Dict[str, Tuple[str, List[str]]] = None, ) -> hl.Table: """ Process multiple VEP consequences lists to determine the most severe consequence. @@ -970,6 +980,14 @@ def get_most_severe_csq_from_multiple_csq_lists( add_order_by_csq_list = add_order_by_csq_list or {} loftee_labels = loftee_labels or LOFTEE_LABELS + result = { + **({"protein_coding": hl.tbool} if prioritize_protein_coding else {}), + **({"lof": hl.tstr} if prioritize_loftee else {}), + **({"no_lof_flags": hl.tbool} if prioritize_loftee or prioritize_protein_coding else {}), + "most_severe_consequence": hl.tstr, + } + result = hl.struct(**{k: hl.missing(v) for k, v in result.items()}) + # Create a struct with missing values for each VEP consequences list. ms_csq_list_expr = hl.struct( **{c: hl.missing(vep_expr[c].dtype) for c in csq_list_order if c in vep_expr} @@ -987,28 +1005,18 @@ def get_most_severe_csq_from_multiple_csq_lists( # Set the base arguments for filtering to the most severe consequence using # filter_to_most_severe_consequences. + add_order = add_order_by_csq_list.get(csq_list) base_args = { "csq_order": csq_order, "loftee_labels": loftee_labels, + "prioritize_protein_coding": True if (prioritize_protein_coding and is_tc) else False, "prioritize_loftee": True if (prioritize_loftee and is_tc) else False, "prioritize_loftee_no_flags": prioritize_loftee_no_flags, - "additional_order": add_order_by_csq_list.get(csq_list), + "additional_order_field": add_order[0] if add_order else None, + "additional_order": add_order[1] if add_order else None, } ms_expr = filter_to_most_severe_consequences(csq_expr, **base_args) - - # If prioritizing protein-coding transcripts, get the most severe consequence - # for protein-coding transcripts. - if prioritize_protein_coding: - ms_expr = ms_expr.annotate(protein_coding=hl.missing(hl.tbool)) - if is_tc: - pc_ms_expr = filter_to_most_severe_consequences( - csq_expr, filter_to_protein_coding=True, **base_args - ) - ms_expr = hl.if_else( - hl.is_defined(pc_ms_expr.most_severe_consequence), - pc_ms_expr.annotate(protein_coding=True), - ms_expr.annotate(protein_coding=False), - ) + ms_expr = result.annotate(**ms_expr) # Annotate the current consequence list with the consequences that match the # most severe consequence term. @@ -1112,8 +1120,6 @@ def filter_vep_transcript_csqs_expr( is not already annotated on the `csq_expr` elements, the most severe consequence will be added to the `csq_expr` for filtering. - If `csq_expr` is empty after filtering, the expression will be set to missing. - :param csq_expr: ArrayExpression of VEP transcript consequences. :param synonymous: Whether to filter to variants where the most severe consequence is 'synonymous_variant'. Default is False. @@ -1146,12 +1152,13 @@ def filter_vep_transcript_csqs_expr( criteria to apply to the VEP transcript consequences. :return: ArrayExpression of filtered VEP transcript consequences. """ + csq_fields = csq_expr.dtype.element_type.fields criteria = [lambda csq: True] if synonymous: logger.info("Filtering to most severe consequence of synonymous_variant...") csqs = ["synonymous_variant"] if csqs is not None: - if "most_severe_consequence" not in csq_expr.dtype.element_type.fields: + if "most_severe_consequence" not in csq_fields: logger.info("Adding most_severe_consequence annotation...") csq_expr = add_most_severe_consequence_to_consequence(csq_expr) @@ -1179,7 +1186,10 @@ def filter_vep_transcript_csqs_expr( criteria.append(lambda x: hl.set(loftee_labels).contains(x.lof)) if no_lof_flags: logger.info("Filtering to consequences with no LOFTEE flags...") - criteria.append(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) + if "lof_flags" in csq_fields: + criteria.append(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) + else: + logger.warning("'lof_flags' not present in consequence struct, no consequences are filtered based on LOFTEE flags") if genes is not None: logger.info("Filtering to genes of interest...") genes = hl.literal(genes) @@ -1195,9 +1205,7 @@ def filter_vep_transcript_csqs_expr( if len(criteria) == 1: logger.warning("No changes have been made to input transcript consequences!") - csq_expr = csq_expr.filter(lambda x: combine_functions(criteria, x)) - - return hl.or_missing(hl.len(csq_expr) > 0, csq_expr) + return csq_expr.filter(lambda x: combine_functions(criteria, x)) def add_most_severe_csq_to_tc_within_vep_root( diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py index 32128f660..ef4ee2c8b 100644 --- a/tests/utils/test_vep.py +++ b/tests/utils/test_vep.py @@ -14,21 +14,21 @@ def test_get_most_severe_consequence_expr(): """Test the get_most_severe_consequence_expr function.""" - # Create a mock ArrayExpression of consequences + # Create a mock ArrayExpression of consequences. csq_expr = hl.literal(["splice_region_variant", "intron_variant"]) - # Test with default csq_order + # Test with default csq_order. result = get_most_severe_consequence_expr(csq_expr) assert ( hl.eval(result) == "splice_region_variant" ), "Expected 'splice_region_variant'" - # Test with custom csq_order + # Test with custom csq_order. custom_csq_order = ["intron_variant", "splice_region_variant"] result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) assert hl.eval(result) == "intron_variant", "Expected 'intron_variant'" - # Test with csq_expr that contains a consequence not in csq_order + # Test with csq_expr that contains a consequence not in csq_order. csq_expr = hl.literal(["non_existent_consequence"]) result = get_most_severe_consequence_expr(csq_expr) assert hl.eval(result) is None, "Expected None for non-existent consequence" @@ -36,23 +36,23 @@ def test_get_most_severe_consequence_expr(): def test_add_most_severe_consequence_to_consequence(): """Test the add_most_severe_consequence_to_consequence function.""" - # Create a mock StructExpression of transcript consequence + # Create a mock StructExpression of transcript consequence. tc_expr = hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]) - # Test with default csq_order + # Test with default csq_order. result = add_most_severe_consequence_to_consequence(tc_expr) assert hl.eval(result.most_severe_consequence) == hl.eval( get_most_severe_consequence_expr(tc_expr.consequence_terms) ), "Expected 'missense_variant'" - # Test with custom csq_order + # Test with custom csq_order. custom_csq_order = ["synonymous_variant", "missense_variant"] result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) assert hl.eval(result.most_severe_consequence) == hl.eval( get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order) ), "Expected 'synonymous_variant'" - # Test with csq_expr that contains a consequence not in csq_order + # Test with csq_expr that contains a consequence not in csq_order. tc_expr = hl.struct(consequence_terms=["non_existent_consequence"]) result = add_most_severe_consequence_to_consequence(tc_expr) assert hl.eval(result.most_severe_consequence) == hl.eval( @@ -62,7 +62,7 @@ def test_add_most_severe_consequence_to_consequence(): def test_add_most_severe_csq_to_tc_within_vep_root(): """Test the add_most_severe_csq_to_tc_within_vep_root function.""" - # Create a mock MatrixTable with vep.transcript_consequences + # Create a mock MatrixTable with vep.transcript_consequences. mt = hl.utils.range_matrix_table(1, 1) mt = mt.annotate_rows( vep=hl.struct( @@ -72,36 +72,36 @@ def test_add_most_severe_csq_to_tc_within_vep_root(): ) ) - # Apply the function + # Apply the function. result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) - # Check that the most_severe_consequence field is added + # Check that the most_severe_consequence field is added. assert ( "most_severe_consequence" in result_mt.vep.transcript_consequences[0].keys() ), "Expected 'most_severe_consequence' field" - # Check that the most_severe_consequence is correct + # Check that the most_severe_consequence is correct. assert ( result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] == "missense_variant" - ) + ), "Expected 'missense_variant'" def test_filter_to_most_severe_consequences(): """Test the filter_to_most_severe_consequences function.""" - # Create a mock array of VEP consequences + # Create a mock array of VEP consequences. mock_csq_expr = hl.literal( [ hl.struct( consequence_terms=["missense_variant", "synonymous_variant"], lof="HC", - lof_flags="HC", + lof_flags="flag1", protein_coding=1, ), hl.struct( consequence_terms=["splice_region_variant", "synonymous_variant"], lof="LC", - lof_flags="HC", + lof_flags="flag1", protein_coding=1, ), hl.struct( @@ -113,15 +113,15 @@ def test_filter_to_most_severe_consequences(): ] ) - # Call the function with the mock data + # Call the function with the mock data. result = filter_to_most_severe_consequences(mock_csq_expr, prioritize_loftee=True) - # Assert that the output is as expected + # Assert that the output is as expected. assert ( hl.eval(result.most_severe_consequence) == "frameshift_variant" ), "Expected 'frameshift_variant'" assert hl.eval(result.lof) == "HC", "Expected 'HC'" - assert hl.eval(result.no_lof_flags) + assert hl.eval(result.no_lof_flags), "Expected True" assert hl.eval(hl.len(result.consequences)) == 1, "Expected 1 consequence" assert hl.eval(result.consequences[0].consequence_terms) == [ "frameshift_variant", @@ -131,7 +131,7 @@ def test_filter_to_most_severe_consequences(): def test_filter_vep_transcript_csqs_expr_loftee(): """Test the filter_vep_consequences_by_loftee function.""" - # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations + # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations. csq_expr = hl.literal( [ hl.struct(lof="HC", lof_flags=hl.missing(hl.tstr)), @@ -142,16 +142,16 @@ def test_filter_vep_transcript_csqs_expr_loftee(): ] ) - # Test with default parameters + # Test with default parameters. result = filter_vep_transcript_csqs_expr(csq_expr) assert hl.eval(hl.len(result)) == 5, "Expected 5 consequences" - # Test with loftee_labels + # Test with loftee_labels. result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=["HC"]) assert hl.eval(hl.len(result)) == 2, "Expected 2 consequences" assert hl.eval(result[0].lof) == "HC", "Expected 'HC'" - # Test with no_lof_flags + # Test with no_lof_flags. result = filter_vep_transcript_csqs_expr(csq_expr, no_lof_flags=True) assert hl.eval(hl.len(result)) == 3, "Expected 3 consequences" assert hl.eval( @@ -160,7 +160,7 @@ def test_filter_vep_transcript_csqs_expr_loftee(): ) ), "Expected no LOFTEE flags" - # Test with loftee_labels and no_lof_flags + # Test with loftee_labels and no_lof_flags. result = filter_vep_transcript_csqs_expr( csq_expr, loftee_labels=["HC"], no_lof_flags=True ) @@ -175,62 +175,91 @@ def test_filter_vep_transcript_csqs_expr_loftee(): def test_get_most_severe_csq_from_multiple_csq_lists(): """Test the get_most_severe_csq_from_multiple_csq_lists function.""" - # Create a mock VEP expression + # Create a mock VEP expression. vep_expr = hl.struct( transcript_consequences=[ hl.struct( biotype="protein_coding", lof="HC", lof_flags="flag1", - consequence_terms="missense_variant", + consequence_terms="splice_acceptor_variant", + ), + hl.struct( + biotype="protein_coding", + lof="HC", + lof_flags="flag1", + consequence_terms="splice_acceptor_variant", ), hl.struct( biotype="protein_coding", lof="HC", lof_flags=hl.missing(hl.tstr), - consequence_terms="synonymous_variant", + consequence_terms="stop_lost", ), hl.struct( biotype="protein_coding", lof="HC", lof_flags=hl.missing(hl.tstr), + consequence_terms="stop_gained", + ), + hl.struct( + biotype="protein_coding", + lof=hl.missing(hl.tstr), + lof_flags=hl.missing(hl.tstr), consequence_terms="missense_variant", ), ], intergenic_consequences=[ hl.struct( biotype="intergenic", - lof=hl.missing(hl.tstr), - lof_flags=hl.missing(hl.tstr), consequence_terms="intergenic_variant", ) ], ) - # Test the function + # Test the function. result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) - # Check the most_severe_consequence + # Check the most_severe_consequence. assert ( - hl.eval(result.most_severe_consequence) == "missense_variant" - ), "Expected 'missense_variant'" + hl.eval(result.most_severe_consequence) == "splice_acceptor_variant" + ), "Expected 'splice_acceptor_variant'" + + # Check the protein_coding. + assert hl.eval(result.protein_coding), "Expected True" + + # Check the lof. + assert hl.eval(result.lof) == "HC", "Expected 'HC'" + + # Check the no_lof_flags. + assert hl.eval(result.no_lof_flags), "Expected True" + + # Check the prioritize_loftee_no_flags. + result = get_most_severe_csq_from_multiple_csq_lists( + vep_expr, prioritize_loftee_no_flags=True + ) + + # Check the most_severe_consequence. + assert ( + hl.eval(result.most_severe_consequence) == "stop_gained" + ), "Expected 'stop_gained'" - # Check the protein_coding + # Check the protein_coding. assert hl.eval(result.protein_coding), "Expected True" - # Check the lof + # Check the lof. assert hl.eval(result.lof) == "HC", "Expected 'HC'" - # Check the no_lof_flags + # Check the no_lof_flags. assert hl.eval(result.no_lof_flags), "Expected True" - # Check the transcript_consequences + # Check the transcript_consequences. result = get_most_severe_csq_from_multiple_csq_lists(vep_expr, include_csqs=True) assert ( hl.eval(hl.len(result.transcript_consequences)) == 2 ), "Expected 2 transcript consequences" - # Create a mock VEP expression + # Create a mock VEP expression. vep_expr = hl.struct( transcript_consequences=[ hl.struct( @@ -242,8 +271,8 @@ def test_get_most_severe_csq_from_multiple_csq_lists(): ] ) - # Test the function + # Test the function. result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) - # Check the no_lof_flags + # Check the no_lof_flags. assert hl.eval(result.no_lof_flags) == False, "Expected False" From aa9df9a49cab666fa55e974a3355c75854c8e48c Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:30:38 -0600 Subject: [PATCH 27/33] format --- gnomad/utils/vep.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 998f27afd..78df513a0 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -844,7 +844,7 @@ def filter_to_most_severe_consequences( # consequence is missing. curr_expr = hl.or_missing( hl.is_defined(ms_csq_expr), - csq_expr.filter(lambda x: f_func(x[f], ms_csq_expr)) + csq_expr.filter(lambda x: f_func(x[f], ms_csq_expr)), ) # Add the most severe consequence to the result if the field is in the order @@ -983,7 +983,11 @@ def get_most_severe_csq_from_multiple_csq_lists( result = { **({"protein_coding": hl.tbool} if prioritize_protein_coding else {}), **({"lof": hl.tstr} if prioritize_loftee else {}), - **({"no_lof_flags": hl.tbool} if prioritize_loftee or prioritize_protein_coding else {}), + **( + {"no_lof_flags": hl.tbool} + if prioritize_loftee or prioritize_protein_coding + else {} + ), "most_severe_consequence": hl.tstr, } result = hl.struct(**{k: hl.missing(v) for k, v in result.items()}) @@ -1009,7 +1013,9 @@ def get_most_severe_csq_from_multiple_csq_lists( base_args = { "csq_order": csq_order, "loftee_labels": loftee_labels, - "prioritize_protein_coding": True if (prioritize_protein_coding and is_tc) else False, + "prioritize_protein_coding": ( + True if (prioritize_protein_coding and is_tc) else False + ), "prioritize_loftee": True if (prioritize_loftee and is_tc) else False, "prioritize_loftee_no_flags": prioritize_loftee_no_flags, "additional_order_field": add_order[0] if add_order else None, @@ -1189,7 +1195,9 @@ def filter_vep_transcript_csqs_expr( if "lof_flags" in csq_fields: criteria.append(lambda x: hl.is_missing(x.lof_flags) | (x.lof_flags == "")) else: - logger.warning("'lof_flags' not present in consequence struct, no consequences are filtered based on LOFTEE flags") + logger.warning( + "'lof_flags' not present in consequence struct, no consequences are filtered based on LOFTEE flags" + ) if genes is not None: logger.info("Filtering to genes of interest...") genes = hl.literal(genes) From 5ecdeac827b6302ff652ede31bfee26c99d62562 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:33:50 -0600 Subject: [PATCH 28/33] Small docstring change --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 78df513a0..88db13737 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -932,7 +932,7 @@ def get_most_severe_csq_from_multiple_csq_lists( .. note:: Assumes input Table is annotated with VEP and that VEP annotations have been - filtered to canonical transcripts if wanted. + filtered to canonical or MANE Select transcripts if wanted. If `include_csqs` is True, additional annotations are added for each VEP consequences list in `csq_list_order`, with the consequences that match the most From e1c9ef728b8d0905ca5944d7aad9da82176f6393 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:35:59 -0600 Subject: [PATCH 29/33] Add check for is_tc --- gnomad/utils/vep.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 88db13737..8d318d943 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -1017,7 +1017,9 @@ def get_most_severe_csq_from_multiple_csq_lists( True if (prioritize_protein_coding and is_tc) else False ), "prioritize_loftee": True if (prioritize_loftee and is_tc) else False, - "prioritize_loftee_no_flags": prioritize_loftee_no_flags, + "prioritize_loftee_no_flags": ( + True if (prioritize_loftee_no_flags and is_tc) else False + ), "additional_order_field": add_order[0] if add_order else None, "additional_order": add_order[1] if add_order else None, } From fbda89ba40d00e5bda83a395c647dbd0fb990924 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:37:00 -0600 Subject: [PATCH 30/33] Change docstring default to correct value --- gnomad/utils/vep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 8d318d943..a7b34886e 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -966,7 +966,7 @@ def get_most_severe_csq_from_multiple_csq_lists( :param prioritize_protein_coding: Whether to prioritize protein-coding transcripts when determining the worst consequence. Default is True. :param prioritize_loftee: Whether to prioritize consequences with LOFTEE annotations - over those without. Default is False. + over those without. Default is True. :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE annotated consequences with no flags over those with flags. Default is False. :param csq_list_order: Order of VEP consequences lists to be processed. Default is From 8ab6fb4db5bcd6a45daf66d6a49925307c48d7cf Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Tue, 22 Oct 2024 00:25:24 -0600 Subject: [PATCH 31/33] Change to get around pylint error --- gnomad/utils/vep.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index a7b34886e..475d760fa 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -818,11 +818,14 @@ def filter_to_most_severe_consequences( for curr_field in priority_order: order = term_order.get(curr_field) - if order is None and curr_field != "most_severe_consequence": + # Added the below line to get around the pylint error: Unexpected keyword + # argument 'most_severe_consequence' in function call. + f_param = curr_field if curr_field not in order_result_fields else None + if f_param is not None and order is None: # If there is no order specified for the current field, then the field is # used as a parameter to filter_vep_transcript_csqs_expr and if there are # any consequences remaining, the result is set to True. - curr_expr = filter_vep_transcript_csqs_expr(curr_expr, **{curr_field: True}) + curr_expr = filter_vep_transcript_csqs_expr(curr_expr, **{f_param: True}) result[curr_field] = hl.len(curr_expr) > 0 else: # Handle the case where the current field is a collection of consequences From 141fecf9e55f38c17be9a9d036b1c2fbd7995ffa Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 23 Oct 2024 22:31:04 -0600 Subject: [PATCH 32/33] Clean up test_vep.py to add additional tests and improve the structure of the tests --- tests/utils/test_vep.py | 797 ++++++++++++++++++++++++++++------------ 1 file changed, 561 insertions(+), 236 deletions(-) diff --git a/tests/utils/test_vep.py b/tests/utils/test_vep.py index ef4ee2c8b..d9e76b6dc 100644 --- a/tests/utils/test_vep.py +++ b/tests/utils/test_vep.py @@ -1,6 +1,9 @@ """Tests for the gnomAD VEP utility functions.""" +from typing import Any, Dict, List, Optional + import hail as hl +import pytest from gnomad.utils.vep import ( add_most_severe_consequence_to_consequence, @@ -12,267 +15,589 @@ ) -def test_get_most_severe_consequence_expr(): +class TestGetMostSevereConsequenceExpr: """Test the get_most_severe_consequence_expr function.""" - # Create a mock ArrayExpression of consequences. - csq_expr = hl.literal(["splice_region_variant", "intron_variant"]) - - # Test with default csq_order. - result = get_most_severe_consequence_expr(csq_expr) - assert ( - hl.eval(result) == "splice_region_variant" - ), "Expected 'splice_region_variant'" - - # Test with custom csq_order. - custom_csq_order = ["intron_variant", "splice_region_variant"] - result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) - assert hl.eval(result) == "intron_variant", "Expected 'intron_variant'" - - # Test with csq_expr that contains a consequence not in csq_order. - csq_expr = hl.literal(["non_existent_consequence"]) - result = get_most_severe_consequence_expr(csq_expr) - assert hl.eval(result) is None, "Expected None for non-existent consequence" - - -def test_add_most_severe_consequence_to_consequence(): - """Test the add_most_severe_consequence_to_consequence function.""" - # Create a mock StructExpression of transcript consequence. - tc_expr = hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]) - - # Test with default csq_order. - result = add_most_severe_consequence_to_consequence(tc_expr) - assert hl.eval(result.most_severe_consequence) == hl.eval( - get_most_severe_consequence_expr(tc_expr.consequence_terms) - ), "Expected 'missense_variant'" - - # Test with custom csq_order. - custom_csq_order = ["synonymous_variant", "missense_variant"] - result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) - assert hl.eval(result.most_severe_consequence) == hl.eval( - get_most_severe_consequence_expr(tc_expr.consequence_terms, custom_csq_order) - ), "Expected 'synonymous_variant'" - - # Test with csq_expr that contains a consequence not in csq_order. - tc_expr = hl.struct(consequence_terms=["non_existent_consequence"]) - result = add_most_severe_consequence_to_consequence(tc_expr) - assert hl.eval(result.most_severe_consequence) == hl.eval( - get_most_severe_consequence_expr(tc_expr.consequence_terms) - ), "Expected None for non-existent consequence" - - -def test_add_most_severe_csq_to_tc_within_vep_root(): - """Test the add_most_severe_csq_to_tc_within_vep_root function.""" - # Create a mock MatrixTable with vep.transcript_consequences. - mt = hl.utils.range_matrix_table(1, 1) - mt = mt.annotate_rows( - vep=hl.struct( - transcript_consequences=[ - hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]) - ] - ) - ) - - # Apply the function. - result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) - # Check that the most_severe_consequence field is added. - assert ( - "most_severe_consequence" in result_mt.vep.transcript_consequences[0].keys() - ), "Expected 'most_severe_consequence' field" + @pytest.mark.parametrize( + "csq_expr, custom_csq_order, expected", + [ + # Test with default csq_order. + ( + hl.literal(["splice_region_variant", "intron_variant"]), + None, + "splice_region_variant", + ), + # Test with custom csq_order. + ( + hl.literal(["splice_region_variant", "intron_variant"]), + ["intron_variant", "splice_region_variant"], + "intron_variant", + ), + # Test with csq_expr that contains a consequence not in csq_order. + (hl.literal(["non_existent_consequence"]), None, None), + # Test with empty csq_expr. + (hl.empty_array(hl.tstr), None, None), + ], + ) + def test_get_most_severe_consequence_expr( + self, + csq_expr: hl.expr.ArrayExpression, + custom_csq_order: List[str], + expected: str, + ) -> None: + """ + Test get_most_severe_consequence_expr with various parameters. + + :param csq_expr: The consequence terms to evaluate. + :param custom_csq_order: The custom consequence order to use. + :param expected: The expected most severe consequence. + :return: None. + """ + result = get_most_severe_consequence_expr(csq_expr, custom_csq_order) + assert hl.eval(result) == expected, f"Expected {expected}" + + +class TestAddMostSevereConsequenceToConsequence: + """Tests for the add_most_severe_consequence_to_consequence function.""" + + @pytest.mark.parametrize( + "tc_expr, custom_csq_order, expected", + [ + # Test with default csq_order. + ( + hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]), + None, + "missense_variant", + ), + # Test with custom csq_order. + ( + hl.struct(consequence_terms=["missense_variant", "synonymous_variant"]), + ["synonymous_variant", "missense_variant"], + "synonymous_variant", + ), + # Test with csq_expr that contains a consequence not in csq_order. + (hl.struct(consequence_terms=["non_existent_consequence"]), None, None), + ], + ) + def test_add_most_severe_consequence_to_consequence( + self, + tc_expr: hl.expr.StructExpression, + custom_csq_order: List[str], + expected: str, + ) -> None: + """ + Test add_most_severe_consequence_to_consequence with various parameters. + + :param tc_expr: The transcript consequence to evaluate. + :param custom_csq_order: The custom consequence order to use. + :param expected: The expected most severe consequence. + :return: None. + """ + result = add_most_severe_consequence_to_consequence(tc_expr, custom_csq_order) + assert hl.eval(result.most_severe_consequence) == hl.eval( + get_most_severe_consequence_expr( + tc_expr.consequence_terms, custom_csq_order + ) + ), f"Expected {expected}" - # Check that the most_severe_consequence is correct. - assert ( - result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] - == "missense_variant" - ), "Expected 'missense_variant'" +class TestAddMostSevereCsqToTcWithinVepRoot: + """Tests for the add_most_severe_csq_to_tc_within_vep_root function.""" -def test_filter_to_most_severe_consequences(): - """Test the filter_to_most_severe_consequences function.""" - # Create a mock array of VEP consequences. - mock_csq_expr = hl.literal( + @pytest.mark.parametrize( + "transcript_consequences, expected_most_severe", [ - hl.struct( - consequence_terms=["missense_variant", "synonymous_variant"], - lof="HC", - lof_flags="flag1", - protein_coding=1, + # Test with multiple consequence terms of different severity. + ( + [ + hl.struct( + consequence_terms=["missense_variant", "synonymous_variant"] + ) + ], + "missense_variant", ), - hl.struct( - consequence_terms=["splice_region_variant", "synonymous_variant"], - lof="LC", - lof_flags="flag1", - protein_coding=1, + # Test with a single consequence term. + ( + [hl.struct(consequence_terms=["synonymous_variant"])], + "synonymous_variant", ), - hl.struct( - consequence_terms=["frameshift_variant", "synonymous_variant"], - lof="HC", - lof_flags="", - protein_coding=1, + # Test with multiple consequence terms of the same severity. + ( + [ + hl.struct( + consequence_terms=["synonymous_variant", "synonymous_variant"] + ) + ], + "synonymous_variant", ), - ] + # Test with a consequence term not in the default order. + ([hl.struct(consequence_terms=["non_existent_consequence"])], None), + ], ) + def test_add_most_severe_csq_to_tc_within_vep_root( + self, + transcript_consequences: List[hl.expr.StructExpression], + expected_most_severe: str, + ) -> None: + """ + Test the add_most_severe_csq_to_tc_within_vep_root function. + + :param transcript_consequences: List of transcript consequences to annotate. + :param expected_most_severe: The expected most severe consequence. + :return: None. + """ + # Create a mock MatrixTable with vep.transcript_consequences. + mt = hl.utils.range_matrix_table(1, 1) + mt = mt.annotate_rows( + vep=hl.struct(transcript_consequences=transcript_consequences) + ) - # Call the function with the mock data. - result = filter_to_most_severe_consequences(mock_csq_expr, prioritize_loftee=True) - - # Assert that the output is as expected. - assert ( - hl.eval(result.most_severe_consequence) == "frameshift_variant" - ), "Expected 'frameshift_variant'" - assert hl.eval(result.lof) == "HC", "Expected 'HC'" - assert hl.eval(result.no_lof_flags), "Expected True" - assert hl.eval(hl.len(result.consequences)) == 1, "Expected 1 consequence" - assert hl.eval(result.consequences[0].consequence_terms) == [ - "frameshift_variant", - "synonymous_variant", - ], "Expected ['frameshift_variant', 'synonymous_variant']" - - -def test_filter_vep_transcript_csqs_expr_loftee(): - """Test the filter_vep_consequences_by_loftee function.""" - # Create a mock ArrayExpression of VEP consequences with LOFTEE annotations. - csq_expr = hl.literal( - [ - hl.struct(lof="HC", lof_flags=hl.missing(hl.tstr)), - hl.struct(lof="HC", lof_flags=""), - hl.struct(lof="LC", lof_flags="flag1"), - hl.struct(lof="OS", lof_flags=hl.missing(hl.tstr)), - hl.struct(lof=hl.missing(hl.tstr), lof_flags="flag2"), + # Apply the function. + result_mt = add_most_severe_csq_to_tc_within_vep_root(mt) + + # Check that the most_severe_consequence is correct. + assert ( + result_mt.vep.transcript_consequences[0].most_severe_consequence.take(1)[0] + == expected_most_severe + ), f"Expected '{expected_most_severe}'" + + +class TestFilterToMostSevereConsequences: + """Tests for the filter_to_most_severe_consequences function.""" + + @pytest.fixture + def mock_csq_expr(self) -> hl.expr.ArrayExpression: + """Fixture to create a mock array of VEP consequences.""" + + def _build_csq_struct( + csq: str, + protein_coding: bool, + lof: str, + no_lof_flags: bool, + polyphen_prediction: Optional[str] = None, + ) -> hl.Struct: + """ + Build a mock VEP consequence struct. + + :param csq: The consequence term. + :param protein_coding: Whether the consequence is protein coding. + :param lof: The LOF value. + :param no_lof_flags: Whether the consequence has no LOF flags. + :param polyphen_prediction: The PolyPhen prediction. + :return: The mock VEP consequence struct. + """ + return hl.Struct( + biotype="protein_coding" if protein_coding else "not_protein_coding", + lof=lof, + lof_flags=hl.missing(hl.tstr) if no_lof_flags else "flag1", + consequence_terms=[csq], + polyphen_prediction=( + hl.missing(hl.tstr) + if polyphen_prediction is None + else polyphen_prediction + ), + ) + + struct_order = [ + ["stop_gained", True, "HC", True, None], + ["stop_lost", True, "HC", True, None], + ["splice_acceptor_variant", True, "HC", False, "benign"], + ["splice_acceptor_variant", True, "HC", False, "possibly_damaging"], + ["splice_acceptor_variant", True, "LC", True, None], + ["splice_acceptor_variant", True, "LC", True, "possibly_damaging"], + ["splice_acceptor_variant", True, "LC", False, "possibly_damaging"], + ["stop_gained", False, "HC", True, None], + ["stop_gained", False, "HC", True, "probably_damaging"], + ["splice_acceptor_variant", False, "HC", False, "possibly_damaging"], + ["splice_acceptor_variant", False, "LC", True, "probably_damaging"], + ["splice_acceptor_variant", False, "LC", True, None], + ["splice_acceptor_variant", False, "LC", True, "possibly_damaging"], + ["splice_acceptor_variant", False, "LC", False, "probably_damaging"], ] + + return hl.literal([_build_csq_struct(*p) for p in struct_order]) + + polyphen_order = ["probably_damaging", "possibly_damaging", "benign"] + polyphen_params = ["polyphen_prediction", polyphen_order] + + @pytest.mark.parametrize( + "prioritize_protein_coding, prioritize_loftee, prioritize_loftee_no_flags, additional_order_field, additional_order, expected_most_severe_csq, expected_polyphen_prediction", + [ + (False, False, False, None, None, None, None), + (True, False, False, None, None, None, None), + (False, True, False, None, None, None, None), + (False, False, True, None, None, None, None), + (False, False, False, *polyphen_params, None, None), + (True, True, False, None, None, None, None), + (True, False, True, None, None, None, None), + (True, False, False, *polyphen_params, None, "possibly_damaging"), + (False, True, True, None, None, "stop_gained", None), + (False, True, False, *polyphen_params, None, "possibly_damaging"), + (False, False, True, *polyphen_params, None, None), + (True, True, True, None, None, "stop_gained", None), + (True, True, False, *polyphen_params, None, "possibly_damaging"), + (True, False, True, *polyphen_params, None, "possibly_damaging"), + (False, True, True, *polyphen_params, "stop_gained", None), + # Need to figure out class too large error + (True, True, True, *polyphen_params, "stop_gained", "possibly_damaging"), + ], ) + def test_filter_to_most_severe_consequences( + self, + mock_csq_expr: hl.expr.ArrayExpression, + prioritize_protein_coding: bool, + prioritize_loftee: bool, + prioritize_loftee_no_flags: bool, + additional_order_field: str, + additional_order: List[str], + expected_most_severe_csq: str, + expected_polyphen_prediction: str, + ) -> None: + """ + Test the filter_to_most_severe_consequences function. + + :param mock_csq_expr: The mock VEP consequence expression. + :param prioritize_protein_coding: Whether to prioritize protein coding. + :param prioritize_loftee: Whether to prioritize LOFTEE. + :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE no flags. + :param additional_order_field: The additional order field to use. + :param additional_order: The additional order to use. + :param expected_most_severe_csq: The expected most severe consequence. + :param expected_polyphen_prediction: The expected PolyPhen prediction. + :return: None. + """ + result = filter_to_most_severe_consequences( + mock_csq_expr, + prioritize_protein_coding=prioritize_protein_coding, + prioritize_loftee=prioritize_loftee, + prioritize_loftee_no_flags=prioritize_loftee_no_flags, + additional_order_field=additional_order_field, + additional_order=additional_order, + ) - # Test with default parameters. - result = filter_vep_transcript_csqs_expr(csq_expr) - assert hl.eval(hl.len(result)) == 5, "Expected 5 consequences" - - # Test with loftee_labels. - result = filter_vep_transcript_csqs_expr(csq_expr, loftee_labels=["HC"]) - assert hl.eval(hl.len(result)) == 2, "Expected 2 consequences" - assert hl.eval(result[0].lof) == "HC", "Expected 'HC'" - - # Test with no_lof_flags. - result = filter_vep_transcript_csqs_expr(csq_expr, no_lof_flags=True) - assert hl.eval(hl.len(result)) == 3, "Expected 3 consequences" - assert hl.eval( - hl.all( - result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == "")) + expected_dict = hl.Struct( + protein_coding=True, + lof="HC", + most_severe_consequence="splice_acceptor_variant", + no_lof_flags=True, ) - ), "Expected no LOFTEE flags" - # Test with loftee_labels and no_lof_flags. - result = filter_vep_transcript_csqs_expr( - csq_expr, loftee_labels=["HC"], no_lof_flags=True - ) - assert hl.eval(hl.len(result)) == 2, "Expected 2 consequence" - assert hl.eval(result[0].lof) == "HC", "Expected 'HC'" - assert hl.eval( - hl.all( - result.map(lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == "")) + def _get_csq_structs( + csq: str, + protein_coding: Optional[bool] = None, + lof: Optional[str] = None, + no_lof_flags: Optional[bool] = None, + polyphen_prediction: Optional[str] = None, + ) -> List[Dict[str, Any]]: + """ + Get the expected consequence structs. + + :param csq: The consequence term to filter by. + :param protein_coding: Whether to filter by protein coding. + :param lof: The LOF value to filter by. + :param no_lof_flags: Whether to filter by no LOF flags. + :param polyphen_prediction: The PolyPhen prediction to filter by. + :return: The expected consequence structs. + """ + + def _get_csq_criteria(s): + keep = s.consequence_terms.contains(csq) + if protein_coding: + keep &= s.biotype == "protein_coding" + if lof: + keep &= s.lof == lof + if no_lof_flags: + keep &= hl.is_missing(s.lof_flags) + if polyphen_prediction: + keep &= s.polyphen_prediction == polyphen_prediction + + return keep + + return hl.eval(mock_csq_expr.filter(_get_csq_criteria)) + + expected_polyphen_prediction = ( + expected_polyphen_prediction or "probably_damaging" ) - ), "Expected no LOFTEE flags" - - -def test_get_most_severe_csq_from_multiple_csq_lists(): - """Test the get_most_severe_csq_from_multiple_csq_lists function.""" - # Create a mock VEP expression. - vep_expr = hl.struct( - transcript_consequences=[ - hl.struct( - biotype="protein_coding", - lof="HC", - lof_flags="flag1", - consequence_terms="splice_acceptor_variant", - ), - hl.struct( - biotype="protein_coding", - lof="HC", - lof_flags="flag1", - consequence_terms="splice_acceptor_variant", - ), - hl.struct( - biotype="protein_coding", - lof="HC", - lof_flags=hl.missing(hl.tstr), - consequence_terms="stop_lost", - ), - hl.struct( - biotype="protein_coding", - lof="HC", - lof_flags=hl.missing(hl.tstr), - consequence_terms="stop_gained", - ), - hl.struct( - biotype="protein_coding", - lof=hl.missing(hl.tstr), - lof_flags=hl.missing(hl.tstr), - consequence_terms="missense_variant", + expected_most_severe_csq = expected_most_severe_csq or "splice_acceptor_variant" + add_ms = expected_most_severe_csq != "splice_acceptor_variant" + + expected_select = ( + (["protein_coding"] if prioritize_protein_coding else []) + + (["lof"] if prioritize_loftee else []) + + ( + ["no_lof_flags"] + if prioritize_loftee_no_flags or prioritize_loftee + else [] + ) + + (["most_severe_consequence"] if not add_ms else []) + ) + expected = expected_dict.select( + *expected_select, + **({"most_severe_consequence": expected_most_severe_csq} if add_ms else {}), + consequences=_get_csq_structs( + expected_most_severe_csq, + protein_coding=prioritize_protein_coding, + lof="HC" if prioritize_loftee else None, + no_lof_flags=prioritize_loftee_no_flags, + polyphen_prediction=( + expected_polyphen_prediction + if additional_order_field == "polyphen_prediction" + else None + ), ), - ], - intergenic_consequences=[ - hl.struct( - biotype="intergenic", - consequence_terms="intergenic_variant", + ) + assert hl.eval(result) == expected, f"Expected '{expected}'" + + +class TestFilterVepTranscriptCsqsExprLoftee: + """Tests for the filter_vep_transcript_csqs_expr function.""" + + @pytest.fixture + def csq_expr(self) -> hl.expr.ArrayExpression: + """Fixture to create a mock array of VEP consequences with LOFTEE annotations.""" + return hl.literal( + [ + hl.struct(lof="HC", lof_flags=hl.missing(hl.tstr)), + hl.struct(lof="HC", lof_flags=""), + hl.struct(lof="LC", lof_flags="flag1"), + hl.struct(lof="OS", lof_flags=hl.missing(hl.tstr)), + hl.struct(lof=hl.missing(hl.tstr), lof_flags="flag2"), + ] + ) + + @staticmethod + def check_length(result: hl.expr.ArrayExpression, expected_len: int) -> None: + """ + Check the length of the result. + + :param result: The result to check. + :param expected_len: The expected length. + :return: None. + """ + assert ( + hl.eval(hl.len(result)) == expected_len + ), f"Expected {expected_len} consequences" + + @staticmethod + def check_lof(result: hl.expr.ArrayExpression, expected_lof: str) -> None: + """ + Check the LOF annotation. + + :param result: The result to check. + :param expected_lof: The expected LOF annotation. + :return: None. + """ + assert hl.eval(result[0].lof) == expected_lof, f"Expected '{expected_lof}'" + + @staticmethod + def check_no_lof_flags(result: hl.expr.ArrayExpression) -> None: + """ + Check the no LOF flags value. + + :param result: The result to check. + :return: None. + """ + assert hl.eval( + hl.all( + result.map( + lambda csq: hl.is_missing(csq.lof_flags) | (csq.lof_flags == "") + ) ) + ), "Expected no LOFTEE flags" + + @pytest.mark.parametrize( + "loftee_labels, no_lof_flags, expected_len, expected_lof, expected_no_lof_flags", + [ + (None, None, 5, None, None), + (["HC"], None, 2, "HC", None), + (None, True, 3, None, True), + (["HC"], True, 2, "HC", True), ], ) + def test_filter_vep_transcript_csqs_expr_loftee( + self, + csq_expr: hl.expr.ArrayExpression, + loftee_labels: list, + no_lof_flags: bool, + expected_len: int, + expected_lof: str, + expected_no_lof_flags: bool, + ) -> None: + """ + Test the filter_vep_transcript_csqs_expr function. + + :param csq_expr: The VEP consequence expression to filter. + :param loftee_labels: The LOFTEE labels to filter by. + :param no_lof_flags: Whether to filter by no LOF flags. + :param expected_len: The expected length of the result. + :param expected_lof: The expected LOF value. + :param expected_no_lof_flags: The expected no LOF flags value. + :return: None. + """ + result = filter_vep_transcript_csqs_expr( + csq_expr, loftee_labels=loftee_labels, no_lof_flags=no_lof_flags + ) + self.check_length(result, expected_len) + if expected_lof is not None: + self.check_lof(result, expected_lof) + if expected_no_lof_flags is not None: + self.check_no_lof_flags(result) - # Test the function. - result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) - - # Check the most_severe_consequence. - assert ( - hl.eval(result.most_severe_consequence) == "splice_acceptor_variant" - ), "Expected 'splice_acceptor_variant'" - - # Check the protein_coding. - assert hl.eval(result.protein_coding), "Expected True" - # Check the lof. - assert hl.eval(result.lof) == "HC", "Expected 'HC'" +class TestGetMostSevereCsqFromMultipleCsqLists: + """Tests for the get_most_severe_csq_from_multiple_csq_lists function.""" - # Check the no_lof_flags. - assert hl.eval(result.no_lof_flags), "Expected True" + @pytest.fixture + def vep_expr(self) -> hl.expr.StructExpression: + """Fixture to create a mock VEP expression.""" + return hl.struct( + transcript_consequences=[ + hl.struct( + biotype="protein_coding", + lof="HC", + lof_flags="flag1", + consequence_terms=["splice_acceptor_variant"], + ), + hl.struct( + biotype="protein_coding", + lof="HC", + lof_flags="flag1", + consequence_terms=["splice_acceptor_variant"], + ), + hl.struct( + biotype="protein_coding", + lof="HC", + lof_flags=hl.missing(hl.tstr), + consequence_terms=["stop_lost"], + ), + hl.struct( + biotype="protein_coding", + lof="HC", + lof_flags=hl.missing(hl.tstr), + consequence_terms=["stop_gained"], + ), + hl.struct( + biotype="protein_coding", + lof=hl.missing(hl.tstr), + lof_flags=hl.missing(hl.tstr), + consequence_terms=["missense_variant"], + ), + ], + intergenic_consequences=[ + hl.struct( + biotype="intergenic", + consequence_terms=["intergenic_variant"], + ) + ], + ) - # Check the prioritize_loftee_no_flags. - result = get_most_severe_csq_from_multiple_csq_lists( - vep_expr, prioritize_loftee_no_flags=True + @pytest.mark.parametrize( + "prioritize_loftee_no_flags, include_csqs, expected_most_severe, expected_protein_coding, expected_lof, expected_no_lof_flags, expected_transcript_consequences_len", + [ + (False, False, "splice_acceptor_variant", True, "HC", True, None), + (True, False, "stop_gained", True, "HC", True, None), + (False, True, "splice_acceptor_variant", True, "HC", True, 2), + (True, True, "stop_gained", True, "HC", True, 1), + ], ) - - # Check the most_severe_consequence. - assert ( - hl.eval(result.most_severe_consequence) == "stop_gained" - ), "Expected 'stop_gained'" - - # Check the protein_coding. - assert hl.eval(result.protein_coding), "Expected True" - - # Check the lof. - assert hl.eval(result.lof) == "HC", "Expected 'HC'" - - # Check the no_lof_flags. - assert hl.eval(result.no_lof_flags), "Expected True" - - # Check the transcript_consequences. - result = get_most_severe_csq_from_multiple_csq_lists(vep_expr, include_csqs=True) - assert ( - hl.eval(hl.len(result.transcript_consequences)) == 2 - ), "Expected 2 transcript consequences" - - # Create a mock VEP expression. - vep_expr = hl.struct( - transcript_consequences=[ - hl.struct( - biotype="protein_coding", - lof="HC", - lof_flags="flag1", - consequence_terms="missense_variant", + def test_get_most_severe_csq_from_multiple_csq_lists( + self, + vep_expr: hl.expr.StructExpression, + prioritize_loftee_no_flags: bool, + include_csqs: bool, + expected_most_severe: str, + expected_protein_coding: bool, + expected_lof: str, + expected_no_lof_flags: bool, + expected_transcript_consequences_len: Optional[int], + ) -> None: + """ + Test the get_most_severe_csq_from_multiple_csq_lists function. + + :param vep_expr: The VEP expression to test. + :param prioritize_loftee_no_flags: Whether to prioritize LOFTEE no flags. + :param include_csqs: Whether to include consequences. + :param expected_most_severe: The expected most severe consequence. + :param expected_protein_coding: The expected protein coding value. + :param expected_lof: The expected LOF value. + :param expected_no_lof_flags: The expected no LOF flags value. + :param expected_transcript_consequences_len: The expected length of transcript + consequences. + :return: None. + """ + result = get_most_severe_csq_from_multiple_csq_lists( + vep_expr, + prioritize_loftee_no_flags=prioritize_loftee_no_flags, + include_csqs=include_csqs, + ) + self.check_most_severe_consequence(result, expected_most_severe) + self.check_protein_coding(result, expected_protein_coding) + self.check_lof(result, expected_lof) + self.check_no_lof_flags(result, expected_no_lof_flags) + if include_csqs: + self.check_transcript_consequences_len( + result, expected_transcript_consequences_len ) - ] - ) - - # Test the function. - result = get_most_severe_csq_from_multiple_csq_lists(vep_expr) - # Check the no_lof_flags. - assert hl.eval(result.no_lof_flags) == False, "Expected False" + @staticmethod + def check_most_severe_consequence( + result: hl.expr.StructExpression, expected: str + ) -> None: + """ + Check the most severe consequence. + + :param result: The result to check. + :param expected: The expected most severe consequence. + :return: None. + """ + assert ( + hl.eval(result.most_severe_consequence) == expected + ), f"Expected '{expected}'" + + @staticmethod + def check_protein_coding(result: hl.expr.StructExpression, expected: bool) -> None: + """ + Check the protein coding value. + + :param result: The result to check. + :param expected: The expected protein coding value. + :return: None. + """ + assert hl.eval(result.protein_coding) == expected, f"Expected '{expected}'" + + @staticmethod + def check_lof(result: hl.expr.StructExpression, expected: str) -> None: + """ + Check the LOF value. + + :param result: The result to check. + :param expected: The expected LOF value. + :return: None. + """ + assert hl.eval(result.lof) == expected, f"Expected '{expected}'" + + @staticmethod + def check_no_lof_flags(result: hl.expr.StructExpression, expected: bool) -> None: + """ + Check the no LOF flags value. + + :param result: The result to check. + :param expected: The expected no LOF flags value. + :return: None. + """ + assert hl.eval(result.no_lof_flags) == expected, f"Expected '{expected}'" + + @staticmethod + def check_transcript_consequences_len( + result: hl.expr.StructExpression, expected: int + ) -> None: + """ + Check the length of transcript consequences. + + :param result: The result to check. + :param expected: The expected length. + :return: None. + """ + assert ( + hl.eval(hl.len(result.transcript_consequences)) == expected + ), f"Expected {expected} transcript consequences" From 8e2562167826708217d054fbbc1f0044e48870eb Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Wed, 23 Oct 2024 22:31:36 -0600 Subject: [PATCH 33/33] Fix incorrect return types --- gnomad/utils/vep.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 475d760fa..f6c994e1d 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -922,11 +922,11 @@ def get_most_severe_csq_from_multiple_csq_lists( "intergenic_consequences", ), add_order_by_csq_list: Dict[str, Tuple[str, List[str]]] = None, -) -> hl.Table: +) -> hl.expr.StructExpression: """ Process multiple VEP consequences lists to determine the most severe consequence. - Adds the following annotations: + Returns a struct expression with the following annotations: - most_severe_consequence: Most severe consequence for variant. - protein_coding: Whether the variant is present on a protein-coding transcript. - lof: Whether the variant is a loss-of-function variant. @@ -978,7 +978,7 @@ def get_most_severe_csq_from_multiple_csq_lists( :param add_order_by_csq_list: Dictionary of additional ordering for VEP consequences lists. The key is the name of the consequences list and the value is the order of consequences, sorted from high to low impact. Default is None. - :return: Table annotated with VEP summary annotations. + :return: StructExpression with the most severe consequence and additional annotations. """ add_order_by_csq_list = add_order_by_csq_list or {} loftee_labels = loftee_labels or LOFTEE_LABELS @@ -1121,7 +1121,7 @@ def filter_vep_transcript_csqs_expr( keep_genes: bool = True, match_by_gene_symbol: bool = False, additional_filtering_criteria: Optional[List[Callable]] = None, -) -> Union[hl.Table, hl.MatrixTable]: +) -> hl.expr.ArrayExpression: """ Filter VEP transcript consequences based on specified criteria, and optionally filter to variants where transcript consequences is not empty after filtering.