Skip to content

Commit

Permalink
feat!: Change breakpoint validation function (#397)
Browse files Browse the repository at this point in the history
  • Loading branch information
jarbesfeld authored Jan 27, 2025
1 parent a43932f commit ef342d4
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 21 deletions.
34 changes: 14 additions & 20 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,14 +865,14 @@ async def _genomic_to_tx_segment(
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue

# Validate that the breakpoint occurs on a transcript given a gene
coordinate_check = await self._validate_gene_coordinates(
pos=genomic_pos, genomic_ac=genomic_ac, gene=gene
# Validate that the breakpoint between the first and last exon for the selected transcript
coordinate_check = await self._validate_genomic_breakpoint(
pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
)
if not coordinate_check:
return GenomicTxSeg(
errors=[
f"{genomic_pos} on {genomic_ac} does not occur within the exons for {gene}"
f"{genomic_pos} on {genomic_ac} does not occur within the exons for {transcript}"
]
)

Expand Down Expand Up @@ -943,38 +943,32 @@ async def _get_grch38_pos(
)
return liftover_data[1] if liftover_data else None

async def _validate_gene_coordinates(
async def _validate_genomic_breakpoint(
self,
pos: int,
genomic_ac: str,
gene: str,
tx_ac: str,
) -> bool:
"""Validate that a genomic coordinate falls within the first and last exon
given a gene and accession
for a transcript on a given accession
:param pos: Genomic position on ``genomic_ac``
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
:param gene: A valid, case-sensitive HGNC gene symbol
:param transcript: A transcript accession
:return: ``True`` if the coordinate falls within the first and last exon
for the gene, ``False`` if not
for the transcript, ``False`` if not
"""
query = f"""
WITH tx_boundaries AS (
SELECT
tx_ac,
hgnc,
MIN(alt_start_i) as min_start,
MAX(alt_end_i) as max_end
SELECT
MIN(alt_start_i) AS min_start,
MAX(alt_end_i) AS max_end
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE hgnc = '{gene}'
WHERE tx_ac = '{tx_ac}'
AND alt_ac = '{genomic_ac}'
GROUP BY tx_ac, hgnc
)
SELECT DISTINCT hgnc
FROM tx_boundaries
SELECT * FROM tx_boundaries
WHERE {pos} between tx_boundaries.min_start and tx_boundaries.max_end
ORDER BY hgnc
LIMIT 1;
""" # noqa: S608
results = await self.uta_db.execute_query(query)
return bool(results)
Expand Down
2 changes: 1 addition & 1 deletion tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -1516,7 +1516,7 @@ async def test_invalid(test_egc_mapper):
)
genomic_tx_seg_service_checks(resp, is_valid=False)
assert resp.errors == [
"9999999999998 on NC_000001.11 does not occur within the exons for TPM3"
"9999999999998 on NC_000001.11 does not occur within the exons for NM_152263.3"
]

# Must supply either gene or transcript
Expand Down

0 comments on commit ef342d4

Please sign in to comment.