Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uncertain ranges #699

Open
wants to merge 31 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5054739
Support indefinite and definite uncertain ranges, and unbalanced unce…
theferrit32 Sep 20, 2023
1f624ad
Rebuild hgvs grammar due to whitespace changes
theferrit32 Sep 20, 2023
2bcccc2
Merge branch 'main' into 225-uncertain-ranges
andreasprlic Dec 11, 2023
722b142
Merge branch '712-fix-dev-install' into 225-uncertain-ranges
andreasprlic Dec 11, 2023
c15c552
feat(test): minor addition to test to make sure the breakpoints are u…
andreasprlic Dec 11, 2023
8a4341a
feat(g_to_c): this adds support for g_to_c of uncertain coordinates. …
andreasprlic Dec 12, 2023
1983654
Merge branch 'main' into 225-uncertain-ranges
andreasprlic Dec 12, 2023
2e4ae22
update CODEOWNERS to @biocommons/maintainers
reece Jan 19, 2024
82d7331
use shared stale action configuration
reece Jan 31, 2024
a045165
import and standardize issue templates from biocommons.example
Jan 31, 2024
43a556c
add standardized github labels and update action
Jan 31, 2024
3ea0c5c
expose clearer name for label sync action
Jan 31, 2024
1e2a433
remove .github/ISSUE_TEMPLATE (in order to use templates in .github r…
Feb 1, 2024
0c4c5d7
migrate to endbug/label-sync with biocommons-wide label config
Feb 1, 2024
4a44582
#688 - remove __future__ usage
davmlaw Feb 7, 2024
6056847
#695 - Remove top level code environment scripts from modules
davmlaw Feb 9, 2024
cce1c6f
update stale action to use stale.yml from worfklow-template
Feb 13, 2024
261245a
feat(imprecise hgvs_c): adding support to create imprecise hgvs_c s.
andreasprlic Feb 20, 2024
52adbe9
Merge branch 'main' into 225-uncertain-ranges
andreasprlic Feb 20, 2024
609244a
feat(imprecise g_to_c): g_to_c now works. c_to_g not working yet. Req…
andreasprlic Feb 21, 2024
0364f37
feat(cleanup): removing broken c_to_g unit test for imprecise events.
andreasprlic Feb 24, 2024
4c7ecbf
Simplify parser to use None for single-digit intervals. Fixed format …
theferrit32 Mar 11, 2024
cec05d3
Merge branch 'main' into 225-uncertain-ranges
andreasprlic Dec 24, 2024
f591a1a
Merge branch 'main' into 225-uncertain-ranges
andreasprlic Jan 20, 2025
8ddecd1
updating test cache
andreasprlic Jan 20, 2025
99e0060
adding a unit test for the examples from #225
andreasprlic Jan 20, 2025
053f686
installing pytest now for CI
andreasprlic Jan 20, 2025
f370655
installing pytest-cov for CI
andreasprlic Jan 21, 2025
b6bf4b6
updating test cache
andreasprlic Jan 21, 2025
1a857f9
more meddling with the CI
andreasprlic Jan 21, 2025
cd949db
fixing grammar based on PR feedback.
andreasprlic Jan 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 14 additions & 8 deletions src/hgvs/_data/hgvs.pymeta
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# variant specification. The subset is limited to is limited to those
# rules that define sequence variants precisely. It does not current
# cover rules for translocations or conversions.

# The basic structure of a HGVS sequence variant is:
# <ac>:<type>.<posedit>
# where <ac> is a sequence accession, <type> determines the sequence
Expand All @@ -26,7 +26,7 @@ r_variant = accn:ac opt_gene_expr:gene ':' 'r':type '.' r_posedit:posedit -> hgv

############################################################################
## HGVS Position -- e.g., NM_01234.5:c.22+6 (without an edit)
# This is unofficial syntax
# This is unofficial syntax

hgvs_position = g_hgvs_position | m_hgvs_position | c_hgvs_position | n_hgvs_position | r_hgvs_position | p_hgvs_position

Expand Down Expand Up @@ -65,7 +65,7 @@ r_posedit = (r_interval:pos rna_edit:edit -> hgvs.posedit.PosEdit(pos=pos,edit=e
p_posedit = (p_interval:pos pro_edit:edit -> hgvs.posedit.PosEdit(pos=pos,edit=edit))
| ('(' p_interval:pos pro_edit:edit ')' -> hgvs.posedit.PosEdit(pos=pos,edit=edit, uncertain=True))
| p_posedit_special
p_posedit_special =
p_posedit_special =
'=':x -> hgvs.posedit.PosEdit(pos=None,edit=x,uncertain=False)
| '(' '=':x ')' -> hgvs.posedit.PosEdit(pos=None,edit=x,uncertain=True)
| '0':x '?' -> hgvs.posedit.PosEdit(pos=None,edit=x,uncertain=True)
Expand Down Expand Up @@ -122,20 +122,26 @@ pro_ident = '=' -> hgvs.edit.AARefAlt(ref='',alt=''

# potentially indefinite/uncertain intervals
c_interval = def_c_interval | '(' def_c_interval:iv ')' -> iv._set_uncertain()
g_interval = def_g_interval | '(' def_g_interval:iv ')' -> iv._set_uncertain()
g_interval = uncertain_g_interval:iv | ('(' def_g_interval:iv ')' -> iv._set_uncertain()) | def_g_interval
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by this. The second alternative parse (with parentheses) is for an uncertain g_interval.

m_interval = def_m_interval | '(' def_m_interval:iv ')' -> iv._set_uncertain()
n_interval = def_n_interval | '(' def_n_interval:iv ')' -> iv._set_uncertain()
p_interval = def_p_interval | '(' def_p_interval:iv ')' -> iv._set_uncertain()
r_interval = def_r_interval | '(' def_r_interval:iv ')' -> iv._set_uncertain()

# definite intervals
def_g_interval = (g_pos:start '_' g_pos:end -> hgvs.location.Interval(start,end)) | (g_pos:start -> hgvs.location.Interval(start,copy.deepcopy(start)))
def_m_interval = (m_pos:start '_' m_pos:end -> hgvs.location.Interval(start,end)) | (m_pos:start -> hgvs.location.Interval(start,copy.deepcopy(start)))
def_p_interval = (p_pos:start '_' p_pos:end -> hgvs.location.Interval(start,end)) | (p_pos:start -> hgvs.location.Interval(start,copy.deepcopy(start)))
def_r_interval = (r_pos:start '_' r_pos:end -> hgvs.location.Interval(start,end)) | (r_pos:start -> hgvs.location.Interval(start,copy.deepcopy(start)))
def_g_interval = (g_pos:start '_' g_pos:end -> hgvs.location.Interval(start,end)) | (g_pos:start -> hgvs.location.Interval(start,None))
def_m_interval = (m_pos:start '_' m_pos:end -> hgvs.location.Interval(start,end)) | (m_pos:start -> hgvs.location.Interval(start,None))
def_p_interval = (p_pos:start '_' p_pos:end -> hgvs.location.Interval(start,end)) | (p_pos:start -> hgvs.location.Interval(start,None))
def_r_interval = (r_pos:start '_' r_pos:end -> hgvs.location.Interval(start,end)) | (r_pos:start -> hgvs.location.Interval(start,None))
def_c_interval = (c_pos:start '_' c_pos:end -> hgvs.location.BaseOffsetInterval(start,end)) | (c_pos:start -> hgvs.location.BaseOffsetInterval(start,copy.deepcopy(start)))
def_n_interval = (n_pos:start '_' n_pos:end -> hgvs.location.BaseOffsetInterval(start,end)) | (n_pos:start -> hgvs.location.BaseOffsetInterval(start,copy.deepcopy(start)))

# indefinite ranges
uncertain_g_interval = '(' def_g_interval:ivl_start ')' '_' '(' def_g_interval:ivl_end ')' -> hgvs.location.Interval(start=ivl_start._set_uncertain(), end=ivl_end._set_uncertain())
| def_g_interval:ivl_start '_' '(' def_g_interval:ivl_end ')' -> hgvs.location.Interval(start=ivl_start, end=ivl_end._set_uncertain())
| '(' def_g_interval:ivl_start ')' '_' def_g_interval:ivl_end -> hgvs.location.Interval(start=ivl_start._set_uncertain(), end=ivl_end)


# positions
c_pos = def_c_pos #| '(' def_c_pos:pos ')' -> pos._set_uncertain()
g_pos = def_g_pos #| '(' def_g_pos:pos ')' -> pos._set_uncertain()
Expand Down
48 changes: 38 additions & 10 deletions src/hgvs/alignmentmapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
#


from __future__ import absolute_import, division, print_function, unicode_literals

from typing import Optional

from bioutils.coordinates import strand_int_to_pm
from six.moves import range

Expand All @@ -39,6 +43,7 @@
HGVSInvalidIntervalError,
HGVSUsageError,
)
from hgvs.location import Interval, BaseOffsetInterval
from hgvs.utils import build_tx_cigar
from hgvs.utils.cigarmapper import CIGARMapper

Expand Down Expand Up @@ -151,16 +156,29 @@ def __str__(self):
)
)

def g_to_n(self, g_interval, strict_bounds=None):
def g_to_n(self, g_interval: Interval, strict_bounds:Optional[bool]=None)->BaseOffsetInterval:
"""convert a genomic (g.) interval to a transcript cDNA (n.) interval"""

if strict_bounds is None:
strict_bounds = global_config.mapping.strict_bounds

grs, gre = (
g_interval.start.base - 1 - self.gc_offset,
g_interval.end.base - 1 - self.gc_offset,
)
# in case of uncertain ranges, we fall back to the inner (more confident) interval
if g_interval.start.uncertain:
grs = g_interval.start.end.base - 1 - self.gc_offset
else:
if isinstance(g_interval.start, Interval):
grs = g_interval.start.start.base - 1 - self.gc_offset
else:
grs = g_interval.start.base - 1 - self.gc_offset

if g_interval.end.uncertain:
gre = g_interval.end.start.base - 1 - self.gc_offset
else:
if isinstance(g_interval.end, Interval):
gre = g_interval.end.end.base - 1 - self.gc_offset
else:
gre = g_interval.end.base - 1 - self.gc_offset

# frs, fre = (f)orward (r)na (s)tart & (e)nd; forward w.r.t. genome
frs, frs_offset, frs_cigar = self.cigarmapper.map_ref_to_tgt(
pos=grs, end="start", strict_bounds=strict_bounds
Expand All @@ -174,17 +192,24 @@ def g_to_n(self, g_interval, strict_bounds=None):
frs_offset, fre_offset = -fre_offset, -frs_offset

# The returned interval would be uncertain when locating at alignment gaps
# of if the initial interval was uncertain
return hgvs.location.BaseOffsetInterval(
start=hgvs.location.BaseOffsetPosition(
base=_zbc_to_hgvs(frs), offset=frs_offset, datum=Datum.SEQ_START
base=_zbc_to_hgvs(frs),
offset=frs_offset,
datum=Datum.SEQ_START,
uncertain=g_interval.start.uncertain
),
end=hgvs.location.BaseOffsetPosition(
base=_zbc_to_hgvs(fre), offset=fre_offset, datum=Datum.SEQ_START
base=_zbc_to_hgvs(fre),
offset=fre_offset,
datum=Datum.SEQ_START,
uncertain=g_interval.end.uncertain
),
uncertain=frs_cigar in "DI" or fre_cigar in "DI",
)

def n_to_g(self, n_interval, strict_bounds=None):
def n_to_g(self, n_interval, strict_bounds=None) ->Interval:
"""convert a transcript (n.) interval to a genomic (g.) interval"""

if strict_bounds is None:
Expand Down Expand Up @@ -216,7 +241,7 @@ def n_to_g(self, n_interval, strict_bounds=None):
uncertain=grs_cigar in "DI" or gre_cigar in "DI",
)

def n_to_c(self, n_interval, strict_bounds=None):
def n_to_c(self, n_interval:Interval, strict_bounds:Optional[bool]=None):
"""convert a transcript cDNA (n.) interval to a transcript CDS (c.) interval"""

if strict_bounds is None:
Expand Down Expand Up @@ -246,7 +271,10 @@ def pos_n_to_c(pos):
else:
c = pos.base - self.cds_end_i
c_datum = Datum.CDS_END
return hgvs.location.BaseOffsetPosition(base=c, offset=pos.offset, datum=c_datum)
return hgvs.location.BaseOffsetPosition(base=c,
offset=pos.offset,
datum=c_datum,
uncertain=pos.uncertain)

c_interval = hgvs.location.BaseOffsetInterval(
start=pos_n_to_c(n_interval.start),
Expand Down
Loading
Loading