Skip to content

Commit

Permalink
Add skipna option to genomic_relationship #1076
Browse files Browse the repository at this point in the history
  • Loading branch information
timothymillar authored and tomwhite committed May 2, 2023
1 parent e28c0f7 commit a7340c0
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 5 deletions.
33 changes: 28 additions & 5 deletions sgkit/stats/grm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,36 @@
from sgkit.utils import conditional_merge_datasets, create_dataset


def _grm_VanRaden(
call_dosage: ArrayLike,
ancestral_frequency: ArrayLike,
ploidy: int,
skipna: bool = False,
):
ancestral_dosage = ancestral_frequency * ploidy
M = call_dosage - ancestral_dosage[:, None]
if skipna:
nans = da.isnan(M)
M0 = da.where(nans, 0, M)
numerator = M0.T @ M0
AD = ~nans * ancestral_dosage[:, None]
AFC = ~nans * (1 - ancestral_frequency[:, None])
denominator = AD.T @ AFC
else:
numerator = M.T @ M
denominator = (ancestral_dosage * (1 - ancestral_frequency)).sum()
G = numerator / denominator
return G


def genomic_relationship(
ds: Dataset,
*,
call_dosage: Hashable = variables.call_dosage,
estimator: Optional[Literal["VanRaden"]] = None,
ancestral_frequency: Optional[Hashable] = None,
ploidy: Optional[int] = None,
skipna: bool = False,
merge: bool = True,
) -> Dataset:
"""Compute a genomic relationship matrix (AKA the GRM or G-matrix).
Expand Down Expand Up @@ -44,6 +67,10 @@ def genomic_relationship(
Ploidy level of all samples within the dataset.
By default this is inferred from the size of the "ploidy" dimension
of the dataset.
skipna
If True, missing (nan) values of 'call_dosage' will be skipped so
that the relationship between each pair of individuals is estimated
using only variants where both samples have non-nan values.
merge
If True (the default), merge the input dataset and the computed
output variables into a single dataset, otherwise return only
Expand Down Expand Up @@ -134,11 +161,7 @@ def genomic_relationship(
raise ValueError(
"The ancestral_frequency variable must have one value per variant"
)
ad = af * ploidy
M = cd - ad[:, None]
num = M.T @ M
denom = (ad * (1 - af)).sum()
G = num / denom
G = _grm_VanRaden(cd, af, ploidy=ploidy, skipna=skipna)

new_ds = create_dataset(
{
Expand Down
74 changes: 74 additions & 0 deletions sgkit/tests/test_grm.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,80 @@ def test_genomic_relationship__VanRaden_AGHmatrix_tetraploid(chunks):
np.testing.assert_array_almost_equal(actual, expect)


def test_genomic_relationship__VanRaden_skipna():
# Test that skipna option skips values in call_dosage
# such that the relationship between each pair of individuals
# is calculated using only the variants where neither sample
# has missing data.
# This should be equivalent to calculating the GRM using
# multiple subsets of the variants and using pairwise
# values from the larges subset of variants that doesn't
# result in a nan value.
nan = np.nan
dosage = np.array(
[
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 2.0, 0.0],
[1.0, 1.0, 1.0, 2.0, nan, 1.0, 1.0, 0.0, 1.0, 2.0],
[2.0, 2.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 0.0, 1.0],
[1.0, 0.0, 0.0, 0.0, nan, 1.0, 1.0, 1.0, 1.0, 0.0],
[1.0, 0.0, 1.0, 1.0, nan, 2.0, 0.0, 1.0, 0.0, 2.0],
[2.0, 1.0, 1.0, 1.0, nan, 1.0, 2.0, nan, 0.0, 1.0],
[2.0, 0.0, 1.0, 1.0, nan, 2.0, 1.0, nan, 1.0, 1.0],
[1.0, 1.0, 1.0, 2.0, nan, 1.0, 2.0, nan, 1.0, 0.0],
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, nan, 1.0, 1.0],
[2.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, nan, 2.0, 1.0],
[1.0, 2.0, 2.0, 1.0, 2.0, 0.0, 1.0, nan, 1.0, 2.0],
[0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 0.0, nan, 1.0, 2.0],
[1.0, 2.0, 1.0, 2.0, 2.0, 0.0, 1.0, nan, 1.0, 0.0],
[0.0, 2.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0],
[1.0, 1.0, 2.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0],
[2.0, 0.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0],
[1.0, 0.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 1.0],
[2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0],
[1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 0.0, 2.0, 1.0, 2.0],
[1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
]
)
ds = xr.Dataset()
ds["call_dosage"] = ["variants", "samples"], dosage
ds["ancestral_frequency"] = "variants", np.ones(len(dosage)) / 2
# calculating without skipna will result in nans in the GRM
expect = sg.genomic_relationship(
ds,
call_dosage="call_dosage",
ancestral_frequency="ancestral_frequency",
estimator="VanRaden",
ploidy=2,
skipna=False,
).stat_genomic_relationship.values
assert np.isnan(expect).sum() > 0
# fill nan values using maximum subsets without missing data
idx_0 = ~np.isnan(dosage[:, 4])
idx_1 = ~np.isnan(dosage[:, 7])
idx_2 = np.logical_and(idx_0, idx_1)
for idx in [idx_0, idx_1, idx_2]:
sub = ds.sel(dict(variants=idx))
sub_expect = sg.genomic_relationship(
sub,
call_dosage="call_dosage",
ancestral_frequency="ancestral_frequency",
estimator="VanRaden",
ploidy=2,
skipna=False,
).stat_genomic_relationship.values
expect = np.where(np.isnan(expect), sub_expect, expect)
# calculate actual value using skipna=True
actual = sg.genomic_relationship(
ds,
call_dosage="call_dosage",
ancestral_frequency="ancestral_frequency",
estimator="VanRaden",
ploidy=2,
skipna=True,
).stat_genomic_relationship.values
np.testing.assert_array_equal(actual, expect)


@pytest.mark.parametrize("ploidy", [2, 4])
def test_genomic_relationship__detect_ploidy(ploidy):
ds = xr.Dataset()
Expand Down

0 comments on commit a7340c0

Please sign in to comment.