Skip to content

Commit

Permalink
Merge pull request #25 from jrm5100/master
Browse files Browse the repository at this point in the history
Version 0.10.0
  • Loading branch information
jrm5100 authored Jul 16, 2021
2 parents f9e71ca + 9c491a3 commit 553ce10
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 38 deletions.
5 changes: 5 additions & 0 deletions docs/release-history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
Release History
===============

v0.10.0 (2021-07-16)
--------------------

Change genotype scores to be a unit8 (255=missing) instead of float64, to save ~70% of memory usage

v0.9.1 (2021-07-13)
-------------------

Expand Down
8 changes: 5 additions & 3 deletions pandas_genomics/arrays/genotype_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ def __init__(self, variant: Optional[Variant] = None):

# Data backing the GenotypeArray is stored as a numpy structured array
# An unsigned integer for each allele in the genotype indexing the list of possible alleles
# A float value for the genotype score (nan if missing)
# An unsigned integer for the genotype score (255 if missing)
self._record_type = np.dtype(
[
("allele_idxs", np.uint8, (self.variant.ploidy,)),
("gt_score", np.float64),
("gt_score", np.uint8),
]
)

Expand Down Expand Up @@ -665,7 +665,9 @@ def gt_scores(self):
"""
Return the genotype score for each genotype (as a float)
"""
return self._data["gt_score"]
scores = self._data["gt_score"].copy().astype("float")
scores[scores == MISSING_IDX] = np.nan
return scores

# Operations
# Note: genotypes are compared by first allele then second, using the order of alleles in the variant
Expand Down
3 changes: 1 addition & 2 deletions pandas_genomics/io/plink/from_plink.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,7 @@ def create_gt_array(num_samples, variant_gt_bytes, variant):
genotypes[het_gt] = (0, 1)
# Create GenotypeArray representation of the data
dtype = GenotypeDtype(variant)
scores = np.empty(num_samples)
scores[:] = np.nan
scores = np.ones(num_samples) * MISSING_IDX # Missing Scores
data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
gt_array = GenotypeArray(values=data, dtype=dtype)
return gt_array
6 changes: 5 additions & 1 deletion pandas_genomics/io/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ def from_vcf(
allele_idxs = np.array(vcf_variant.genotypes)[:, :2]
allele_idxs = np.where(allele_idxs == -1, MISSING_IDX, allele_idxs)
gt_scores = vcf_variant.gt_quals
gt_scores = np.where(gt_scores == -1, np.nan, gt_scores)
# Convert genotype scores from float values to uint8 values
gt_scores = np.where(gt_scores > 254, 254, gt_scores) # Max Score
gt_scores = np.where(gt_scores < 0, 255, gt_scores) # Min Score (<0 is missing)
gt_scores = np.where(gt_scores == -1, 255, gt_scores) # Missing values
gt_scores = gt_scores.round().astype("uint8")
values = np.array(list(zip(allele_idxs, gt_scores)), dtype=dtype._record_type)

# Make the GenotypeArray
Expand Down
20 changes: 11 additions & 9 deletions pandas_genomics/scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@
import uuid
from typing import Optional, List, Tuple, Union

MISSING_IDX = (
255 # Integer indicating a missing allele. Each variant must have 254 alleles max.
)
MISSING_IDX = 255 # Integer indicating a missing allele or genotype score. Each variant must have 254 alleles max and the maximum genotype score is 254.


class Variant:
Expand Down Expand Up @@ -359,7 +357,7 @@ class Genotype:
allele_idxs: List[int]
Alleles encoded as indexes into the variant allele list
score: int, optional
A quality score for the Genotype. No assumptions are made about the meaning.
A quality score for the Genotype between 0 and 254. 255 or < 0 is treated as missing.
Examples
--------
Expand Down Expand Up @@ -398,9 +396,13 @@ def __init__(

self.variant = variant
self.allele_idxs = allele_idxs
self.score = None
if score is not None:
self.score = int(score)
score = int(score)
if score < 0 or score > 255:
raise ValueError("The score must be between 0 and 255, inclusive")
elif score == 255:
score = None
self.score = score

# Validate parameters
for a in self.allele_idxs:
Expand Down Expand Up @@ -502,8 +504,8 @@ def is_missing(self) -> bool:

@property
def _float_score(self):
"""Convenience method for storing score as a float"""
"""Convenience method for storing score as a uint8"""
if self.score is None:
return float("NaN")
return 255
else:
return float(self.score)
return self.score
38 changes: 19 additions & 19 deletions pandas_genomics/sim/biallelic_model_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from numpy.random._generator import default_rng

from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
from pandas_genomics.scalars import Variant
from pandas_genomics.scalars import Variant, MISSING_IDX


class SNPEffectEncodings(Enum):
Expand Down Expand Up @@ -450,15 +450,15 @@ def _get_snp1_gt_array(self, gt_table_idxs):
"""Assemble a GenotypeArray for SNP1 directly from genotype table indices"""
dtype = GenotypeDtype(self.snp1)
gt_table_data = (
((0, 0), np.nan),
((0, 1), np.nan),
((1, 1), np.nan),
((0, 0), np.nan),
((0, 1), np.nan),
((1, 1), np.nan),
((0, 0), np.nan),
((0, 1), np.nan),
((1, 1), np.nan),
((0, 0), MISSING_IDX),
((0, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
((0, 0), MISSING_IDX),
((0, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
((0, 0), MISSING_IDX),
((0, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
)
data = np.array(
[gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type
Expand All @@ -469,15 +469,15 @@ def _get_snp2_gt_array(self, gt_table_idxs):
"""Assemble a GenotypeArray for SNP2 directly from genotype table indices"""
dtype = GenotypeDtype(self.snp2)
gt_table_data = (
((0, 0), np.nan),
((0, 0), np.nan),
((0, 0), np.nan),
((0, 1), np.nan),
((0, 1), np.nan),
((0, 1), np.nan),
((1, 1), np.nan),
((1, 1), np.nan),
((1, 1), np.nan),
((0, 0), MISSING_IDX),
((0, 0), MISSING_IDX),
((0, 0), MISSING_IDX),
((0, 1), MISSING_IDX),
((0, 1), MISSING_IDX),
((0, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
((1, 1), MISSING_IDX),
)
data = np.array(
[gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type
Expand Down
5 changes: 2 additions & 3 deletions pandas_genomics/sim/random_gt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
from pandas_genomics.scalars import Variant
from pandas_genomics.scalars import Variant, MISSING_IDX


def generate_random_gt(
Expand Down Expand Up @@ -58,8 +58,7 @@ def generate_random_gt(

# Create GenotypeArray representation of the data
dtype = GenotypeDtype(variant)
scores = np.empty(n)
scores[:] = np.nan
scores = np.ones(n) * MISSING_IDX
data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
gt_array = GenotypeArray(values=data, dtype=dtype)

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pandas-genomics"
version = "0.9.1"
version = "0.10.0"
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
license = "BSD-3-Clause"
authors = ["John McGuigan <[email protected]>"]
Expand Down

0 comments on commit 553ce10

Please sign in to comment.