Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds seed parameter to rarefy #321

Merged
merged 11 commits into from
Jan 24, 2025
17 changes: 14 additions & 3 deletions q2_feature_table/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,28 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------
from typing import Union

import biom
import numpy as np


def rarefy(table: biom.Table, sampling_depth: int,
with_replacement: bool = False) -> biom.Table:
def rarefy(table: biom.Table,
sampling_depth: int,
with_replacement: bool = False,
seed: Union[int, str] = 1
) -> biom.Table:
# Generate a random seed if seed = "random"
if seed == "random":
rng = np.random.default_rng()
seed = rng.integers(0, 2 ** 32 - 1)

if with_replacement:
table = table.filter(lambda v, i, m: v.sum() >= sampling_depth,
inplace=False, axis='sample')

table = table.subsample(sampling_depth, axis='sample', by_id=False,
with_replacement=with_replacement)
with_replacement=with_replacement, seed=seed)

if table.is_empty():
raise ValueError('The rarefied table contains no samples or features. '
Expand Down
9 changes: 7 additions & 2 deletions q2_feature_table/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
function=q2_feature_table.rarefy,
inputs={'table': FeatureTable[Frequency]},
parameters={'sampling_depth': Int % Range(1, None),
'with_replacement': Bool},
'with_replacement': Bool,
'seed': Int % Range(0, 2**32) | Str % Choices(["random"])},
outputs=[('rarefied_table', FeatureTable[Frequency])],
input_descriptions={'table': 'The feature table to be rarefied.'},
parameter_descriptions={
Expand All @@ -46,7 +47,11 @@
'included in the resulting table.'),
'with_replacement': ('Rarefy with replacement by sampling from the '
'multinomial distribution instead of rarefying '
'without replacement.')
'without replacement.'),
'seed': ('Set the seed for the subsampling. Using the same seed with '
'the same table will always lead to the same result. Using '
'"random", sets the seed to a random number. The random '
'seed will not be logged in provenance.')
},
output_descriptions={
'rarefied_table': 'The resulting rarefied feature table.'
Expand Down
12 changes: 10 additions & 2 deletions q2_feature_table/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,24 @@

class RarefyTests(TestCase):

def test_rarefy(self):
def test_rarefy_random_seed(self):
VinzentRisch marked this conversation as resolved.
Show resolved Hide resolved
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
['O1', 'O2'],
['S1', 'S2', 'S3'])
a = rarefy(t, 2)
a = rarefy(t, 2, seed="random")
self.assertEqual(a.shape, (2, 2))
self.assertEqual(set(a.ids(axis='sample')), set(['S2', 'S3']))
self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2']))
npt.assert_array_equal(a.sum(axis='sample'), np.array([2., 2.]))

def test_rarefy_seed_1(self):
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
['O1', 'O2'],
['S1', 'S2', 'S3'])
a = rarefy(t, 2, seed=1)
self.assertEqual(a.data('S2', axis='sample').tolist(), [1, 1])
self.assertEqual(a.data('S3', axis='sample').tolist(), [1, 1])

def test_rarefy_replacement(self):
t = Table(np.array([[0, 10, 30], [10, 10, 20]]),
['O1', 'O2'],
Expand Down
Loading