Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Adds seed parameter to rarefy #321

Merged
merged 11 commits into from
Jan 24, 2025
13 changes: 9 additions & 4 deletions q2_feature_table/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import biom

import os
Expand All @@ -15,13 +14,19 @@
from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ


def rarefy(table: biom.Table, sampling_depth: int,
with_replacement: bool = False) -> biom.Table:
def rarefy(table: biom.Table,
sampling_depth: int,
with_replacement: bool = False,
random_seed: int = None
) -> biom.Table:

if with_replacement:
table = table.filter(lambda v, i, m: v.sum() >= sampling_depth,
inplace=False, axis='sample')

table = table.subsample(sampling_depth, axis='sample', by_id=False,
with_replacement=with_replacement)
with_replacement=with_replacement,
seed=random_seed)

if table.is_empty():
raise ValueError('The rarefied table contains no samples or features. '
Expand Down
8 changes: 6 additions & 2 deletions q2_feature_table/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
function=q2_feature_table.rarefy,
inputs={'table': FeatureTable[Frequency]},
parameters={'sampling_depth': Int % Range(1, None),
'with_replacement': Bool},
'with_replacement': Bool,
'random_seed': Int % Range(0, None)},
outputs=[('rarefied_table', FeatureTable[Frequency])],
input_descriptions={'table': 'The feature table to be rarefied.'},
parameter_descriptions={
Expand All @@ -47,7 +48,10 @@
'included in the resulting table.'),
'with_replacement': ('Rarefy with replacement by sampling from the '
'multinomial distribution instead of rarefying '
'without replacement.')
'without replacement.'),
'random_seed': ('Set the seed for the subsampling. Using the same '
'seed with the same table will always lead to the '
'same result. Defaults to a random seed.')
},
output_descriptions={
'rarefied_table': 'The resulting rarefied feature table.'
Expand Down
35 changes: 30 additions & 5 deletions q2_feature_table/tests/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,40 @@

class RarefyTests(TestCase):

def test_rarefy(self):
def test_rarefy_random_seed_is_randomized(self):
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
['O1', 'O2'],
['S1', 'S2', 'S3'])
a = rarefy(t, 2)
self.assertEqual(a.shape, (2, 2))
self.assertEqual(set(a.ids(axis='sample')), set(['S2', 'S3']))
self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2']))
npt.assert_array_equal(a.sum(axis='sample'), np.array([2., 2.]))
a_eq_b = []
n_iterations = 100
for i in range(n_iterations):
b = rarefy(t, 2)
self.assertEqual(b.shape, (2, 2))
self.assertEqual(set(b.ids(axis='sample')), set(['S2', 'S3']))
self.assertEqual(set(b.ids(axis='observation')), set(['O1', 'O2']))
npt.assert_array_equal(b.sum(axis='sample'), np.array([2., 2.]))
a_eq_b.append(a == b)
self.assertTrue(False in a_eq_b,
f"After {n_iterations} iterations, all resulting "
"feature tables are identical. It therefore seems "
"that a randomized seed is not being used.")

def test_rarefy_seed_is_not_randomized(self):
t = Table(np.array([[0, 1, 3], [1, 1, 2]]),
['O1', 'O2'],
['S1', 'S2', 'S3'])
a = rarefy(t, 2, random_seed=1)
a_eq_b = []
n_iterations = 100
for i in range(n_iterations):
b = rarefy(t, 2, random_seed=1)
a_eq_b.append(a == b)
self.assertFalse(False in a_eq_b,
f"After {n_iterations} iterations, at least one "
"feature table differed from the others. It "
"therefore seems that a randomized seed is being "
"used.")

def test_rarefy_replacement(self):
t = Table(np.array([[0, 10, 30], [10, 10, 20]]),
Expand Down
Loading