From 95701eb162a4e82ff2b3473f82969a383cd9a04a Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 5 Dec 2024 17:43:08 +0100 Subject: [PATCH 1/9] added seed parameter --- q2_feature_table/_normalize.py | 13 +++++++++++-- q2_feature_table/plugin_setup.py | 9 +++++++-- q2_feature_table/tests/test_normalize.py | 12 ++++++++++-- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index d9c2935..06caf63 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -5,17 +5,26 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +from typing import Union import biom +import numpy as np def rarefy(table: biom.Table, sampling_depth: int, - with_replacement: bool = False) -> biom.Table: + with_replacement: bool = False, seed: Union[int, str] = 1) -> biom.Table: + + # Generate a random seed if seed = "random" + if seed == "random": + rng = np.random.default_rng() + seed = rng.integers(0, 2 ** 32 - 1) + if with_replacement: table = table.filter(lambda v, i, m: v.sum() >= sampling_depth, inplace=False, axis='sample') + table = table.subsample(sampling_depth, axis='sample', by_id=False, - with_replacement=with_replacement) + with_replacement=with_replacement, seed=seed) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index 569b4d3..de22993 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -36,7 +36,8 @@ function=q2_feature_table.rarefy, inputs={'table': FeatureTable[Frequency]}, parameters={'sampling_depth': Int % Range(1, None), - 'with_replacement': Bool}, + 'with_replacement': Bool, + 'seed': Int % Range(0, 2**32) | Str % Choices(["random"])}, outputs=[('rarefied_table', FeatureTable[Frequency])], input_descriptions={'table': 'The feature table to be rarefied.'}, parameter_descriptions={ @@ -46,7 +47,11 @@ 'included in the resulting table.'), 'with_replacement': ('Rarefy with replacement by sampling from the ' 'multinomial distribution instead of rarefying ' - 'without replacement.') + 'without replacement.'), + 'seed': ('Set the seed for the subsampling. Using the same seed with ' + 'the same table will always lead to the same result. Using ' + '"random", sets the seed to a random number. The random ' + 'seed will not be logged in provenance.') }, output_descriptions={ 'rarefied_table': 'The resulting rarefied feature table.' diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index d45ca94..f08140f 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -17,16 +17,24 @@ class RarefyTests(TestCase): - def test_rarefy(self): + def test_rarefy_random_seed(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) - a = rarefy(t, 2) + a = rarefy(t, 2, seed="random") self.assertEqual(a.shape, (2, 2)) self.assertEqual(set(a.ids(axis='sample')), set(['S2', 'S3'])) self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2'])) npt.assert_array_equal(a.sum(axis='sample'), np.array([2., 2.])) + def test_rarefy_seed_1(self): + t = Table(np.array([[0, 1, 3], [1, 1, 2]]), + ['O1', 'O2'], + ['S1', 'S2', 'S3']) + a = rarefy(t, 2, seed=1) + self.assertEqual(a.data('S2', axis='sample').tolist(), [1, 1]) + self.assertEqual(a.data('S3', axis='sample').tolist(), [1, 1]) + def test_rarefy_replacement(self): t = Table(np.array([[0, 10, 30], [10, 10, 20]]), ['O1', 'O2'], From 8769d04f7c3fbdec5ac63d4578a0c1db0130adb0 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 5 Dec 2024 17:47:08 +0100 Subject: [PATCH 2/9] lint --- q2_feature_table/_normalize.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index 06caf63..ad47169 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -11,9 +11,11 @@ import numpy as np -def rarefy(table: biom.Table, sampling_depth: int, - with_replacement: bool = False, seed: Union[int, str] = 1) -> biom.Table: - +def rarefy(table: biom.Table, + sampling_depth: int, + with_replacement: bool = False, + seed: Union[int, str] = 1 + ) -> biom.Table: # Generate a random seed if seed = "random" if seed == "random": rng = np.random.default_rng() From 2cb39317172c83d0eaf7d5cbb1d59036a3671931 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 8 Jan 2025 13:53:21 +0100 Subject: [PATCH 3/9] changed random seed option --- q2_feature_table/_normalize.py | 14 +++++++------- q2_feature_table/plugin_setup.py | 9 ++++----- q2_feature_table/tests/test_normalize.py | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index ad47169..93840a4 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -5,28 +5,28 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- +import random from typing import Union import biom -import numpy as np def rarefy(table: biom.Table, sampling_depth: int, with_replacement: bool = False, - seed: Union[int, str] = 1 + random_seed: float = None ) -> biom.Table: - # Generate a random seed if seed = "random" - if seed == "random": - rng = np.random.default_rng() - seed = rng.integers(0, 2 ** 32 - 1) + # Generate a random seed if seed is None + if random_seed is not None: + random.seed(random_seed) if with_replacement: table = table.filter(lambda v, i, m: v.sum() >= sampling_depth, inplace=False, axis='sample') table = table.subsample(sampling_depth, axis='sample', by_id=False, - with_replacement=with_replacement, seed=seed) + with_replacement=with_replacement, + seed=random.randint(0, 2**32 - 1)) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index de22993..d5df09b 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -37,7 +37,7 @@ inputs={'table': FeatureTable[Frequency]}, parameters={'sampling_depth': Int % Range(1, None), 'with_replacement': Bool, - 'seed': Int % Range(0, 2**32) | Str % Choices(["random"])}, + 'random_seed': Int}, outputs=[('rarefied_table', FeatureTable[Frequency])], input_descriptions={'table': 'The feature table to be rarefied.'}, parameter_descriptions={ @@ -48,10 +48,9 @@ 'with_replacement': ('Rarefy with replacement by sampling from the ' 'multinomial distribution instead of rarefying ' 'without replacement.'), - 'seed': ('Set the seed for the subsampling. Using the same seed with ' - 'the same table will always lead to the same result. Using ' - '"random", sets the seed to a random number. The random ' - 'seed will not be logged in provenance.') + 'random_seed': ('Set the seed for the subsampling. Using the same seed with ' + 'the same table will always lead to the same result. Defaults ' + 'to a random seed') }, output_descriptions={ 'rarefied_table': 'The resulting rarefied feature table.' diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index f08140f..7df31a6 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -21,7 +21,7 @@ def test_rarefy_random_seed(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) - a = rarefy(t, 2, seed="random") + a = rarefy(t, 2) self.assertEqual(a.shape, (2, 2)) self.assertEqual(set(a.ids(axis='sample')), set(['S2', 'S3'])) self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2'])) @@ -31,7 +31,7 @@ def test_rarefy_seed_1(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) - a = rarefy(t, 2, seed=1) + a = rarefy(t, 2, random_seed=1) self.assertEqual(a.data('S2', axis='sample').tolist(), [1, 1]) self.assertEqual(a.data('S3', axis='sample').tolist(), [1, 1]) From 31c8f6809badac42de49d5e45d0bad14a393b84e Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 8 Jan 2025 13:56:46 +0100 Subject: [PATCH 4/9] lint --- q2_feature_table/_normalize.py | 2 -- q2_feature_table/plugin_setup.py | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index 93840a4..fb8c904 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -6,8 +6,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- import random -from typing import Union - import biom diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index d5df09b..451cbe6 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -48,9 +48,9 @@ 'with_replacement': ('Rarefy with replacement by sampling from the ' 'multinomial distribution instead of rarefying ' 'without replacement.'), - 'random_seed': ('Set the seed for the subsampling. Using the same seed with ' - 'the same table will always lead to the same result. Defaults ' - 'to a random seed') + 'random_seed': ('Set the seed for the subsampling. Using the same ' + 'seed with the same table will always lead to the ' + 'same result. Defaults to a random seed.') }, output_descriptions={ 'rarefied_table': 'The resulting rarefied feature table.' From 14727ec185f9787247e7e40925a29e068af47691 Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Fri, 17 Jan 2025 09:56:14 +0100 Subject: [PATCH 5/9] Update q2_feature_table/_normalize.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_feature_table/_normalize.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index fb8c904..b8ada1b 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -15,8 +15,7 @@ def rarefy(table: biom.Table, random_seed: float = None ) -> biom.Table: # Generate a random seed if seed is None - if random_seed is not None: - random.seed(random_seed) + random.seed(a=random_seed) if with_replacement: table = table.filter(lambda v, i, m: v.sum() >= sampling_depth, From 602ef5e0729e3fb700a6483bc3ebc06ba75d1034 Mon Sep 17 00:00:00 2001 From: VinzentRisch <100149044+VinzentRisch@users.noreply.github.com> Date: Fri, 17 Jan 2025 09:57:18 +0100 Subject: [PATCH 6/9] Update q2_feature_table/_normalize.py Co-authored-by: Greg Caporaso <192372+gregcaporaso@users.noreply.github.com> --- q2_feature_table/_normalize.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index b8ada1b..e29a896 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -22,8 +22,7 @@ def rarefy(table: biom.Table, inplace=False, axis='sample') table = table.subsample(sampling_depth, axis='sample', by_id=False, - with_replacement=with_replacement, - seed=random.randint(0, 2**32 - 1)) + with_replacement=with_replacement) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' From 0df183944ef84502b6529e0382e592b4945d6285 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 17 Jan 2025 12:08:50 +0100 Subject: [PATCH 7/9] fixed random seed and added tests with iteration --- q2_feature_table/_normalize.py | 6 ++--- q2_feature_table/tests/test_normalize.py | 32 ++++++++++++++++++------ 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index e29a896..350761b 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -5,7 +5,6 @@ # # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- -import random import biom @@ -14,15 +13,14 @@ def rarefy(table: biom.Table, with_replacement: bool = False, random_seed: float = None ) -> biom.Table: - # Generate a random seed if seed is None - random.seed(a=random_seed) if with_replacement: table = table.filter(lambda v, i, m: v.sum() >= sampling_depth, inplace=False, axis='sample') table = table.subsample(sampling_depth, axis='sample', by_id=False, - with_replacement=with_replacement) + with_replacement=with_replacement, + seed=random_seed) if table.is_empty(): raise ValueError('The rarefied table contains no samples or features. ' diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index 7df31a6..c0fedb3 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -17,23 +17,39 @@ class RarefyTests(TestCase): - def test_rarefy_random_seed(self): + def test_rarefy_random_seed_is_randomized(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) a = rarefy(t, 2) - self.assertEqual(a.shape, (2, 2)) - self.assertEqual(set(a.ids(axis='sample')), set(['S2', 'S3'])) - self.assertEqual(set(a.ids(axis='observation')), set(['O1', 'O2'])) - npt.assert_array_equal(a.sum(axis='sample'), np.array([2., 2.])) + a_eq_b = [] + n_iterations = 100 + for i in range(n_iterations): + b = rarefy(t, 2) + self.assertEqual(b.shape, (2, 2)) + self.assertEqual(set(b.ids(axis='sample')), set(['S2', 'S3'])) + self.assertEqual(set(b.ids(axis='observation')), set(['O1', 'O2'])) + npt.assert_array_equal(b.sum(axis='sample'), np.array([2., 2.])) + a_eq_b.append(a == b) + self.assertTrue(False in a_eq_b, + f"After {n_iterations} iterations, all resulting feature " + "tables are identical. It therefore seems that a " + "randomized seed is not being used.") - def test_rarefy_seed_1(self): + def test_rarefy_seed_is_not_randomized(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), ['O1', 'O2'], ['S1', 'S2', 'S3']) a = rarefy(t, 2, random_seed=1) - self.assertEqual(a.data('S2', axis='sample').tolist(), [1, 1]) - self.assertEqual(a.data('S3', axis='sample').tolist(), [1, 1]) + a_eq_b = [] + n_iterations = 100 + for i in range(n_iterations): + b = rarefy(t, 2, random_seed=1) + a_eq_b.append(a == b) + self.assertFalse(False in a_eq_b, + f"After {n_iterations} iterations, at least one feature " + "table differed from the others. It therefore seems that a " + "randomized seed is being used.") def test_rarefy_replacement(self): t = Table(np.array([[0, 10, 30], [10, 10, 20]]), From 3fa2548ec8c0b628fcc97b15f26f697484af2ec8 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 17 Jan 2025 12:12:13 +0100 Subject: [PATCH 8/9] changed random seed to int and added range --- q2_feature_table/_normalize.py | 2 +- q2_feature_table/plugin_setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_feature_table/_normalize.py b/q2_feature_table/_normalize.py index 350761b..cbbd7ab 100644 --- a/q2_feature_table/_normalize.py +++ b/q2_feature_table/_normalize.py @@ -11,7 +11,7 @@ def rarefy(table: biom.Table, sampling_depth: int, with_replacement: bool = False, - random_seed: float = None + random_seed: int = None ) -> biom.Table: if with_replacement: diff --git a/q2_feature_table/plugin_setup.py b/q2_feature_table/plugin_setup.py index 451cbe6..cd7928f 100644 --- a/q2_feature_table/plugin_setup.py +++ b/q2_feature_table/plugin_setup.py @@ -37,7 +37,7 @@ inputs={'table': FeatureTable[Frequency]}, parameters={'sampling_depth': Int % Range(1, None), 'with_replacement': Bool, - 'random_seed': Int}, + 'random_seed': Int % Range(0, None)}, outputs=[('rarefied_table', FeatureTable[Frequency])], input_descriptions={'table': 'The feature table to be rarefied.'}, parameter_descriptions={ From 302f2445c5c19910a36966b35216a48242b4c276 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 17 Jan 2025 12:20:06 +0100 Subject: [PATCH 9/9] lint --- q2_feature_table/tests/test_normalize.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/q2_feature_table/tests/test_normalize.py b/q2_feature_table/tests/test_normalize.py index c0fedb3..cc80a96 100644 --- a/q2_feature_table/tests/test_normalize.py +++ b/q2_feature_table/tests/test_normalize.py @@ -32,9 +32,9 @@ def test_rarefy_random_seed_is_randomized(self): npt.assert_array_equal(b.sum(axis='sample'), np.array([2., 2.])) a_eq_b.append(a == b) self.assertTrue(False in a_eq_b, - f"After {n_iterations} iterations, all resulting feature " - "tables are identical. It therefore seems that a " - "randomized seed is not being used.") + f"After {n_iterations} iterations, all resulting " + "feature tables are identical. It therefore seems " + "that a randomized seed is not being used.") def test_rarefy_seed_is_not_randomized(self): t = Table(np.array([[0, 1, 3], [1, 1, 2]]), @@ -47,9 +47,10 @@ def test_rarefy_seed_is_not_randomized(self): b = rarefy(t, 2, random_seed=1) a_eq_b.append(a == b) self.assertFalse(False in a_eq_b, - f"After {n_iterations} iterations, at least one feature " - "table differed from the others. It therefore seems that a " - "randomized seed is being used.") + f"After {n_iterations} iterations, at least one " + "feature table differed from the others. It " + "therefore seems that a randomized seed is being " + "used.") def test_rarefy_replacement(self): t = Table(np.array([[0, 10, 30], [10, 10, 20]]),