BUG: samples dropped by rarefying are now handled by mismatched pairs (…

…#117)
qiime2 · Dec 12, 2024 · 7a88753 · 7a88753
1 parent 9275844
commit 7a88753
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 0 deletions.
diff --git a/q2_fmt/_peds.py b/q2_fmt/_peds.py
@@ -516,6 +516,10 @@ def pedf_permutation_test(table: pd.DataFrame, metadata: qiime2.Metadata,
     actual_pedf = pedf_df[['id', 'measure']].set_index('id')['measure']
 
     # Mismatch simulation:
+
+    # Filtering out any samples that are going to be dropped by rarfying before
+    # we assign donor-receipent mismatches.
+    table = table[table.sum(axis=1) >= sampling_depth]
     recip_df = _create_recipient_table(used_references, metadata_df, table)
     donor_df = table[table.index.isin(used_references)]
     mismatched_series = \
@@ -529,10 +533,12 @@ def pedf_permutation_test(table: pd.DataFrame, metadata: qiime2.Metadata,
     simulated_recip_table, simulated_donor_table =\
         _create_duplicated_tables(simulated_mismatched_series, recip_df,
                                   donor_df)
+
     # concating or recip and donor tables so column number stays the same after
     # subsampling
     simulated_table = pd.concat([simulated_recip_table,
                                  simulated_donor_table])
+
     rarefied_simulated_table = _subsample(table=simulated_table,
                                           sampling_depth=sampling_depth)
 

diff --git a/q2_fmt/tests/test_engraftment.py b/q2_fmt/tests/test_engraftment.py
@@ -2014,6 +2014,41 @@ def test_peds_sim_stats_99_iters(self):
         self.assertEqual(count_less, exp_count_less)
         self.assertEqual(per_subject_p, exp_per_subject_p)
 
+    def test_samples_drop(self):
+        # This tests has a very small chance of failing by random chance if
+        # out of the 9 random samples, the present feature doesnt subsample
+        # at least one time but that is pretty unlikely.
+        metadata_df = pd.DataFrame({
+            'id': ['sample1', 'sample2', 'sample3',
+                   'donor1', 'donor2', 'donor3'],
+            'Ref': ['donor1', 'donor2', 'donor3', np.nan, np.nan,
+                    np.nan],
+            'subject': ['sub1', 'sub2', 'sub3', np.nan, np.nan,
+                        np.nan],
+            'group': [1, 1, 1, np.nan, np.nan,
+                      np.nan],
+            "Location": [np.nan, np.nan,
+                         np.nan, 'test', 'test',
+                         'test']}).set_index('id')
+
+        table_df = pd.DataFrame({
+            'id': ['sample1', 'sample2', 'sample3',
+                   'donor1', 'donor2', 'donor3'],
+            'Feature1': [10, 0, 0, 10, 0, 0],
+            'Feature2': [0, 1, 0, 0, 10, 0],
+            'Feature3': [0, 0, 10, 0, 0, 10]}).set_index('id')
+        metadata = Metadata(metadata_df)
+
+        peds, _, _ = pedf_permutation_test(metadata=metadata,
+                                           table=table_df,
+                                           time_column="group",
+                                           reference_column="Ref",
+                                           subject_column="subject",
+                                           num_resamples=999,
+                                           sampling_depth=9)
+
+        self.assertFalse(peds['id'].isin(['sample2']).any())
+
 
 class detect(TestBase):
     def test_baseline_donor_md(self):