Skip to content

Commit

Permalink
REF: Add more bumpers and tests to cc --to-baseline (#89)
Browse files Browse the repository at this point in the history
  • Loading branch information
cherman2 authored Aug 21, 2024
1 parent b8161de commit 7d7cb10
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 9 deletions.
41 changes: 33 additions & 8 deletions q2_fmt/_engraftment.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import pandas as pd
import itertools
import numpy as np

import qiime2

Expand All @@ -23,7 +24,6 @@ def cc(
group_timepoints = ctx.get_action('fmt', 'group_timepoints')

results = []

time_dist, ref_dist = group_timepoints(
diversity_measure=diversity_measure,
metadata=metadata, distance_to=distance_to,
Expand Down Expand Up @@ -70,10 +70,15 @@ def group_timepoints(

(is_beta, used_references, time_col, subject_col, group_col,
used_controls) = \
_data_filtering(diversity_measure, metadata, distance_to, time_column,
reference_column, group_column, subject_column,
control_column, filter_missing_references,
baseline_timepoint, where)
_data_filtering(diversity_measure=diversity_measure,
metadata=metadata, distance_to=distance_to,
time_column=time_column,
reference_column=reference_column,
group_column=group_column,
subject_column=subject_column,
control_column=control_column,
filter_missing_references=filter_missing_references,
baseline_timepoint=baseline_timepoint, where=where)

original_measure_name = diversity_measure.name
diversity_measure.name = 'measure'
Expand Down Expand Up @@ -233,18 +238,38 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None,
temp_baseline_ref = []
reference_list = []
baseline_ref_df = pd.DataFrame()
for sub, samples in metadata.to_dataframe().groupby([subject_column]):
# All valid FMT samples have to have a time column
metadata = metadata.to_dataframe()[~time_col.isna()]
if float(baseline_timepoint) not in metadata[time_column].values:
raise AssertionError("The provided baseline timepoint"
f" {baseline_timepoint} was not"
f" found in `metadata` "
f" column {time_column}.")
for sub, samples in metadata.groupby([subject_column]):
reference = \
samples[samples[
time_column] == float(baseline_timepoint)].index.to_list()
if len(reference) != 1:
if len(reference) > 1:
raise ValueError("More than one baseline sample was found per"
" subject. Only one baseline sample can be"
" used as a reference. Please group baseline"
" replicates.")
elif len(reference) == 0:
# If there is no baseline for a subject,
# This will either drop with filter-missing-references or
# or error and say that they need to pass
# filter-missing-references
reference = [np.nan]
temp_baseline_ref = temp_baseline_ref + samples.index.to_list()
reference_list = \
reference_list + (reference * len(samples.index.to_list()))
# I dont see any way that this hits because of my above assertion but
# I think its a good check so I am leavig it in.
if len(reference_list) == 0:
raise AssertionError("No baseline samples",
" were found in the metadata.",
" Please confirm that a valid",
" baseline timepoint was given.")
baseline_ref_df["sample_name"] = temp_baseline_ref
baseline_ref_df["relevant_baseline"] = reference_list
baseline_ref_df = \
Expand All @@ -257,7 +282,7 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None,
# this is so the variables for distance to donor and distance to
# baseline have the same variable name
used_references = reference_col

metadata = qiime2.Metadata(metadata)
if used_references.isna().any():
if filter_missing_references:
used_references = used_references.dropna()
Expand Down
113 changes: 112 additions & 1 deletion q2_fmt/tests/test_engraftment.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,121 @@ def test_no_baseline_duplicates(self):
" was found per subject.*"):
group_timepoints(diversity_measure=self.alpha,
metadata=self.md_alpha, distance_to='baseline',
baseline_timepoint="1",
baseline_timepoint="7",
time_column='days_post_transplant',
subject_column='subject')

def test_d2_baseline_alpha_missing_reference(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3',
'sample4'],
'subject': ['sub1', 'sub1', 'sub2', 'sub2'],
'group': [1, 2, 2, 3]}).set_index('id')
md_baseline = Metadata(metadata_df)

obs_feature = pd.Series(data=[1, 0, 1, 0],
index=['sample1', 'sample2',
'sample3', 'sample4'])
with self.assertRaisesRegex(KeyError,
'Missing references for the associated'
' sample data'):
group_timepoints(diversity_measure=obs_feature,
metadata=md_baseline,
distance_to='baseline',
time_column='group',
subject_column='subject',
baseline_timepoint=1)

def test_d2_baseline_alpha_filt_missing_reference(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3',
'sample4'],
'subject': ['sub1', 'sub1', 'sub2', 'sub2'],
'group': [1, 2, 2, 3]}).set_index('id')
md_baseline = Metadata(metadata_df)

obs_feature = pd.Series(data=[1, 0, 1, 0],
index=['sample1', 'sample2',
'sample3', 'sample4'])

exp_time_df = pd.DataFrame({
'id': ['sample2'],
'measure': [0],
'group': [2.0],
'subject': ['sub1',]
})

exp_ref_df = pd.DataFrame({
'id': ['sample1'],
'measure': [1],
'group': ['reference']
})
time_df, ref_df = group_timepoints(diversity_measure=obs_feature,
metadata=md_baseline,
distance_to='baseline',
time_column='group',
subject_column='subject',
baseline_timepoint=1,
filter_missing_references=True)

pd.testing.assert_frame_equal(time_df, exp_time_df)
pd.testing.assert_frame_equal(ref_df, exp_ref_df)

def test_d2_baseline_alpha_drop_na_tp(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3',
'sample4'],
'subject': ['sub1', 'sub1', 'sub2', 'sub2'],
'group': [1, 2, np.nan, np.nan]}).set_index('id')
md_baseline = Metadata(metadata_df)

obs_feature = pd.Series(data=[1, 0, 1, 0],
index=['sample1', 'sample2',
'sample3', 'sample4'])

exp_time_df = pd.DataFrame({
'id': ['sample2'],
'measure': [0],
'group': [2.0],
'subject': ['sub1',]
})

exp_ref_df = pd.DataFrame({
'id': ['sample1'],
'measure': [1],
'group': ['reference']
})
time_df, ref_df = group_timepoints(diversity_measure=obs_feature,
metadata=md_baseline,
distance_to='baseline',
time_column='group',
subject_column='subject',
baseline_timepoint=1,
filter_missing_references=True)

pd.testing.assert_frame_equal(time_df, exp_time_df)
pd.testing.assert_frame_equal(ref_df, exp_ref_df)

def test_d2_baseline_alpha_invalid_tp(self):
metadata_df = pd.DataFrame({
'id': ['sample1', 'sample2', 'sample3',
'sample4'],
'subject': ['sub1', 'sub1', 'sub2', 'sub2'],
'group': [1, 2, 2, 3]}).set_index('id')
md_baseline = Metadata(metadata_df)

obs_feature = pd.Series(data=[1, 0, 1, 0],
index=['sample1', 'sample2',
'sample3', 'sample4'])
with self.assertRaisesRegex(AssertionError,
'The provided .* group.'):
group_timepoints(diversity_measure=obs_feature,
metadata=md_baseline,
distance_to='baseline',
time_column='group',
subject_column='subject',
baseline_timepoint=7)

def test_examples(self):
self.execute_examples()

Expand Down

0 comments on commit 7d7cb10

Please sign in to comment.