From 5e534836315dd78de1120a04afcfa64018d5383b Mon Sep 17 00:00:00 2001 From: Liz Gehret <54517601+lizgehret@users.noreply.github.com> Date: Wed, 4 May 2022 10:34:43 -0700 Subject: [PATCH] LINT: linting cleanup (#17) --- LICENSE | 1 - Makefile | 4 +- ci/recipe/meta.yaml | 6 +- q2_fmt/__init__.py | 5 +- q2_fmt/_engraftment.py | 142 +++++++++++------- q2_fmt/_examples.py | 15 +- q2_fmt/_stats.py | 36 ++--- q2_fmt/_transformer.py | 4 +- q2_fmt/_visualizer.py | 4 +- q2_fmt/plugin_setup.py | 156 ++++++++++++-------- q2_fmt/tests/test_engraftment.py | 237 +++++++++++++++++++------------ 11 files changed, 390 insertions(+), 220 deletions(-) diff --git a/LICENSE b/LICENSE index 9700189..bffa534 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ - BSD 3-Clause License Copyright (c) 2022, QIIME 2 development team. diff --git a/Makefile b/Makefile index 4b8cf7c..1b9c4d2 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ PYTHON ?= python -all: +all: ; lint: q2lint @@ -22,4 +22,4 @@ dev: all clean: distclean -distclean: +distclean: ; diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index ffbb184..362393a 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -24,14 +24,16 @@ requirements: run: - python {{ python }} + - pandas + - scipy + - scikit-bio + - jinja2 - qiime2 {{ qiime2_epoch }}.* - - q2templates {{ qiime2_epoch }}.* - q2-types {{ qiime2_epoch }}.* test: requires: - qiime2 >={{ qiime2 }} - - q2templates >={{ q2templates }} - q2-types >={{ q2_types }} - pytest diff --git a/q2_fmt/__init__.py b/q2_fmt/__init__.py index f58c592..8b1589e 100644 --- a/q2_fmt/__init__.py +++ b/q2_fmt/__init__.py @@ -15,5 +15,6 @@ __version__ = get_versions()['version'] del get_versions -__all__ = ['RecordTSVFileFormat', 'AnnotatedTSVDirFmt', 'StatsTable', 'Pairwise', - 'GroupDist', 'Ordered', 'Unordered', 'Matched', 'Independent', 'engraftment'] +__all__ = ['RecordTSVFileFormat', 'AnnotatedTSVDirFmt', 'StatsTable', + 'Pairwise', 'GroupDist', 'Ordered', 'Unordered', 'Matched', + 'Independent', 'engraftment'] diff --git a/q2_fmt/_engraftment.py b/q2_fmt/_engraftment.py index d941eea..3882274 100644 --- a/q2_fmt/_engraftment.py +++ b/q2_fmt/_engraftment.py @@ -16,7 +16,8 @@ def engraftment( ctx, diversity_measure, metadata, hypothesis, time_column, reference_column, subject_column, control_column=None, filter_missing_references=False, where=None, against_group=None, - p_val_approx='auto'): + p_val_approx='auto' +): raincloud_plot = ctx.get_action('fmt', 'plot_rainclouds') group_timepoints = ctx.get_action('fmt', 'group_timepoints') @@ -24,25 +25,29 @@ def engraftment( results = [] time_dist, ref_dist = group_timepoints(diversity_measure, metadata, - time_column, reference_column, subject_column, control_column, - filter_missing_references, where) + time_column, reference_column, + subject_column, control_column, + filter_missing_references, where) if hypothesis == 'reference' or hypothesis == 'all-pairwise': mann_whitney_u = ctx.get_action('fmt', 'mann_whitney_u') stats = mann_whitney_u(distribution=ref_dist, hypothesis=hypothesis, reference_group=against_group, - against_each=ref_dist, p_val_approx=p_val_approx) + against_each=ref_dist, + p_val_approx=p_val_approx) else: wilcoxon_srt = ctx.get_action('fmt', 'wilcoxon_srt') stats = wilcoxon_srt(distribution=time_dist, hypothesis=hypothesis, - baseline_group=against_group, p_val_approx=p_val_approx) + baseline_group=against_group, + p_val_approx=p_val_approx) results += stats results += raincloud_plot(data=time_dist, stats=stats[0]) return tuple(results) + def group_timepoints( diversity_measure: pd.Series, metadata: qiime2.Metadata, time_column: str, reference_column: str, subject_column: str = False, @@ -53,9 +58,9 @@ def group_timepoints( diversity_measure.index = _sort_multi_index(diversity_measure.index) is_beta, used_references, time_col, subject_col, used_controls = \ - _data_filtering(diversity_measure, metadata, time_column, reference_column, - subject_column, control_column, filter_missing_references, - where) + _data_filtering(diversity_measure, metadata, time_column, + reference_column, subject_column, control_column, + filter_missing_references, where) original_measure_name = diversity_measure.name diversity_measure.name = 'measure' @@ -72,7 +77,7 @@ def group_timepoints( ordered_df['id'].attrs.update(id_annotation) ordered_df['measure'].attrs.update({ 'unit': ('Distance to %s' % used_references.name) - if is_beta else original_measure_name, + if is_beta else original_measure_name, 'description': '...' }) ordered_df['group'].attrs.update({ @@ -85,9 +90,9 @@ def group_timepoints( 'description': '...' }) - independent_df = _independent_dists(diversity_measure, metadata, - used_references, is_beta, used_controls) + used_references, is_beta, + used_controls) # id, measure, group, [A, B] if is_beta: @@ -107,7 +112,7 @@ def group_timepoints( }) independent_df['group'].attrs.update({ 'unit': used_references.name if used_controls is None else - '%s or %s' % (used_references.name, used_controls.name), + '%s or %s' % (used_references.name, used_controls.name), 'description': '...' }) if is_beta: @@ -116,15 +121,18 @@ def group_timepoints( return ordered_df, independent_df + # HELPER FUNCTION FOR DATA FILTERING def _data_filtering(diversity_measure: pd.Series, metadata: qiime2.Metadata, - time_column: str, reference_column: str, subject_column: str = False, - control_column: str = None, filter_missing_references: bool = False, - where: str = None): + time_column: str, reference_column: str, + subject_column: str = False, control_column: str = None, + filter_missing_references: bool = False, + where: str = None): if diversity_measure.empty: raise ValueError('Empty diversity measure detected.' - ' Please make sure your diversity measure contains data.') + ' Please make sure your diversity measure' + ' contains data.') if isinstance(diversity_measure.index, pd.MultiIndex): is_beta = True @@ -137,7 +145,10 @@ def _data_filtering(diversity_measure: pd.Series, metadata: qiime2.Metadata, metadata = metadata.filter_ids(ids_to_keep=ids_with_data) if where is not None: - metadata = metadata.filter_ids(ids_to_keep=metadata.get_ids(where=where)) + metadata = (metadata + .filter_ids(ids_to_keep=metadata + .get_ids(where=where)) + ) def _get_series_from_col(md, col_name, param_name, expected_type=None, drop_missing_values=False): @@ -145,7 +156,7 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None, column = md.get_column(col_name) except ValueError as e: raise ValueError("There was an issue with the argument for %r. %s" - % (param_name, e)) from e + % (param_name, e)) from e if expected_type is not None and not isinstance(column, expected_type): if type(expected_type) is tuple: @@ -154,18 +165,22 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None, exp = expected_type.type raise ValueError("Provided column for %r is %r, not %r." - % (param_name, column.type, exp)) + % (param_name, column.type, exp)) if drop_missing_values: column = column.drop_missing_values() return column.to_series() - time_col = _get_series_from_col(md=metadata, col_name=time_column, param_name='time_column', - expected_type=qiime2.NumericMetadataColumn) + time_col = _get_series_from_col( + md=metadata, col_name=time_column, + param_name='time_column', + expected_type=qiime2.NumericMetadataColumn) - reference_col = _get_series_from_col(md=metadata, col_name=reference_column, param_name='reference_column', - expected_type=qiime2.CategoricalMetadataColumn) + reference_col = _get_series_from_col( + md=metadata, col_name=reference_column, + param_name='reference_column', + expected_type=qiime2.CategoricalMetadataColumn) used_references = reference_col[~time_col.isna()] @@ -174,44 +189,55 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None, used_references = used_references.dropna() else: nan_references = used_references.index[used_references.isna()] - raise KeyError('Missing references for the associated sample data. Please make sure' - ' that all samples with a timepoint value have an associated reference.' - ' IDs where missing references were found: %s' % (tuple(nan_references),)) + raise KeyError('Missing references for the associated sample data.' + ' Please make sure that all samples with a' + ' timepoint value have an associated reference.' + ' IDs where missing references were found:' + ' %s' % (tuple(nan_references),)) available_references = (used_references.isin(ids_with_data)) if not available_references.all(): if filter_missing_references: used_references = used_references[available_references] else: - raise KeyError('References included in the metadata are missing from the diversity measure.' - ' Please make sure all references included in the metadata are also present' - ' in the diversity measure. Missing references: %s' - % list(used_references[~available_references].unique()) - ) + raise KeyError('References included in the metadata are missing' + ' from the diversity measure. Please make sure all' + ' references included in the metadata are also' + ' present in the diversity measure.' + ' Missing references: %s' + % list(used_references[~available_references] + .unique())) if used_references.empty: raise KeyError('No references were found within the diversity metric.') subject_col = None if subject_column: - subject_col = _get_series_from_col(md=metadata, col_name=subject_column, param_name='subject_column', - expected_type=qiime2.CategoricalMetadataColumn) + subject_col = _get_series_from_col( + md=metadata, col_name=subject_column, + param_name='subject_column', + expected_type=qiime2.CategoricalMetadataColumn) used_controls = None if control_column is not None: - control_col = _get_series_from_col(md=metadata, col_name=control_column, param_name='control_column') + control_col = _get_series_from_col(md=metadata, + col_name=control_column, + param_name='control_column') used_controls = control_col[~control_col.isna()] return is_beta, used_references, time_col, subject_col, used_controls + # HELPER FUNCTION FOR sorting a multi-index (for dist matrix and metadata) def _sort_multi_index(index): sorted_levels = list(map(sorted, index)) sorted_multi = pd.MultiIndex.from_tuples(sorted_levels) return sorted_multi + # HELPER FUNCTION FOR GroupDists[Ordered, Matched | Independent] -def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_col, subject_col): +def _ordered_dists(diversity_measure: pd.Series, is_beta, + used_references, time_col, subject_col): if is_beta: idx = pd.MultiIndex.from_frame( used_references.to_frame().reset_index()) @@ -222,11 +248,16 @@ def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_ idx.name = 'id' try: - sliced_df = diversity_measure[idx].to_frame().reset_index().set_index('id') + sliced_df = (diversity_measure[idx] + .to_frame() + .reset_index() + .set_index('id') + ) except KeyError: - raise KeyError('Pairwise comparisons were unsuccessful. Please double check that your' - ' chosen reference column contains values that are also present in the ID column for' - ' the associated metadata.') + raise KeyError('Pairwise comparisons were unsuccessful. Please double' + ' check that your chosen reference column contains' + ' values that are also present in the ID column for' + ' the associated metadata.') if is_beta: sliced_df.index = used_references.index @@ -239,8 +270,10 @@ def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_ return ordinal_df.reset_index() + # HELPER FUNCTION FOR GroupDists[Unordered, Independent] -def _independent_dists(diversity_measure, metadata, used_references, is_beta, used_controls): +def _independent_dists(diversity_measure, metadata, + used_references, is_beta, used_controls): unique_references = sorted(used_references.unique()) if is_beta: @@ -248,18 +281,26 @@ def _independent_dists(diversity_measure, metadata, used_references, is_beta, us ref_idx = pd.MultiIndex.from_tuples( itertools.combinations(unique_references, 2)) except TypeError: - raise TypeError('Single reference value detected. More than one unique reference must be' - ' provided for successful grouping.') + raise TypeError('Single reference value detected. More than one' + ' unique reference must be provided for' + ' successful grouping.') ref_idx.names = ['A', 'B'] if used_controls is not None: - grouped_md = metadata.to_dataframe().loc[used_controls.index].groupby(used_controls) + grouped_md = (metadata + .to_dataframe() + .loc[used_controls.index] + .groupby(used_controls) + ) ctrl_list = list() for group_id, grouped_ctrls in grouped_md: if len(grouped_ctrls.index) < 2: continue - ctrl_combos = list(itertools.combinations(grouped_ctrls.index, 2)) + ctrl_combos = list( + itertools.combinations( + grouped_ctrls.index, 2) + ) ctrl_idx = pd.MultiIndex.from_tuples(ctrl_combos) ctrl_series = pd.Series(group_id, index=ctrl_idx) ctrl_list.append(ctrl_series) @@ -267,8 +308,10 @@ def _independent_dists(diversity_measure, metadata, used_references, is_beta, us try: ctrl_series = pd.concat(ctrl_list) except ValueError: - raise ValueError('One or less controls detected. When including controls in your data,' - ' please include more than one for successful grouping.') + raise ValueError('One or less controls detected.' + ' When including controls in your data,' + ' please include more than one for' + ' successful grouping.') ctrl_series.name = 'group' ctrl_series.index.names = ['A', 'B'] @@ -282,9 +325,10 @@ def _independent_dists(diversity_measure, metadata, used_references, is_beta, us try: nominal_df = diversity_measure[ref_idx].to_frame().reset_index() except KeyError: - raise KeyError('Pairwise comparisons were unsuccessful. Please double check that your' - ' chosen reference column contains values that are also present in the ID column for' - ' the associated metadata.') + raise KeyError('Pairwise comparisons were unsuccessful. Please double' + ' check that your chosen reference column contains' + ' values that are also present in the ID column for' + ' the associated metadata.') nominal_df['group'] = 'reference' diff --git a/q2_fmt/_examples.py b/q2_fmt/_examples.py index 65bc395..5ab06c0 100644 --- a/q2_fmt/_examples.py +++ b/q2_fmt/_examples.py @@ -11,6 +11,7 @@ import qiime2 + def _get_data_from_tests(path): return pkg_resources.resource_filename('q2_fmt.tests', os.path.join('data', path)) @@ -20,38 +21,47 @@ def alpha_md_factory(): return qiime2.Metadata.load( _get_data_from_tests('sample_metadata_alpha_div.tsv')) + def beta_md_factory(): return qiime2.Metadata.load( _get_data_from_tests('sample_metadata_donors.tsv')) + def alpha_div_factory(): return qiime2.Artifact.import_data( 'SampleData[AlphaDiversity]', _get_data_from_tests('alpha_div.tsv')) + def beta_div_factory(): return qiime2.Artifact.import_data( 'DistanceMatrix', _get_data_from_tests('dist_matrix_donors.tsv')) + def faithpd_timedist_factory(): return qiime2.Artifact.import_data( 'GroupDist[Ordered, Matched]', _get_data_from_tests('faithpd_timedist') ) + def faithpd_refdist_factory(): return qiime2.Artifact.import_data( - 'GroupDist[Unordered, Independent]', _get_data_from_tests('faithpd_refdist') + 'GroupDist[Unordered, Independent]', + _get_data_from_tests('faithpd_refdist') ) + def faithpd_md_factory(): return qiime2.Metadata.load( _get_data_from_tests('metadata-faithpd.tsv') ) + def faithpd_div_factory(): return qiime2.Artifact.import_data( 'SampleData[AlphaDiversity]', _get_data_from_tests('faithpd.tsv') ) + def group_timepoints_alpha_independent(use): alpha = use.init_artifact('alpha', alpha_div_factory) metadata = use.init_metadata('metadata', alpha_md_factory) @@ -97,6 +107,7 @@ def group_timepoints_beta(use): timepoints.assert_output_type('GroupDist[Ordered, Matched]') references.assert_output_type('GroupDist[Unordered, Independent]') + def wilcoxon_baseline0(use): timedist = use.init_artifact('timedist', faithpd_timedist_factory) @@ -115,6 +126,7 @@ def wilcoxon_baseline0(use): stats_table.assert_output_type('StatsTable[Pairwise]') + def mann_whitney_pairwise(use): timedist = use.init_artifact('timedist', faithpd_timedist_factory) refdist = use.init_artifact('refdist', faithpd_refdist_factory) @@ -134,6 +146,7 @@ def mann_whitney_pairwise(use): stats_table.assert_output_type('StatsTable[Pairwise]') + # Engraftment example using faith PD, baseline0 hypothesis def engraftment_baseline(use): md = use.init_metadata('md', faithpd_md_factory) diff --git a/q2_fmt/_stats.py b/q2_fmt/_stats.py index 388e69c..acdbdbc 100644 --- a/q2_fmt/_stats.py +++ b/q2_fmt/_stats.py @@ -12,9 +12,9 @@ def mann_whitney_u(distribution: pd.DataFrame, hypothesis: str, - reference_group: str=None, - against_each: pd.DataFrame=None, - p_val_approx: str='auto') -> pd.DataFrame: + reference_group: str = None, + against_each: pd.DataFrame = None, + p_val_approx: str = 'auto') -> pd.DataFrame: dists = [distribution] @@ -27,14 +27,15 @@ def mann_whitney_u(distribution: pd.DataFrame, hypothesis: str, elif hypothesis == 'all-pairwise': if reference_group is not None: raise ValueError("`all-pairwise` was selected as the hypothesis," - " but a `reference_group` was added. Please either" - " select `reference` as the hypothesis, or remove" - " the `reference_group` parameter from your command.") + " but a `reference_group` was added." + " Please either select `reference` as the" + " hypothesis, or remove the `reference_group`" + " parameter from your command.") comparisons = _comp_all_pairwise(distribution, against_each=against_each) else: - raise ValueError("Invalid hypothesis. Please either choose `reference` or" - " `all-pairwise` as your hypothesis.") + raise ValueError("Invalid hypothesis. Please either choose `reference`" + " or `all-pairwise` as your hypothesis.") table = [] for (idx_a, comp_a), (idx_b, comp_b) in comparisons: @@ -80,7 +81,6 @@ def mann_whitney_u(distribution: pd.DataFrame, hypothesis: str, df['q-value'].attrs.update( dict(unit='Benjamini–Hochberg', description='...')) - return df @@ -125,20 +125,22 @@ def _compare_mannwhitneyu(group_a, group_b, p_val_approx): def wilcoxon_srt(distribution: pd.DataFrame, hypothesis: str, - baseline_group: str=None, p_val_approx: str='auto') -> pd.DataFrame: + baseline_group: str = None, + p_val_approx: str = 'auto') -> pd.DataFrame: if hypothesis == 'baseline': comparisons = _comp_baseline(distribution, baseline_group) elif hypothesis == 'consecutive': if baseline_group is not None: raise ValueError("`consecutive` was selected as the hypothesis," - " but a `baseline_group` was added. Please either" - " select `baseline` as the hypothesis, or remove" - " the `baseline_group` parameter from your command.") + " but a `baseline_group` was added. Please" + " either select `baseline` as the hypothesis," + " or remove the `baseline_group` parameter" + " from your command.") comparisons = _comp_consecutive(distribution) else: - raise ValueError("Invalid hypothesis. Please either choose `baseline` or" - " `consecutive` as your hypothesis.") + raise ValueError("Invalid hypothesis. Please either choose `baseline`" + " or `consecutive` as your hypothesis.") table = [] for comp_a, comp_b in comparisons: @@ -230,6 +232,7 @@ def _compare_wilcoxon(group_a, group_b, p_val_approx) -> dict: return results + def _fdr_correction(p_vals): ranked_p_values = scipy.stats.rankdata(p_vals) fdr = p_vals * len(p_vals) / ranked_p_values @@ -237,6 +240,7 @@ def _fdr_correction(p_vals): return fdr + def _get_reference_from_column(series, reference_value, param_name): if reference_value is None: raise ValueError("%s must be provided." % param_name) @@ -253,4 +257,4 @@ def _get_reference_from_column(series, reference_value, param_name): return reference_value raise ValueError("%r was not found as a group within the distribution." - % reference_value) + % reference_value) diff --git a/q2_fmt/_transformer.py b/q2_fmt/_transformer.py index 0e9a8e5..25227d0 100644 --- a/q2_fmt/_transformer.py +++ b/q2_fmt/_transformer.py @@ -1,5 +1,5 @@ # ---------------------------------------------------------------------------- -# Copyright (c) 2021, QIIME 2 development team. +# Copyright (c) 2022, QIIME 2 development team. # # Distributed under the terms of the Modified BSD License. # @@ -32,6 +32,7 @@ def _3(ff: LSMatFormat) -> pd.Series: dm = skbio.DistanceMatrix.read(str(ff), format='lsmat', verify=False) return dm.to_series() + @plugin.register_transformer def _4(df: AnnotatedTSVDirFmt) -> pd.DataFrame: data = df.data.view(pd.DataFrame) @@ -44,6 +45,7 @@ def _4(df: AnnotatedTSVDirFmt) -> pd.DataFrame: return data + @plugin.register_transformer def _5(obj: pd.DataFrame) -> AnnotatedTSVDirFmt: metadata = [] diff --git a/q2_fmt/_visualizer.py b/q2_fmt/_visualizer.py index c1c33df..a87b2d9 100644 --- a/q2_fmt/_visualizer.py +++ b/q2_fmt/_visualizer.py @@ -14,7 +14,7 @@ def plot_rainclouds(output_dir: str, data: pd.DataFrame, - stats: pd.DataFrame=None): + stats: pd.DataFrame = None): table1 = None if stats is not None: table1, stats = _make_stats(stats) @@ -119,5 +119,5 @@ def _make_group_col(prefix, df): group_n = " (n=" + group_n.apply(str) + ")" series = group_series + group_n - series.name = f"Group " + prefix + series.name = f'{"Group "}' + prefix return series diff --git a/q2_fmt/plugin_setup.py b/q2_fmt/plugin_setup.py index 02cd658..f4d75f7 100644 --- a/q2_fmt/plugin_setup.py +++ b/q2_fmt/plugin_setup.py @@ -8,7 +8,8 @@ import importlib -from qiime2.plugin import Str, Plugin, Metadata, TypeMap, Bool, Choices, Visualization +from qiime2.plugin import (Str, Plugin, Metadata, TypeMap, + Bool, Choices, Visualization) from q2_types.sample_data import SampleData, AlphaDiversity from q2_types.distance_matrix import DistanceMatrix @@ -18,7 +19,8 @@ from q2_fmt._stats import mann_whitney_u, wilcoxon_srt from q2_fmt._format import AnnotatedTSVDirFmt from q2_fmt._visualizer import plot_rainclouds -from q2_fmt._type import GroupDist, Matched, Independent, Ordered, Unordered, StatsTable, Pairwise +from q2_fmt._type import (GroupDist, Matched, Independent, Ordered, + Unordered, StatsTable, Pairwise) import q2_fmt._examples as ex plugin = Plugin(name='fmt', @@ -29,10 +31,12 @@ short_description='Plugin for analyzing FMT data.') plugin.register_formats(RecordTSVFileFormat, AnnotatedTSVDirFmt) -plugin.register_semantic_types(StatsTable, Pairwise, GroupDist, Matched, Independent, - Ordered, Unordered) +plugin.register_semantic_types(StatsTable, Pairwise, GroupDist, Matched, + Independent, Ordered, Unordered) plugin.register_semantic_type_to_format( - GroupDist[Ordered | Unordered, Matched | Independent] | StatsTable[Pairwise], AnnotatedTSVDirFmt) + GroupDist[Ordered | Unordered, + Matched | Independent] | StatsTable[Pairwise], AnnotatedTSVDirFmt + ) T_subject, T_dependence = TypeMap({ Bool % Choices(False): Independent, @@ -47,40 +51,54 @@ 'baseline', 'consecutive'), 'time_column': Str, 'reference_column': Str, 'subject_column': T_subject, 'control_column': Str, - 'filter_missing_references': Bool, 'where': Str, 'against_group': Str, + 'filter_missing_references': Bool, 'where': Str, + 'against_group': Str, 'p_val_approx': Str % Choices('auto', 'exact', 'asymptotic')}, outputs=[ ('stats', StatsTable[Pairwise]), ('raincloud_plot', Visualization) ], - input_descriptions= { + input_descriptions={ 'diversity_measure': '', }, parameter_descriptions={ 'metadata': 'The sample metadata.', - 'hypothesis': 'The hypothesis that will be used to analyze the input `distribution`.' - ' Either `reference`, `all-pairwise`, `baseline` or `consecutive` must be selected.', - 'time_column': 'The column within the `metadata` that the `diversity_measure` should be grouped by.' + 'hypothesis': 'The hypothesis that will be used to analyze the input' + ' `distribution`. Either `reference`, `all-pairwise`,' + ' `baseline` or `consecutive` must be selected.', + 'time_column': 'The column within the `metadata` that the' + ' `diversity_measure` should be grouped by.' ' This column should contain simple integer values.', - 'control_column': 'The column within the `metadata` that contains any relevant control group IDs.' - ' Actual treatment samples should not contain any value within this column.', - 'reference_column': 'The column within the `metadata` that contains the sample to use as a reference' - ' for a given beta `diversity_measure`.' - ' For example, this may be the relevant donor sample to compare against.', - 'subject_column': 'The column within the `metadata` that contains the subject ID to be tracked against timepoints.', - 'filter_missing_references': 'Filter out references contained within the metadata that are not present' - ' in the diversity measure. Default behavior is to raise an error.', + 'control_column': 'The column within the `metadata` that contains any' + ' relevant control group IDs.' + ' Actual treatment samples should not contain any' + ' value within this column.', + 'reference_column': 'The column within the `metadata` that contains' + ' the sample to use as a reference for a given' + ' beta `diversity_measure`. For example, this' + ' may be the relevant donor sample to compare' + ' against.', + 'subject_column': 'The column within the `metadata` that contains the' + ' subject ID to be tracked against timepoints.', + 'filter_missing_references': 'Filter out references contained within' + ' the metadata that are not present in' + ' the diversity measure.' + ' Default behavior is to raise an error.', 'where': '..', - 'against_group': 'Based on the selected hypothesis, this is the column that will be used' - ' to compare all samples against.', - 'p_val_approx': '"exact" will calculate an exact p-value for distributions,' - ' "asymptotic" will use a normal distribution, and "auto" will use either "exact"' - ' when one of the groups has less than 8 observations and there are no ties, otherwise "asymptotic".' + 'against_group': 'Based on the selected hypothesis, this is the column' + ' that will be used to compare all samples against.', + 'p_val_approx': '"exact" will calculate an exact p-value' + ' for distributions, "asymptotic" will use a normal' + ' distribution, and "auto" will use either "exact"' + ' when one of the groups has less than 8 observations' + ' and there are no ties, otherwise "asymptotic".' }, output_descriptions={ - 'stats': 'Either the Mann-Whitney U or Wilcoxon SRT distribution for the chosen hypothesis.', - 'raincloud_plot': 'Raincloud plot for the computed significance test (either Mann-Whitney U or Wilxocon SRT)' - ' from the grouped diversity data and selected hypothesis.', + 'stats': 'Either the Mann-Whitney U or Wilcoxon SRT distribution' + ' for the chosen hypothesis.', + 'raincloud_plot': 'Raincloud plot for the computed significance test' + ' (either Mann-Whitney U or Wilxocon SRT) from the' + ' grouped diversity data and selected hypothesis.', }, name='Engraftment Pipeline for FMT Analysis', description='', @@ -93,31 +111,45 @@ function=group_timepoints, inputs={'diversity_measure': DistanceMatrix | SampleData[AlphaDiversity]}, parameters={'metadata': Metadata, 'time_column': Str, - 'reference_column': Str, 'subject_column': T_subject, 'control_column': Str, - 'filter_missing_references': Bool, 'where': Str}, + 'reference_column': Str, 'subject_column': T_subject, + 'control_column': Str, 'filter_missing_references': Bool, + 'where': Str}, outputs=[('timepoint_dists', GroupDist[Ordered, T_dependence]), ('reference_dists', GroupDist[Unordered, Independent])], parameter_descriptions={ 'metadata': 'The sample metadata.', - 'time_column': 'The column within the `metadata` that the `diversity_measure` should be grouped by.' + 'time_column': 'The column within the `metadata` that the' + ' `diversity_measure` should be grouped by.' ' This column should contain simple integer values.', - 'control_column': 'The column within the `metadata` that contains any relevant control group IDs.' - ' Actual treatment samples should not contain any value within this column.', - 'reference_column': 'The column within the `metadata` that contains the sample to use as a reference' + 'control_column': 'The column within the `metadata` that contains any' + ' relevant control group IDs.' + ' Actual treatment samples should not contain any' + ' value within this column.', + 'reference_column': 'The column within the `metadata` that contains' + ' the sample to use as a reference' ' for a given beta `diversity_measure`.' - ' For example, this may be the relevant donor sample to compare against.', - 'subject_column': 'The column within the `metadata` that contains the subject ID to be tracked against timepoints.', - 'filter_missing_references': 'Filter out references contained within the metadata that are not present' - ' in the diversity measure. Default behavior is to raise an error.', + ' For example, this may be the relevant donor' + ' sample to compare against.', + 'subject_column': 'The column within the `metadata` that contains the' + ' subject ID to be tracked against timepoints.', + 'filter_missing_references': 'Filter out references contained within' + ' the metadata that are not present' + ' in the diversity measure.' + ' Default behavior is to raise an error.', 'where': '..', }, output_descriptions={ - 'timepoint_dists': 'The distributions for the `diversity_measure`, grouped by the selected `time_column`.' - ' May also contain subject IDs, if `subject_column` is provided in the `metadata`.', - 'reference_dists': 'The inter-group reference and inter-group control (when provided) distributions.' - ' When `diversity_measure` is DistanceMatrix, the inter-group calculations' - ' will be all pairwise comparisons within a group.' - ' Otherwise, these are just the per-sample measurements of alpha-diversity.' + 'timepoint_dists': 'The distributions for the `diversity_measure`,' + ' grouped by the selected `time_column`.' + ' May also contain subject IDs, if `subject_column`' + ' is provided in the `metadata`.', + 'reference_dists': 'The inter-group reference and inter-group control' + ' (when provided) distributions.' + ' When `diversity_measure` is DistanceMatrix, the' + ' inter-group calculations will be all pairwise' + ' comparisons within a group.' + ' Otherwise, these are just the per-sample' + ' measurements of alpha-diversity.' }, name='', description='', @@ -130,22 +162,28 @@ plugin.methods.register_function( function=mann_whitney_u, inputs={'distribution': GroupDist[Unordered | Ordered, Independent], - 'against_each': GroupDist[Unordered | Ordered, Matched | Independent]}, + 'against_each': GroupDist[Unordered | Ordered, + Matched | Independent]}, parameters={'hypothesis': Str % Choices('reference', 'all-pairwise'), 'reference_group': Str, 'p_val_approx': Str % Choices('auto', 'exact', 'asymptotic')}, outputs=[('stats', StatsTable[Pairwise])], parameter_descriptions={ - 'hypothesis': 'The hypothesis that will be used to analyze the input `distribution`.' - ' Either `reference` or `all-pairwise` must be selected.', - 'reference_group': 'If `reference` is the selected hypothesis, this is the column that will be used' + 'hypothesis': 'The hypothesis that will be used to analyze the input' + ' `distribution`. Either `reference` or `all-pairwise`' + ' must be selected.', + 'reference_group': 'If `reference` is the selected hypothesis, this' + ' is the column that will be used' ' to compare all samples against.', - 'p_val_approx': '"exact" will calculate an exact p-value for distributions,' - ' "asymptotic" will use a normal distribution, and "auto" will use either "exact"' - ' when one of the groups has less than 8 observations and there are no ties, otherwise "asymptotic".' + 'p_val_approx': '"exact" will calculate an exact p-value for' + ' distributions, "asymptotic" will use a normal' + ' distribution, and "auto" will use either "exact"' + ' when one of the groups has less than 8 observations' + ' and there are no ties, otherwise "asymptotic".' }, output_descriptions={ - 'stats': 'The Mann-Whitney U distribution for either the `reference` or `all-pairwise` hypothesis.', + 'stats': 'The Mann-Whitney U distribution for either the `reference`' + ' or `all-pairwise` hypothesis.', }, name='Mann-Whitney U Test', description='', @@ -162,15 +200,21 @@ 'p_val_approx': Str % Choices('auto', 'exact', 'asymptotic')}, outputs=[('stats', StatsTable[Pairwise])], parameter_descriptions={ - 'hypothesis': 'The hypothesis that will be used to analyze the input `distribution`.' - ' Either `baseline` or `consecutive` must be selected.', - 'baseline_group': 'If `baseline` is the selected hypothesis, this is the column that will be used' + 'hypothesis': 'The hypothesis that will be used to analyze the input' + ' `distribution`. Either `baseline` or `consecutive`' + ' must be selected.', + 'baseline_group': 'If `baseline` is the selected hypothesis, this is' + ' the column that will be used' ' to compare all samples against.', - 'p_val_approx': '"exact" will calculate an exact p-value for distributions of up to 25 (inclusive) measurements,' - ' "asymptotic" will use a normal distribution, and "auto" will use either "exact" or "approx" depending on size.' + 'p_val_approx': '"exact" will calculate an exact p-value for' + ' distributions of up to 25 (inclusive) measurements,' + ' "asymptotic" will use a normal distribution,' + ' and "auto" will use either "exact" or "approx"' + ' depending on size.' }, output_descriptions={ - 'stats': 'The Wilcoxon SRT distribution for either the `baseline` or `consecutive` hypothesis.', + 'stats': 'The Wilcoxon SRT distribution for either the `baseline`' + ' or `consecutive` hypothesis.', }, name='Wilcoxon Signed Rank Test', description='', diff --git a/q2_fmt/tests/test_engraftment.py b/q2_fmt/tests/test_engraftment.py index 4f7d628..c5727d8 100644 --- a/q2_fmt/tests/test_engraftment.py +++ b/q2_fmt/tests/test_engraftment.py @@ -18,16 +18,20 @@ class TestBase(TestPluginBase): - package='q2_fmt.tests' + package = 'q2_fmt.tests' def setUp(self): super().setUp() - self.md_beta = Metadata.load(self.get_data_path('sample_metadata_donors.tsv')) - self.md_alpha = Metadata.load(self.get_data_path('sample_metadata_alpha_div.tsv')) + self.md_beta = Metadata.load(self.get_data_path( + 'sample_metadata_donors.tsv')) + self.md_alpha = Metadata.load(self.get_data_path( + 'sample_metadata_alpha_div.tsv')) - self.dm = DistanceMatrix.read(self.get_data_path('dist_matrix_donors.tsv')).to_series() - self.alpha = pd.read_csv(self.get_data_path('alpha_div.tsv'), sep='\t', index_col=0, squeeze=True) + self.dm = DistanceMatrix.read(self.get_data_path( + 'dist_matrix_donors.tsv')).to_series() + self.alpha = pd.read_csv(self.get_data_path('alpha_div.tsv'), + sep='\t', index_col=0, squeeze=True) self.faithpd_timedist = faithpd_timedist_factory().view(pd.DataFrame) self.faithpd_refdist = faithpd_refdist_factory().view(pd.DataFrame) @@ -35,7 +39,8 @@ def setUp(self): class ErrorMixins: def test_with_time_column_input_not_in_metadata(self): - with self.assertRaisesRegex(ValueError, 'time_column.*foo.*metadata'): + with self.assertRaisesRegex(ValueError, + 'time_column.*foo.*metadata'): group_timepoints(diversity_measure=self.div, metadata=self.md, time_column='foo', @@ -43,7 +48,8 @@ def test_with_time_column_input_not_in_metadata(self): control_column='control') def test_with_reference_column_input_not_in_metadata(self): - with self.assertRaisesRegex(ValueError, 'reference_column.*foo.*metadata'): + with self.assertRaisesRegex(ValueError, + 'reference_column.*foo.*metadata'): group_timepoints(diversity_measure=self.div, metadata=self.md, time_column='days_post_transplant', @@ -51,7 +57,8 @@ def test_with_reference_column_input_not_in_metadata(self): control_column='control') def test_with_control_column_input_not_in_metadata(self): - with self.assertRaisesRegex(ValueError, 'control_column.*foo.*metadata'): + with self.assertRaisesRegex(ValueError, + 'control_column.*foo.*metadata'): group_timepoints(diversity_measure=self.div, metadata=self.md, time_column='days_post_transplant', @@ -59,13 +66,15 @@ def test_with_control_column_input_not_in_metadata(self): control_column='foo') def test_with_non_numeric_time_column(self): - with self.assertRaisesRegex(ValueError, 'time_column.*categorical.*numeric'): + with self.assertRaisesRegex(ValueError, + 'time_column.*categorical.*numeric'): group_timepoints(diversity_measure=self.div, metadata=self.md, time_column='non_numeric_time_column', reference_column='relevant_donor', control_column='control') + class TestAlphaErrors(TestBase, ErrorMixins): def setUp(self): super().setUp() @@ -73,6 +82,7 @@ def setUp(self): self.div = self.alpha self.md = self.md_alpha + class TestBetaErrors(TestBase, ErrorMixins): def setUp(self): super().setUp() @@ -80,6 +90,7 @@ def setUp(self): self.div = self.dm self.md = self.md_beta + class TestGroupTimepoints(TestBase): # Beta Diversity (Distance Matrix) Test Cases def test_beta_dists_with_donors_and_controls(self): @@ -93,9 +104,12 @@ def test_beta_dists_with_donors_and_controls(self): 'id': ['donor1..donor2', 'donor1..donor3', 'donor2..donor3', 'sampleB..sampleC', 'sampleB..sampleD', 'sampleC..sampleD'], 'measure': [0.24, 0.41, 0.74, 0.37, 0.44, 0.31], - 'group': ['reference', 'reference', 'reference', 'control1', 'control1', 'control1'], - 'A': ['donor1', 'donor1', 'donor2', 'sampleB', 'sampleB', 'sampleC'], - 'B': ['donor2', 'donor3', 'donor3', 'sampleC', 'sampleD', 'sampleD'] + 'group': ['reference', 'reference', 'reference', + 'control1', 'control1', 'control1'], + 'A': ['donor1', 'donor1', 'donor2', + 'sampleB', 'sampleB', 'sampleC'], + 'B': ['donor2', 'donor3', 'donor3', + 'sampleC', 'sampleD', 'sampleD'] }) time_df, ref_df = group_timepoints(diversity_measure=self.dm, @@ -112,16 +126,20 @@ def test_beta_dists_with_donors_controls_and_subjects(self): 'id': ['sampleA', 'sampleB', 'sampleC', 'sampleD', 'sampleE'], 'measure': [0.45, 0.40, 0.28, 0.78, 0.66], 'group': [7.0, 7.0, 9.0, 11.0, 11.0], - 'subject': ['subject1', 'subject1', 'subject1', 'subject2', 'subject2'] + 'subject': ['subject1', 'subject1', + 'subject1', 'subject2', 'subject2'] }) exp_ref_df = pd.DataFrame({ 'id': ['donor1..donor2', 'donor1..donor3', 'donor2..donor3', 'sampleB..sampleC', 'sampleB..sampleD', 'sampleC..sampleD'], 'measure': [0.24, 0.41, 0.74, 0.37, 0.44, 0.31], - 'group': ['reference', 'reference', 'reference', 'control1', 'control1', 'control1'], - 'A': ['donor1', 'donor1', 'donor2', 'sampleB', 'sampleB', 'sampleC'], - 'B': ['donor2', 'donor3', 'donor3', 'sampleC', 'sampleD', 'sampleD'] + 'group': ['reference', 'reference', 'reference', + 'control1', 'control1', 'control1'], + 'A': ['donor1', 'donor1', 'donor2', + 'sampleB', 'sampleB', 'sampleC'], + 'B': ['donor2', 'donor3', 'donor3', + 'sampleC', 'sampleD', 'sampleD'] }) time_df, ref_df = group_timepoints(diversity_measure=self.dm, @@ -135,7 +153,8 @@ def test_beta_dists_with_donors_controls_and_subjects(self): pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_beta_dists_with_same_donor_for_all_samples(self): - with self.assertRaisesRegex(TypeError, 'Single reference value detected'): + with self.assertRaisesRegex(TypeError, + 'Single reference value detected'): group_timepoints(diversity_measure=self.dm, metadata=self.md_beta, time_column='days_post_transplant', @@ -143,20 +162,23 @@ def test_beta_dists_with_same_donor_for_all_samples(self): control_column='control') def test_beta_dists_with_one_donor_and_controls(self): - with self.assertRaisesRegex(KeyError, 'Missing references for the associated sample data'): + with self.assertRaisesRegex(KeyError, + 'Missing references for the associated' + ' sample data'): group_timepoints(diversity_measure=self.dm, - metadata=self.md_beta, - time_column='days_post_transplant', - reference_column='single_donor', - control_column='control') + metadata=self.md_beta, + time_column='days_post_transplant', + reference_column='single_donor', + control_column='control') def test_beta_dists_with_donors_and_one_control(self): - with self.assertRaisesRegex(ValueError, 'One or less controls detected'): + with self.assertRaisesRegex(ValueError, + 'One or less controls detected'): group_timepoints(diversity_measure=self.dm, - metadata=self.md_beta, - time_column='days_post_transplant', - reference_column='relevant_donor', - control_column='single_control') + metadata=self.md_beta, + time_column='days_post_transplant', + reference_column='relevant_donor', + control_column='single_control') def test_beta_dists_with_donors_no_controls(self): exp_time_df = pd.DataFrame({ @@ -182,16 +204,19 @@ def test_beta_dists_with_donors_no_controls(self): pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_beta_dists_no_donors_with_controls(self): - with self.assertRaisesRegex(TypeError, r"group_timepoints\(\) missing 1 required positional argument: " - "'reference_column'"): + with self.assertRaisesRegex( + TypeError, + r"group_timepoints\(\) missing 1 required positional argument: " + "'reference_column'"): group_timepoints(diversity_measure=self.dm, metadata=self.md_beta, time_column='days_post_transplant', control_column='control') def test_beta_dists_with_invalid_ref_column(self): - with self.assertRaisesRegex(KeyError, 'References included in the metadata are missing' - ' from the diversity measure.*foo.*bar.*baz'): + with self.assertRaisesRegex(KeyError, 'References included in the' + ' metadata are missing from the diversity' + ' measure.*foo.*bar.*baz'): group_timepoints(diversity_measure=self.dm, metadata=self.md_beta, time_column='days_post_transplant', @@ -201,7 +226,8 @@ def test_beta_dists_with_invalid_ref_column(self): def test_beta_dists_with_empty_diversity_series(self): empty_beta_series = pd.Series() - with self.assertRaisesRegex(ValueError, 'Empty diversity measure detected'): + with self.assertRaisesRegex(ValueError, + 'Empty diversity measure detected'): group_timepoints(diversity_measure=empty_beta_series, metadata=self.md_beta, time_column='days_post_transplant', @@ -209,7 +235,8 @@ def test_beta_dists_with_empty_diversity_series(self): control_column='control') def test_beta_dists_with_extra_samples_in_metadata_not_in_diversity(self): - extra_md = Metadata.load(self.get_data_path('sample_metadata_donors_missing.tsv')) + extra_md = Metadata.load(self.get_data_path( + 'sample_metadata_donors_missing.tsv')) exp_time_df = pd.DataFrame({ 'id': ['sampleA', 'sampleB', 'sampleC', 'sampleD', 'sampleE'], @@ -221,9 +248,12 @@ def test_beta_dists_with_extra_samples_in_metadata_not_in_diversity(self): 'id': ['donor1..donor2', 'donor1..donor3', 'donor2..donor3', 'sampleB..sampleC', 'sampleB..sampleD', 'sampleC..sampleD'], 'measure': [0.24, 0.41, 0.74, 0.37, 0.44, 0.31], - 'group': ['reference', 'reference', 'reference', 'control1', 'control1', 'control1'], - 'A': ['donor1', 'donor1', 'donor2', 'sampleB', 'sampleB', 'sampleC'], - 'B': ['donor2', 'donor3', 'donor3', 'sampleC', 'sampleD', 'sampleD'] + 'group': ['reference', 'reference', 'reference', + 'control1', 'control1', 'control1'], + 'A': ['donor1', 'donor1', 'donor2', + 'sampleB', 'sampleB', 'sampleC'], + 'B': ['donor2', 'donor3', 'donor3', + 'sampleC', 'sampleD', 'sampleD'] }) time_df, ref_df = group_timepoints(diversity_measure=self.dm, @@ -236,9 +266,12 @@ def test_beta_dists_with_extra_samples_in_metadata_not_in_diversity(self): pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_beta_dists_with_extra_samples_in_diversity_not_in_metadata(self): - extra_dm = DistanceMatrix.read(self.get_data_path('dist_matrix_donors_missing.tsv')).to_series() + extra_dm = DistanceMatrix.read(self.get_data_path( + 'dist_matrix_donors_missing.tsv')).to_series() - with self.assertRaisesRegex(ValueError, 'The following IDs are not present in the metadata'): + with self.assertRaisesRegex(ValueError, + 'The following IDs are not present' + ' in the metadata'): group_timepoints(diversity_measure=extra_dm, metadata=self.md_beta, time_column='days_post_transplant', @@ -310,20 +343,23 @@ def test_alpha_dists_with_same_donor_for_all_samples(self): exp_ref_df = pd.DataFrame({ 'id': ['donor1', 'sampleC', 'sampleD', 'sampleE', 'sampleF'], 'measure': [32, 15, 6, 44, 17], - 'group': ['reference', 'control1', 'control1', 'control2', 'control2'] + 'group': ['reference', 'control1', + 'control1', 'control2', 'control2'] }) - time_df, ref_df = group_timepoints(diversity_measure=self.alpha, - metadata=self.md_alpha, - time_column='days_post_transplant', - reference_column='relevant_donor_all', - control_column='control') + time_df, ref_df = group_timepoints( + diversity_measure=self.alpha, metadata=self.md_alpha, + time_column='days_post_transplant', + reference_column='relevant_donor_all', + control_column='control') pd.testing.assert_frame_equal(time_df, exp_time_df) pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_alpha_dists_with_one_donor_and_controls(self): - with self.assertRaisesRegex(KeyError, 'Missing references for the associated sample data'): + with self.assertRaisesRegex(KeyError, + 'Missing references for the associated' + ' sample data'): group_timepoints(diversity_measure=self.alpha, metadata=self.md_alpha, time_column='days_post_transplant', @@ -341,14 +377,15 @@ def test_alpha_dists_with_donors_and_one_control(self): exp_ref_df = pd.DataFrame({ 'id': ['donor1', 'donor2', 'donor3', 'donor4', 'sampleB'], 'measure': [32, 51, 3, 19, 37], - 'group': ['reference', 'reference', 'reference', 'reference', 'control1'] + 'group': ['reference', 'reference', 'reference', + 'reference', 'control1'] }) time_df, ref_df = group_timepoints(diversity_measure=self.alpha, - metadata=self.md_alpha, - time_column='days_post_transplant', - reference_column='relevant_donor', - control_column='single_control') + metadata=self.md_alpha, + time_column='days_post_transplant', + reference_column='relevant_donor', + control_column='single_control') pd.testing.assert_frame_equal(time_df, exp_time_df) pd.testing.assert_frame_equal(ref_df, exp_ref_df) @@ -376,16 +413,19 @@ def test_alpha_dists_with_donors_no_controls(self): pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_alpha_dists_no_donors_with_controls(self): - with self.assertRaisesRegex(TypeError, r"group_timepoints\(\) missing 1 required positional argument: " - "'reference_column'"): + with self.assertRaisesRegex( + TypeError, + r"group_timepoints\(\) missing 1 required positional argument: " + "'reference_column'"): group_timepoints(diversity_measure=self.alpha, metadata=self.md_alpha, time_column='days_post_transplant', control_column='control') def test_alpha_dists_with_invalid_ref_column(self): - with self.assertRaisesRegex(KeyError, 'References included in the metadata are missing' - ' from the diversity measure.*foo.*bar.*baz'): + with self.assertRaisesRegex(KeyError, 'References included in the' + ' metadata are missing from the diversity' + ' measure.*foo.*bar.*baz'): group_timepoints(diversity_measure=self.alpha, metadata=self.md_alpha, time_column='days_post_transplant', @@ -395,7 +435,8 @@ def test_alpha_dists_with_invalid_ref_column(self): def test_alpha_dists_with_empty_diversity_series(self): empty_alpha_series = pd.Series() - with self.assertRaisesRegex(ValueError, 'Empty diversity measure detected'): + with self.assertRaisesRegex(ValueError, + 'Empty diversity measure detected'): group_timepoints(diversity_measure=empty_alpha_series, metadata=self.md_alpha, time_column='days_post_transplant', @@ -403,7 +444,8 @@ def test_alpha_dists_with_empty_diversity_series(self): control_column='control') def test_alpha_dists_with_extra_samples_in_metadata_not_in_diversity(self): - extra_md = Metadata.load(self.get_data_path('sample_metadata_alpha_div_missing.tsv')) + extra_md = Metadata.load(self.get_data_path( + 'sample_metadata_alpha_div_missing.tsv')) exp_time_df = pd.DataFrame({ 'id': ['sampleA', 'sampleB', 'sampleC', 'sampleD', @@ -430,9 +472,11 @@ def test_alpha_dists_with_extra_samples_in_metadata_not_in_diversity(self): pd.testing.assert_frame_equal(ref_df, exp_ref_df) def test_alpha_dists_with_extra_samples_in_diversity_not_in_metadata(self): - extra_alpha = pd.read_csv(self.get_data_path('alpha_div_missing.tsv'), sep='\t', index_col=0, squeeze=True) + extra_alpha = pd.read_csv(self.get_data_path('alpha_div_missing.tsv'), + sep='\t', index_col=0, squeeze=True) - with self.assertRaisesRegex(ValueError, 'The following IDs are not present in the metadata'): + with self.assertRaisesRegex(ValueError, 'The following IDs are not' + ' present in the metadata'): group_timepoints(diversity_measure=extra_alpha, metadata=self.md_alpha, time_column='days_post_transplant', @@ -442,12 +486,14 @@ def test_alpha_dists_with_extra_samples_in_diversity_not_in_metadata(self): def test_examples(self): self.execute_examples() + class TestStats(TestBase): # Wilcoxon SRT test cases # Data in the exp_stats_data dataframes were pulled from Greg Caporaso's # Autism study repo on github, which can be found here: - # https://github.com/caporaso-lab/autism-fmt1/blob/18-month-followup/16S/engraftment.ipynb + # https://github.com/caporaso-lab/autism-fmt1/ + # blob/18-month-followup/16S/engraftment.ipynb def test_wilcoxon_with_faith_pd_baseline0_asymptotic(self): exp_stats_data = pd.DataFrame({ 'A:group': [0.0, 0.0, 0.0, 0.0], @@ -462,8 +508,10 @@ def test_wilcoxon_with_faith_pd_baseline0_asymptotic(self): 'q-value': [0.758312374, 0.005782696, 0.00154471, 0.002246758] }) - stats_data = wilcoxon_srt(distribution=self.faithpd_timedist, hypothesis='baseline', - baseline_group='0', p_val_approx='asymptotic') + stats_data = wilcoxon_srt(distribution=self.faithpd_timedist, + hypothesis='baseline', + baseline_group='0', + p_val_approx='asymptotic') pd.testing.assert_frame_equal(stats_data, exp_stats_data) @@ -482,54 +530,65 @@ def test_wilcoxon_with_faith_pd_consecutive_asymptotic(self): }) stats_data = wilcoxon_srt(distribution=self.faithpd_timedist, - hypothesis='consecutive', p_val_approx='asymptotic') + hypothesis='consecutive', + p_val_approx='asymptotic') pd.testing.assert_frame_equal(stats_data, exp_stats_data) def test_wilcoxon_consecutive_hypothesis_with_baseline_group(self): - with self.assertRaisesRegex(ValueError, "`consecutive` was selected as the hypothesis," - " but a `baseline_group` was added."): + with self.assertRaisesRegex(ValueError, "`consecutive` was selected as" + " the hypothesis, but a `baseline_group`" + " was added."): wilcoxon_srt(distribution=self.faithpd_timedist, hypothesis='consecutive', baseline_group='reference') def test_wilcoxon_invalid_hypothesis(self): - with self.assertRaisesRegex(ValueError, "Invalid hypothesis. Please either choose" - " `baseline` or `consecutive` as your hypothesis."): - wilcoxon_srt(distribution=self.faithpd_timedist, hypothesis='foo') + with self.assertRaisesRegex(ValueError, "Invalid hypothesis. Please" + " either choose `baseline` or" + " `consecutive` as your hypothesis."): + wilcoxon_srt(distribution=self.faithpd_timedist, + hypothesis='foo') def test_wilcoxon_invalid_baseline_group(self): - with self.assertRaisesRegex(ValueError, "'foo' was not found as a group" - " within the distribution."): + with self.assertRaisesRegex(ValueError, "'foo' was not found as a" + " group within the distribution."): wilcoxon_srt(distribution=self.faithpd_timedist, hypothesis='baseline', baseline_group='foo') # Mann-Whitney U test cases - # Data in the exp_stats_data dataframes were calculated 'by hand' in a jupyter notebook - # using the same data, manually organized into groups and subsequently compared using - # scipy.stats.mannwhitneyu to calculate the test-statistic and p-values - # Notebook can be found here, for reference: + # Data in the exp_stats_data dataframes were calculated 'by hand' in a + # jupyter notebook using the same data, manually organized into groups + # and subsequently compared using scipy.stats.mannwhitneyu to calculate + # the test-statistic and p-values. Notebook can be found here: # https://gist.github.com/lizgehret/c9add7b451e5e91b1017a2a963276bff def test_mann_whitney_pairwise_against_each(self): exp_stats_data = pd.DataFrame({ 'A:group': ['control', 'control', 'control', 'control', 'control', - 'reference', 'reference', 'reference', 'reference', 'reference'], + 'reference', 'reference', 'reference', + 'reference', 'reference'], 'A:n': [23, 23, 23, 23, 23, 5, 5, 5, 5, 5], - 'A:measure': [11.64962736, 11.64962736, 11.64962736, 11.64962736, 11.64962736, - 10.24883918, 10.24883918, 10.24883918, 10.24883918, 10.24883918], + 'A:measure': [11.64962736, 11.64962736, 11.64962736, + 11.64962736, 11.64962736, + 10.24883918, 10.24883918, 10.24883918, + 10.24883918, 10.24883918], 'B:group': [0, 3, 10, 18, 100, 0, 3, 10, 18, 100], 'B:n': [18, 17, 18, 18, 16, 18, 17, 18, 18, 16], - 'B:measure': [9.54973486, 9.592979726, 10.9817719, 11.39392352, 12.97286672, - 9.54973486, 9.592979726, 10.9817719, 11.39392352, 12.97286672], + 'B:measure': [9.54973486, 9.592979726, 10.9817719, + 11.39392352, 12.97286672, + 9.54973486, 9.592979726, 10.9817719, + 11.39392352, 12.97286672], 'n': [41, 40, 41, 41, 39, 23, 22, 23, 23, 21], 'test-statistic': [282.0, 260.0, 194.0, 190.0, 104.0, 49.0, 43.0, 20.0, 14.0, 6.0], - 'p-value': [0.050330911733538534, 0.07994303215567311, 0.7426248650660427, - 0.6646800940267454, 0.02321456407322841, 0.7941892150565809, + 'p-value': [0.050330911733538534, 0.07994303215567311, + 0.7426248650660427, 0.6646800940267454, + 0.02321456407322841, 0.7941892150565809, 1.0, 0.06783185968744732, 0.023005953105134484, 0.0056718704407604376], - 'q-value': [0.12582728, 0.13323839, 0.92828108, 0.94954299, 0.07738188, - 0.88243246, 1.0, 0.13566372, 0.11502977, 0.0567187], + 'q-value': [0.12582728, 0.13323839, 0.92828108, 0.94954299, + 0.07738188, 0.88243246, 1.0, 0.13566372, + 0.11502977, 0.0567187], }) stats_data = mann_whitney_u(distribution=self.faithpd_refdist, @@ -561,20 +620,22 @@ def test_mann_whitney_reference(self): pd.testing.assert_frame_equal(stats_data, exp_stats_data) def test_mann_whitney_all_pairwise_hypothesis_with_reference_group(self): - with self.assertRaisesRegex(ValueError, "`all-pairwise` was selected as the" - " hypothesis, but a `reference_group` was added."): + with self.assertRaisesRegex(ValueError, "`all-pairwise` was selected" + " as the hypothesis, but a" + " `reference_group` was added."): mann_whitney_u(distribution=self.faithpd_refdist, hypothesis='all-pairwise', reference_group='reference') def test_mann_whitney_invalid_hypothesis(self): - with self.assertRaisesRegex(ValueError, "Invalid hypothesis. Please either" - " choose `reference` or `all-pairwise` as your hypothesis."): + with self.assertRaisesRegex(ValueError, "Invalid hypothesis. Please" + " either choose `reference` or" + " `all-pairwise` as your hypothesis."): mann_whitney_u(distribution=self.faithpd_refdist, hypothesis='foo') def test_mann_whitney_invalid_reference_group(self): - with self.assertRaisesRegex(ValueError, "'foo' was not found as a group" - " within the distribution."): + with self.assertRaisesRegex(ValueError, "'foo' was not found as a" + " group within the distribution."): mann_whitney_u(distribution=self.faithpd_refdist, hypothesis='reference', reference_group='foo')