Skip to content

Commit

Permalink
LINT: linting cleanup (#17)
Browse files Browse the repository at this point in the history
  • Loading branch information
lizgehret authored May 4, 2022
1 parent 52cd403 commit 5e53483
Show file tree
Hide file tree
Showing 11 changed files with 390 additions and 220 deletions.
1 change: 0 additions & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

BSD 3-Clause License

Copyright (c) 2022, QIIME 2 development team.
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

PYTHON ?= python

all:
all: ;

lint:
q2lint
Expand All @@ -22,4 +22,4 @@ dev: all

clean: distclean

distclean:
distclean: ;
6 changes: 4 additions & 2 deletions ci/recipe/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,16 @@ requirements:

run:
- python {{ python }}
- pandas
- scipy
- scikit-bio
- jinja2
- qiime2 {{ qiime2_epoch }}.*
- q2templates {{ qiime2_epoch }}.*
- q2-types {{ qiime2_epoch }}.*

test:
requires:
- qiime2 >={{ qiime2 }}
- q2templates >={{ q2templates }}
- q2-types >={{ q2_types }}
- pytest

Expand Down
5 changes: 3 additions & 2 deletions q2_fmt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@
__version__ = get_versions()['version']
del get_versions

__all__ = ['RecordTSVFileFormat', 'AnnotatedTSVDirFmt', 'StatsTable', 'Pairwise',
'GroupDist', 'Ordered', 'Unordered', 'Matched', 'Independent', 'engraftment']
__all__ = ['RecordTSVFileFormat', 'AnnotatedTSVDirFmt', 'StatsTable',
'Pairwise', 'GroupDist', 'Ordered', 'Unordered', 'Matched',
'Independent', 'engraftment']
142 changes: 93 additions & 49 deletions q2_fmt/_engraftment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,33 +16,38 @@ def engraftment(
ctx, diversity_measure, metadata, hypothesis, time_column,
reference_column, subject_column, control_column=None,
filter_missing_references=False, where=None, against_group=None,
p_val_approx='auto'):
p_val_approx='auto'
):

raincloud_plot = ctx.get_action('fmt', 'plot_rainclouds')
group_timepoints = ctx.get_action('fmt', 'group_timepoints')

results = []

time_dist, ref_dist = group_timepoints(diversity_measure, metadata,
time_column, reference_column, subject_column, control_column,
filter_missing_references, where)
time_column, reference_column,
subject_column, control_column,
filter_missing_references, where)

if hypothesis == 'reference' or hypothesis == 'all-pairwise':
mann_whitney_u = ctx.get_action('fmt', 'mann_whitney_u')
stats = mann_whitney_u(distribution=ref_dist, hypothesis=hypothesis,
reference_group=against_group,
against_each=ref_dist, p_val_approx=p_val_approx)
against_each=ref_dist,
p_val_approx=p_val_approx)

else:
wilcoxon_srt = ctx.get_action('fmt', 'wilcoxon_srt')
stats = wilcoxon_srt(distribution=time_dist, hypothesis=hypothesis,
baseline_group=against_group, p_val_approx=p_val_approx)
baseline_group=against_group,
p_val_approx=p_val_approx)

results += stats
results += raincloud_plot(data=time_dist, stats=stats[0])

return tuple(results)


def group_timepoints(
diversity_measure: pd.Series, metadata: qiime2.Metadata,
time_column: str, reference_column: str, subject_column: str = False,
Expand All @@ -53,9 +58,9 @@ def group_timepoints(
diversity_measure.index = _sort_multi_index(diversity_measure.index)

is_beta, used_references, time_col, subject_col, used_controls = \
_data_filtering(diversity_measure, metadata, time_column, reference_column,
subject_column, control_column, filter_missing_references,
where)
_data_filtering(diversity_measure, metadata, time_column,
reference_column, subject_column, control_column,
filter_missing_references, where)

original_measure_name = diversity_measure.name
diversity_measure.name = 'measure'
Expand All @@ -72,7 +77,7 @@ def group_timepoints(
ordered_df['id'].attrs.update(id_annotation)
ordered_df['measure'].attrs.update({
'unit': ('Distance to %s' % used_references.name)
if is_beta else original_measure_name,
if is_beta else original_measure_name,
'description': '...'
})
ordered_df['group'].attrs.update({
Expand All @@ -85,9 +90,9 @@ def group_timepoints(
'description': '...'
})


independent_df = _independent_dists(diversity_measure, metadata,
used_references, is_beta, used_controls)
used_references, is_beta,
used_controls)

# id, measure, group, [A, B]
if is_beta:
Expand All @@ -107,7 +112,7 @@ def group_timepoints(
})
independent_df['group'].attrs.update({
'unit': used_references.name if used_controls is None else
'%s or %s' % (used_references.name, used_controls.name),
'%s or %s' % (used_references.name, used_controls.name),
'description': '...'
})
if is_beta:
Expand All @@ -116,15 +121,18 @@ def group_timepoints(

return ordered_df, independent_df


# HELPER FUNCTION FOR DATA FILTERING
def _data_filtering(diversity_measure: pd.Series, metadata: qiime2.Metadata,
time_column: str, reference_column: str, subject_column: str = False,
control_column: str = None, filter_missing_references: bool = False,
where: str = None):
time_column: str, reference_column: str,
subject_column: str = False, control_column: str = None,
filter_missing_references: bool = False,
where: str = None):

if diversity_measure.empty:
raise ValueError('Empty diversity measure detected.'
' Please make sure your diversity measure contains data.')
' Please make sure your diversity measure'
' contains data.')

if isinstance(diversity_measure.index, pd.MultiIndex):
is_beta = True
Expand All @@ -137,15 +145,18 @@ def _data_filtering(diversity_measure: pd.Series, metadata: qiime2.Metadata,
metadata = metadata.filter_ids(ids_to_keep=ids_with_data)

if where is not None:
metadata = metadata.filter_ids(ids_to_keep=metadata.get_ids(where=where))
metadata = (metadata
.filter_ids(ids_to_keep=metadata
.get_ids(where=where))
)

def _get_series_from_col(md, col_name, param_name, expected_type=None,
drop_missing_values=False):
try:
column = md.get_column(col_name)
except ValueError as e:
raise ValueError("There was an issue with the argument for %r. %s"
% (param_name, e)) from e
% (param_name, e)) from e

if expected_type is not None and not isinstance(column, expected_type):
if type(expected_type) is tuple:
Expand All @@ -154,18 +165,22 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None,
exp = expected_type.type

raise ValueError("Provided column for %r is %r, not %r."
% (param_name, column.type, exp))
% (param_name, column.type, exp))

if drop_missing_values:
column = column.drop_missing_values()

return column.to_series()

time_col = _get_series_from_col(md=metadata, col_name=time_column, param_name='time_column',
expected_type=qiime2.NumericMetadataColumn)
time_col = _get_series_from_col(
md=metadata, col_name=time_column,
param_name='time_column',
expected_type=qiime2.NumericMetadataColumn)

reference_col = _get_series_from_col(md=metadata, col_name=reference_column, param_name='reference_column',
expected_type=qiime2.CategoricalMetadataColumn)
reference_col = _get_series_from_col(
md=metadata, col_name=reference_column,
param_name='reference_column',
expected_type=qiime2.CategoricalMetadataColumn)

used_references = reference_col[~time_col.isna()]

Expand All @@ -174,44 +189,55 @@ def _get_series_from_col(md, col_name, param_name, expected_type=None,
used_references = used_references.dropna()
else:
nan_references = used_references.index[used_references.isna()]
raise KeyError('Missing references for the associated sample data. Please make sure'
' that all samples with a timepoint value have an associated reference.'
' IDs where missing references were found: %s' % (tuple(nan_references),))
raise KeyError('Missing references for the associated sample data.'
' Please make sure that all samples with a'
' timepoint value have an associated reference.'
' IDs where missing references were found:'
' %s' % (tuple(nan_references),))

available_references = (used_references.isin(ids_with_data))
if not available_references.all():
if filter_missing_references:
used_references = used_references[available_references]
else:
raise KeyError('References included in the metadata are missing from the diversity measure.'
' Please make sure all references included in the metadata are also present'
' in the diversity measure. Missing references: %s'
% list(used_references[~available_references].unique())
)
raise KeyError('References included in the metadata are missing'
' from the diversity measure. Please make sure all'
' references included in the metadata are also'
' present in the diversity measure.'
' Missing references: %s'
% list(used_references[~available_references]
.unique()))

if used_references.empty:
raise KeyError('No references were found within the diversity metric.')

subject_col = None
if subject_column:
subject_col = _get_series_from_col(md=metadata, col_name=subject_column, param_name='subject_column',
expected_type=qiime2.CategoricalMetadataColumn)
subject_col = _get_series_from_col(
md=metadata, col_name=subject_column,
param_name='subject_column',
expected_type=qiime2.CategoricalMetadataColumn)

used_controls = None
if control_column is not None:
control_col = _get_series_from_col(md=metadata, col_name=control_column, param_name='control_column')
control_col = _get_series_from_col(md=metadata,
col_name=control_column,
param_name='control_column')
used_controls = control_col[~control_col.isna()]

return is_beta, used_references, time_col, subject_col, used_controls


# HELPER FUNCTION FOR sorting a multi-index (for dist matrix and metadata)
def _sort_multi_index(index):
sorted_levels = list(map(sorted, index))
sorted_multi = pd.MultiIndex.from_tuples(sorted_levels)
return sorted_multi


# HELPER FUNCTION FOR GroupDists[Ordered, Matched | Independent]
def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_col, subject_col):
def _ordered_dists(diversity_measure: pd.Series, is_beta,
used_references, time_col, subject_col):
if is_beta:
idx = pd.MultiIndex.from_frame(
used_references.to_frame().reset_index())
Expand All @@ -222,11 +248,16 @@ def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_
idx.name = 'id'

try:
sliced_df = diversity_measure[idx].to_frame().reset_index().set_index('id')
sliced_df = (diversity_measure[idx]
.to_frame()
.reset_index()
.set_index('id')
)
except KeyError:
raise KeyError('Pairwise comparisons were unsuccessful. Please double check that your'
' chosen reference column contains values that are also present in the ID column for'
' the associated metadata.')
raise KeyError('Pairwise comparisons were unsuccessful. Please double'
' check that your chosen reference column contains'
' values that are also present in the ID column for'
' the associated metadata.')

if is_beta:
sliced_df.index = used_references.index
Expand All @@ -239,36 +270,48 @@ def _ordered_dists(diversity_measure: pd.Series, is_beta, used_references, time_

return ordinal_df.reset_index()


# HELPER FUNCTION FOR GroupDists[Unordered, Independent]
def _independent_dists(diversity_measure, metadata, used_references, is_beta, used_controls):
def _independent_dists(diversity_measure, metadata,
used_references, is_beta, used_controls):
unique_references = sorted(used_references.unique())

if is_beta:
try:
ref_idx = pd.MultiIndex.from_tuples(
itertools.combinations(unique_references, 2))
except TypeError:
raise TypeError('Single reference value detected. More than one unique reference must be'
' provided for successful grouping.')
raise TypeError('Single reference value detected. More than one'
' unique reference must be provided for'
' successful grouping.')

ref_idx.names = ['A', 'B']

if used_controls is not None:
grouped_md = metadata.to_dataframe().loc[used_controls.index].groupby(used_controls)
grouped_md = (metadata
.to_dataframe()
.loc[used_controls.index]
.groupby(used_controls)
)
ctrl_list = list()
for group_id, grouped_ctrls in grouped_md:
if len(grouped_ctrls.index) < 2:
continue
ctrl_combos = list(itertools.combinations(grouped_ctrls.index, 2))
ctrl_combos = list(
itertools.combinations(
grouped_ctrls.index, 2)
)
ctrl_idx = pd.MultiIndex.from_tuples(ctrl_combos)
ctrl_series = pd.Series(group_id, index=ctrl_idx)
ctrl_list.append(ctrl_series)

try:
ctrl_series = pd.concat(ctrl_list)
except ValueError:
raise ValueError('One or less controls detected. When including controls in your data,'
' please include more than one for successful grouping.')
raise ValueError('One or less controls detected.'
' When including controls in your data,'
' please include more than one for'
' successful grouping.')

ctrl_series.name = 'group'
ctrl_series.index.names = ['A', 'B']
Expand All @@ -282,9 +325,10 @@ def _independent_dists(diversity_measure, metadata, used_references, is_beta, us
try:
nominal_df = diversity_measure[ref_idx].to_frame().reset_index()
except KeyError:
raise KeyError('Pairwise comparisons were unsuccessful. Please double check that your'
' chosen reference column contains values that are also present in the ID column for'
' the associated metadata.')
raise KeyError('Pairwise comparisons were unsuccessful. Please double'
' check that your chosen reference column contains'
' values that are also present in the ID column for'
' the associated metadata.')

nominal_df['group'] = 'reference'

Expand Down
Loading

0 comments on commit 5e53483

Please sign in to comment.