Skip to content

Commit

Permalink
Workflows Base Module (#229)
Browse files Browse the repository at this point in the history
* close #237
* add workflows.base (code for reading CSV file of projects and running automated workflows on each project)
* add workflows.registry (with a single example workflow, DihedralAnalysis)
* add new testing data, .csv for workflows base module; add workflows to STATES dictionary
* add documentation for workflows registry and base module
* update CHANGES
  • Loading branch information
cadeduckworth authored Apr 4, 2023
1 parent b7d0b06 commit cafb300
Show file tree
Hide file tree
Showing 9 changed files with 361 additions and 4 deletions.
11 changes: 7 additions & 4 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,13 @@ Changes

Enhancements

* new workflows module (PR #217)
* new workflows registry that contains each EnsembleAnalysis for which
a workflows module exists, for use with workflows base module (#229)
* new workflows base module that provides iterative workflow use for
directories that contain multiple projects (#229)
* new workflows module (#217)
* new automated dihedral analysis workflow (detect dihedrals with SMARTS,
analyze with EnsembleAnalysis, and generate seaborn violinplots)
PR #217)
analyze with EnsembleAnalysis, and generate seaborn violinplots) (#217)

Fixes

Expand All @@ -36,7 +39,7 @@ Fixes
* fix ensemble.EnsembleAnalysis.check_groups_from_common_ensemble (#212)


2021-01-03 0.8.0
2022-01-03 0.8.0
ALescoulie, orbeckst

Changes
Expand Down
2 changes: 2 additions & 0 deletions doc/sphinx/source/workflows.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,6 @@ for use with :class:`~mdpow.analysis.dihedral.DihedralAnalysis`.
.. toctree::
:maxdepth: 1

workflows/base
workflows/registry
workflows/dihedrals
7 changes: 7 additions & 0 deletions doc/sphinx/source/workflows/base.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
==============
Workflows Base
==============

.. versionadded:: 0.9.0

.. automodule:: mdpow.workflows.base
7 changes: 7 additions & 0 deletions doc/sphinx/source/workflows/registry.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
==================
Workflows Registry
==================

.. versionadded:: 0.9.0

.. automodule:: mdpow.workflows.registry
1 change: 1 addition & 0 deletions mdpow/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
"FEP": RESOURCES.join("states", "FEP"),
"base": RESOURCES.join("states", "base"),
"md_npt": RESOURCES.join("states", "FEP"),
"workflows": RESOURCES.join("states", "workflows"),
}
CONFIGURATIONS = RESOURCES.join("test_configurations")
101 changes: 101 additions & 0 deletions mdpow/tests/test_workflows_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import re
import os
import sys
import yaml
import pybol
import pytest
import pathlib
import logging

import pandas as pd

from . import RESOURCES
from . import STATES

import py.path

from ..workflows import base

from pkg_resources import resource_filename

RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources'))
MANIFEST = RESOURCES / 'manifest.yml'

@pytest.fixture(scope='function')
def molname_workflows_directory(tmp_path):
m = pybol.Manifest(str(MANIFEST))
m.assemble('workflows', tmp_path)
return tmp_path

class TestWorkflowsBase(object):

@pytest.fixture(scope='function')
def SM_tmp_dir(self, molname_workflows_directory):
dirname = molname_workflows_directory
return dirname

@pytest.fixture(scope='function')
def csv_input_data(self):
csv_path = STATES['workflows'] / 'project_paths.csv'
csv_df = pd.read_csv(csv_path).reset_index(drop=True)
return csv_path, csv_df

@pytest.fixture(scope='function')
def test_df_data(self):
test_dict = {'molecule' : ['SM25', 'SM26'],
'resname' : ['SM25', 'SM26']}
test_df = pd.DataFrame(test_dict).reset_index(drop=True)
return test_df

@pytest.fixture(scope='function')
def project_paths_data(self, SM_tmp_dir):
project_paths = base.project_paths(parent_directory=SM_tmp_dir)
return project_paths

def test_project_paths(self, test_df_data, project_paths_data):
test_df = test_df_data
project_paths = project_paths_data

assert project_paths['molecule'][0] == test_df['molecule'][0]
assert project_paths['molecule'][1] == test_df['molecule'][1]
assert project_paths['resname'][0] == test_df['resname'][0]
assert project_paths['resname'][1] == test_df['resname'][1]

def test_project_paths_csv_input(self, csv_input_data):
csv_path, csv_df = csv_input_data
project_paths = base.project_paths(csv=csv_path)

pd.testing.assert_frame_equal(project_paths, csv_df)

def test_automated_project_analysis(self, project_paths_data, caplog):
project_paths = project_paths_data
# change resname to match topology (every SAMPL7 resname is 'UNK')
# only necessary for this dataset, not necessary for normal use
project_paths['resname'] = 'UNK'

base.automated_project_analysis(project_paths, solvents=('water',),
ensemble_analysis='DihedralAnalysis')

assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis '
'did not iteratively run to completion for the provided project')

def test_automated_project_analysis_KeyError(self, project_paths_data, caplog):
caplog.clear()
caplog.set_level(logging.ERROR, logger='mdpow.workflows.base')

project_paths = project_paths_data
# change resname to match topology (every SAMPL7 resname is 'UNK')
# only necessary for this dataset, not necessary for normal use
project_paths['resname'] = 'UNK'

# test error output when raised
with pytest.raises(KeyError,
match="Invalid ensemble_analysis 'DarthVaderAnalysis'. "
"An EnsembleAnalysis type that corresponds to an existing "
"automated workflow module must be input as a kwarg. ex: "
"ensemble_analysis='DihedralAnalysis'"):
base.automated_project_analysis(project_paths, ensemble_analysis='DarthVaderAnalysis', solvents=('water',))

# test logger error recording
assert "'DarthVaderAnalysis' is an invalid selection" in caplog.text, ('did not catch incorrect '
'key specification for workflows.registry that results in KeyError')
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
molecule,resname,path
SM25,SM25,mdpow/tests/testing_resources/states/workflows/SM25
SM26,SM26,mdpow/tests/testing_resources/states/workflows/SM26
180 changes: 180 additions & 0 deletions mdpow/workflows/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
# MDPOW: base.py
# 2022 Cade Duckworth

"""
:mod:`mdpow.workflows.base` --- Automated workflow base functions
=================================================================
To analyze multiple MDPOW projects, provide :func:`project_paths`
with the top-level directory containing all MDPOW projects' simulation data
to obtain a :class:`pandas.DataFrame` containing the project information
and paths. Then, :func:`automated_project_analysis` takes as input the
aforementioned :class:`pandas.DataFrame` and runs the specified
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects
under the top-level directory provided to :func:`project_paths`.
.. seealso:: :mod:`~mdpow.workflows.registry`
.. autofunction:: project_paths
.. autofunction:: automated_project_analysis
"""

import os
import re
import pandas as pd

from mdpow.workflows import registry

import logging

logger = logging.getLogger('mdpow.workflows.base')

def project_paths(parent_directory=None, csv=None, csv_save_dir=None):
"""Takes a top directory containing MDPOW projects and determines
the molname, resname, and path, of each MDPOW project within.
Optionally takes a .csv file containing `molname`, `resname`, and
`paths`, in that order.
:keywords:
*parent_directory*
the path for the location of the top directory
under which the subdirectories of MDPOW simulation
data exist, additionally creates a 'project_paths.csv' file
for user manipulation of metadata and for future reference
*csv*
.csv file containing the molecule names, resnames,
and paths, in that order, for the MDPOW simulation
data to be iterated over must contain header of the
form: `molecule,resname,path`
*csv_save_dir*
optionally provided directory to save .csv file, otherwise,
data will be saved in current working directory
:returns:
*project_paths*
:class:`pandas.DataFrame` containing MDPOW project metadata
.. rubric:: Example
Typical Workflow::
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
automated_project_analysis(project_paths)
or::
project_paths = project_paths(csv='/foo/bar/MDPOW.csv')
automated_project_analysis(project_paths)
"""

if parent_directory is not None:

locations = []

reg_compile = re.compile('FEP')
for dirpath, dirnames, filenames in os.walk(parent_directory):
result = [dirpath.strip() for dirname in dirnames if reg_compile.match(dirname)]
if result:
locations.append(result[0])

resnames = []

for loc in locations:
res_temp = loc.strip().split('/')
resnames.append(res_temp[-1])

project_paths = pd.DataFrame(
{
'molecule': resnames,
'resname': resnames,
'path': locations
}
)
if csv_save_dir is not None:
project_paths.to_csv(f'{csv_save_dir}/project_paths.csv', index=False)
logger.info(f'project_paths saved under {csv_save_dir}')
else:
current_directory = os.getcwd()
project_paths.to_csv('project_paths.csv', index=False)
logger.info(f'project_paths saved under {current_directory}')

elif csv is not None:
locations = pd.read_csv(csv)
project_paths = locations.sort_values(by=['molecule', 'resname', 'path']).reset_index(drop=True)

return project_paths

def automated_project_analysis(project_paths, ensemble_analysis, **kwargs):
"""Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths`
and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
for each of the projects by running the associated automated workflow
in each project directory returned by :func:`~mdpow.workflows.base.project_paths`.
Compatibility with more automated analyses in development.
:keywords:
*project_paths*
:class:`pandas.DataFrame` that provides paths to MDPOW projects
*ensemble_analysis*
name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
that corresponds to the desired automated workflow module
*kwargs*
keyword arguments for the supported automated workflows,
see the :mod:`~mdpow.workflows.registry` for all available
workflows and their call signatures
.. rubric:: Example
A typical workflow is the automated dihedral analysis from
:mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis*
:class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project.
The :data:`~mdpow.workflows.registry.registry` contains this automated
workflow under the key *"DihedralAnalysis"* and so the automated execution
for all `project_paths` (obtained via :func:`project_paths`) is performed by
passing the specific key to :func:`automated_project_analysis`::
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects')
automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs)
"""

for row in project_paths.itertuples():
molname = row.molecule
resname = row.resname
dirname = row.path

logger.info(f'starting {molname}')

try:
registry.registry[ensemble_analysis](dirname=dirname, resname=resname, molname=molname, **kwargs)

logger.info(f'{molname} completed')

except KeyError as err:
msg = (f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds "
"to an existing automated workflow module must be input as a kwarg. "
"ex: ensemble_analysis='DihedralAnalysis'")
logger.error(f'{err} is an invalid selection')

raise KeyError(msg)

except TypeError as err:
msg = (f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that "
"corresponds to an existing automated workflow module must be input as a kwarg. "
"ex: ensemble_analysis='DihedralAnalysis'")
logger.error(f'workflow module for {ensemble_analysis} does not exist yet')

raise TypeError(msg)

logger.info('all analyses completed')
return
53 changes: 53 additions & 0 deletions mdpow/workflows/registry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# MDPOW: registry.py
# 2023 Cade Duckworth

"""
:mod:`mdpow.workflows.registry` --- Registry of currently supported automated workflows
=======================================================================================
The :mod:`mdpow.workflows.registry` module hosts a dictionary with keys that correspond to an
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for which exists a corresponding automated workflow.
.. table:: Currently supported automated workflows.
:widths: auto
:name: workflows_registry
+-------------------------------+------------------------------------------------------------------------------------------------------+
| key/keyword: EnsembleAnalysis | value: <workflow module>.<top-level automated analysis function> |
+===============================+======================================================================================================+
| DihedralAnalysis | :any:`dihedrals.automated_dihedral_analysis <mdpow.workflows.dihedrals.automated_dihedral_analysis>` |
+-------------------------------+------------------------------------------------------------------------------------------------------+
.. autodata:: registry
.. seealso:: :mod:`~mdpow.workflows.base`
"""

# import analysis
from mdpow.workflows import dihedrals

registry = {

'DihedralAnalysis' : dihedrals.automated_dihedral_analysis

}

"""
In the `registry`, each entry corresponds to an
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`
for which exists a corresponding automated workflow.
Intended for use with :mod:`mdpow.workflows.base` to specify which
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` should run iteratively over
the provided project data directory.
To include a new automated workflow for use with :mod:`mdpow.workflows.base`,
create a key that is the name of the corresponding
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`, with the value defined as
`<workflow module>.<top-level automated analysis function>`.
The available automated workflows (key-value pairs) are listed in the
following table :any:`Currently supported automated workflows. <workflows_registry>`
"""

0 comments on commit cafb300

Please sign in to comment.