-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* close #237 * add workflows.base (code for reading CSV file of projects and running automated workflows on each project) * add workflows.registry (with a single example workflow, DihedralAnalysis) * add new testing data, .csv for workflows base module; add workflows to STATES dictionary * add documentation for workflows registry and base module * update CHANGES
- Loading branch information
1 parent
b7d0b06
commit cafb300
Showing
9 changed files
with
361 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
============== | ||
Workflows Base | ||
============== | ||
|
||
.. versionadded:: 0.9.0 | ||
|
||
.. automodule:: mdpow.workflows.base |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
================== | ||
Workflows Registry | ||
================== | ||
|
||
.. versionadded:: 0.9.0 | ||
|
||
.. automodule:: mdpow.workflows.registry |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import re | ||
import os | ||
import sys | ||
import yaml | ||
import pybol | ||
import pytest | ||
import pathlib | ||
import logging | ||
|
||
import pandas as pd | ||
|
||
from . import RESOURCES | ||
from . import STATES | ||
|
||
import py.path | ||
|
||
from ..workflows import base | ||
|
||
from pkg_resources import resource_filename | ||
|
||
RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources')) | ||
MANIFEST = RESOURCES / 'manifest.yml' | ||
|
||
@pytest.fixture(scope='function') | ||
def molname_workflows_directory(tmp_path): | ||
m = pybol.Manifest(str(MANIFEST)) | ||
m.assemble('workflows', tmp_path) | ||
return tmp_path | ||
|
||
class TestWorkflowsBase(object): | ||
|
||
@pytest.fixture(scope='function') | ||
def SM_tmp_dir(self, molname_workflows_directory): | ||
dirname = molname_workflows_directory | ||
return dirname | ||
|
||
@pytest.fixture(scope='function') | ||
def csv_input_data(self): | ||
csv_path = STATES['workflows'] / 'project_paths.csv' | ||
csv_df = pd.read_csv(csv_path).reset_index(drop=True) | ||
return csv_path, csv_df | ||
|
||
@pytest.fixture(scope='function') | ||
def test_df_data(self): | ||
test_dict = {'molecule' : ['SM25', 'SM26'], | ||
'resname' : ['SM25', 'SM26']} | ||
test_df = pd.DataFrame(test_dict).reset_index(drop=True) | ||
return test_df | ||
|
||
@pytest.fixture(scope='function') | ||
def project_paths_data(self, SM_tmp_dir): | ||
project_paths = base.project_paths(parent_directory=SM_tmp_dir) | ||
return project_paths | ||
|
||
def test_project_paths(self, test_df_data, project_paths_data): | ||
test_df = test_df_data | ||
project_paths = project_paths_data | ||
|
||
assert project_paths['molecule'][0] == test_df['molecule'][0] | ||
assert project_paths['molecule'][1] == test_df['molecule'][1] | ||
assert project_paths['resname'][0] == test_df['resname'][0] | ||
assert project_paths['resname'][1] == test_df['resname'][1] | ||
|
||
def test_project_paths_csv_input(self, csv_input_data): | ||
csv_path, csv_df = csv_input_data | ||
project_paths = base.project_paths(csv=csv_path) | ||
|
||
pd.testing.assert_frame_equal(project_paths, csv_df) | ||
|
||
def test_automated_project_analysis(self, project_paths_data, caplog): | ||
project_paths = project_paths_data | ||
# change resname to match topology (every SAMPL7 resname is 'UNK') | ||
# only necessary for this dataset, not necessary for normal use | ||
project_paths['resname'] = 'UNK' | ||
|
||
base.automated_project_analysis(project_paths, solvents=('water',), | ||
ensemble_analysis='DihedralAnalysis') | ||
|
||
assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis ' | ||
'did not iteratively run to completion for the provided project') | ||
|
||
def test_automated_project_analysis_KeyError(self, project_paths_data, caplog): | ||
caplog.clear() | ||
caplog.set_level(logging.ERROR, logger='mdpow.workflows.base') | ||
|
||
project_paths = project_paths_data | ||
# change resname to match topology (every SAMPL7 resname is 'UNK') | ||
# only necessary for this dataset, not necessary for normal use | ||
project_paths['resname'] = 'UNK' | ||
|
||
# test error output when raised | ||
with pytest.raises(KeyError, | ||
match="Invalid ensemble_analysis 'DarthVaderAnalysis'. " | ||
"An EnsembleAnalysis type that corresponds to an existing " | ||
"automated workflow module must be input as a kwarg. ex: " | ||
"ensemble_analysis='DihedralAnalysis'"): | ||
base.automated_project_analysis(project_paths, ensemble_analysis='DarthVaderAnalysis', solvents=('water',)) | ||
|
||
# test logger error recording | ||
assert "'DarthVaderAnalysis' is an invalid selection" in caplog.text, ('did not catch incorrect ' | ||
'key specification for workflows.registry that results in KeyError') |
3 changes: 3 additions & 0 deletions
3
mdpow/tests/testing_resources/states/workflows/project_paths.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
molecule,resname,path | ||
SM25,SM25,mdpow/tests/testing_resources/states/workflows/SM25 | ||
SM26,SM26,mdpow/tests/testing_resources/states/workflows/SM26 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
# MDPOW: base.py | ||
# 2022 Cade Duckworth | ||
|
||
""" | ||
:mod:`mdpow.workflows.base` --- Automated workflow base functions | ||
================================================================= | ||
To analyze multiple MDPOW projects, provide :func:`project_paths` | ||
with the top-level directory containing all MDPOW projects' simulation data | ||
to obtain a :class:`pandas.DataFrame` containing the project information | ||
and paths. Then, :func:`automated_project_analysis` takes as input the | ||
aforementioned :class:`pandas.DataFrame` and runs the specified | ||
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects | ||
under the top-level directory provided to :func:`project_paths`. | ||
.. seealso:: :mod:`~mdpow.workflows.registry` | ||
.. autofunction:: project_paths | ||
.. autofunction:: automated_project_analysis | ||
""" | ||
|
||
import os | ||
import re | ||
import pandas as pd | ||
|
||
from mdpow.workflows import registry | ||
|
||
import logging | ||
|
||
logger = logging.getLogger('mdpow.workflows.base') | ||
|
||
def project_paths(parent_directory=None, csv=None, csv_save_dir=None): | ||
"""Takes a top directory containing MDPOW projects and determines | ||
the molname, resname, and path, of each MDPOW project within. | ||
Optionally takes a .csv file containing `molname`, `resname`, and | ||
`paths`, in that order. | ||
:keywords: | ||
*parent_directory* | ||
the path for the location of the top directory | ||
under which the subdirectories of MDPOW simulation | ||
data exist, additionally creates a 'project_paths.csv' file | ||
for user manipulation of metadata and for future reference | ||
*csv* | ||
.csv file containing the molecule names, resnames, | ||
and paths, in that order, for the MDPOW simulation | ||
data to be iterated over must contain header of the | ||
form: `molecule,resname,path` | ||
*csv_save_dir* | ||
optionally provided directory to save .csv file, otherwise, | ||
data will be saved in current working directory | ||
:returns: | ||
*project_paths* | ||
:class:`pandas.DataFrame` containing MDPOW project metadata | ||
.. rubric:: Example | ||
Typical Workflow:: | ||
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') | ||
automated_project_analysis(project_paths) | ||
or:: | ||
project_paths = project_paths(csv='/foo/bar/MDPOW.csv') | ||
automated_project_analysis(project_paths) | ||
""" | ||
|
||
if parent_directory is not None: | ||
|
||
locations = [] | ||
|
||
reg_compile = re.compile('FEP') | ||
for dirpath, dirnames, filenames in os.walk(parent_directory): | ||
result = [dirpath.strip() for dirname in dirnames if reg_compile.match(dirname)] | ||
if result: | ||
locations.append(result[0]) | ||
|
||
resnames = [] | ||
|
||
for loc in locations: | ||
res_temp = loc.strip().split('/') | ||
resnames.append(res_temp[-1]) | ||
|
||
project_paths = pd.DataFrame( | ||
{ | ||
'molecule': resnames, | ||
'resname': resnames, | ||
'path': locations | ||
} | ||
) | ||
if csv_save_dir is not None: | ||
project_paths.to_csv(f'{csv_save_dir}/project_paths.csv', index=False) | ||
logger.info(f'project_paths saved under {csv_save_dir}') | ||
else: | ||
current_directory = os.getcwd() | ||
project_paths.to_csv('project_paths.csv', index=False) | ||
logger.info(f'project_paths saved under {current_directory}') | ||
|
||
elif csv is not None: | ||
locations = pd.read_csv(csv) | ||
project_paths = locations.sort_values(by=['molecule', 'resname', 'path']).reset_index(drop=True) | ||
|
||
return project_paths | ||
|
||
def automated_project_analysis(project_paths, ensemble_analysis, **kwargs): | ||
"""Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths` | ||
and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` | ||
for each of the projects by running the associated automated workflow | ||
in each project directory returned by :func:`~mdpow.workflows.base.project_paths`. | ||
Compatibility with more automated analyses in development. | ||
:keywords: | ||
*project_paths* | ||
:class:`pandas.DataFrame` that provides paths to MDPOW projects | ||
*ensemble_analysis* | ||
name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` | ||
that corresponds to the desired automated workflow module | ||
*kwargs* | ||
keyword arguments for the supported automated workflows, | ||
see the :mod:`~mdpow.workflows.registry` for all available | ||
workflows and their call signatures | ||
.. rubric:: Example | ||
A typical workflow is the automated dihedral analysis from | ||
:mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis* | ||
:class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project. | ||
The :data:`~mdpow.workflows.registry.registry` contains this automated | ||
workflow under the key *"DihedralAnalysis"* and so the automated execution | ||
for all `project_paths` (obtained via :func:`project_paths`) is performed by | ||
passing the specific key to :func:`automated_project_analysis`:: | ||
project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') | ||
automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs) | ||
""" | ||
|
||
for row in project_paths.itertuples(): | ||
molname = row.molecule | ||
resname = row.resname | ||
dirname = row.path | ||
|
||
logger.info(f'starting {molname}') | ||
|
||
try: | ||
registry.registry[ensemble_analysis](dirname=dirname, resname=resname, molname=molname, **kwargs) | ||
|
||
logger.info(f'{molname} completed') | ||
|
||
except KeyError as err: | ||
msg = (f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds " | ||
"to an existing automated workflow module must be input as a kwarg. " | ||
"ex: ensemble_analysis='DihedralAnalysis'") | ||
logger.error(f'{err} is an invalid selection') | ||
|
||
raise KeyError(msg) | ||
|
||
except TypeError as err: | ||
msg = (f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that " | ||
"corresponds to an existing automated workflow module must be input as a kwarg. " | ||
"ex: ensemble_analysis='DihedralAnalysis'") | ||
logger.error(f'workflow module for {ensemble_analysis} does not exist yet') | ||
|
||
raise TypeError(msg) | ||
|
||
logger.info('all analyses completed') | ||
return |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
# MDPOW: registry.py | ||
# 2023 Cade Duckworth | ||
|
||
""" | ||
:mod:`mdpow.workflows.registry` --- Registry of currently supported automated workflows | ||
======================================================================================= | ||
The :mod:`mdpow.workflows.registry` module hosts a dictionary with keys that correspond to an | ||
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for which exists a corresponding automated workflow. | ||
.. table:: Currently supported automated workflows. | ||
:widths: auto | ||
:name: workflows_registry | ||
+-------------------------------+------------------------------------------------------------------------------------------------------+ | ||
| key/keyword: EnsembleAnalysis | value: <workflow module>.<top-level automated analysis function> | | ||
+===============================+======================================================================================================+ | ||
| DihedralAnalysis | :any:`dihedrals.automated_dihedral_analysis <mdpow.workflows.dihedrals.automated_dihedral_analysis>` | | ||
+-------------------------------+------------------------------------------------------------------------------------------------------+ | ||
.. autodata:: registry | ||
.. seealso:: :mod:`~mdpow.workflows.base` | ||
""" | ||
|
||
# import analysis | ||
from mdpow.workflows import dihedrals | ||
|
||
registry = { | ||
|
||
'DihedralAnalysis' : dihedrals.automated_dihedral_analysis | ||
|
||
} | ||
|
||
""" | ||
In the `registry`, each entry corresponds to an | ||
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` | ||
for which exists a corresponding automated workflow. | ||
Intended for use with :mod:`mdpow.workflows.base` to specify which | ||
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` should run iteratively over | ||
the provided project data directory. | ||
To include a new automated workflow for use with :mod:`mdpow.workflows.base`, | ||
create a key that is the name of the corresponding | ||
:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`, with the value defined as | ||
`<workflow module>.<top-level automated analysis function>`. | ||
The available automated workflows (key-value pairs) are listed in the | ||
following table :any:`Currently supported automated workflows. <workflows_registry>` | ||
""" |