Skip to content

Commit

Permalink
ENH: Bring the core anchoring options up to par with its ligand-based…
Browse files Browse the repository at this point in the history
… counterpart
  • Loading branch information
BvB93 authored and Bas van Beek committed Dec 2, 2021
1 parent 8dd0a61 commit 36694b3
Show file tree
Hide file tree
Showing 12 changed files with 449 additions and 99 deletions.
115 changes: 115 additions & 0 deletions CAT/attachment/core_anchoring.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""A module designed for finding core functional groups.
Index
-----
.. currentmodule:: CAT.attachment.ligand_anchoring
.. autosummary::
set_core_anchors
find_core_substructure
API
---
.. autofunction:: set_core_anchors
.. autofunction:: find_core_substructure
"""

from typing import Tuple, Any, Mapping, TYPE_CHECKING

import numpy as np
from scm.plams import Molecule, MoleculeError, to_rdmol

from .distribution import distribute_idx
from ..utils import AllignmentEnum, AllignmentTup, AnchorTup

if TYPE_CHECKING:
from numpy.typing import NDArray
from numpy import int64 as i8

__all__ = ["set_core_anchors", "find_core_substructure"]


def set_core_anchors(
mol: Molecule,
anchor_tup: AnchorTup,
allignment_tup: AllignmentTup,
subset_kwargs: "None | Mapping[str, Any]" = None,
) -> Tuple[str, str]:
"""Identify and parse the core anchors within the passed molecule.
Returns two strings: The (parsed) molecular formula and the anchor indices.
"""
# Checks the if the anchor is a string (atomic symbol) or integer (atomic number)
formula = mol.get_formula()

# Get the indices of all anchor atom ligand placeholders in the core
anchors = mol.properties.dummies
if not anchors:
anchor_idx, remove_idx = find_core_substructure(mol, anchor_tup)
else:
anchor_idx = np.fromiter(anchors, count=len(anchors), dtype=np.int64)
anchor_idx -= 1
remove_idx = anchor_idx.copy()
if subset_kwargs:
anchor_idx = distribute_idx(mol, anchor_idx, **subset_kwargs)
if not len(anchor_idx):
raise MoleculeError(f"No valid anchoring groups found in the core {formula!r}")

# Convert atomic indices into Atoms
anchor_idx += 1
anchor_idx.sort()
mol.properties.dummies = [mol[i] for i in anchor_idx]

# Returns an error if no anchor atoms were found
if len(anchor_idx) < 4 and allignment_tup.kind == AllignmentEnum.SURFACE:
raise NotImplementedError(
'`optional.core.allignment = "surface"` is not supported for cores with less '
f'than 4 anchor atoms ({mol.get_formula()}); consider using '
'`optional.core.allignment = "sphere"`'
)

# Delete all core anchor atoms
if remove_idx is not None:
remove_idx += 1
remove_idx.sort()
for i in reversed(remove_idx):
mol.delete_atom(mol[i])
return formula, ' '.join(anchor_idx.astype(str))


def find_core_substructure(
mol: Molecule,
anchor_tup: AnchorTup,
) -> Tuple["NDArray[i8]", "None | NDArray[i8]"]:
"""Identify substructures within the passed core based on **anchor_tup**.
Returns two indice-arrays, respectivelly containing the indices of the anchor
atoms and all to-be removed atoms.
"""
rdmol = to_rdmol(mol)
matches = rdmol.GetSubstructMatches(anchor_tup.mol, useChirality=True)
remove = anchor_tup.remove

# Remove all duplicate matches, each heteroatom (match[0]) should have <= 1 entry
ref_set = set()
anchor_list = []
remove_list = []
for idx_tup in matches:
anchor_idx_tup = tuple(idx_tup[i] for i in anchor_tup.group_idx)
if anchor_idx_tup in ref_set:
continue # Skip duplicates
else:
ref_set.add(anchor_idx_tup)

if remove is not None:
remove_list += [idx_tup[i] for i in remove]
anchor_list.append(anchor_idx_tup[0])

anchor_array = np.fromiter(anchor_list, dtype=np.int64, count=len(anchor_list))
if remove is not None:
remove_array = np.fromiter(remove_list, dtype=np.int64, count=len(remove_list))
return anchor_array, remove_array
else:
return anchor_array, None
55 changes: 8 additions & 47 deletions CAT/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@
from .__version__ import __version__

from .logger import logger
from .mol_utils import to_symbol
from .settings_dataframe import SettingsDataFrame
from .utils import AllignmentEnum

from .data_handling.mol_import import read_mol
from .data_handling.update_qd_df import update_qd_df
Expand All @@ -44,9 +42,9 @@
from .multi_ligand import init_multi_ligand
from .attachment.qd_opt import init_qd_opt
from .attachment.ligand_opt import init_ligand_opt, allign_axis
from .attachment.distribution import distribute_idx
from .attachment.ligand_attach import init_qd_construction
from .attachment.ligand_anchoring import init_ligand_anchoring
from .attachment.core_anchoring import set_core_anchors

from .workflows import MOL

Expand Down Expand Up @@ -209,50 +207,13 @@ def prep_core(core_df: SettingsDataFrame) -> SettingsDataFrame:
"""
# Unpack arguments
anchor = core_df.settings.optional.core.anchor
subset = core_df.settings.optional.core.subset

idx_tuples = []
for core in core_df[MOL]:
# Checks the if the anchor is a string (atomic symbol) or integer (atomic number)
formula = core.get_formula()

# Returns the indices of all anchor atom ligand placeholders in the core
if not core.properties.dummies:
at_idx = np.array([i for i, atom in enumerate(core) if atom.atnum == anchor])
else:
dummies = core.properties.dummies
at_idx = np.fromiter(dummies, count=len(dummies), dtype=int)
at_idx -= 1
if subset:
at_idx = distribute_idx(core, at_idx, **subset)

# Convert atomic indices into Atoms
at_idx += 1
at_idx.sort()
core.properties.dummies = dummies = [core[i] for i in at_idx]

# Returns an error if no anchor atoms were found
if not dummies:
raise MoleculeError(f"{repr(to_symbol(anchor))} was specified as core anchor atom, yet "
f"no matching atoms were found in {core.properties.name} "
f"(formula: {formula})")
elif (
len(dummies) < 4 and
core_df.settings.optional.core.allignment.kind == AllignmentEnum.SURFACE
):
raise NotImplementedError(
'`optional.core.allignment = "surface"` is not supported for cores with less '
f'than 4 anchor atoms ({core.get_formula()}); consider using '
'`optional.core.allignment = "sphere"`'
)

# Delete all core anchor atoms
for at in dummies:
core.delete_atom(at)
idx_tuples.append(
(formula, ' '.join(at_idx.astype(str)))
)
core_options = core_df.settings.optional.core
anchor_tup = core_options.anchor[0]
allignment_tup = core_options.allignment
subset = core_options.subset

# Set the core anchors
idx_tuples = [set_core_anchors(i, anchor_tup, allignment_tup, subset) for i in core_df[MOL]]

# Create and return a new dataframe
idx = pd.MultiIndex.from_tuples(idx_tuples, names=['formula', 'anchor'])
Expand Down
78 changes: 57 additions & 21 deletions CAT/data_handling/anchor_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import Union, Tuple, Iterable, SupportsFloat

from rdkit.Chem import Mol
from scm.plams import Units
from scm.plams import Units, PT
from schema import Schema, Use, Optional
from typing_extensions import TypedDict, SupportsIndex

Expand Down Expand Up @@ -103,25 +103,42 @@ def _parse_angle_offset(
})


#: All atom types that have to be encapsulated in square brackets when parsing SMILES strings
SQUARE_BRACKET_ATOMS = frozenset(
PT.symtonum.keys() - {'B', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S'}
)


def parse_anchors(
patterns: Union[
None,
SupportsIndex,
str,
Mol,
AnchorTup,
_UnparsedAnchorDict,
"Iterable[str | Mol | AnchorTup | _UnparsedAnchorDict]",
"Iterable[str | SupportsIndex | Mol | AnchorTup | _UnparsedAnchorDict]",
] = None,
split: bool = True,
is_core: bool = False,
) -> Tuple[AnchorTup, ...]:
"""Parse the user-specified anchors."""
if patterns is None:
if is_core:
raise TypeError("`anchor=None` is not supported for core anchors")
patterns = get_functional_groups(None, split)
elif isinstance(patterns, (Mol, str, dict, AnchorTup)):
elif isinstance(patterns, (Mol, str, dict, AnchorTup, SupportsIndex)):
patterns = [patterns]

ret = []
for p in patterns: # type: _UnparsedAnchorDict | str | Mol | AnchorTup
for p in patterns: # type: _UnparsedAnchorDict | str | Mol | SupportsIndex | AnchorTup
try:
atnum = operator.index(p) # Check for atomic symbols
except TypeError:
pass
else:
p = PT.get_symbol(atnum)

if isinstance(p, AnchorTup):
ret.append(p)
elif isinstance(p, Mol):
Expand All @@ -132,40 +149,59 @@ def parse_anchors(
ret.append(AnchorTup(mol=mol, remove=remove))
elif isinstance(p, str):
group = p
if group in SQUARE_BRACKET_ATOMS:
group = f"[{group}]"
mol = _smiles_to_rdmol(group)
remove = None if not split else (list(mol.GetAtoms())[-1].GetIdx(),)
ret.append(AnchorTup(mol=mol, group=group, remove=remove))
else:
kwargs: _AnchorDict = anchor_schema.validate(p)

# Check that `group_idx` and `remove` are disjoint
group_idx = kwargs["group_idx"]
remove = kwargs["remove"]
if remove is not None and not set(group_idx).isdisjoint(remove):
raise ValueError("`group_idx` and `remove` must be disjoint")

# Check that at least 3 atoms are available for `angle_offset`
# (so a plane can be defined)
angle_offset = kwargs["angle_offset"]
if angle_offset is not None and len(group_idx) < 3:
raise ValueError("`group_idx` must contain at least 3 atoms when "
"`angle_offset` is specified")

# Check that at least 2 atoms are available for `dihedral`
# (so the third dihedral-defining vector can be defined)
dihedral = kwargs["dihedral"]
if dihedral is not None and len(group_idx) < 2:
raise ValueError("`group_idx` must contain at least 2 atoms when "
"`dihedral` is specified")

group = kwargs.pop("group")
if group in SQUARE_BRACKET_ATOMS:
group = f"[{group}]"
mol = _smiles_to_rdmol(group)

# Dihedral and angle-offset options are not supported for core anchors
if is_core:
if dihedral is not None:
raise TypeError("`dihedral != None` is not supported for core anchors")
elif angle_offset is not None:
raise TypeError("`angle_offset != None` is not supported for core anchors")
elif kwargs["kind"] != KindEnum.FIRST:
raise NotImplementedError('`kind != "first"` is not yet supported')
else:
# Check that at least 3 atoms are available for `angle_offset`
# (so a plane can be defined)
if angle_offset is not None and len(group_idx) < 3:
raise ValueError("`group_idx` must contain at least 3 atoms when "
"`angle_offset` is specified")

# Check that at least 2 atoms are available for `dihedral`
# (so the third dihedral-defining vector can be defined)
if dihedral is not None and len(group_idx) < 2:
raise ValueError("`group_idx` must contain at least 2 atoms when "
"`dihedral` is specified")

# Check that `group_idx` and `remove` are disjoint
# TODO: Investigate if this check can be removed
if remove is not None and not set(group_idx).isdisjoint(remove):
raise ValueError("`group_idx` and `remove` must be disjoint")

# Check that the indices in `group_idx` and `remove` are not out of bounds
mol = _smiles_to_rdmol(kwargs["group"])
atom_count = len(mol.GetAtoms())
if atom_count <= max(group_idx):
raise IndexError(f"`group_idx` index {max(group_idx)} is out of bounds "
f"for a `group` with {atom_count} atoms")
elif remove is not None and atom_count <= max(remove):
raise IndexError(f"`remove` index {max(remove)} is out of bounds "
f"for a `group` with {atom_count} atoms")
ret.append(AnchorTup(**kwargs, mol=mol))
ret.append(AnchorTup(**kwargs, group=group, mol=mol))
if is_core and len(ret) > 1:
raise NotImplementedError("Cores with multiple anchor types aren't supported yet")
return tuple(ret)
3 changes: 2 additions & 1 deletion CAT/data_handling/validate_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,4 +190,5 @@ def validate_input(s: Settings, validate_only: bool = True) -> None:
del s.optional.ligand.functional_groups

split = s.optional.ligand.split
s.optional.ligand.anchor = parse_anchors(func_groups, split)
s.optional.ligand.anchor = parse_anchors(func_groups, split=split)
s.optional.core.anchor = parse_anchors(s.optional.core.anchor, split=True, is_core=True)
16 changes: 2 additions & 14 deletions CAT/data_handling/validation_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,21 +303,9 @@ def _get_crsjob() -> type:
And(str, error='optional.core.dirname expects a string'),

# Alias for `optional.core.anchor`
Optional_('dummy', default=None): # Return a tuple of atomic numbers
Or(
None,
And(val_int, Use(lambda n: to_atnum(int(n)))),
And(str, Use(to_atnum)),
error='optional.core.dummy expects a valid atomic number (int) or symbol (string)'
),
Optional_('dummy', default=None): object,

Optional_('anchor', default=None): # Return a tuple of atomic numbers
Or(
None,
And(val_int, Use(lambda n: to_atnum(int(n)))),
And(str, Use(to_atnum)),
error='optional.core.anchor expects a valid atomic number (int) or symbol (string)'
),
Optional_('anchor', default=None): object,

Optional_('subset', default=None):
Or(None, dict, error="optional.core.subset epected 'None' or a dictionary"),
Expand Down
18 changes: 17 additions & 1 deletion docs/4_optional.rst
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,23 @@ Core
replaced with ligands. Alternatively, anchor atoms can be manually specified
with the core_indices variable.

This optiona can alternatively be provided as ``optional.core.dummy``.
Further customization can be achieved by passing a dictionary:

* :attr:`anchor.group <optional.ligand.anchor.group>`
* :attr:`anchor.group_idx <optional.ligand.anchor.group_idx>`
* :attr:`anchor.remove <optional.ligand.anchor.remove>`

.. note::

.. code:: yaml
optional:
core:
anchor:
group: "[H]Cl" # Remove HCl and attach at previous Cl position
group_idx: 1
remove: [0, 1]
.. attribute:: optional.core.allignment
Expand Down
Loading

0 comments on commit 36694b3

Please sign in to comment.