ENH: Bring the core anchoring options up to par with its ligand-based…

… counterpart
nlesc-nano · Dec 2, 2021 · 36694b3 · 36694b3
1 parent 8dd0a61
commit 36694b3
Show file tree

Hide file tree

Showing 12 changed files with 449 additions and 99 deletions.
diff --git a/CAT/attachment/core_anchoring.py b/CAT/attachment/core_anchoring.py
@@ -0,0 +1,115 @@
+"""A module designed for finding core functional groups.
+
+Index
+-----
+.. currentmodule:: CAT.attachment.ligand_anchoring
+.. autosummary::
+    set_core_anchors
+    find_core_substructure
+
+API
+---
+.. autofunction:: set_core_anchors
+.. autofunction:: find_core_substructure
+
+"""
+
+from typing import Tuple, Any, Mapping, TYPE_CHECKING
+
+import numpy as np
+from scm.plams import Molecule, MoleculeError, to_rdmol
+
+from .distribution import distribute_idx
+from ..utils import AllignmentEnum, AllignmentTup, AnchorTup
+
+if TYPE_CHECKING:
+    from numpy.typing import NDArray
+    from numpy import int64 as i8
+
+__all__ = ["set_core_anchors", "find_core_substructure"]
+
+
+def set_core_anchors(
+    mol: Molecule,
+    anchor_tup: AnchorTup,
+    allignment_tup: AllignmentTup,
+    subset_kwargs: "None | Mapping[str, Any]" = None,
+) -> Tuple[str, str]:
+    """Identify and parse the core anchors within the passed molecule.
+
+    Returns two strings: The (parsed) molecular formula and the anchor indices.
+
+    """
+    # Checks the if the anchor is a string (atomic symbol) or integer (atomic number)
+    formula = mol.get_formula()
+
+    # Get the indices of all anchor atom ligand placeholders in the core
+    anchors = mol.properties.dummies
+    if not anchors:
+        anchor_idx, remove_idx = find_core_substructure(mol, anchor_tup)
+    else:
+        anchor_idx = np.fromiter(anchors, count=len(anchors), dtype=np.int64)
+        anchor_idx -= 1
+        remove_idx = anchor_idx.copy()
+    if subset_kwargs:
+        anchor_idx = distribute_idx(mol, anchor_idx, **subset_kwargs)
+    if not len(anchor_idx):
+        raise MoleculeError(f"No valid anchoring groups found in the core {formula!r}")
+
+    # Convert atomic indices into Atoms
+    anchor_idx += 1
+    anchor_idx.sort()
+    mol.properties.dummies = [mol[i] for i in anchor_idx]
+
+    # Returns an error if no anchor atoms were found
+    if len(anchor_idx) < 4 and allignment_tup.kind == AllignmentEnum.SURFACE:
+        raise NotImplementedError(
+            '`optional.core.allignment = "surface"` is not supported for cores with less '
+            f'than 4 anchor atoms ({mol.get_formula()}); consider using '
+            '`optional.core.allignment = "sphere"`'
+        )
+
+    # Delete all core anchor atoms
+    if remove_idx is not None:
+        remove_idx += 1
+        remove_idx.sort()
+        for i in reversed(remove_idx):
+            mol.delete_atom(mol[i])
+    return formula, ' '.join(anchor_idx.astype(str))
+
+
+def find_core_substructure(
+    mol: Molecule,
+    anchor_tup: AnchorTup,
+) -> Tuple["NDArray[i8]", "None | NDArray[i8]"]:
+    """Identify substructures within the passed core based on **anchor_tup**.
+
+    Returns two indice-arrays, respectivelly containing the indices of the anchor
+    atoms and all to-be removed atoms.
+
+    """
+    rdmol = to_rdmol(mol)
+    matches = rdmol.GetSubstructMatches(anchor_tup.mol, useChirality=True)
+    remove = anchor_tup.remove
+
+    # Remove all duplicate matches, each heteroatom (match[0]) should have <= 1 entry
+    ref_set = set()
+    anchor_list = []
+    remove_list = []
+    for idx_tup in matches:
+        anchor_idx_tup = tuple(idx_tup[i] for i in anchor_tup.group_idx)
+        if anchor_idx_tup in ref_set:
+            continue  # Skip duplicates
+        else:
+            ref_set.add(anchor_idx_tup)
+
+        if remove is not None:
+            remove_list += [idx_tup[i] for i in remove]
+        anchor_list.append(anchor_idx_tup[0])
+
+    anchor_array = np.fromiter(anchor_list, dtype=np.int64, count=len(anchor_list))
+    if remove is not None:
+        remove_array = np.fromiter(remove_list, dtype=np.int64, count=len(remove_list))
+        return anchor_array, remove_array
+    else:
+        return anchor_array, None
diff --git a/CAT/base.py b/CAT/base.py
@@ -33,9 +33,7 @@
 from .__version__ import __version__
 
 from .logger import logger
-from .mol_utils import to_symbol
 from .settings_dataframe import SettingsDataFrame
-from .utils import AllignmentEnum
 
 from .data_handling.mol_import import read_mol
 from .data_handling.update_qd_df import update_qd_df
@@ -44,9 +42,9 @@
 from .multi_ligand import init_multi_ligand
 from .attachment.qd_opt import init_qd_opt
 from .attachment.ligand_opt import init_ligand_opt, allign_axis
-from .attachment.distribution import distribute_idx
 from .attachment.ligand_attach import init_qd_construction
 from .attachment.ligand_anchoring import init_ligand_anchoring
+from .attachment.core_anchoring import set_core_anchors
 
 from .workflows import MOL
 
@@ -209,50 +207,13 @@ def prep_core(core_df: SettingsDataFrame) -> SettingsDataFrame:
 
     """
     # Unpack arguments
-    anchor = core_df.settings.optional.core.anchor
-    subset = core_df.settings.optional.core.subset
-
-    idx_tuples = []
-    for core in core_df[MOL]:
-        # Checks the if the anchor is a string (atomic symbol) or integer (atomic number)
-        formula = core.get_formula()
-
-        # Returns the indices of all anchor atom ligand placeholders in the core
-        if not core.properties.dummies:
-            at_idx = np.array([i for i, atom in enumerate(core) if atom.atnum == anchor])
-        else:
-            dummies = core.properties.dummies
-            at_idx = np.fromiter(dummies, count=len(dummies), dtype=int)
-            at_idx -= 1
-        if subset:
-            at_idx = distribute_idx(core, at_idx, **subset)
-
-        # Convert atomic indices into Atoms
-        at_idx += 1
-        at_idx.sort()
-        core.properties.dummies = dummies = [core[i] for i in at_idx]
-
-        # Returns an error if no anchor atoms were found
-        if not dummies:
-            raise MoleculeError(f"{repr(to_symbol(anchor))} was specified as core anchor atom, yet "
-                                f"no matching atoms were found in {core.properties.name} "
-                                f"(formula: {formula})")
-        elif (
-            len(dummies) < 4 and
-            core_df.settings.optional.core.allignment.kind == AllignmentEnum.SURFACE
-        ):
-            raise NotImplementedError(
-                '`optional.core.allignment = "surface"` is not supported for cores with less '
-                f'than 4 anchor atoms ({core.get_formula()}); consider using '
-                '`optional.core.allignment = "sphere"`'
-            )
-
-        # Delete all core anchor atoms
-        for at in dummies:
-            core.delete_atom(at)
-        idx_tuples.append(
-            (formula, ' '.join(at_idx.astype(str)))
-        )
+    core_options = core_df.settings.optional.core
+    anchor_tup = core_options.anchor[0]
+    allignment_tup = core_options.allignment
+    subset = core_options.subset
+
+    # Set the core anchors
+    idx_tuples = [set_core_anchors(i, anchor_tup, allignment_tup, subset) for i in core_df[MOL]]
 
     # Create and return a new dataframe
     idx = pd.MultiIndex.from_tuples(idx_tuples, names=['formula', 'anchor'])

diff --git a/CAT/data_handling/anchor_parsing.py b/CAT/data_handling/anchor_parsing.py
@@ -5,7 +5,7 @@
 from typing import Union, Tuple, Iterable, SupportsFloat
 
 from rdkit.Chem import Mol
-from scm.plams import Units
+from scm.plams import Units, PT
 from schema import Schema, Use, Optional
 from typing_extensions import TypedDict, SupportsIndex
 
@@ -103,25 +103,42 @@ def _parse_angle_offset(
 })
 
 
+#: All atom types that have to be encapsulated in square brackets when parsing SMILES strings
+SQUARE_BRACKET_ATOMS = frozenset(
+    PT.symtonum.keys() - {'B', 'Br', 'C', 'Cl', 'F', 'I', 'N', 'O', 'P', 'S'}
+)
+
+
 def parse_anchors(
     patterns: Union[
         None,
+        SupportsIndex,
         str,
         Mol,
         AnchorTup,
         _UnparsedAnchorDict,
-        "Iterable[str | Mol | AnchorTup | _UnparsedAnchorDict]",
+        "Iterable[str | SupportsIndex | Mol | AnchorTup | _UnparsedAnchorDict]",
     ] = None,
     split: bool = True,
+    is_core: bool = False,
 ) -> Tuple[AnchorTup, ...]:
     """Parse the user-specified anchors."""
     if patterns is None:
+        if is_core:
+            raise TypeError("`anchor=None` is not supported for core anchors")
         patterns = get_functional_groups(None, split)
-    elif isinstance(patterns, (Mol, str, dict, AnchorTup)):
+    elif isinstance(patterns, (Mol, str, dict, AnchorTup, SupportsIndex)):
         patterns = [patterns]
 
     ret = []
-    for p in patterns:  # type: _UnparsedAnchorDict | str | Mol | AnchorTup
+    for p in patterns:  # type: _UnparsedAnchorDict | str | Mol | SupportsIndex | AnchorTup
+        try:
+            atnum = operator.index(p)  # Check for atomic symbols
+        except TypeError:
+            pass
+        else:
+            p = PT.get_symbol(atnum)
+
         if isinstance(p, AnchorTup):
             ret.append(p)
         elif isinstance(p, Mol):
@@ -132,40 +149,59 @@ def parse_anchors(
             ret.append(AnchorTup(mol=mol, remove=remove))
         elif isinstance(p, str):
             group = p
+            if group in SQUARE_BRACKET_ATOMS:
+                group = f"[{group}]"
             mol = _smiles_to_rdmol(group)
             remove = None if not split else (list(mol.GetAtoms())[-1].GetIdx(),)
             ret.append(AnchorTup(mol=mol, group=group, remove=remove))
         else:
             kwargs: _AnchorDict = anchor_schema.validate(p)
 
-            # Check that `group_idx` and `remove` are disjoint
             group_idx = kwargs["group_idx"]
             remove = kwargs["remove"]
-            if remove is not None and not set(group_idx).isdisjoint(remove):
-                raise ValueError("`group_idx` and `remove` must be disjoint")
-
-            # Check that at least 3 atoms are available for `angle_offset`
-            # (so a plane can be defined)
             angle_offset = kwargs["angle_offset"]
-            if angle_offset is not None and len(group_idx) < 3:
-                raise ValueError("`group_idx` must contain at least 3 atoms when "
-                                 "`angle_offset` is specified")
-
-            # Check that at least 2 atoms are available for `dihedral`
-            # (so the third dihedral-defining vector can be defined)
             dihedral = kwargs["dihedral"]
-            if dihedral is not None and len(group_idx) < 2:
-                raise ValueError("`group_idx` must contain at least 2 atoms when "
-                                 "`dihedral` is specified")
+
+            group = kwargs.pop("group")
+            if group in SQUARE_BRACKET_ATOMS:
+                group = f"[{group}]"
+            mol = _smiles_to_rdmol(group)
+
+            # Dihedral and angle-offset options are not supported for core anchors
+            if is_core:
+                if dihedral is not None:
+                    raise TypeError("`dihedral != None` is not supported for core anchors")
+                elif angle_offset is not None:
+                    raise TypeError("`angle_offset != None` is not supported for core anchors")
+                elif kwargs["kind"] != KindEnum.FIRST:
+                    raise NotImplementedError('`kind != "first"` is not yet supported')
+            else:
+                # Check that at least 3 atoms are available for `angle_offset`
+                # (so a plane can be defined)
+                if angle_offset is not None and len(group_idx) < 3:
+                    raise ValueError("`group_idx` must contain at least 3 atoms when "
+                                     "`angle_offset` is specified")
+
+                # Check that at least 2 atoms are available for `dihedral`
+                # (so the third dihedral-defining vector can be defined)
+                if dihedral is not None and len(group_idx) < 2:
+                    raise ValueError("`group_idx` must contain at least 2 atoms when "
+                                     "`dihedral` is specified")
+
+                # Check that `group_idx` and `remove` are disjoint
+                # TODO: Investigate if this check can be removed
+                if remove is not None and not set(group_idx).isdisjoint(remove):
+                    raise ValueError("`group_idx` and `remove` must be disjoint")
 
             # Check that the indices in `group_idx` and `remove` are not out of bounds
-            mol = _smiles_to_rdmol(kwargs["group"])
             atom_count = len(mol.GetAtoms())
             if atom_count <= max(group_idx):
                 raise IndexError(f"`group_idx` index {max(group_idx)} is out of bounds "
                                  f"for a `group` with {atom_count} atoms")
             elif remove is not None and atom_count <= max(remove):
                 raise IndexError(f"`remove` index {max(remove)} is out of bounds "
                                  f"for a `group` with {atom_count} atoms")
-            ret.append(AnchorTup(**kwargs, mol=mol))
+            ret.append(AnchorTup(**kwargs, group=group, mol=mol))
+    if is_core and len(ret) > 1:
+        raise NotImplementedError("Cores with multiple anchor types aren't supported yet")
     return tuple(ret)
diff --git a/CAT/data_handling/validate_input.py b/CAT/data_handling/validate_input.py
@@ -190,4 +190,5 @@ def validate_input(s: Settings, validate_only: bool = True) -> None:
         del s.optional.ligand.functional_groups
 
         split = s.optional.ligand.split
-        s.optional.ligand.anchor = parse_anchors(func_groups, split)
+        s.optional.ligand.anchor = parse_anchors(func_groups, split=split)
+        s.optional.core.anchor = parse_anchors(s.optional.core.anchor, split=True, is_core=True)
diff --git a/CAT/data_handling/validation_schemas.py b/CAT/data_handling/validation_schemas.py
@@ -303,21 +303,9 @@ def _get_crsjob() -> type:
         And(str, error='optional.core.dirname expects a string'),
 
     # Alias for `optional.core.anchor`
-    Optional_('dummy', default=None):  # Return a tuple of atomic numbers
-        Or(
-            None,
-            And(val_int, Use(lambda n: to_atnum(int(n)))),
-            And(str, Use(to_atnum)),
-            error='optional.core.dummy expects a valid atomic number (int) or symbol (string)'
-        ),
+    Optional_('dummy', default=None): object,
 
-    Optional_('anchor', default=None):  # Return a tuple of atomic numbers
-        Or(
-            None,
-            And(val_int, Use(lambda n: to_atnum(int(n)))),
-            And(str, Use(to_atnum)),
-            error='optional.core.anchor expects a valid atomic number (int) or symbol (string)'
-        ),
+    Optional_('anchor', default=None): object,
 
     Optional_('subset', default=None):
         Or(None, dict, error="optional.core.subset epected 'None' or a dictionary"),

diff --git a/docs/4_optional.rst b/docs/4_optional.rst
@@ -264,7 +264,23 @@ Core
         replaced with ligands. Alternatively, anchor atoms can be manually specified
         with the core_indices variable.
 
-        This optiona can alternatively be provided as ``optional.core.dummy``.
+        Further customization can be achieved by passing a dictionary:
+
+        * :attr:`anchor.group <optional.ligand.anchor.group>`
+        * :attr:`anchor.group_idx <optional.ligand.anchor.group_idx>`
+        * :attr:`anchor.remove <optional.ligand.anchor.remove>`
+
+        .. note::
+
+            .. code:: yaml
+
+                optional:
+                    core:
+                        anchor:
+                            group: "[H]Cl"  # Remove HCl and attach at previous Cl position
+                            group_idx: 1
+                            remove: [0, 1]
+
 
 
     .. attribute:: optional.core.allignment