From 6a940f1cb6e9a7706902703ebf189863ead5ba85 Mon Sep 17 00:00:00 2001
From: Bas van Beek <b.f.van.beek@vu.nl>
Date: Thu, 2 Dec 2021 15:55:12 +0100
Subject: [PATCH] MAINT: Backport the PLAMS <= 1.5.1 `Molecule.get_formula`
 method

---
 CAT/attachment/ligand_anchoring.py |  4 ++--
 CAT/data_handling/mol_import.py    |  3 ++-
 CAT/multi_ligand.py                |  4 ++--
 CAT/utils.py                       | 12 +++++++++++-
 tests/test_gen_job_manager.py      |  3 ++-
 tests/test_mol_import.py           |  3 ++-
 tests/test_utils.py                | 25 +++++++++++++++++++------
 7 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/CAT/attachment/ligand_anchoring.py b/CAT/attachment/ligand_anchoring.py
index 405b3600..0ab878e8 100644
--- a/CAT/attachment/ligand_anchoring.py
+++ b/CAT/attachment/ligand_anchoring.py
@@ -34,7 +34,7 @@
 from rdkit import Chem
 
 from ..logger import logger
-from ..utils import get_template, AnchorTup, KindEnum
+from ..utils import get_template, AnchorTup, KindEnum, get_formula
 from ..mol_utils import separate_mod   # noqa: F401
 from ..workflows import MOL, FORMULA, HDF5_INDEX, OPT
 from ..settings_dataframe import SettingsDataFrame
@@ -114,7 +114,7 @@ def _get_df(
     # Create, fill and return the dataframe
     df = SettingsDataFrame(-1, index=idx, columns=columns, settings=settings)
     df[MOL] = mol_list
-    df[FORMULA] = [lig.get_formula() for lig in df[MOL]]
+    df[FORMULA] = [get_formula(lig) for lig in df[MOL]]
     df[OPT] = False
     return df[~df.index.duplicated(keep='first')]  # Remove duplicate indices
 
diff --git a/CAT/data_handling/mol_import.py b/CAT/data_handling/mol_import.py
index d1055c34..78da4295 100644
--- a/CAT/data_handling/mol_import.py
+++ b/CAT/data_handling/mol_import.py
@@ -51,6 +51,7 @@
 
 from rdkit import Chem, RDLogger
 
+from ..utils import get_formula
 from ..logger import logger
 from ..data_handling.validate_mol import validate_mol
 
@@ -353,7 +354,7 @@ def set_mol_prop(mol: Molecule, mol_dict: Settings) -> None:
     """Set molecular and atomic properties."""
     if mol_dict.is_core:
         residue_name = 'COR'
-        mol.properties.name = mol.get_formula()
+        mol.properties.name = get_formula(mol)
     else:
         residue_name = 'LIG'
         mol.properties.name = mol_dict.name
diff --git a/CAT/multi_ligand.py b/CAT/multi_ligand.py
index da6a1283..7790bda3 100755
--- a/CAT/multi_ligand.py
+++ b/CAT/multi_ligand.py
@@ -8,7 +8,7 @@
 
 from scm.plams import Molecule, MoleculeError
 
-from .utils import AnchorTup
+from .utils import AnchorTup, get_formula
 from .workflows import WorkFlow
 from .mol_utils import to_symbol
 from .data_handling import mol_to_file
@@ -95,7 +95,7 @@ def _multi_lig_anchor(qd_series, ligands, path, anchor, allignment) -> np.ndarra
                 assert atoms
             except AssertionError as ex:
                 raise MoleculeError(f'Failed to identify {to_symbol(atnum)!r} in '
-                                    f'{qd.get_formula()!r}') from ex
+                                    f'{get_formula(q)!r}') from ex
 
             coords = Molecule.as_array(None, atom_subset=atoms)
             qd.properties.dummies = np.array(coords, ndmin=2, dtype=float)
diff --git a/CAT/utils.py b/CAT/utils.py
index cb749c09..ddce6c85 100644
--- a/CAT/utils.py
+++ b/CAT/utils.py
@@ -39,7 +39,7 @@
 from os.path import join, isdir, isfile, exists
 from itertools import cycle, chain, repeat
 from contextlib import redirect_stdout
-from collections import abc
+from collections import abc, Counter
 from typing import (
     Iterable, Union, TypeVar, Mapping, Type, Generator, Iterator, Optional,
     Any, NoReturn, Dict, overload, Callable, NamedTuple, Tuple,
@@ -567,3 +567,13 @@ class AllignmentTup(NamedTuple):
 
     kind: AllignmentEnum
     invert: bool
+
+
+def get_formula(mol: Molecule) -> str:
+    """Backport of the PLAMS <= 1.5.1 ``Molecule.get_formula`` method.
+
+    The resulting atoms are reported in alphabetical order,
+    contrary to the Hill system (that prioritizes ``CH`` pairs) utilized after 1.5.1.
+    """
+    dct = Counter(at.symbol for at in mol)
+    return "".join(f"{at}{i}" for at, i in sorted(dct.items()))
diff --git a/tests/test_gen_job_manager.py b/tests/test_gen_job_manager.py
index f41aea96..61893532 100644
--- a/tests/test_gen_job_manager.py
+++ b/tests/test_gen_job_manager.py
@@ -7,6 +7,7 @@
 from assertionlib import assertion
 
 from CAT.gen_job_manager import GenJobManager
+from CAT.utils import get_formula
 
 SETTINGS = Settings({'counter_len': 3, 'hashing': 'input', 'remove_empty_directories': True})
 PATH = join('tests', 'test_files')
@@ -50,7 +51,7 @@ def test_load_job() -> None:
     assertion.isinstance(job.settings, Settings)
     assertion.eq(job.depend, [])
     assertion.eq(job._dont_pickle, [])
-    assertion.eq(job.molecule.get_formula(), 'C78Cd68H182O26Se55')
+    assertion.eq(get_formula(job.molecule), 'C78Cd68H182O26Se55')
 
 
 def _test_check_hash() -> None:
diff --git a/tests/test_mol_import.py b/tests/test_mol_import.py
index 032981a0..78d7a5e5 100644
--- a/tests/test_mol_import.py
+++ b/tests/test_mol_import.py
@@ -9,6 +9,7 @@
 import scm.plams.interfaces.molecule.rdkit as molkit
 from assertionlib import assertion
 
+from CAT.utils import get_formula
 from CAT.data_handling.mol_import import (
     read_mol_xyz, read_mol_pdb, read_mol_mol, read_mol_smiles, read_mol_plams, read_mol_rdkit,
     read_mol_folder, read_mol_txt, get_charge_dict, set_mol_prop, canonicalize_mol
@@ -92,7 +93,7 @@ def test_read_mol_folder() -> None:
     """Test :func:`CAT.data_handling.validate_input.read_mol_folder`."""
     mol_dict = Settings({'mol': PATH, 'path': PATH, 'guess_bonds': True, 'is_core': False})
     _mol_list = read_mol_folder(mol_dict)
-    mol_list = [mol for mol in _mol_list if mol.get_formula() == 'C1H4O1']
+    mol_list = [mol for mol in _mol_list if get_formula(mol) == 'C1H4O1']
 
     for mol in mol_list:
         assertion.isinstance(mol, Molecule)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 4dd3434d..ea835bc4 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,11 +1,12 @@
 """Tests for :mod:`CAT.utils`."""
 
 import os
-from os.path import join
+import re
+from pathlib import Path
 
 from unittest import mock
 
-from scm.plams import config
+from scm.plams import config, Molecule
 from scm.plams.interfaces.adfsuite.ams import AMSJob
 from scm.plams.interfaces.adfsuite.adf import ADFJob
 from scm.plams.interfaces.thirdparty.orca import ORCAJob
@@ -15,10 +16,16 @@
 from assertionlib import assertion
 
 from CAT.utils import (
-    type_to_string, dict_concatenate, get_template, validate_path, check_sys_var, restart_init
+    type_to_string,
+    dict_concatenate,
+    get_template,
+    validate_path,
+    check_sys_var,
+    restart_init,
+    get_formula,
 )
 
-PATH = join('tests', 'test_files')
+PATH = Path('tests') / 'test_files'
 FOLDER = 'test_plams_workdir'
 
 
@@ -60,8 +67,8 @@ def test_validate_path() -> None:
     assertion.eq(validate_path(''), os.getcwd())
     assertion.eq(validate_path('.'), os.getcwd())
     assertion.eq(validate_path(PATH), PATH)
-    assertion.assert_(validate_path, join(PATH, 'bob'), exception=FileNotFoundError)
-    assertion.assert_(validate_path, join(PATH, 'Methanol.xyz'), exception=NotADirectoryError)
+    assertion.assert_(validate_path, PATH / 'bob', exception=FileNotFoundError)
+    assertion.assert_(validate_path, PATH / 'Methanol.xyz', exception=NotADirectoryError)
 
 
 @mock.patch.dict(
@@ -80,3 +87,9 @@ def test_restart_init() -> None:
 
     _hash = '0da9b13507022986d26bbc57b4c366cf1ead1fe70ff750e071e79e393b14dfb5'
     assertion.contains(manager.hashes, _hash)
+
+
+def test_get_formula() -> None:
+    formula = get_formula(Molecule(PATH / "multi_ligand.pdb"))
+    matches = re.findall(f"([a-zA-Z]+)[0-9+]", formula)
+    assertion.eq(matches, ["C", "Cd", "F", "H", "O", "Se"])