Skip to content

Commit

Permalink
partial sanitization, allow iterables with Chem.Mol objects
Browse files Browse the repository at this point in the history
  • Loading branch information
eloyfelix committed Oct 6, 2023
1 parent 2948019 commit a702b1b
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 19 deletions.
41 changes: 28 additions & 13 deletions FPSim2/io/chem.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,33 +90,48 @@ def build_fp(rdmol, fp_type, fp_params, mol_id):
return fp


def load_molecule(mol_string: str) -> Chem.Mol:
def partial_sanitization(rdmol: Chem.Mol) -> Chem.Mol:
# partialSanit5 from http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
rdmol.UpdatePropertyCache()
Chem.FastFindRings(rdmol)
return rdmol


def load_molecule(molecule: Any) -> Chem.Mol:
"""Reads SMILES, molblock or InChI and returns a RDKit mol.
Parameters
----------
mol_string : str
SMILES, molblock or InChI.
molecule : Any
Chem.Mol, SMILES, molblock or InChI.
Returns
-------
mol: ROMol
RDKit molecule.
"""
if re.search(MOLFILE_RE, mol_string, flags=re.MULTILINE):
rdmol = Chem.MolFromMolBlock(mol_string)
elif mol_string.startswith("InChI="):
if isinstance(molecule, Chem.Mol):
# RDKit mol object parsed by the user
# no further sanitization will be done
return molecule
if re.search(MOLFILE_RE, molecule, flags=re.MULTILINE):
rdmol = Chem.MolFromMolBlock(molecule, sanitize=False)
elif molecule.startswith("InChI="):
try:
rdmol = Chem.MolFromInchi(mol_string)
rdmol = Chem.MolFromInchi(molecule, sanitize=False)
except:
rdmol = None
else:
rdmol = Chem.MolFromSmiles(mol_string)
rdmol = Chem.MolFromSmiles(molecule, sanitize=False)
try:
rdmol = partial_sanitization(rdmol)
except:
rdmol = None
return rdmol


def get_fp_length(fp_type: str, fp_params: Dict[str, Any]) -> int:
"""Returns the FP length given the name of the FP function and it's parameters.
"""Returns the FP length given the name of the FP function and its parameters.
Parameters
----------
Expand Down Expand Up @@ -193,23 +208,23 @@ def it_mol_supplier(
"""
for new_mol_id, mol in enumerate(iterable, 1):
if isinstance(mol, str):
mol_string = mol
molecule = mol
mol_id = new_mol_id
else:
if gen_ids:
mol_string = mol[0]
molecule = mol[0]
mol_id = new_mol_id
else:
try:
mol_string = mol[0]
molecule = mol[0]
mol_id = int(mol[1])
except ValueError:
raise Exception(
"FPSim only supports integer ids for molecules, "
"cosinder setting gen_ids=True when running "
"create_db_file to autogenerate them."
)
rdmol = load_molecule(mol_string)
rdmol = load_molecule(molecule)
if rdmol:
yield mol_id, rdmol
else:
Expand Down
2 changes: 1 addition & 1 deletion docsrc/source/user_guide/limitations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Limitations
===========

Due to it's simplicity FPSim2 can only use integer ids to store the fingerprints, however it can generate new ids for the provided molecules using gen_ids flag.
Due to its simplicity FPSim2 can only use integer ids to store the fingerprints, however it can generate new ids for the provided molecules using gen_ids flag.

>>> create_db_file('mols.smi', 'mols.h5', 'Morgan', {'radius': 2, 'nBits': 2048}, gen_ids=True)

Expand Down
15 changes: 10 additions & 5 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
get_mol_supplier,
get_bounds_range,
build_fp,
partial_sanitization,
)
from rdkit import Chem
import tables as tb
Expand Down Expand Up @@ -36,22 +37,26 @@
def test_suppliers():
smi_file = os.path.join(TESTS_DIR, "data/10mols.smi")
smi_mols = [
Chem.MolToSmiles(x[1]) for x in smi_mol_supplier(smi_file, gen_ids=False)
partial_sanitization(Chem.MolToSmiles(x[1], sanitize=False))
for x in smi_mol_supplier(smi_file, gen_ids=False)
]
sdf_file = os.path.join(TESTS_DIR, "data/10mols.sdf")
sdf_mols = [
Chem.MolToSmiles(x[1])
partial_sanitization(Chem.MolToSmiles(x[1], sanitize=False))
for x in sdf_mol_supplier(sdf_file, gen_ids=False, mol_id_prop="mol_id")
]
sdfgz_file = os.path.join(TESTS_DIR, "data/10mols.sdf.gz")
sdfgz_mols = [
Chem.MolToSmiles(x[1])
partial_sanitization(Chem.MolToSmiles(x[1], sanitize=False))
for x in sdf_mol_supplier(sdfgz_file, gen_ids=False, mol_id_prop="mol_id")
]
it_mols = [
Chem.MolToSmiles(x[1]) for x in it_mol_supplier(smiles_list, gen_ids=True)
partial_sanitization(Chem.MolToSmiles(x[1], sanitize=False))
for x in it_mol_supplier(smiles_list, gen_ids=True)
]
assert smi_mols == sdf_mols == sdfgz_mols == it_mols
# check when iterable with Chem.Mol
it_rd_mols = [x for x in it_mol_supplier(it_mols, gen_ids=True)]
assert smi_mols == sdf_mols == sdfgz_mols == it_mols == it_rd_mols


def test_get_mol_supplier():
Expand Down

0 comments on commit a702b1b

Please sign in to comment.