Skip to content

Commit

Permalink
Synthon search fp bug (rdkit#8086)
Browse files Browse the repository at this point in the history
* Fix bug - connector patterns weren't being matched to the synthon connector patterns.

* Tiny tweak.

* Typo in comment.

---------

Co-authored-by: David Cosgrove <[email protected]>
  • Loading branch information
DavidACosgrove and David Cosgrove authored Dec 12, 2024
1 parent 607912a commit 403cd55
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 18 deletions.
23 changes: 16 additions & 7 deletions Code/GraphMol/SynthonSpaceSearch/SynthonSet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ void SynthonSet::readFromDBStream(std::istream &is, std::uint32_t) {
d_synthonFPs[i][j] = std::make_unique<ExplicitBitVect>(fString);
}
}
// So that d_synthConnPatts is filled in. Next time the binary file format
// is updated they can be put in it, but they're cheap enough to calculate
// so leave it for now.
assignConnectorsUsed();
}

void SynthonSet::enumerateToStream(std::ostream &os) const {
Expand Down Expand Up @@ -322,13 +326,18 @@ void SynthonSet::assignConnectorsUsed() {
}
}
d_connectors.resize(MAX_CONNECTOR_NUM + 1, false);
for (const auto &reagSet : d_synthons) {
for (const auto &reag : reagSet) {
for (size_t i = 0; i < MAX_CONNECTOR_NUM; ++i) {
if (std::regex_search(reag->getSmiles(), connRegexs[2 * i]) ||
std::regex_search(reag->getSmiles(), connRegexs[2 * i + 1])) {
d_connectors.set(i + 1);
}
d_synthConnPatts.clear();
for (const auto &synthSet : d_synthons) {
// We only need to look at the first synthon in each set, as they
// should all be the same.
d_synthConnPatts.emplace_back();
d_synthConnPatts.back().resize(MAX_CONNECTOR_NUM + 1, false);
const auto &reag = synthSet.front();
for (size_t i = 0; i < MAX_CONNECTOR_NUM; ++i) {
if (std::regex_search(reag->getSmiles(), connRegexs[2 * i]) ||
std::regex_search(reag->getSmiles(), connRegexs[2 * i + 1])) {
d_connectors.set(i + 1);
d_synthConnPatts.back().set(i + 1);
}
}
}
Expand Down
9 changes: 8 additions & 1 deletion Code/GraphMol/SynthonSpaceSearch/SynthonSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
return d_synthons;
}
const boost::dynamic_bitset<> &getConnectors() const { return d_connectors; }
const std::vector<boost::dynamic_bitset<>> &getSynthonConnectorPatterns()
const {
return d_synthConnPatts;
}
const std::vector<std::shared_ptr<ROMol>> &getConnectorRegions() const;

const std::unique_ptr<ExplicitBitVect> &getConnRegFP() const;
Expand Down Expand Up @@ -98,8 +102,11 @@ class RDKIT_SYNTHONSPACESEARCH_EXPORT SynthonSet {
// The lists of synthons. A product of the reaction is created by
// combining 1 synthon from each of the outer vectors.
std::vector<std::vector<std::unique_ptr<Synthon>>> d_synthons;
// 4 bits showing which connectors are present in the synthons.
// 4 bits showing which connectors are present in all the
// synthon sets.
boost::dynamic_bitset<> d_connectors;
// and the connector patterns for each synthon set.
std::vector<boost::dynamic_bitset<>> d_synthConnPatts;

// The connector regions of a molecule are the pieces of up to 3 bonds from
// a connector atom into the molecule. We keep a vector of all the ones
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,15 @@ std::vector<boost::dynamic_bitset<>> getHitSynthons(
}
}
if (!fragMatched) {
// No synthons matched this fragment, so the whole fragment set is a
// bust.
synthonsToUse.clear();
return synthonsToUse;
}
}

// Fill in any synthons where they all didn't match.
// Fill in any synthons where they all didn't match because there were
// fewer fragments than synthons.
details::expandBitSet(synthonsToUse);
return synthonsToUse;
}
Expand Down Expand Up @@ -89,10 +92,12 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
if (fragSet.size() > reaction->getSynthons().size()) {
continue;
}
auto synthConnPatts = reaction->getSynthonConnectorPatterns();

// Need to try all combinations of synthon orders.
auto synthonOrders =
details::permMFromN(fragSet.size(), reaction->getSynthons().size());
for (const auto &so : synthonOrders) {
for (const auto &synthonOrder : synthonOrders) {
// Get all the possible permutations of connector numbers compatible with
// the number of synthon sets in this reaction. So if the
// fragmented molecule is C[1*].N[2*] and there are 3 synthon sets
Expand All @@ -102,27 +107,30 @@ std::vector<SynthonSpaceHitSet> SynthonSpaceFingerprintSearcher::searchFragSet(
fragSet, conns, reaction->getConnectors());

for (auto &connComb : connCombs) {
// All the fragment connectors must match something in the corresponding
// synthon.
// Make sure that for this connector combination, the synthons in this
// order have something similar. All query fragment connectors must
// match something in the corresponding synthon. The synthon can
// have unused connectors.
auto connCombConnPatterns = details::getConnectorPatterns(connComb);
bool skip = false;
for (size_t i = 0; i < connCombConnPatterns.size(); ++i) {
if ((connCombConnPatterns[i] & connPatterns[i]).count() <
connPatterns[i].count()) {
if ((connCombConnPatterns[i] & synthConnPatts[synthonOrder[i]])
.count() < connCombConnPatterns[i].count()) {
skip = true;
break;
}
}
if (skip) {
continue;
}
// It appears that for Morgan fingerprints, the isotope numbers are

// It appears that for fingerprints, the isotope numbers are
// ignored so there's no need to worry about the connector numbers
// in the fingerprints.
auto theseSynthons = getHitSynthons(
fragFPs,
getParams().similarityCutoff - getParams().fragSimilarityAdjuster,
reaction, so);
reaction, synthonOrder);
if (!theseSynthons.empty()) {
const size_t numHits = std::accumulate(
theseSynthons.begin(), theseSynthons.end(), 1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ TEST_CASE("FP Small tests") {
"C[C@@H]1CC(NC(=O)NC2COC2)CN(C(=O)c2nccnc2F)C1",
};

std::vector<size_t> expNumHits{2, 4, 4};
std::vector<size_t> expNumHits{2, 3, 4};

for (size_t i = 0; i < libNames.size(); i++) {
SynthonSpace synthonspace;
Expand All @@ -112,7 +112,18 @@ TEST_CASE("FP Small tests") {
for (const auto &r : names) {
fullSmis.insert(MolToSmiles(*mols[r]));
}
CHECK(resSmis == fullSmis);
if (i != 1) {
CHECK(resSmis == fullSmis);
} else {
// In the triazole library, one of the hits found by the brute force
// method (triazole-1_1-1_2-2_3-1) is missed by the SynthonSpaceSearch
// because it requires that the fragment [1*]n([3*])C1CCCC1 is similar
// to synthon c1ccccc1-n([3*])[1*] which it isn't. Instead, make sure
// all the ones that are found are in the brute force results.
for (const auto &rs : resSmis) {
CHECK(fullSmis.find(rs) != fullSmis.end());
}
}
}
}

Expand Down

0 comments on commit 403cd55

Please sign in to comment.