1 #!/bin/env python
2 #
3 # File: RDKitPerformSynthonSpaceSearch.py
4 # Author: Manish Sud <msud@san.rr.com>
5 #
6 # Acknowledgment: Dave Cosgrove
7 #
8 # Copyright (C) 2026 Manish Sud. All rights reserved.
9 #
10 # The functionality available in this script is implemented using RDKit, an
11 # open source toolkit for cheminformatics developed by Greg Landrum.
12 #
13 # This file is part of MayaChemTools.
14 #
15 # MayaChemTools is free software; you can redistribute it and/or modify it under
16 # the terms of the GNU Lesser General Public License as published by the Free
17 # Software Foundation; either version 3 of the License, or (at your option) any
18 # later version.
19 #
20 # MayaChemTools is distributed in the hope that it will be useful, but without
21 # any warranty; without even the implied warranty of merchantability of fitness
22 # for a particular purpose. See the GNU Lesser General Public License for more
23 # details.
24 #
25 # You should have received a copy of the GNU Lesser General Public License
26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
28 # Boston, MA, 02111-1307, USA.
29 #
30
31 from __future__ import print_function
32
33 import os
34 import sys
35 import time
36 import re
37 import multiprocessing as mp
38
39 # RDKit imports...
40 try:
41 from rdkit import rdBase
42 from rdkit import Chem
43 from rdkit.Chem import AllChem
44 from rdkit.Chem import rdSynthonSpaceSearch
45 from rdkit.Chem import rdFingerprintGenerator
46 from rdkit.Chem import rdRascalMCES
47 except ImportError as ErrMsg:
48 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
49 sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
50 sys.exit(1)
51
52 # MayaChemTools imports...
53 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
54 try:
55 from docopt import docopt
56 import MiscUtil
57 import RDKitUtil
58 except ImportError as ErrMsg:
59 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
60 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
61 sys.exit(1)
62
63 ScriptName = os.path.basename(sys.argv[0])
64 Options = {}
65 OptionsInfo = {}
66
67
68 def main():
69 """Start execution of the script."""
70
71 MiscUtil.PrintInfo(
72 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
73 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
74 )
75
76 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
77
78 # Retrieve command line arguments and options...
79 RetrieveOptions()
80
81 if Options and Options["--list"]:
82 # Process list option...
83 ProcessListSynthonSearchSpace()
84 else:
85 # Process and validate command line arguments and options...
86 ProcessOptions()
87
88 # Perform actions required by the script...
89 PerformSynthonSpaceSearch()
90
91 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
92 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
93
94
95 def PerformSynthonSpaceSearch():
96 """Perform synthon space search."""
97
98 Mode = OptionsInfo["Mode"]
99 if re.match("^FingerprintsGeneration$", Mode, re.I):
100 GenerateFingerprints()
101 elif re.match("^BinaryDBFileGeneration$", Mode, re.I):
102 GenerateBinaryDatabaseFile()
103 elif re.match("^LibraryEnumeration$", Mode, re.I):
104 PerformLibraryEnumeration()
105 elif re.match("^RascalSimilaritySearch$", Mode, re.I):
106 PerformRascalSimilaritySearch()
107 elif re.match("^SimilaritySearch$", Mode, re.I):
108 PerformSimilaritySearch()
109 elif re.match("^SubstructureSearch$", Mode, re.I):
110 PerformSubtructureSearch()
111 else:
112 MiscUtil.PrintError('The value specified, %s, for option "--mode" is not valid.' % Mode)
113
114
115 def GenerateFingerprints():
116 """Generate fingerprints for synthons and write out a binary file."""
117
118 MiscUtil.PrintInfo("\nGenerating fingerprints (Mode: %s)..." % OptionsInfo["Mode"])
119
120 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
121
122 StartTime = time.perf_counter()
123
124 MiscUtil.PrintInfo("\nGenerating fingerprints (Type: %s)..." % OptionsInfo["SpecifiedFingerprints"])
125 FPGenerator = InitializeFingerprintsGenerator()
126 SynthonSpace.BuildSynthonFingerprints(FPGenerator)
127
128 TotalTime = time.perf_counter() - StartTime
129 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
130
131 WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"])
132
133
134 def GenerateBinaryDatabaseFile():
135 """Write out a binary file for synthons."""
136
137 MiscUtil.PrintInfo("\nGenerating binary database file (Mode: %s)..." % OptionsInfo["Mode"])
138
139 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
140 WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"])
141
142
143 def PerformLibraryEnumeration():
144 """Enumerate library using synthons and write out a SMILES file."""
145
146 MiscUtil.PrintInfo("\nPerforming library enumeration (Mode: %s)..." % OptionsInfo["Mode"])
147
148 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
149
150 MiscUtil.PrintInfo("\nWriting file %s ..." % OptionsInfo["Outfile"])
151 SynthonSpace.WriteEnumeratedFile(OptionsInfo["Outfile"])
152
153
154 def PerformSimilaritySearch():
155 """Perform similarity search."""
156
157 SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
158 CountHitsMode = OptionsInfo["CountHitsMode"]
159 SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
160
161 MiscUtil.PrintInfo(
162 "\nPerforming similiarity search (Fingerprints: %s; SimilarityCutoff: %s; MaxHits: %s)..."
163 % (
164 OptionsInfo["SpecifiedFingerprints"],
165 SynthonSearchParams["SimilarityCutoff"],
166 SynthonSearchParams["MaxHits"],
167 )
168 )
169
170 # Setup synthon space...
171 SynthonSpace, FPGenerator = SetupSynthonSpaceForSimilaritySearch()
172
173 # Setup out file writers...
174 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
175
176 # Setup a molecule reader...
177 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"])
178 QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"])
179
180 # Process query molecules...
181 (QueryMolCount, ValidQueryMolCount) = [0] * 2
182 for QueryMol in QueryMols:
183 QueryMolCount += 1
184 if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol):
185 continue
186
187 ValidQueryMolCount += 1
188 QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount)
189
190 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSimilaritySearch(
191 SynthonSpace, FPGenerator, QueryMol
192 )
193
194 if CountHitsMode:
195 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
196 else:
197 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
198
199 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
200 WriteMolecules(Writer, QueryMolName, HitMols)
201
202 if not SingleOutFileMode:
203 if Writer is not None:
204 Writer.close()
205
206 if SingleOutFileWriter is not None:
207 SingleOutFileWriter.close()
208
209 if HitsInfoWriter is not None:
210 HitsInfoWriter.close()
211
212 MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount)
213 MiscUtil.PrintInfo("Number of valid query molecules: %d" % ValidQueryMolCount)
214 MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount))
215
216
217 def PerformSubtructureSearch():
218 """Perform substructure search."""
219
220 SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
221 CountHitsMode = OptionsInfo["CountHitsMode"]
222 SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
223
224 MiscUtil.PrintInfo("\nPerforming substructue search (MaxHits: %s)..." % (SynthonSearchParams["MaxHits"]))
225
226 # Setup synthon space...
227 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
228
229 # Setup out file writers...
230 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
231
232 # Process query pattern molecules...
233 MiscUtil.PrintInfo("\nProcessing query patterns...")
234
235 QueryMolCount = 0
236 for QueryMol in OptionsInfo["QueryPatternMols"]:
237 QueryMolCount += 1
238 QueryMolName = "Pattern%s" % QueryMolCount
239
240 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol)
241
242 if CountHitsMode:
243 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
244 else:
245 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
246
247 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
248 WriteMolecules(Writer, QueryMolName, HitMols)
249
250 if not SingleOutFileMode:
251 if Writer is not None:
252 Writer.close()
253
254 if SingleOutFileWriter is not None:
255 SingleOutFileWriter.close()
256
257 if HitsInfoWriter is not None:
258 HitsInfoWriter.close()
259
260 MiscUtil.PrintInfo("\nTotal number of query patterns: %d" % QueryMolCount)
261
262
263 def PerformRascalSimilaritySearch():
264 """Perform RASCAL similarity search."""
265
266 SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
267 CountHitsMode = OptionsInfo["CountHitsMode"]
268 RascalSearchParams = OptionsInfo["RascalSearchParams"]
269 SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
270
271 MiscUtil.PrintInfo(
272 "\nPerforming RASCAL similiarity search (SimilarityThreshold: %s; MaxHits: %s)..."
273 % (RascalSearchParams["SimilarityThreshold"], SynthonSearchParams["MaxHits"])
274 )
275
276 # Setup synthon space...
277 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
278
279 # Setup out file writers...
280 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
281
282 # Setup a molecule reader...
283 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"])
284 QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"])
285
286 # Process query molecules...
287 (QueryMolCount, ValidQueryMolCount) = [0] * 2
288 for QueryMol in QueryMols:
289 QueryMolCount += 1
290 if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol):
291 continue
292
293 ValidQueryMolCount += 1
294 QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount)
295
296 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol)
297
298 if CountHitsMode:
299 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
300 else:
301 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
302
303 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
304 WriteMolecules(Writer, QueryMolName, HitMols)
305
306 if not SingleOutFileMode:
307 if Writer is not None:
308 Writer.close()
309
310 if SingleOutFileWriter is not None:
311 SingleOutFileWriter.close()
312
313 if HitsInfoWriter is not None:
314 HitsInfoWriter.close()
315
316 MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount)
317 MiscUtil.PrintInfo("Number of valid query molecules: %d" % ValidQueryMolCount)
318 MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount))
319
320
321 def ProcessListSynthonSearchSpace():
322 """Process list synthon search space information."""
323
324 MiscUtil.PrintInfo("\nListing information...")
325
326 # Validate infile..
327 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
328 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc")
329
330 # Process infile..
331 OptionsInfo["Infile"] = Options["--infile"]
332
333 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
334
335 MiscUtil.PrintInfo("\nSummary of synthon space:\n")
336 SynthonSpace.Summarise()
337
338 ListSynthonSpaceFingerprintsType(SynthonSpace)
339
340
341 def PerformSynthonSpaceSimilaritySearch(SynthonSpace, FPGenerator, QueryMol):
342 """Perform synthon space similarity search."""
343
344 try:
345 Results = SynthonSpace.FingerprintSearch(QueryMol, FPGenerator, params=OptionsInfo["RDKitSynthonSearchParams"])
346 except Exception as ErrMsg:
347 MiscUtil.PrintInfo("")
348 MiscUtil.PrintError("Failed to perform synthon space fingerprints seach:\n%s\n" % (ErrMsg))
349
350 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
351
352 return (HitMols, HitMolsCount, MaxPossibleHits)
353
354
355 def PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol):
356 """Perform synthon space RASCAL similarity search."""
357
358 try:
359 Results = SynthonSpace.RascalSearch(
360 QueryMol, OptionsInfo["RDKitRascalSearchParams"], params=OptionsInfo["RDKitSynthonSearchParams"]
361 )
362 except Exception as ErrMsg:
363 MiscUtil.PrintInfo("")
364 MiscUtil.PrintError("Failed to perform synthon space RASCAL similarity seach:\n%s\n" % (ErrMsg))
365
366 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
367
368 return (HitMols, HitMolsCount, MaxPossibleHits)
369
370
371 def PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol):
372 """Perform synthon space substructure search."""
373
374 try:
375 Results = SynthonSpace.SubstructureSearch(
376 QueryMol,
377 substructMatchParams=OptionsInfo["RDKitSubstructureMatchParams"],
378 params=OptionsInfo["RDKitSynthonSearchParams"],
379 )
380 except Exception as ErrMsg:
381 MiscUtil.PrintInfo("")
382 MiscUtil.PrintError("Failed to perform synthon space substructure seach:\n%s\n" % (ErrMsg))
383
384 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
385
386 return (HitMols, HitMolsCount, MaxPossibleHits)
387
388
389 def GetSynthonSpaceHitMolecules(Results):
390 """Retrieve synthon space hit molecues."""
391
392 HitMols = Results.GetHitMolecules()
393
394 HitMolsCount = len(HitMols)
395 if HitMolsCount == 0:
396 HitMols = None
397 HitMolsCount = None
398
399 MaxPossibleHits = Results.GetMaxNumResults()
400
401 return (HitMols, HitMolsCount, MaxPossibleHits)
402
403
404 def SetupSynthonSpaceForSimilaritySearch():
405 """Setup synthon space for similarity search."""
406
407 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
408
409 FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace)
410 if FPType is None:
411 MiscUtil.PrintInfo("")
412 MiscUtil.PrintError(
413 "The synthon space input file, %s, doesn't contain any fingerprints. You must specify a synthon space binary database file containing appropriate fingerprints for similarity search.."
414 % OptionsInfo["Infile"]
415 )
416
417 if not re.search("%s" % OptionsInfo["SpecifiedFingerprints"], FPType, re.I):
418 MiscUtil.PrintInfo("")
419 MiscUtil.PrintWarning(
420 'The fingerprints type, %s, in synthon space input file, %s, doesn\'t appear to match fingerprints, %s, specified using "--fingerprints" option for similarity search.'
421 % (FPType, OptionsInfo["Infile"], OptionsInfo["SpecifiedFingerprints"])
422 )
423
424 FPGenerator = InitializeFingerprintsGenerator()
425
426 return (SynthonSpace, FPGenerator)
427
428
429 def InitializeFingerprintsGenerator():
430 """Initialize fingerprints generator."""
431
432 FPGenerator = None
433 SpecifiedFingerprints = OptionsInfo["SpecifiedFingerprints"]
434 if re.match("^AtomPairs$", SpecifiedFingerprints, re.I):
435 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"]
436 FPGenerator = rdFingerprintGenerator.GetAtomPairGenerator(
437 minDistance=FPParamsInfo["MinLength"],
438 maxDistance=FPParamsInfo["MaxLength"],
439 includeChirality=FPParamsInfo["UseChirality"],
440 use2D=FPParamsInfo["Use2D"],
441 fpSize=FPParamsInfo["FPSize"],
442 )
443 elif re.match("^Morgan$", SpecifiedFingerprints, re.I):
444 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["Morgan"]
445 FPGenerator = rdFingerprintGenerator.GetMorganGenerator(
446 radius=FPParamsInfo["Radius"],
447 includeChirality=FPParamsInfo["UseChirality"],
448 useBondTypes=FPParamsInfo["UseBondTypes"],
449 includeRingMembership=FPParamsInfo["UseRingMembership"],
450 fpSize=FPParamsInfo["FPSize"],
451 )
452 elif re.match("^MorganFeatures$", SpecifiedFingerprints, re.I):
453 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"]
454 FPGenerator = rdFingerprintGenerator.GetMorganGenerator(
455 radius=FPParamsInfo["Radius"],
456 includeChirality=FPParamsInfo["UseChirality"],
457 useBondTypes=FPParamsInfo["UseBondTypes"],
458 includeRingMembership=FPParamsInfo["UseRingMembership"],
459 fpSize=FPParamsInfo["FPSize"],
460 atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen(),
461 )
462 elif re.match("^PathLength$", SpecifiedFingerprints, re.I):
463 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["PathLength"]
464 FPGenerator = rdFingerprintGenerator.GetRDKitFPGenerator(
465 minPath=FPParamsInfo["MinPath"],
466 maxPath=FPParamsInfo["MaxPath"],
467 useHs=FPParamsInfo["UseExplicitHs"],
468 branchedPaths=FPParamsInfo["UseBranchedPaths"],
469 useBondOrder=FPParamsInfo["UseBondOrder"],
470 fpSize=FPParamsInfo["FPSize"],
471 numBitsPerFeature=FPParamsInfo["BitsPerHash"],
472 )
473 elif re.match("^TopologicalTorsions$", SpecifiedFingerprints, re.I):
474 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"]
475 FPGenerator = rdFingerprintGenerator.GetTopologicalTorsionGenerator(
476 includeChirality=FPParamsInfo["UseChirality"], fpSize=FPParamsInfo["FPSize"]
477 )
478 else:
479 MiscUtil.PrintError('The value specified, %s, for option "--fingerprints" is not valid.')
480
481 return FPGenerator
482
483
484 def ReadSynthonSpaceFile(Infile):
485 """Read synthon space file."""
486
487 MiscUtil.PrintInfo("\nReading synthon space file %s..." % Infile)
488 SynthonSpace = rdSynthonSpaceSearch.SynthonSpace()
489
490 StartTime = time.perf_counter()
491
492 try:
493 if MiscUtil.CheckFileExt(Infile, "spc"):
494 SynthonSpace.ReadDBFile(Infile)
495 else:
496 SynthonSpace.ReadTextFile(Infile)
497 except Exception as ErrMsg:
498 MiscUtil.PrintInfo("")
499 MiscUtil.PrintError("Failed to read synthon space file:\n%s\n" % (ErrMsg))
500
501 TotalTime = time.perf_counter() - StartTime
502 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
503
504 return SynthonSpace
505
506
507 def WriteSynthonSpaceBinaryFile(SynthonSpace, Outfile):
508 """Write synthon space binary file."""
509
510 MiscUtil.PrintInfo("\nWriting synthon space file %s..." % Outfile)
511 StartTime = time.perf_counter()
512
513 try:
514 SynthonSpace.WriteDBFile(Outfile)
515 except Exception as ErrMsg:
516 MiscUtil.PrintInfo("")
517 MiscUtil.PrintError("Failed to write synthon space file:\n%s\n" % (ErrMsg))
518
519 TotalTime = time.perf_counter() - StartTime
520 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
521
522 return SynthonSpace
523
524
525 def ListSynthonSpaceFingerprintsType(SynthonSpace):
526 """List synthon space fingerprints type."""
527
528 FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace)
529
530 if FPInfo is None:
531 MiscUtil.PrintInfo("\nFingerprints type: %s" % (FPInfo))
532 else:
533 MiscUtil.PrintInfo("\nFingerprints type: %s\nFingerprints Info: %s" % (FPType, FPInfo))
534
535
536 def GetSynthonFingerprintsInfo(SynthonSpace):
537 """Get synthon fingerprints information."""
538
539 FPInfo = SynthonSpace.GetSynthonFingerprintType()
540 if len(FPInfo) == 0:
541 return (None, None)
542
543 if re.search("AtomPairArguments", FPInfo, re.I):
544 FPType = "AtomPairs"
545 elif re.search("MorganArguments", FPInfo, re.I):
546 FPType = "Morgan or MorganFeatures"
547 elif re.search("RDKitFPArguments", FPInfo, re.I):
548 FPType = "PathLength"
549 elif re.search("TopologicalTorsionArguments", FPInfo, re.I):
550 FPType = "TopologicalTorsions"
551 else:
552 FPType = "Unknown"
553
554 return (FPType, FPInfo)
555
556
557 def SetupMoleculeWriter(SIngleOutFile, MolCount=0):
558 """Setup molecule writer."""
559
560 TextOutFileMode = OptionsInfo["TextOutFileMode"]
561 TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
562 TextOutFileTitleLine = OptionsInfo["TextOutFileTitleLine"]
563
564 if SIngleOutFile:
565 Outfile = OptionsInfo["Outfile"]
566 else:
567 Outfile = "%s_%s%s.%s" % (
568 OptionsInfo["OutFileRoot"],
569 OptionsInfo["OutFileSuffix"],
570 MolCount,
571 OptionsInfo["OutFileExt"],
572 )
573
574 if TextOutFileMode:
575 Writer = open(Outfile, "w")
576 else:
577 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
578 if Writer is None:
579 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
580
581 if TextOutFileMode:
582 if TextOutFileTitleLine:
583 WriteTextFileHeaderLine(Writer, TextOutFileDelim)
584
585 return Writer
586
587
588 def WriteTextFileHeaderLine(Writer, TextOutFileDelim):
589 """Write out a header line for text files including SMILES file."""
590
591 Line = ""
592 if OptionsInfo["SubstructureSearchMode"]:
593 Line = TextOutFileDelim.join(["SMILES", "Name", "QueryPatternNumber"])
594 elif OptionsInfo["SimilaritySearchMode"]:
595 Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"])
596 elif OptionsInfo["RascalSimilaritySearchMode"]:
597 Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"])
598
599 Writer.write("%s\n" % Line)
600
601
602 def WriteMolecules(Writer, QueryMolName, HitMols):
603 """Write hit molecules for similarity and substructure search."""
604
605 RascalSimilaritySearchMode = OptionsInfo["RascalSimilaritySearchMode"]
606 SimilaritySearchMode = OptionsInfo["SimilaritySearchMode"]
607 SubstructureSearchMode = OptionsInfo["SubstructureSearchMode"]
608
609 TextOutFileMode = OptionsInfo["TextOutFileMode"]
610 TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
611
612 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
613
614 SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"]
615 SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"]
616
617 HitMolCount = 0
618 for HitMol in HitMols:
619 HitMolCount += 1
620
621 if TextOutFileMode:
622 # Write out text file including SMILES file...
623 LineWords = []
624 LineWords.append(Chem.MolToSmiles(HitMol, isomericSmiles=SMILESIsomeric, kekuleSmiles=SMILESKekulize))
625 LineWords.append(RDKitUtil.GetMolName(HitMol, HitMolCount))
626
627 if SimilaritySearchMode or RascalSimilaritySearchMode:
628 Similarity = "%.2f" % float(HitMol.GetProp("Similarity"))
629 LineWords.append(Similarity)
630
631 LineWords.append(QueryMolName)
632
633 Line = TextOutFileDelim.join(LineWords)
634 Writer.write("%s\n" % Line)
635 else:
636 # Write out SD file...
637 if SimilaritySearchMode or RascalSimilaritySearchMode:
638 HitMol.SetProp("QueryMolName", QueryMolName)
639 elif SubstructureSearchMode:
640 HitMol.SetProp("QueryPatternNum", QueryMolName)
641
642 if SimilaritySearchMode or RascalSimilaritySearchMode:
643 Similarity = "%.2f" % float(HitMol.GetProp("Similarity"))
644 HitMol.SetProp("Similarity", Similarity)
645
646 if Compute2DCoords:
647 AllChem.Compute2DCoords(HitMol)
648 Writer.write(HitMol)
649
650
651 def SetupOutfileWriters():
652 """Setup outfile writers."""
653
654 SingleOutFileWriter, HitsInfoWriter = [None] * 2
655
656 if OptionsInfo["CountHitsMode"]:
657 MiscUtil.PrintInfo(
658 "\nSkipping generation of output files containing hit structures and only counting hits (BuildHits: No)..."
659 )
660 else:
661 if OptionsInfo["SingleOutFileMode"]:
662 SingleOutFileWriter = SetupMoleculeWriter(OptionsInfo["SingleOutFileMode"])
663 MiscUtil.PrintInfo("\nGenerating output file %s..." % OptionsInfo["Outfile"])
664 else:
665 MiscUtil.PrintInfo(
666 "\nGenerating output file(s) %s_%s*.%s..."
667 % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"])
668 )
669
670 HitsInfoWriter = SetupHitsInfoWriter()
671
672 return (SingleOutFileWriter, HitsInfoWriter)
673
674
675 def SetupHitsInfoWriter():
676 """Setup hits info writer."""
677
678 HitsInfoOutFile = OptionsInfo["HitsInfoOutFile"]
679 HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"]
680
681 MiscUtil.PrintInfo("\nGenerating output file %s..." % HitsInfoOutFile)
682
683 Writer = open(HitsInfoOutFile, "w")
684
685 # Setup and write out header...
686 MolIDColName = "MolID"
687 if OptionsInfo["SubstructureSearchMode"]:
688 MolIDColName = "QueryPatternNumber"
689 elif OptionsInfo["SimilaritySearchMode"]:
690 MolIDColName = "QueryMolName"
691 elif OptionsInfo["RascalSimilaritySearchMode"]:
692 MolIDColName = "QueryMolName"
693
694 if OptionsInfo["CountHitsMode"]:
695 Line = HitsInfoOutFileDelim.join([MolIDColName, "MaxPossibleHits"])
696 else:
697 Line = HitsInfoOutFileDelim.join([MolIDColName, "HitsCount", "MaxPossibleHits"])
698
699 Writer.write("%s\n" % Line)
700
701 return Writer
702
703
704 def WriteHitsInfo(Writer, HitsInfo):
705 """Write hits info."""
706
707 HitsInfoWords = ["%s" % HitInfo for HitInfo in HitsInfo]
708
709 HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"]
710 Line = HitsInfoOutFileDelim.join(HitsInfoWords)
711
712 Writer.write("%s\n" % Line)
713
714
715 def ProcessFingerprintsParameters():
716 """Set up and process fingerprints parameters."""
717
718 SetupFingerprintsNamesAndParameters()
719
720 ProcessSpecifiedFingerprintsName()
721 ProcessSpecifiedFingerprintsParameters()
722
723
724 def SetupFingerprintsNamesAndParameters():
725 """Set up fingerprints parameters."""
726
727 OptionsInfo["FingerprintsNames"] = ["AtomPairs", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
728
729 OptionsInfo["FingerprintsParamsInfo"] = {}
730 OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"] = {
731 "MinLength": 1,
732 "MaxLength": 30,
733 "UseChirality": False,
734 "Use2D": True,
735 "FPSize": 2048,
736 }
737 OptionsInfo["FingerprintsParamsInfo"]["Morgan"] = {
738 "Radius": 2,
739 "UseChirality": False,
740 "UseBondTypes": True,
741 "UseRingMembership": True,
742 "FPSize": 2048,
743 }
744 OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"] = {
745 "Radius": 2,
746 "UseChirality": False,
747 "UseBondTypes": True,
748 "UseRingMembership": True,
749 "FPSize": 2048,
750 }
751 OptionsInfo["FingerprintsParamsInfo"]["PathLength"] = {
752 "MinPath": 1,
753 "MaxPath": 7,
754 "UseExplicitHs": True,
755 "UseBranchedPaths": True,
756 "UseBondOrder": True,
757 "FPSize": 2048,
758 "BitsPerHash": 2,
759 }
760 OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"] = {"UseChirality": False, "FPSize": 2048}
761
762
763 def ProcessSpecifiedFingerprintsName():
764 """Process specified fingerprints name."""
765
766 # Set up a canonical fingerprints name map...
767 CanonicalFingerprintsNamesMap = {}
768 for Name in OptionsInfo["FingerprintsNames"]:
769 CanonicalName = Name.lower()
770 CanonicalFingerprintsNamesMap[CanonicalName] = Name
771
772 # Validate specified fingerprints name...
773 CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
774 if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
775 MiscUtil.PrintError(
776 'The fingerprints name, %s, specified using "-f, --fingerprints" option is not a valid name.'
777 % (OptionsInfo["Fingerprints"])
778 )
779
780 OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
781
782
783 def ProcessSpecifiedFingerprintsParameters():
784 """Process specified fingerprints parameters."""
785
786 if re.match("^auto$", OptionsInfo["FingerprintsParams"], re.I):
787 # Nothing to process...
788 return
789
790 SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
791
792 # Parse specified fingerprints parameters...
793 FingerprintsParams = re.sub(" ", "", OptionsInfo["FingerprintsParams"])
794 if not FingerprintsParams:
795 MiscUtil.PrintError(
796 'No valid parameter name and value pairs specified using "--fingerprintsParams" option corrresponding to fingerprints %s.'
797 % (SpecifiedFingerprintsName)
798 )
799
800 FingerprintsParamsWords = FingerprintsParams.split(",")
801 if len(FingerprintsParamsWords) % 2:
802 MiscUtil.PrintError(
803 'The number of comma delimited paramater names and values, %d, specified using "--fingerprintsParams" option must be an even number.'
804 % (len(FingerprintsParamsWords))
805 )
806
807 # Setup canonical parameter names for specified fingerprints...
808 ValidParamNames = []
809 CanonicalParamNamesMap = {}
810 for ParamName in sorted(OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName]):
811 ValidParamNames.append(ParamName)
812 CanonicalParamNamesMap[ParamName.lower()] = ParamName
813
814 # Validate and set paramater names and value...
815 for Index in range(0, len(FingerprintsParamsWords), 2):
816 Name = FingerprintsParamsWords[Index]
817 Value = FingerprintsParamsWords[Index + 1]
818
819 CanonicalName = Name.lower()
820 if CanonicalName not in CanonicalParamNamesMap:
821 MiscUtil.PrintError(
822 'The parameter name, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid name. Supported parameter names: %s'
823 % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames))
824 )
825
826 ParamName = CanonicalParamNamesMap[CanonicalName]
827 if re.match(
828 "^(UseChirality|Use2D|UseBondTypes|UseRingMembership|UseExplicitHs|UseBranchedPaths|UseBondOrder)$",
829 ParamName,
830 re.I,
831 ):
832 if not re.match("^(Yes|No|True|False)$", Value, re.I):
833 MiscUtil.PrintError(
834 'The parameter value, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False'
835 % (Value, SpecifiedFingerprintsName)
836 )
837 ParamValue = False
838 if re.match("^(Yes|True)$", Value, re.I):
839 ParamValue = True
840 else:
841 ParamValue = int(Value)
842 if ParamValue <= 0:
843 MiscUtil.PrintError(
844 'The parameter value, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid value. Supported values: > 0'
845 % (Value, SpecifiedFingerprintsName)
846 )
847
848 # Set value...
849 OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName][ParamName] = ParamValue
850
851
852 def ProcessOutfileParameters():
853 """Process outfile related parameters"""
854
855 Mode = OptionsInfo["Mode"]
856
857 OptionsInfo["Outfile"] = Options["--outfile"]
858 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
859 "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
860 )
861
862 # OutfileMode is only used for similarity and substructure search...
863 OptionsInfo["OutFileMode"] = Options["--outfileMode"]
864 SingleOutFileMode = True
865 if not re.match("^SingleFile$", Options["--outfileMode"], re.I):
866 SingleOutFileMode = False
867 OptionsInfo["SingleOutFileMode"] = SingleOutFileMode
868
869 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
870 OptionsInfo["OutFileRoot"] = FileName
871 OptionsInfo["OutFileExt"] = FileExt
872
873 OutFileSuffix = ""
874 if re.match("^SubstructureSearch$", Mode, re.I):
875 OutFileSuffix = "Pattern"
876 elif re.match("^SimilaritySearch$", Mode, re.I):
877 OutFileSuffix = "Mol"
878 OptionsInfo["OutFileSuffix"] = OutFileSuffix
879
880 OptionsInfo["HitsInfoOutFile"] = "%s_HitCount.csv" % OptionsInfo["OutFileRoot"]
881 OptionsInfo["HitsInfoOutFileDelim"] = ","
882
883 TextOutFileMode, TextOutFileDelim, TextOutFileTitleLine = [None] * 3
884 if re.match("^(SimilaritySearch|SubstructureSearch)$", Mode, re.I):
885 TextOutFileMode = False
886 TextOutFileDelim = ""
887 TextOutFileTitleLine = True
888
889 if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
890 TextOutFileMode = True
891 TextOutFileDelim = ","
892 elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
893 TextOutFileMode = True
894 TextOutFileDelim = "\t"
895 elif MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
896 TextOutFileMode = True
897 TextOutFileDelim = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
898 TextOutFileTitleLine = OptionsInfo["OutfileParams"]["SMILESTitleLine"]
899
900 OptionsInfo["TextOutFileMode"] = TextOutFileMode
901 OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
902 OptionsInfo["TextOutFileTitleLine"] = TextOutFileTitleLine
903
904 if not OptionsInfo["SingleOutFileMode"]:
905 FilesSpec = "%s_%s*.%s" % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"])
906 FileNames = MiscUtil.ExpandFileNames(FilesSpec)
907 if len(FileNames):
908 if not Options["--overwrite"]:
909 MiscUtil.PrintError(
910 'The output files, %s, corresponding to output file specified, %s, for option "-o, --outfile" already exist. Use option "--ov" or "--overwrite" and try again.'
911 % (FilesSpec, OptionsInfo["Outfile"])
912 )
913
914
915 def ProcessRascalSearchParametersOption():
916 """Process option for RASCAL similarity search."""
917
918 ParamsOptionName = "--rascalSearchParams"
919 ParamsOptionValue = Options[ParamsOptionName]
920
921 ParamsDefaultInfo = {
922 "AllBestMCESs": ["bool", False],
923 "CompleteAromaticRings": ["bool", True],
924 "CompleteSmallestRings": ["bool", False],
925 "ExactConnectionsMatch": ["bool", False],
926 "IgnoreAtomAromaticity": ["bool", True],
927 "IgnoreBondOrders": ["bool", False],
928 "MaxBondMatchPairs": ["int", 1000],
929 "MaxFragSeparation": ["int", -1],
930 "MinCliqueSize": ["int", 0],
931 "MinFragSize": ["int", -1],
932 "ReturnEmptyMCES": ["bool", False],
933 "RingMatchesRingOnly": ["bool", False],
934 "SimilarityThreshold": ["float", 0.7],
935 "SingleLargestFrag": ["bool", False],
936 "Timeout": ["int", 60],
937 }
938
939 # Update default values to match RDKit default values...
940 RDKitRascalSearchParams = rdRascalMCES.RascalOptions()
941 for ParamName in ParamsDefaultInfo.keys():
942 RDKitParamName = LowercaseFirstLetter(ParamName)
943 if hasattr(RDKitRascalSearchParams, RDKitParamName):
944 RDKitParamValue = getattr(RDKitRascalSearchParams, RDKitParamName)
945 ParamsDefaultInfo[ParamName][1] = RDKitParamValue
946 else:
947 MiscUtil.PrintWarning(
948 "The RASCAL search parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
949 )
950
951 RascalSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(
952 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
953 )
954
955 for ParamName in ["MaxBondMatchPairs"]:
956 ParamValue = RascalSearchParams[ParamName]
957 if ParamValue <= 0:
958 MiscUtil.PrintError(
959 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
960 % (ParamValue, ParamName, ParamsOptionName)
961 )
962
963 for ParamName in ["MinCliqueSize", "SimilarityThreshold"]:
964 ParamValue = RascalSearchParams[ParamName]
965 if ParamValue < 0:
966 MiscUtil.PrintError(
967 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
968 % (ParamValue, ParamName, ParamsOptionName)
969 )
970 if re.match("^SimilarityThreshold$", ParamName, re.I):
971 if ParamValue > 1:
972 MiscUtil.PrintError(
973 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: <= 1\n'
974 % (ParamValue, ParamName, ParamsOptionName)
975 )
976
977 for ParamName in ["MaxFragSeparation", "MinFragSize", "Timeout"]:
978 ParamValue = RascalSearchParams[ParamName]
979 if not (ParamValue == -1 or ParamValue > 0):
980 MiscUtil.PrintError(
981 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: -1 or > 0\n'
982 % (ParamValue, ParamName, ParamsOptionName)
983 )
984
985 # Setup RDKit object for RASCAL match parameters...
986 RDKitRascalSearchParams = rdRascalMCES.RascalOptions()
987 for ParamName in RascalSearchParams.keys():
988 ParamValue = RascalSearchParams[ParamName]
989
990 # Convert first letter to lower case for RDKit param name and set its value...
991 RDKitParamName = LowercaseFirstLetter(ParamName)
992 if hasattr(RDKitRascalSearchParams, RDKitParamName):
993 setattr(RDKitRascalSearchParams, RDKitParamName, ParamValue)
994 else:
995 MiscUtil.PrintWarning(
996 "The RASCAL searh parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
997 )
998
999 OptionsInfo["RascalSearchParams"] = RascalSearchParams
1000 OptionsInfo["RDKitRascalSearchParams"] = RDKitRascalSearchParams
1001
1002
1003 def ProcessSubstructureMatchParametersOption():
1004 """Process option for substructure match parameters."""
1005
1006 ParamsOptionName = "--substructureMatchParams"
1007 ParamsOptionValue = Options[ParamsOptionName]
1008
1009 ParamsDefaultInfo = {
1010 "AromaticMatchesConjugated": ["bool", False],
1011 "MaxMatches": ["int", 1000],
1012 "MaxRecursiveMatches": ["int", 1000],
1013 "RecursionPossible": ["bool", True],
1014 "SpecifiedStereoQueryMatchesUnspecified": ["bool", False],
1015 "Uniquify": ["bool", True],
1016 "UseChirality": ["bool", False],
1017 "UseEnhancedStereo": ["bool", False],
1018 "UseGenericMatchers": ["bool", False],
1019 }
1020
1021 # Update default values to match RDKit default values...
1022 RDKitSubstructureMatchParams = Chem.SubstructMatchParameters()
1023 for ParamName in ParamsDefaultInfo.keys():
1024 RDKitParamName = LowercaseFirstLetter(ParamName)
1025 if hasattr(RDKitSubstructureMatchParams, RDKitParamName):
1026 RDKitParamValue = getattr(RDKitSubstructureMatchParams, RDKitParamName)
1027 ParamsDefaultInfo[ParamName][1] = RDKitParamValue
1028 else:
1029 MiscUtil.PrintWarning(
1030 "The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
1031 )
1032
1033 SubstructureMatchParams = MiscUtil.ProcessOptionNameValuePairParameters(
1034 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
1035 )
1036
1037 for ParamName in ["MaxMatches", "MaxRecursiveMatches"]:
1038 ParamValue = SubstructureMatchParams[ParamName]
1039 if ParamValue <= 0:
1040 MiscUtil.PrintError(
1041 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
1042 % (ParamValue, ParamName, ParamsOptionName)
1043 )
1044
1045 # Setup RDKit object for substructure match parameters...
1046 RDKitSubstructureMatchParams = Chem.SubstructMatchParameters()
1047 for ParamName in SubstructureMatchParams.keys():
1048 ParamValue = SubstructureMatchParams[ParamName]
1049
1050 # Convert first letter to lower case for RDKit param name and set its value...
1051 RDKitParamName = LowercaseFirstLetter(ParamName)
1052 if hasattr(RDKitSubstructureMatchParams, RDKitParamName):
1053 setattr(RDKitSubstructureMatchParams, RDKitParamName, ParamValue)
1054 else:
1055 MiscUtil.PrintWarning(
1056 "The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
1057 )
1058
1059 OptionsInfo["SubstructureMatchParams"] = SubstructureMatchParams
1060 OptionsInfo["RDKitSubstructureMatchParams"] = RDKitSubstructureMatchParams
1061
1062
1063 def ProcessSynthonSearchParamatersOption():
1064 """Process option for synthon search parameters."""
1065
1066 ParamsOptionName = "--synthonSearchParams"
1067 ParamsOptionValue = Options[ParamsOptionName]
1068
1069 ParamsDefaultInfo = {
1070 "ApproxSimilarityAdjuster": ["float", 0.1],
1071 "BuildHits": ["bool", True],
1072 "FragSimilarityAdjuster": ["float", 0.1],
1073 "HitStart": ["int", 0],
1074 "MaxHits": ["int", 1000],
1075 "MaxNumFrags": ["int", 100000],
1076 "NumThreads": ["int", 1],
1077 "RandomSample": ["bool", False],
1078 "RandomSeed": ["int", -1],
1079 "SimilarityCutoff": ["float", 0.5],
1080 "TimeOut": ["int", 600],
1081 }
1082
1083 # Update default values to match RDKit default values...
1084 RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams()
1085 for ParamName in ParamsDefaultInfo.keys():
1086 RDKitParamName = LowercaseFirstLetter(ParamName)
1087 if hasattr(RDKitSynthonSearchParams, RDKitParamName):
1088 RDKitParamValue = getattr(RDKitSynthonSearchParams, RDKitParamName)
1089 ParamsDefaultInfo[ParamName][1] = RDKitParamValue
1090 else:
1091 MiscUtil.PrintWarning(
1092 "The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName
1093 )
1094
1095 SynthonSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(
1096 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
1097 )
1098
1099 for ParamName in ["ApproxSimilarityAdjuster", "FragSimilarityAdjuster", "SimilarityCutoff", "HitStart"]:
1100 ParamValue = SynthonSearchParams[ParamName]
1101 if ParamValue < 0:
1102 MiscUtil.PrintError(
1103 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
1104 % (ParamValue, ParamName, ParamsOptionName)
1105 )
1106 if re.match("^SimilarityCutoff$", ParamName, re.I):
1107 if ParamValue > 1:
1108 MiscUtil.PrintError(
1109 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: <= 1\n'
1110 % (ParamValue, ParamName, ParamsOptionName)
1111 )
1112
1113 for ParamName in ["MaxNumFrags", "TimeOut"]:
1114 ParamValue = SynthonSearchParams[ParamName]
1115 if ParamValue <= 0:
1116 MiscUtil.PrintError(
1117 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
1118 % (ParamValue, ParamName, ParamsOptionName)
1119 )
1120
1121 for ParamName in ["MaxHits", "RandomSeed"]:
1122 ParamValue = SynthonSearchParams[ParamName]
1123 if not (ParamValue == -1 or ParamValue > 0):
1124 MiscUtil.PrintError(
1125 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: -1 or > 0\n'
1126 % (ParamValue, ParamName, ParamsOptionName)
1127 )
1128
1129 ParamName = "NumThreads"
1130 ParamValue = SynthonSearchParams[ParamName]
1131 if ParamValue > 0:
1132 if ParamValue > mp.cpu_count():
1133 MiscUtil.PrintWarning(
1134 'The parameter value, %s, specified for parameter name, %s, using "%s" option is greater than number of CPUs, %s, returned by mp.cpu_count().'
1135 % (ParamValue, ParamName, ParamsOptionName, mp.cpu_count())
1136 )
1137 elif ParamValue < 0:
1138 if abs(ParamValue) > mp.cpu_count():
1139 MiscUtil.PrintWarning(
1140 'The absolute parameter value, %s, specified for parameter name, %s, using "%s" option is greater than number of CPUs, %s, returned by mp.cpu_count().'
1141 % (abs(ParamValue), ParamName, ParamsOptionName, mp.cpu_count())
1142 )
1143
1144 # Setup RDKit object for synthon space search parameters...
1145 RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams()
1146 for ParamName in SynthonSearchParams.keys():
1147 ParamValue = SynthonSearchParams[ParamName]
1148
1149 # Convert first letter to lower case for RDKit param name and set its value...
1150 RDKitParamName = LowercaseFirstLetter(ParamName)
1151 if hasattr(RDKitSynthonSearchParams, RDKitParamName):
1152 setattr(RDKitSynthonSearchParams, RDKitParamName, ParamValue)
1153 else:
1154 MiscUtil.PrintWarning(
1155 "The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName
1156 )
1157
1158 OptionsInfo["CountHitsMode"] = False if SynthonSearchParams["BuildHits"] else True
1159
1160 OptionsInfo["SynthonSearchParams"] = SynthonSearchParams
1161 OptionsInfo["RDKitSynthonSearchParams"] = RDKitSynthonSearchParams
1162
1163
1164 def LowercaseFirstLetter(Text):
1165 """Convert first letter of a string to lowercase."""
1166
1167 if Text is None or len(Text) == 0:
1168 return Text
1169
1170 return Text[0].lower() + Text[1:]
1171
1172
1173 def ProcessQueryPatternOption():
1174 """Process query pattern option."""
1175
1176 QueryPattern = None if re.match("^None$", Options["--queryPattern"], re.I) else Options["--queryPattern"]
1177 QueryPatternMols = None
1178
1179 if QueryPattern is not None:
1180 QueryPatternMols = []
1181 Patterns = QueryPattern.split()
1182 for Pattern in Patterns:
1183 PatternMol = Chem.MolFromSmarts(Pattern)
1184 if PatternMol is None:
1185 MiscUtil.PrintError(
1186 'The value specified, %s, using option "--queryPattern" is not a valid SMARTS: Failed to create pattern molecule'
1187 % (Pattern)
1188 )
1189 QueryPatternMols.append(PatternMol)
1190
1191 OptionsInfo["QueryPattern"] = QueryPattern
1192 OptionsInfo["QueryPatternMols"] = QueryPatternMols
1193
1194
1195 def ProcessOptions():
1196 """Process and validate command line arguments and options."""
1197
1198 MiscUtil.PrintInfo("Processing options...")
1199
1200 # Validate options...
1201 ValidateOptions()
1202
1203 OptionsInfo["Mode"] = Options["--mode"]
1204 OptionsInfo["RascalSimilaritySearchMode"] = (
1205 True if re.match("^RASCALSimilaritySearch$", Options["--mode"], re.I) else False
1206 )
1207 OptionsInfo["SimilaritySearchMode"] = True if re.match("^SimilaritySearch$", Options["--mode"], re.I) else False
1208 OptionsInfo["SubstructureSearchMode"] = True if re.match("^SubstructureSearch$", Options["--mode"], re.I) else False
1209
1210 OptionsInfo["Fingerprints"] = Options["--fingerprints"]
1211
1212 OptionsInfo["FingerprintsParams"] = Options["--fingerprintsParams"]
1213 ProcessFingerprintsParameters()
1214
1215 OptionsInfo["Infile"] = Options["--infile"]
1216
1217 ProcessOutfileParameters()
1218
1219 OptionsInfo["Overwrite"] = Options["--overwrite"]
1220
1221 ProcessQueryPatternOption()
1222
1223 OptionsInfo["QueryFile"] = None if re.match("^none$", Options["--queryFile"]) else Options["--queryFile"]
1224 if OptionsInfo["QueryFile"] is None:
1225 OptionsInfo["QueryFileParams"] = None
1226 else:
1227 OptionsInfo["QueryFileParams"] = MiscUtil.ProcessOptionInfileParameters(
1228 "--queryFileParams", Options["--queryFileParams"], Options["--queryFile"]
1229 )
1230
1231 ProcessRascalSearchParametersOption()
1232
1233 ProcessSubstructureMatchParametersOption()
1234 ProcessSynthonSearchParamatersOption()
1235
1236 OptionsInfo["Overwrite"] = Options["--overwrite"]
1237
1238
1239 def RetrieveOptions():
1240 """Retrieve command line arguments and options."""
1241
1242 # Get options...
1243 global Options
1244 Options = docopt(_docoptUsage_)
1245
1246 # Set current working directory to the specified directory...
1247 WorkingDir = Options["--workingdir"]
1248 if WorkingDir:
1249 os.chdir(WorkingDir)
1250
1251 # Handle examples option...
1252 if "--examples" in Options and Options["--examples"]:
1253 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
1254 sys.exit(0)
1255
1256
1257 def ValidateOptions():
1258 """Validate option values."""
1259
1260 MiscUtil.ValidateOptionTextValue(
1261 "-m, --mode",
1262 Options["--mode"],
1263 "FingerprintsGeneration BinaryDBFileGeneration LibraryEnumeration RASCALSimilaritySearch SimilaritySearch SubstructureSearch",
1264 )
1265
1266 MiscUtil.ValidateOptionTextValue(
1267 "-f, --fingerprints",
1268 Options["--fingerprints"],
1269 "AtomPairs Morgan MorganFeatures PathLength TopologicalTorsions",
1270 )
1271
1272 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
1273 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc")
1274
1275 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi csv tsv txt spc")
1276 if re.match("^SingleFile$", Options["--outfileMode"], re.I):
1277 MiscUtil.ValidateOptionsOutputFileOverwrite(
1278 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
1279 )
1280 MiscUtil.ValidateOptionsDistinctFileNames(
1281 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
1282 )
1283
1284 if re.match("^(FingerprintsGeneration|BinaryDBFileGeneration)$", Options["--mode"], re.I):
1285 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "spc")
1286 if not MiscUtil.CheckFileExt(Options["--outfile"], "spc"):
1287 MiscUtil.PrintError(
1288 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: spc\n'
1289 % (Options["--outfile"], Options["--mode"])
1290 )
1291 elif re.match("^LibraryEnumeration$", Options["--mode"], re.I):
1292 if not MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
1293 MiscUtil.PrintError(
1294 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: smi\n'
1295 % (Options["--outfile"], Options["--mode"])
1296 )
1297 elif re.match("^(RASCALSimilaritySearch|SimilaritySearch|SubstructureSearch)$", Options["--mode"], re.I):
1298 if not MiscUtil.CheckFileExt(Options["--outfile"], "sdf sd smi csv tsv txt"):
1299 MiscUtil.PrintError(
1300 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: sdf sd smi csv tsv txt\n'
1301 % (Options["--outfile"], Options["--mode"])
1302 )
1303
1304 MiscUtil.ValidateOptionTextValue("--outfileMode", Options["--outfileMode"], "SingleFile or MultipleFiles")
1305
1306 QueryPattern = Options["--queryPattern"]
1307 if re.match("^SubstructureSearch$", Options["--mode"], re.I):
1308 if re.match("^None$", QueryPattern, re.I):
1309 MiscUtil.PrintError(
1310 'You must specify a valid SMARTS pattern(s) for option "--queryPattern" during, SubstructureSearch, value of "-m, --mode" option.'
1311 )
1312
1313 PatternMols = []
1314 if not re.match("^None$", QueryPattern, re.I):
1315 Patterns = QueryPattern.split()
1316 for Pattern in Patterns:
1317 PatternMol = Chem.MolFromSmarts(Pattern)
1318 if PatternMol is None:
1319 MiscUtil.PrintError(
1320 'The value specified, %s, using option "--queryPattern" is not a valid SMARTS: Failed to create pattern molecule'
1321 % (Pattern)
1322 )
1323 PatternMols.append(PatternMol)
1324
1325 if re.match("^SubstructureSearch$", Options["--mode"], re.I):
1326 if len(PatternMols) == 0:
1327 MiscUtil.PrintError(
1328 'You must specify a valid SMARTS pattern(s) for option "--queryPattern" during, SubstructureSearch, value of "-m, --mode" option.'
1329 )
1330
1331 if re.match("^(RASCALSimilaritySearch|SimilaritySearch)$", Options["--mode"], re.I):
1332 if re.match("^None$", Options["--queryFile"], re.I):
1333 MiscUtil.PrintError(
1334 'You must specify a valid filename for option "--queryFile" during, SimilaritySearch, value of "-m, --mode" option.'
1335 )
1336
1337 if not re.match("^None$", Options["--queryFile"], re.I):
1338 MiscUtil.ValidateOptionFilePath("--queryFile", Options["--queryFile"])
1339 MiscUtil.ValidateOptionFileExt("--queryFile", Options["--queryFile"], "sdf sd smi csv tsv")
1340
1341
1342 # Setup a usage string for docopt...
1343 _docoptUsage_ = """
1344 RDKitPerformSynthonSpaceSearch.py - Perform a synthon space search
1345
1346 Usage:
1347 RDKitPerformSynthonSpaceSearch.py [--fingerprints <Morgan, PathLength...>] [--fingerprintsParams <Name,Value,...>]
1348 [--mode <SubstructureSearch...>] [ --outfileParams <Name,Value,...>] [--outfileMode <SingleFile or MultipleFiles>]
1349 [--overwrite] [--queryPattern <SMARTS>] [--queryFileParams <Name,Value,...>] [--queryFile <filename>]
1350 [--rascalSearchParams <Name,Value,...>] [--substructureMatchParams <Name,Value,...>]
1351 [--synthonSearchParams <Name,Value,...>] [-w <dir>] -i <infile> -o <outfile>
1352 RDKitPerformSynthonSpaceSearch.py -l | --list -i <infile>
1353 RDKitPerformSynthonSpaceSearch.py -h | --help | -e | --examples
1354
1355 Description:
1356 Perform a similarity or substructure search, using query molecules or SMARTS
1357 patterns, against a synthon space [ Ref 174 ] in an input file, and write out the
1358 hit molecules to output file(s). You may optionally count the hits without
1359 building and writing them out.
1360
1361 In addition, you may enumerate a combinatorial library corresponding to a
1362 synthon space, generate fingerprints for a synthon space, or list information
1363 about a synthon space.
1364
1365 You must provide a valid synthon space text or binary database file supported
1366 by RDKit module rdSynthonSpaceSearch.
1367
1368 You may perform similarity search using fingerprints or employ RASCAL (RApid
1369 Similarity CALculations using Maximum Edge Subgrahps) methodology [ Ref 175 ].
1370
1371 A number of fingerprints are available for performing similarity search. The
1372 similarity metric, however, is calculated using Tanimoto similarity on hashed
1373 fingerprints.
1374
1375 The RASCAL similarity between two molecuels is calculated based on MCES
1376 (Maximum Common Edge Subgraphs) and corresponds to Johnson similarity.
1377
1378 The supported input file formats are: CSV/TXT synthon space (.csv, .txt) or
1379 binary synthon space (.spc).
1380
1381 The supported outfile formats, for different '--mode' values, are shown
1382 below:
1383
1384 BinaryDBFileGeneration: Binary database file (.spc)
1385 FingerprintsGeneration: Binary database file (.spc)
1386 LibraryEnumeration: SMILES (.smi)
1387 SimilaritySearch or SubstructureSearch: SD (.sdf, .sd), SMILES (.smi),
1388 CSV/TSV (.csv or .tsv)
1389
1390 Possible output files:
1391
1392 <OutfileRoot>.<sdf,sd,smi,csv,tsv>
1393
1394 <OutfileRoot>_Mol<Num>.<sdf,sd,smi,csv,tsv>
1395 <OutfileRoot>_Pattern<Num>.<sdf,sd,smi,csv,tsv>
1396
1397 <OutfileRoot>_HitCount.csv
1398
1399 The <OutfileRoot>_HitCount.csv contains aditional information regarding hit
1400 counts and is writter out for both similarity and substructure search.
1401
1402 Options:
1403 -f, --fingerprints <Morgan, PathLength...> [default: Morgan]
1404 Fingerprints to use for performing synthon space similarity search.
1405 Supported values: AtomPairs, Morgan, MorganFeatures, PathLength,
1406 TopologicalTorsions. The PathLength fingerprints are Daylight like
1407 fingerprints. The Morgan and MorganFeature fingerprints are circular
1408 fingerprints, corresponding Scitegic's Extended Connectivity Fingerprints
1409 (ECFP) and Features Connectivity Fingerprints (FCFP). The values of
1410 default parameters for generating fingerprints can be modified using
1411 '--fingerprintsParams' option.
1412 --fingerprintsParams <Name,Value,...> [default: auto]
1413 Parameter values to use for generating fingerprints. The default values
1414 are dependent on the value of '-f, --fingerprints' option. In general, it is a
1415 comma delimited list of parameter name and value pairs for the name of
1416 fingerprints specified using '-f, --fingerprints' option. The supported
1417 parameter names along with their default values for valid fingerprints
1418 names are shown below:
1419
1420 AtomPairs: minLength,1 ,maxLength,useChirality,No,
1421 use2D, yes, fpSize, 2048
1422 Morgan: radius,2, useChirality,No, useBondTypes, yes,
1423 useRingMembership, yes, fpSize, 2048
1424 MorganFeatures: radius,2, useChirality,No, useBondTypes, yes,
1425 useRingMembership, yes, fpSize, 2048
1426 PathLength: minPath,1, maxPath,7, useExplicitHs, yes,
1427 useBranchedPaths, yes,useBondOrder,yes, fpSize, 2048,
1428 bitsPerHash,2
1429 TopologicalTorsions: useChirality,No, fpSize, 2048
1430
1431 A brief description of parameters, taken from RDKit documentation, is
1432 provided below:
1433
1434 AtomPairs:
1435
1436 minLength: Minimum distance between atoms.
1437 maxLength: Maximum distance between atoms.
1438 useChirality: Use chirality for atom invariants.
1439 use2D: Use topological distance matrix.
1440 fpSize: Size of the fingerpints bit vector.
1441
1442 Morgan and MorganFeatures:
1443
1444 radius: Neighborhood radius.
1445 useChirality: Use chirality to generate fingerprints.
1446 useBondTypes: Use bond type for the bond invariants.
1447 useRingMembership: Use ring membership.
1448 fpSize: Size of the fingerpints bit vector.
1449
1450 PathLength:
1451
1452 minPath: Minimum bond path length.
1453 maxPath: Maximum bond path length.
1454 useExplicitHs: Use explicit hydrogens.
1455 useBranchedPaths: Use branched paths along with linear paths.
1456 useBondOrder: Us bond order in the path hashes.
1457 fpSize: Size of the fingerpints bit vector.
1458 bitsPerHash: Number of bits set per path.
1459
1460 TopologicalTorsions
1461
1462 useChirality: Use chirality to generate fingerprints.
1463 fpSize: Size of the fingerpints bit vector.
1464
1465 -e, --examples
1466 Print examples.
1467 -h, --help
1468 Print this help message.
1469 -i, --infile <infile>
1470 Synthon space Input file name.
1471 -l, --list
1472 List information about synthon space.
1473 -m, --mode <SubstructureSearch...> [default: SimilaritySearch]
1474 Perform similarity or substructure search, enumerate synthon space,
1475 or list information about a synthon space. The supported values along
1476 with a brief explanation of the expected behavior are shown below:
1477
1478 BinaryDBFileGeneration: Write out a binary database file for a
1479 synthon space.
1480 FingerprintsGeneration: Generate fingerints for a synthon space and
1481 write out a binary database file along with fingerprints.
1482 LibraryEnumeration: Enumerate a combinatorial library for a synthon
1483 space and write out a SMILES file.
1484 RASCALSimilaritySearch: Perform a RASCAL (RApid Similarity
1485 CALculations using Maximum Edge Subgrahps) similarity search.
1486 SimilaritySearch: Perform a similarity search using fingerprints.
1487 SubstructureSearch: Perform a substructure search using specified
1488 SMARTS patterns.
1489
1490 -o, --outfile <outfile>
1491 Output file name. The <OutfileRoot> and <OutfileExt> are used to generate
1492 file names during 'MultipleFiles' value for '--outfileMode' option.
1493 --outfileMode <SingleFile or MultipleFiles> [default: SingleFile]
1494 Write out a single file containing hit molecules for substructure or
1495 similarity search or generate an individual file for each query pattern
1496 or molecule. Possible values: SingleFile or MultipleFiles. The query
1497 pattern number or molecule name is written to output file(s). The query
1498 pattern or molecule number is also appended to output file names during
1499 the generation of multiple output files.
1500 --outfileParams <Name,Value,...> [default: auto]
1501 A comma delimited list of parameter name and value pairs for writing
1502 molecules to files during similarity and substructue search. The supported
1503 parameter names for different file formats, along with their default values,
1504 are shown below:
1505
1506 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
1507 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
1508 smilesTitleLine,yes
1509
1510 Default value for compute2DCoords: yes for SMILES input file; no for all other
1511 file types. The kekulize and smilesIsomeric parameters are also used during
1512 generation of SMILES strings for CSV/TSV files.
1513 --queryPattern <SMARTS SMARTS ...> [default: none]
1514 A space delimited list of SMARTS patterns for performing substructure
1515 search. This is required for 'SubstructureSearch' value of '--mode' option.
1516 --queryFile <filename> [default: none]
1517 Input file containing query molecules for performing similarity search. This
1518 is required for 'SimilaritySearch' value of '--mode' option.
1519 --queryFileParams <Name,Value,...> [default: auto]
1520 A comma delimited list of parameter name and value pairs for reading
1521 molecules from query files during similarity search. The supported
1522 parameter names for different file formats, along with their default
1523 values, are shown below:
1524
1525 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
1526 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
1527 smilesTitleLine,auto,sanitize,yes
1528
1529 Possible values for smilesDelimiter: space, comma or tab.
1530 --rascalSearchParams <Name,Value,...> [default: auto]
1531 Parameter values to use for RASCAL similarity search.
1532
1533 The default values are automatically updated to match RDKit default values.
1534 The supported parameter names along with their default values are
1535 are shown below:
1536
1537 allBestMCESs, no, completeAromaticRings, yes,
1538 completeSmallestRings, no, exactConnectionsMatch, no,
1539 ignoreAtomAromaticity, yes, ignoreBondOrders, no,
1540 maxBondMatchPairs, 1000, maxFragSeparation, -1, minCliqueSize, 0,
1541 minFragSize, -1, returnEmptyMCES, false, ringMatchesRingOnly, false,
1542 similarityThreshold, 0.7, singleLargestFrag, no,
1543 timeout, 60
1544
1545 A brief description of parameters, taken from RDKit documentation, is
1546 provided below:
1547
1548 allBestMCESs: Find all Maximum Common Edge Subgraphs (MCES).
1549 completeAromaticRings: Use only complete aromatic rings.
1550 completeSmallestRings: Only complete rings present in both
1551 molecules.
1552 exactConnectionsMatch: Match atoms only when they have the same
1553 number of explicit connections.
1554 ignoreAtomAromaticity: Ignore aromaticity during atom matching.
1555 ignoreBondOrders: Ignore bond orders during atom matching.
1556 maxBondMatchPairs: Maximum number of matching bond pairs.
1557 maxFragSeparation: Maximum bond distance that bonds can match.
1558 value of -1 implies no maximum.
1559 minCliqueSize: A value of > 0 overrides the similarityThreshold.
1560 This refers to the minimum number of bonds in the MCES.
1561 minFragSize: Minimum number of atoms in a fragment. A value of -1
1562 implies no minimum.
1563 returnEmptyMCES: Return empty MCES results.
1564 ringMatchesRingOnly: Match ring bonds to only ring bonds.
1565 similarityThreshold: Similarity threshold for matching and
1566 evaluating MCES.
1567 singleLargestFrag: Find only a single fragment for the MCES. By
1568 default, multiple fragments are generated as necessary.
1569 timeout: Max run time in seconds. A value of -1 implies no max.
1570
1571 --substructureMatchParams <Name,Value,...> [default: auto]
1572 Parameter values to use for substructure match during synthon substructure
1573 search.
1574
1575 The default values are automatically updated to match RDKit default values.
1576 The supported parameter names along with their default values are
1577 are shown below:
1578
1579 aromaticMatchesConjugated, no, maxMatches, 1000,
1580 maxRecursiveMatches, 1000, recursionPossible, yes,
1581 specifiedStereoQueryMatchesUnspecified, no, uniquify, yes,
1582 useChirality, no, useEnhancedStereo, no, useGenericMatchers, no,
1583
1584 A brief description of parameters, taken from RDKit documentation, is
1585 provided below:
1586
1587 aromaticMatchesConjugated: Match aromatic and conjugated bonds.
1588 maxMatches: Maximum number of matches.
1589 maxRecursiveMatches: Maximum number of recursive matches.
1590 recursionPossible: Allow recursive queries.
1591 specifiedStereoQueryMatchesUnspecified: Match query atoms and bonds
1592 with specified stereochemistry to atoms and bonds with unspecified
1593 stereochemistry.
1594 uniquify: Uniquify match results using atom indices.
1595 useChirality: Use chirality to match atom and bonds.
1596 useEnhancedStereo: Use enhanced stereochemistry during the use
1597 of chirality.
1598 useGenericMatchers: Use generic groups as a post-filtering step.
1599
1600 --synthonSearchParams <Name,Value,...> [default: auto]
1601 Parameter values to use for performing synthon substructure and similarity
1602 search.
1603
1604 The default values are automatically updated to match RDKit default values.
1605 The supported parameter names along with their default values are
1606 are shown below:
1607
1608 approxSimilarityAdjuster, 0.1, [ Default value for Morgan FPs ]
1609 buildHits, yes, fragSimilarityAdjuster, 0.1, hitStart, 0,
1610 maxHits, 1000, [ A value of -1 retrives all hits ]
1611 maxNumFrags, 100000,
1612 numThreads, 1 [ 0: Use maximum number of threads supported by the
1613 hardware; Negative value: Added to the maxiumum number of
1614 threads supported by the hardware ]
1615 randomSample, no,
1616 randomSeed, -1 [ Default value implies use random seed ]
1617 similarityCutoff, 0.5, [ Default for Morgan FPs. Ignored during RASCAL
1618 similarity search; instead, RASCAL parameter similarityThreshold is
1619 used. ]
1620 timeOut, 600 [ Unit: sec. The RASCAL searches take longer and may
1621 need a higher value for timeOut. For example: 3600 ]
1622
1623 A brief description of parameters, taken from RDKit documentation, is
1624 provided below:
1625
1626 approxSimilarityAdjuster: Value used for reducing similarity cutoff
1627 during approximate similarity check for fingerprint search. A
1628 lower value leads to faster run times at the risk of missing
1629 some hits.
1630 buildHits: A no value implies to report the maximum number of hits a
1631 search could generate without returning any hits.
1632 fragSimilarityAdjuster: Value used for reducing fragment matching
1633 similarity cutoff to accommodate low bit densities for fragments.
1634 hitStart: Return hits starting from the specified sequence number
1635 to support retrieval of hits in batches.
1636 maxHits: Maximum number of hits to return. A value of -1 implies
1637 retrieve all hits.
1638 maxNumFrags: Maximum number of fragments for breaking a query.
1639 numThreads: Number of threads to use for search. A value of 0
1640 implies the use of all available hardware threads. A negative
1641 value is added to the number of available hardware threads to
1642 calculate number of threads to use.
1643 randomSample: Return a random sample of hits up to maxHits.
1644 randomSeed: Random number seed to use during search. A value of -1
1645 implies the use of a random seed.
1646 similarityCutoff: Similarity cutoff for returning hits by fingerprint
1647 similarity search. A default value of 0.5 is set for Morgan
1648 fingeprints.
1649 timeOut: Time limit for search, in seconds. A valus of 0 implies
1650 no timeout.
1651
1652 --overwrite
1653 Overwrite existing files.
1654 -w, --workingdir <dir>
1655 Location of working directory which defaults to the current directory.
1656
1657 Examples:
1658 To list information about a synthon space in a text file, type:
1659
1660 % RDKitPerformSynthonSpaceSearch.py --list -i SampleSynthonSpace.csv
1661
1662 To generate a binary database file for a synthon space in a text file, type:
1663
1664 % RDKitPerformSynthonSpaceSearch.py -m BinaryDBFileGeneration
1665 -i SampleSynthonSpace.csv -o SampleSynthonSpace.spc
1666
1667 To enumerate a combnatorial library for a synthon space in a text file and
1668 write out a SMILES file, type:
1669
1670 % RDKitPerformSynthonSpaceSearch.py -m LibraryEnumeration
1671 -i SampleSynthonSpace.csv -o SampleSynthonSpace_Library.smi
1672
1673 To generate Morgan fingerprints for a synthon space in a text file, employing
1674 radius of 2 and bit vector size of 2048, and write out a binary database file,
1675 type:
1676
1677 % RDKitPerformSynthonSpaceSearch.py -m FingerprintsGeneration
1678 -i SampleSynthonSpace.csv -o SampleSynthonSpace_MorganFPs.spc
1679
1680 To perform a similarity search using Morgan fingerprints for query molecules
1681 in an input file, against a binary data base file synthon space containing
1682 Morgan fingerprints, employing radius 2 and bit vector size of 2048, finding
1683 a maximum of 1000 hits for each query molecule, and write out a single output
1684 file containing hit molecules, type:
1685
1686 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1687 -i SampleSynthonSpace_MorganFPs.spc
1688 --queryFile SampleSynthonSpaceQuery.sdf
1689 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
1690
1691 or only count hits without building hits and writing them to an output
1692 file:
1693
1694 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1695 -i SampleSynthonSpace_MorganFPs.spc
1696 --queryFile SampleSynthonSpaceQuery.sdf
1697 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
1698 --synthonSearchParams "buildHits,No"
1699
1700 To run previous example for writing individual output files for each query
1701 molecule, type:
1702
1703 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1704 -i SampleSynthonSpace_MorganFPs.spc
1705 --queryFile SampleSynthonSpaceQuery.sdf
1706 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
1707 --outfileMode MultipleFiles
1708
1709 To run previous example for retrieving all possible hits for query molecules
1710 and write out individual output files for each query molecules, type:
1711
1712 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1713 -i SampleSynthonSpace_MorganFPs.spc
1714 --queryFile SampleSynthonSpaceQuery.sdf
1715 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
1716 --outfileMode MultipleFiles
1717 --synthonSearchParams "maxHits,-1"
1718
1719 To run the previous example using multi-threading employing all available
1720 threads on your machine, retrieve maximum of 1000 hits for each query
1721 molecule and generate various output files, type:
1722
1723 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1724 -i SampleSynthonSpace_MorganFPs.spc
1725 --queryFile SampleSynthonSpaceQuery.smi
1726 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi
1727 --outfileMode MultipleFiles
1728 --synthonSearchParams "maxHits, 1000, numThreads, 0"
1729
1730 To run the previous example using multi-threading employing all but one
1731 available threads on your machine, type:
1732
1733 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
1734 -i SampleSynthonSpace_MorganFPs.spc
1735 --queryFile SampleSynthonSpaceQuery.smi
1736 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi
1737 --outfileMode MultipleFiles
1738 --synthonSearchParams "maxHits, 1000, numThreads, -1"
1739
1740 To perform a substructure search using query pattern SMARTS against a synthon
1741 space file, finding a maximum of 1000 hits for each query pattern and write out
1742 a single output file containing hit molecules, type:
1743
1744 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
1745 -i SampleSynthonSpace.spc
1746 --queryPattern "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"
1747 -o SampleSynthonSpace_SubstructureSearchResults.sdf
1748
1749 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
1750 -i SampleSynthonSpace.csv
1751 --queryPattern 'c1c[n,s,o][n,s,o,c]c1C(=O)[$(N1CCCCC1),$(N1CCCC1)]'
1752 -o SampleSynthonSpace_SubstructureSearchResults.sdf
1753
1754 To run previous example for retrieving for writing out individual output files
1755 for each query molecules, type:
1756
1757 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
1758 -i SampleSynthonSpace.spc
1759 --queryPattern "CCN(C(=O)c1cc2cc(OC)ccc2nc1C)C1CCCN(C(=O)OC(C)(C)C)C1
1760 C=CCc1c(N[C@H](C)c2cccc(C)c2)ncnc1N(C)CCCC(=O)OC"
1761 -o SampleSynthonSpace_SubstructureSearchResults.sdf
1762 --outfileMode MultipleFiles
1763
1764 To perform RASCAL similarity search for query molecules in an input file,
1765 against a binary data base file synthon space, finding a maximum of 1000 hits
1766 for each query molecule, using multi-threadsing employing all available CPUs,
1767 timing out after 3600 seconds, and write out a single output file containing
1768 hit molecules, type:
1769
1770 % RDKitPerformSynthonSpaceSearch.py -m RASCALSimilaritySearch
1771 -i SampleSynthonSpace.spc
1772 --queryFile SampleSynthonSpaceQuery.sdf
1773 -o SampleSynthonSpace_RASCALSimilaritySearchResults.sdf
1774 --synthonSearchParams "maxHits, 1000, numThreads, 0, timeOut, 3600"
1775
1776 Author:
1777 Manish Sud(msud@san.rr.com)
1778
1779 Acknowledgment:
1780 Dave Cosgrove
1781
1782 See also:
1783 RDKitConvertFileFormat.py, RDKitPickDiverseMolecules.py, RDKitSearchFunctionalGroups.py,
1784 RDKitSearchSMARTS.py
1785
1786 Copyright:
1787 Copyright (C) 2026 Manish Sud. All rights reserved.
1788
1789 The functionality available in this script is implemented using RDKit, an
1790 open source toolkit for cheminformatics developed by Greg Landrum.
1791
1792 This file is part of MayaChemTools.
1793
1794 MayaChemTools is free software; you can redistribute it and/or modify it under
1795 the terms of the GNU Lesser General Public License as published by the Free
1796 Software Foundation; either version 3 of the License, or (at your option) any
1797 later version.
1798
1799 """
1800
1801 if __name__ == "__main__":
1802 main()