1 #!/bin/env python 2 # 3 # File: RDKitPerformSynthonSpaceSearch.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Acknowledgments: Dave Cosgrove 7 # 8 # Copyright (C) 2025 Manish Sud. All rights reserved. 9 # 10 # The functionality available in this script is implemented using RDKit, an 11 # open source toolkit for cheminformatics developed by Greg Landrum. 12 # 13 # This file is part of MayaChemTools. 14 # 15 # MayaChemTools is free software; you can redistribute it and/or modify it under 16 # the terms of the GNU Lesser General Public License as published by the Free 17 # Software Foundation; either version 3 of the License, or (at your option) any 18 # later version. 19 # 20 # MayaChemTools is distributed in the hope that it will be useful, but without 21 # any warranty; without even the implied warranty of merchantability of fitness 22 # for a particular purpose. See the GNU Lesser General Public License for more 23 # details. 24 # 25 # You should have received a copy of the GNU Lesser General Public License 26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 28 # Boston, MA, 02111-1307, USA. 29 # 30 31 from __future__ import print_function 32 33 # Add local python path to the global path and import standard library modules... 34 import os 35 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 36 import time 37 import re 38 import multiprocessing as mp 39 40 # RDKit imports... 41 try: 42 from rdkit import rdBase 43 from rdkit import Chem 44 from rdkit.Chem import AllChem 45 from rdkit.Chem import rdSynthonSpaceSearch 46 from rdkit.Chem import rdFingerprintGenerator 47 from rdkit.Chem import rdRascalMCES 48 except ImportError as ErrMsg: 49 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 50 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 51 sys.exit(1) 52 53 # MayaChemTools imports... 54 try: 55 from docopt import docopt 56 import MiscUtil 57 import RDKitUtil 58 except ImportError as ErrMsg: 59 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 60 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 61 sys.exit(1) 62 63 ScriptName = os.path.basename(sys.argv[0]) 64 Options = {} 65 OptionsInfo = {} 66 67 def main(): 68 """Start execution of the script.""" 69 70 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 71 72 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 73 74 # Retrieve command line arguments and options... 75 RetrieveOptions() 76 77 if Options and Options["--list"]: 78 # Process list option... 79 ProcessListSynthonSearchSpace() 80 else: 81 # Process and validate command line arguments and options... 82 ProcessOptions() 83 84 # Perform actions required by the script... 85 PerformSynthonSpaceSearch() 86 87 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 88 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 89 90 def PerformSynthonSpaceSearch(): 91 """Perform synthon space search.""" 92 93 Mode = OptionsInfo["Mode"] 94 if re.match("^FingerprintsGeneration$", Mode, re.I): 95 GenerateFingerprints() 96 elif re.match("^BinaryDBFileGeneration$", Mode, re.I): 97 GenerateBinaryDatabaseFile() 98 elif re.match("^LibraryEnumeration$", Mode, re.I): 99 PerformLibraryEnumeration() 100 elif re.match("^RascalSimilaritySearch$", Mode, re.I): 101 PerformRascalSimilaritySearch() 102 elif re.match("^SimilaritySearch$", Mode, re.I): 103 PerformSimilaritySearch() 104 elif re.match("^SubstructureSearch$", Mode, re.I): 105 PerformSubtructureSearch() 106 else: 107 MiscUtil.PrintError("The value specified, %s, for option \"--mode\" is not valid." % Mode) 108 109 def GenerateFingerprints(): 110 """Generate fingerprints for synthons and write out a binary file.""" 111 112 MiscUtil.PrintInfo("\nGenerating fingerprints (Mode: %s)..." % OptionsInfo["Mode"]) 113 114 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 115 116 StartTime = time.perf_counter() 117 118 MiscUtil.PrintInfo("\nGenerating fingerprints (Type: %s)..." % OptionsInfo["SpecifiedFingerprints"]) 119 FPGenerator = InitializeFingerprintsGenerator() 120 SynthonSpace.BuildSynthonFingerprints(FPGenerator) 121 122 TotalTime = time.perf_counter() - StartTime 123 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime) 124 125 WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"]) 126 127 def GenerateBinaryDatabaseFile(): 128 """Write out a binary file for synthons.""" 129 130 MiscUtil.PrintInfo("\nGenerating binary database file (Mode: %s)..." % OptionsInfo["Mode"]) 131 132 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 133 WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"]) 134 135 def PerformLibraryEnumeration(): 136 """Enumerate library using synthons and write out a SMILES file.""" 137 138 MiscUtil.PrintInfo("\nPerforming library enumeration (Mode: %s)..." % OptionsInfo["Mode"]) 139 140 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 141 142 MiscUtil.PrintInfo("\nWriting file %s ..." % OptionsInfo["Outfile"]) 143 SynthonSpace.WriteEnumeratedFile(OptionsInfo["Outfile"]) 144 145 def PerformSimilaritySearch(): 146 """Perform similarity search.""" 147 148 SingleOutFileMode = OptionsInfo["SingleOutFileMode"] 149 CountHitsMode = OptionsInfo["CountHitsMode"] 150 SynthonSearchParams = OptionsInfo["SynthonSearchParams"] 151 152 MiscUtil.PrintInfo("\nPerforming similiarity search (Fingerprints: %s; SimilarityCutoff: %s; MaxHits: %s)..." % (OptionsInfo["SpecifiedFingerprints"], SynthonSearchParams["SimilarityCutoff"], SynthonSearchParams["MaxHits"])) 153 154 # Setup synthon space... 155 SynthonSpace, FPGenerator = SetupSynthonSpaceForSimilaritySearch() 156 157 # Setup out file writers... 158 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters() 159 160 # Setup a molecule reader... 161 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"]) 162 QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"]) 163 164 # Process query molecules... 165 (QueryMolCount, ValidQueryMolCount) = [0] * 2 166 for QueryMol in QueryMols: 167 QueryMolCount += 1 168 if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol): 169 continue 170 171 ValidQueryMolCount += 1 172 QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount) 173 174 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSimilaritySearch(SynthonSpace, FPGenerator, QueryMol) 175 176 if CountHitsMode: 177 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits]) 178 else: 179 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits]) 180 181 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount) 182 WriteMolecules(Writer, QueryMolName, HitMols) 183 184 if not SingleOutFileMode: 185 if Writer is not None: 186 Writer.close() 187 188 if SingleOutFileWriter is not None: 189 SingleOutFileWriter.close() 190 191 if HitsInfoWriter is not None: 192 HitsInfoWriter.close() 193 194 MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount) 195 MiscUtil.PrintInfo("Number of valid query molecules: %d" % ValidQueryMolCount) 196 MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount)) 197 198 def PerformSubtructureSearch(): 199 """Perform substructure search.""" 200 201 SingleOutFileMode = OptionsInfo["SingleOutFileMode"] 202 CountHitsMode = OptionsInfo["CountHitsMode"] 203 SynthonSearchParams = OptionsInfo["SynthonSearchParams"] 204 205 MiscUtil.PrintInfo("\nPerforming substructue search (MaxHits: %s)..." % (SynthonSearchParams["MaxHits"])) 206 207 # Setup synthon space... 208 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 209 210 # Setup out file writers... 211 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters() 212 213 # Process query pattern molecules... 214 MiscUtil.PrintInfo("\nProcessing query patterns...") 215 216 QueryMolCount = 0 217 for QueryMol in OptionsInfo["QueryPatternMols"]: 218 QueryMolCount += 1 219 QueryMolName = "Pattern%s" % QueryMolCount 220 221 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol) 222 223 if CountHitsMode: 224 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits]) 225 else: 226 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits]) 227 228 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount) 229 WriteMolecules(Writer, QueryMolName, HitMols) 230 231 if not SingleOutFileMode: 232 if Writer is not None: 233 Writer.close() 234 235 if SingleOutFileWriter is not None: 236 SingleOutFileWriter.close() 237 238 if HitsInfoWriter is not None: 239 HitsInfoWriter.close() 240 241 MiscUtil.PrintInfo("\nTotal number of query patterns: %d" % QueryMolCount) 242 243 def PerformRascalSimilaritySearch(): 244 """Perform RASCAL similarity search.""" 245 246 SingleOutFileMode = OptionsInfo["SingleOutFileMode"] 247 CountHitsMode = OptionsInfo["CountHitsMode"] 248 RascalSearchParams = OptionsInfo["RascalSearchParams"] 249 SynthonSearchParams = OptionsInfo["SynthonSearchParams"] 250 251 MiscUtil.PrintInfo("\nPerforming RASCAL similiarity search (SimilarityThreshold: %s; MaxHits: %s)..." % (RascalSearchParams["SimilarityThreshold"], SynthonSearchParams["MaxHits"])) 252 253 # Setup synthon space... 254 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 255 256 # Setup out file writers... 257 SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters() 258 259 # Setup a molecule reader... 260 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"]) 261 QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"]) 262 263 # Process query molecules... 264 (QueryMolCount, ValidQueryMolCount) = [0] * 2 265 for QueryMol in QueryMols: 266 QueryMolCount += 1 267 if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol): 268 continue 269 270 ValidQueryMolCount += 1 271 QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount) 272 273 HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol) 274 275 if CountHitsMode: 276 WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits]) 277 else: 278 WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits]) 279 280 Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount) 281 WriteMolecules(Writer, QueryMolName, HitMols) 282 283 if not SingleOutFileMode: 284 if Writer is not None: 285 Writer.close() 286 287 if SingleOutFileWriter is not None: 288 SingleOutFileWriter.close() 289 290 if HitsInfoWriter is not None: 291 HitsInfoWriter.close() 292 293 MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount) 294 MiscUtil.PrintInfo("Number of valid query molecules: %d" % ValidQueryMolCount) 295 MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount)) 296 297 def ProcessListSynthonSearchSpace(): 298 """Process list synthon search space information.""" 299 300 MiscUtil.PrintInfo("\nListing information...") 301 302 # Validate infile.. 303 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 304 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc") 305 306 # Process infile.. 307 OptionsInfo["Infile"] = Options["--infile"] 308 309 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 310 311 MiscUtil.PrintInfo("\nSummary of synthon space:\n") 312 SynthonSpace.Summarise() 313 314 ListSynthonSpaceFingerprintsType(SynthonSpace) 315 316 def PerformSynthonSpaceSimilaritySearch(SynthonSpace, FPGenerator, QueryMol): 317 """Perform synthon space similarity search.""" 318 319 try: 320 Results = SynthonSpace.FingerprintSearch(QueryMol, FPGenerator, params = OptionsInfo["RDKitSynthonSearchParams"]) 321 except Exception as ErrMsg: 322 MiscUtil.PrintInfo("") 323 MiscUtil.PrintError("Failed to perform synthon space fingerprints seach:\n%s\n" % (ErrMsg)) 324 325 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results) 326 327 return (HitMols, HitMolsCount, MaxPossibleHits) 328 329 def PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol): 330 """Perform synthon space RASCAL similarity search.""" 331 332 try: 333 Results = SynthonSpace.RascalSearch(QueryMol, OptionsInfo["RDKitRascalSearchParams"], params = OptionsInfo["RDKitSynthonSearchParams"]) 334 except Exception as ErrMsg: 335 MiscUtil.PrintInfo("") 336 MiscUtil.PrintError("Failed to perform synthon space RASCAL similarity seach:\n%s\n" % (ErrMsg)) 337 338 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results) 339 340 return (HitMols, HitMolsCount, MaxPossibleHits) 341 342 def PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol): 343 """Perform synthon space substructure search.""" 344 345 try: 346 Results = SynthonSpace.SubstructureSearch(QueryMol, substructMatchParams = OptionsInfo["RDKitSubstructureMatchParams"], params = OptionsInfo["RDKitSynthonSearchParams"]) 347 except Exception as ErrMsg: 348 MiscUtil.PrintInfo("") 349 MiscUtil.PrintError("Failed to perform synthon space substructure seach:\n%s\n" % (ErrMsg)) 350 351 HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results) 352 353 return (HitMols, HitMolsCount, MaxPossibleHits) 354 355 def GetSynthonSpaceHitMolecules(Results): 356 """Retrieve synthon space hit molecues.""" 357 358 HitMols = Results.GetHitMolecules() 359 360 HitMolsCount = len(HitMols) 361 if HitMolsCount == 0: 362 HitMols = None 363 HitMolsCount = None 364 365 MaxPossibleHits = Results.GetMaxNumResults() 366 367 return (HitMols, HitMolsCount, MaxPossibleHits) 368 369 def SetupSynthonSpaceForSimilaritySearch(): 370 """Setup synthon space for similarity search.""" 371 372 SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"]) 373 374 FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace) 375 if FPType is None: 376 MiscUtil.PrintInfo("") 377 MiscUtil.PrintError("The synthon space input file, %s, doesn't contain any fingerprints. You must specify a synthon space binary database file containing appropriate fingerprints for similarity search.." % OptionsInfo["Infile"]) 378 379 if not re.search("%s" % OptionsInfo["SpecifiedFingerprints"], FPType, re.I): 380 MiscUtil.PrintInfo("") 381 MiscUtil.PrintWarning("The fingerprints type, %s, in synthon space input file, %s, doesn't appear to match fingerprints, %s, specified using \"--fingerprints\" option for similarity search." % (FPType, OptionsInfo["Infile"], OptionsInfo["SpecifiedFingerprints"])) 382 383 FPGenerator = InitializeFingerprintsGenerator() 384 385 return (SynthonSpace, FPGenerator) 386 387 def InitializeFingerprintsGenerator(): 388 """Initialize fingerprints generator.""" 389 390 FPGenerator = None 391 SpecifiedFingerprints = OptionsInfo["SpecifiedFingerprints"] 392 if re.match("^AtomPairs$", SpecifiedFingerprints, re.I): 393 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"] 394 FPGenerator = rdFingerprintGenerator.GetAtomPairGenerator(minDistance = FPParamsInfo["MinLength"], maxDistance = FPParamsInfo["MaxLength"], includeChirality = FPParamsInfo["UseChirality"], use2D = FPParamsInfo["Use2D"], fpSize = FPParamsInfo["FPSize"]) 395 elif re.match("^Morgan$", SpecifiedFingerprints, re.I): 396 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["Morgan"] 397 FPGenerator = rdFingerprintGenerator.GetMorganGenerator(radius = FPParamsInfo["Radius"], includeChirality = FPParamsInfo["UseChirality"], useBondTypes = FPParamsInfo["UseBondTypes"], includeRingMembership = FPParamsInfo["UseRingMembership"], fpSize = FPParamsInfo["FPSize"]) 398 elif re.match("^MorganFeatures$", SpecifiedFingerprints, re.I): 399 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"] 400 FPGenerator = rdFingerprintGenerator.GetMorganGenerator(radius = FPParamsInfo["Radius"], includeChirality = FPParamsInfo["UseChirality"], useBondTypes = FPParamsInfo["UseBondTypes"], includeRingMembership = FPParamsInfo["UseRingMembership"], fpSize = FPParamsInfo["FPSize"], atomInvariantsGenerator = rdFingerprintGenerator.GetMorganAtomInvGen()) 401 elif re.match("^PathLength$", SpecifiedFingerprints, re.I): 402 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["PathLength"] 403 FPGenerator = rdFingerprintGenerator.GetRDKitFPGenerator(minPath = FPParamsInfo["MinPath"], maxPath = FPParamsInfo["MaxPath"], useHs = FPParamsInfo["UseExplicitHs"], branchedPaths = FPParamsInfo["UseBranchedPaths"], useBondOrder = FPParamsInfo["UseBondOrder"], fpSize = FPParamsInfo["FPSize"], numBitsPerFeature = FPParamsInfo["BitsPerHash"]) 404 elif re.match("^TopologicalTorsions$", SpecifiedFingerprints, re.I): 405 FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"] 406 FPGenerator = rdFingerprintGenerator.GetTopologicalTorsionGenerator(includeChirality = FPParamsInfo["UseChirality"], fpSize = FPParamsInfo["FPSize"]) 407 else: 408 MiscUtil.PrintError("The value specified, %s, for option \"--fingerprints\" is not valid.") 409 410 return FPGenerator 411 412 def ReadSynthonSpaceFile(Infile): 413 """Read synthon space file.""" 414 415 MiscUtil.PrintInfo("\nReading synthon space file %s..." % Infile) 416 SynthonSpace = rdSynthonSpaceSearch.SynthonSpace() 417 418 StartTime = time.perf_counter() 419 420 try: 421 if MiscUtil.CheckFileExt(Infile, "spc"): 422 SynthonSpace.ReadDBFile(Infile) 423 else: 424 SynthonSpace.ReadTextFile(Infile) 425 except Exception as ErrMsg: 426 MiscUtil.PrintInfo("") 427 MiscUtil.PrintError("Failed to read synthon space file:\n%s\n" % (ErrMsg)) 428 429 TotalTime = time.perf_counter() - StartTime 430 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime) 431 432 return SynthonSpace 433 434 def WriteSynthonSpaceBinaryFile(SynthonSpace, Outfile): 435 """Write synthon space binary file.""" 436 437 MiscUtil.PrintInfo("\nWriting synthon space file %s..." % Outfile) 438 StartTime = time.perf_counter() 439 440 try: 441 SynthonSpace.WriteDBFile(Outfile) 442 except Exception as ErrMsg: 443 MiscUtil.PrintInfo("") 444 MiscUtil.PrintError("Failed to write synthon space file:\n%s\n" % (ErrMsg)) 445 446 TotalTime = time.perf_counter() - StartTime 447 MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime) 448 449 return SynthonSpace 450 451 def ListSynthonSpaceFingerprintsType(SynthonSpace): 452 """List synthon space fingerprints type. """ 453 454 FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace) 455 456 if FPInfo is None: 457 MiscUtil.PrintInfo("\nFingerprints type: %s" % (FPInfo)) 458 else: 459 MiscUtil.PrintInfo("\nFingerprints type: %s\nFingerprints Info: %s" % (FPType, FPInfo)) 460 461 def GetSynthonFingerprintsInfo(SynthonSpace): 462 """Get synthon fingerprints information.""" 463 464 FPInfo = SynthonSpace.GetSynthonFingerprintType() 465 if len(FPInfo) == 0: 466 return (None, None) 467 468 if re.search("AtomPairArguments", FPInfo, re.I): 469 FPType = "AtomPairs" 470 elif re.search("MorganArguments", FPInfo, re.I): 471 FPType = "Morgan or MorganFeatures" 472 elif re.search("RDKitFPArguments", FPInfo, re.I): 473 FPType = "PathLength" 474 elif re.search("TopologicalTorsionArguments", FPInfo, re.I): 475 FPType = "TopologicalTorsions" 476 else: 477 FPType = "Unknown" 478 479 return (FPType, FPInfo) 480 481 def SetupMoleculeWriter(SIngleOutFile, MolCount = 0): 482 """Setup molecule writer. """ 483 484 TextOutFileMode = OptionsInfo["TextOutFileMode"] 485 TextOutFileDelim = OptionsInfo["TextOutFileDelim"] 486 TextOutFileTitleLine = OptionsInfo["TextOutFileTitleLine"] 487 488 if SIngleOutFile: 489 Outfile = OptionsInfo["Outfile"] 490 else: 491 Outfile = "%s_%s%s.%s" % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], MolCount, OptionsInfo["OutFileExt"]) 492 493 if TextOutFileMode: 494 Writer = open(Outfile, "w") 495 else: 496 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 497 if Writer is None: 498 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 499 500 if TextOutFileMode: 501 if TextOutFileTitleLine: 502 WriteTextFileHeaderLine(Writer, TextOutFileDelim) 503 504 return Writer 505 506 def WriteTextFileHeaderLine(Writer, TextOutFileDelim): 507 """Write out a header line for text files including SMILES file.""" 508 509 Line = "" 510 if OptionsInfo["SubstructureSearchMode"]: 511 Line = TextOutFileDelim.join(["SMILES", "Name", "QueryPatternNumber"]) 512 elif OptionsInfo["SimilaritySearchMode"]: 513 Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"]) 514 elif OptionsInfo["RascalSimilaritySearchMode"]: 515 Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"]) 516 517 Writer.write("%s\n" % Line) 518 519 def WriteMolecules(Writer, QueryMolName, HitMols): 520 """Write hit molecules for similarity and substructure search.""" 521 522 RascalSimilaritySearchMode = OptionsInfo["RascalSimilaritySearchMode"] 523 SimilaritySearchMode = OptionsInfo["SimilaritySearchMode"] 524 SubstructureSearchMode = OptionsInfo["SubstructureSearchMode"] 525 526 TextOutFileMode = OptionsInfo["TextOutFileMode"] 527 TextOutFileDelim = OptionsInfo["TextOutFileDelim"] 528 529 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 530 531 SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"] 532 SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"] 533 534 HitMolCount = 0 535 for HitMol in HitMols: 536 HitMolCount += 1 537 538 if TextOutFileMode: 539 # Write out text file including SMILES file... 540 LineWords = [] 541 LineWords.append(Chem.MolToSmiles(HitMol, isomericSmiles = SMILESIsomeric, kekuleSmiles = SMILESKekulize)) 542 LineWords.append(RDKitUtil.GetMolName(HitMol, HitMolCount)) 543 544 if SimilaritySearchMode or RascalSimilaritySearchMode: 545 Similarity = "%.2f" % float(HitMol.GetProp("Similarity")) 546 LineWords.append(Similarity) 547 548 LineWords.append(QueryMolName) 549 550 Line = TextOutFileDelim.join(LineWords) 551 Writer.write("%s\n" % Line) 552 else: 553 # Write out SD file... 554 if SimilaritySearchMode or RascalSimilaritySearchMode: 555 HitMol.SetProp("QueryMolName", QueryMolName) 556 elif SubstructureSearchMode: 557 HitMol.SetProp("QueryPatternNum", QueryMolName) 558 559 if SimilaritySearchMode or RascalSimilaritySearchMode: 560 Similarity = "%.2f" % float(HitMol.GetProp("Similarity")) 561 HitMol.SetProp("Similarity", Similarity) 562 563 if Compute2DCoords: 564 AllChem.Compute2DCoords(HitMol) 565 Writer.write(HitMol) 566 567 def SetupOutfileWriters(): 568 """Setup outfile writers.""" 569 570 SingleOutFileWriter, HitsInfoWriter = [None] * 2 571 572 if OptionsInfo["CountHitsMode"]: 573 MiscUtil.PrintInfo("\nSkipping generation of output files containing hit structures and only counting hits (BuildHits: No)...") 574 else: 575 if OptionsInfo["SingleOutFileMode"]: 576 SingleOutFileWriter = SetupMoleculeWriter(OptionsInfo["SingleOutFileMode"]) 577 MiscUtil.PrintInfo("\nGenerating output file %s..." % OptionsInfo["Outfile"]) 578 else: 579 MiscUtil.PrintInfo("\nGenerating output file(s) %s_%s*.%s..." % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"])) 580 581 HitsInfoWriter = SetupHitsInfoWriter() 582 583 return (SingleOutFileWriter, HitsInfoWriter) 584 585 def SetupHitsInfoWriter(): 586 """Setup hits info writer.""" 587 588 HitsInfoOutFile = OptionsInfo["HitsInfoOutFile"] 589 HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"] 590 591 MiscUtil.PrintInfo("\nGenerating output file %s..." % HitsInfoOutFile) 592 593 Writer = open(HitsInfoOutFile, "w") 594 595 # Setup and write out header... 596 MolIDColName = "MolID" 597 if OptionsInfo["SubstructureSearchMode"]: 598 MolIDColName = "QueryPatternNumber" 599 elif OptionsInfo["SimilaritySearchMode"]: 600 MolIDColName = "QueryMolName" 601 elif OptionsInfo["RascalSimilaritySearchMode"]: 602 MolIDColName = "QueryMolName" 603 604 if OptionsInfo["CountHitsMode"]: 605 Line = HitsInfoOutFileDelim.join([MolIDColName, "MaxPossibleHits"]) 606 else: 607 Line = HitsInfoOutFileDelim.join([MolIDColName, "HitsCount", "MaxPossibleHits"]) 608 609 Writer.write("%s\n" % Line) 610 611 return Writer 612 613 def WriteHitsInfo(Writer, HitsInfo): 614 """Write hits info.""" 615 616 HitsInfoWords = ["%s" % HitInfo for HitInfo in HitsInfo] 617 618 HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"] 619 Line = HitsInfoOutFileDelim.join(HitsInfoWords) 620 621 Writer.write("%s\n" % Line) 622 623 def ProcessFingerprintsParameters(): 624 """Set up and process fingerprints parameters.""" 625 626 SetupFingerprintsNamesAndParameters() 627 628 ProcessSpecifiedFingerprintsName() 629 ProcessSpecifiedFingerprintsParameters() 630 631 def SetupFingerprintsNamesAndParameters(): 632 """Set up fingerprints parameters.""" 633 634 OptionsInfo["FingerprintsNames"] = ["AtomPairs", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"] 635 636 OptionsInfo["FingerprintsParamsInfo"] = {} 637 OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False, "Use2D": True, "FPSize": 2048} 638 OptionsInfo["FingerprintsParamsInfo"]["Morgan"] = {"Radius": 2, "UseChirality": False, "UseBondTypes": True, "UseRingMembership": True, "FPSize": 2048} 639 OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False, "UseBondTypes": True, "UseRingMembership": True, "FPSize": 2048} 640 OptionsInfo["FingerprintsParamsInfo"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "UseExplicitHs": True, "UseBranchedPaths": True, "UseBondOrder": True, "FPSize": 2048, "BitsPerHash": 2} 641 OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"] = {"UseChirality": False, "FPSize": 2048} 642 643 def ProcessSpecifiedFingerprintsName(): 644 """Process specified fingerprints name.""" 645 646 # Set up a canonical fingerprints name map... 647 CanonicalFingerprintsNamesMap = {} 648 for Name in OptionsInfo["FingerprintsNames"]: 649 CanonicalName = Name.lower() 650 CanonicalFingerprintsNamesMap[CanonicalName] = Name 651 652 # Validate specified fingerprints name... 653 CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower() 654 if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap: 655 MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"])) 656 657 OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName] 658 659 def ProcessSpecifiedFingerprintsParameters(): 660 """Process specified fingerprints parameters.""" 661 662 if re.match("^auto$", OptionsInfo["FingerprintsParams"], re.I): 663 # Nothing to process... 664 return 665 666 SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"] 667 668 # Parse specified fingerprints parameters... 669 FingerprintsParams = re.sub(" ", "", OptionsInfo["FingerprintsParams"]) 670 if not FingerprintsParams: 671 MiscUtil.PrintError("No valid parameter name and value pairs specified using \"--fingerprintsParams\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName)) 672 673 FingerprintsParamsWords = FingerprintsParams.split(",") 674 if len(FingerprintsParamsWords) % 2: 675 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"--fingerprintsParams\" option must be an even number." % (len(FingerprintsParamsWords))) 676 677 # Setup canonical parameter names for specified fingerprints... 678 ValidParamNames = [] 679 CanonicalParamNamesMap = {} 680 for ParamName in sorted(OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName]): 681 ValidParamNames.append(ParamName) 682 CanonicalParamNamesMap[ParamName.lower()] = ParamName 683 684 # Validate and set paramater names and value... 685 for Index in range(0, len(FingerprintsParamsWords), 2): 686 Name = FingerprintsParamsWords[Index] 687 Value = FingerprintsParamsWords[Index + 1] 688 689 CanonicalName = Name.lower() 690 if not CanonicalName in CanonicalParamNamesMap: 691 MiscUtil.PrintError("The parameter name, %s, specified using \"--fingerprintsParams\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames))) 692 693 ParamName = CanonicalParamNamesMap[CanonicalName] 694 if re.match("^(UseChirality|Use2D|UseBondTypes|UseRingMembership|UseExplicitHs|UseBranchedPaths|UseBondOrder)$", ParamName, re.I): 695 if not re.match("^(Yes|No|True|False)$", Value, re.I): 696 MiscUtil.PrintError("The parameter value, %s, specified using \"--fingerprintsParams\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName)) 697 ParamValue = False 698 if re.match("^(Yes|True)$", Value, re.I): 699 ParamValue = True 700 else: 701 ParamValue = int(Value) 702 if ParamValue <= 0: 703 MiscUtil.PrintError("The parameter value, %s, specified using \"--fingerprintsParams\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName)) 704 705 # Set value... 706 OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName][ParamName] = ParamValue 707 708 def ProcessOutfileParameters(): 709 """Process outfile related parameters""" 710 711 Mode = OptionsInfo["Mode"] 712 713 OptionsInfo["Outfile"] = Options["--outfile"] 714 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 715 716 # OutfileMode is only used for similarity and substructure search... 717 OptionsInfo["OutFileMode"] = Options["--outfileMode"] 718 SingleOutFileMode = True 719 if not re.match("^SingleFile$", Options["--outfileMode"], re.I): 720 SingleOutFileMode = False 721 OptionsInfo["SingleOutFileMode"] = SingleOutFileMode 722 723 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 724 OptionsInfo["OutFileRoot"] = FileName 725 OptionsInfo["OutFileExt"] = FileExt 726 727 OutFileSuffix = "" 728 if re.match("^SubstructureSearch$", Mode, re.I): 729 OutFileSuffix = "Pattern" 730 elif re.match("^SimilaritySearch$", Mode, re.I): 731 OutFileSuffix = "Mol" 732 OptionsInfo["OutFileSuffix"] = OutFileSuffix 733 734 OptionsInfo["HitsInfoOutFile"] = "%s_HitCount.csv" % OptionsInfo["OutFileRoot"] 735 OptionsInfo["HitsInfoOutFileDelim"] = "," 736 737 TextOutFileMode, TextOutFileDelim, TextOutFileTitleLine = [None] * 3 738 if re.match("^(SimilaritySearch|SubstructureSearch)$", Mode, re.I): 739 TextOutFileMode = False 740 TextOutFileDelim = "" 741 TextOutFileTitleLine = True 742 743 if MiscUtil.CheckFileExt(Options["--outfile"], "csv"): 744 TextOutFileMode = True 745 TextOutFileDelim = "," 746 elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"): 747 TextOutFileMode = True 748 TextOutFileDelim = "\t" 749 elif MiscUtil.CheckFileExt(Options["--outfile"], "smi"): 750 TextOutFileMode = True 751 TextOutFileDelim = OptionsInfo["OutfileParams"]["SMILESDelimiter"] 752 TextOutFileTitleLine = OptionsInfo["OutfileParams"]["SMILESTitleLine"] 753 754 OptionsInfo["TextOutFileMode"] = TextOutFileMode 755 OptionsInfo["TextOutFileDelim"] = TextOutFileDelim 756 OptionsInfo["TextOutFileTitleLine"] = TextOutFileTitleLine 757 758 if not OptionsInfo["SingleOutFileMode"]: 759 FilesSpec = "%s_%s*.%s" % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"]) 760 FileNames = MiscUtil.ExpandFileNames(FilesSpec) 761 if len(FileNames): 762 if not Options["--overwrite"]: 763 MiscUtil.PrintError("The output files, %s, corresponding to output file specified, %s, for option \"-o, --outfile\" already exist. Use option \"--ov\" or \"--overwrite\" and try again." % (FilesSpec, OptionsInfo["Outfile"])) 764 765 def ProcessRascalSearchParametersOption(): 766 """Process option for RASCAL similarity search.""" 767 768 ParamsOptionName = "--rascalSearchParams" 769 ParamsOptionValue = Options[ParamsOptionName] 770 771 ParamsDefaultInfo = { "AllBestMCESs": ["bool", False], "CompleteAromaticRings": ["bool", True], "CompleteSmallestRings": ["bool", False], "ExactConnectionsMatch": ["bool", False], "IgnoreAtomAromaticity": ["bool", True], "IgnoreBondOrders": ["bool", False], "MaxBondMatchPairs": ["int", 1000], "MaxFragSeparation": ["int", -1], "MinCliqueSize": ["int", 0], "MinFragSize": ["int", -1], "ReturnEmptyMCES": ["bool", False], "RingMatchesRingOnly": ["bool", False], "SimilarityThreshold": ["float", 0.7], "SingleLargestFrag": ["bool", False], "Timeout": ["int", 60]} 772 773 # Update default values to match RDKit default values... 774 RDKitRascalSearchParams = rdRascalMCES.RascalOptions() 775 for ParamName in ParamsDefaultInfo.keys(): 776 RDKitParamName = LowercaseFirstLetter(ParamName) 777 if hasattr(RDKitRascalSearchParams, RDKitParamName): 778 RDKitParamValue = getattr(RDKitRascalSearchParams, RDKitParamName) 779 ParamsDefaultInfo[ParamName][1] = RDKitParamValue 780 else: 781 MiscUtil.PrintWarning("The RASCAL search parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 782 783 RascalSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 784 785 for ParamName in ["MaxBondMatchPairs"]: 786 ParamValue = RascalSearchParams[ParamName] 787 if ParamValue <= 0: 788 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 789 790 for ParamName in ["MinCliqueSize", "SimilarityThreshold"]: 791 ParamValue = RascalSearchParams[ParamName] 792 if ParamValue < 0: 793 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName)) 794 if re.match("^SimilarityThreshold$", ParamName, re.I): 795 if ParamValue > 1: 796 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: <= 1\n" % (ParamValue, ParamName, ParamsOptionName)) 797 798 for ParamName in ["MaxFragSeparation", "MinFragSize", "Timeout"]: 799 ParamValue = RascalSearchParams[ParamName] 800 if not (ParamValue == -1 or ParamValue > 0): 801 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: -1 or > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 802 803 804 # Setup RDKit object for RASCAL match parameters... 805 RDKitRascalSearchParams = rdRascalMCES.RascalOptions() 806 for ParamName in RascalSearchParams.keys(): 807 ParamValue = RascalSearchParams[ParamName] 808 809 # Convert first letter to lower case for RDKit param name and set its value... 810 RDKitParamName = LowercaseFirstLetter(ParamName) 811 if hasattr(RDKitRascalSearchParams, RDKitParamName): 812 setattr(RDKitRascalSearchParams, RDKitParamName, ParamValue) 813 else: 814 MiscUtil.PrintWarning("The RASCAL searh parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 815 816 OptionsInfo["RascalSearchParams"] = RascalSearchParams 817 OptionsInfo["RDKitRascalSearchParams"] = RDKitRascalSearchParams 818 819 RDKitParamInfo = {} 820 for ParamName in RascalSearchParams.keys(): 821 RDKitParamName = LowercaseFirstLetter(ParamName) 822 RDKitParamValue = getattr(RDKitRascalSearchParams, RDKitParamName) 823 RDKitParamInfo[RDKitParamName] = RDKitParamValue 824 825 def ProcessSubstructureMatchParametersOption(): 826 """Process option for substructure match parameters. """ 827 828 ParamsOptionName = "--substructureMatchParams" 829 ParamsOptionValue = Options[ParamsOptionName] 830 831 ParamsDefaultInfo = { "AromaticMatchesConjugated": ["bool", False], "MaxMatches": ["int", 1000], "MaxRecursiveMatches": ["int", 1000], "RecursionPossible": ["bool", True], "SpecifiedStereoQueryMatchesUnspecified": ["bool", False], "Uniquify": ["bool", True], "UseChirality": ["bool", False], "UseEnhancedStereo": ["bool", False], "UseGenericMatchers": ["bool", False]} 832 833 # Update default values to match RDKit default values... 834 RDKitSubstructureMatchParams = Chem.SubstructMatchParameters() 835 for ParamName in ParamsDefaultInfo.keys(): 836 RDKitParamName = LowercaseFirstLetter(ParamName) 837 if hasattr(RDKitSubstructureMatchParams, RDKitParamName): 838 RDKitParamValue = getattr(RDKitSubstructureMatchParams, RDKitParamName) 839 ParamsDefaultInfo[ParamName][1] = RDKitParamValue 840 else: 841 MiscUtil.PrintWarning("The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 842 843 SubstructureMatchParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 844 845 for ParamName in ["MaxMatches", "MaxRecursiveMatches"]: 846 ParamValue = SubstructureMatchParams[ParamName] 847 if ParamValue <= 0: 848 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 849 850 # Setup RDKit object for substructure match parameters... 851 RDKitSubstructureMatchParams = Chem.SubstructMatchParameters() 852 for ParamName in SubstructureMatchParams.keys(): 853 ParamValue = SubstructureMatchParams[ParamName] 854 855 # Convert first letter to lower case for RDKit param name and set its value... 856 RDKitParamName = LowercaseFirstLetter(ParamName) 857 if hasattr(RDKitSubstructureMatchParams, RDKitParamName): 858 setattr(RDKitSubstructureMatchParams, RDKitParamName, ParamValue) 859 else: 860 MiscUtil.PrintWarning("The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 861 862 OptionsInfo["SubstructureMatchParams"] = SubstructureMatchParams 863 OptionsInfo["RDKitSubstructureMatchParams"] = RDKitSubstructureMatchParams 864 865 def ProcessSynthonSearchParamatersOption(): 866 """Process option for synthon search parameters. """ 867 868 ParamsOptionName = "--synthonSearchParams" 869 ParamsOptionValue = Options[ParamsOptionName] 870 871 ParamsDefaultInfo = {"ApproxSimilarityAdjuster": ["float", 0.1], "BuildHits": ["bool", True], "FragSimilarityAdjuster": ["float", 0.1], "HitStart": ["int", 0], "MaxHits": ["int", 1000], "MaxNumFrags": ["int", 100000], "NumThreads": ["int", 1], "RandomSample": ["bool", False], "RandomSeed": ["int", -1], "SimilarityCutoff": ["float", 0.5], "TimeOut": ["int", 600]} 872 873 # Update default values to match RDKit default values... 874 RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams() 875 for ParamName in ParamsDefaultInfo.keys(): 876 RDKitParamName = LowercaseFirstLetter(ParamName) 877 if hasattr(RDKitSynthonSearchParams, RDKitParamName): 878 RDKitParamValue = getattr(RDKitSynthonSearchParams, RDKitParamName) 879 ParamsDefaultInfo[ParamName][1] = RDKitParamValue 880 else: 881 MiscUtil.PrintWarning("The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 882 883 SynthonSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 884 885 for ParamName in ["ApproxSimilarityAdjuster", "FragSimilarityAdjuster", "SimilarityCutoff", "HitStart"]: 886 ParamValue = SynthonSearchParams[ParamName] 887 if ParamValue < 0: 888 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName)) 889 if re.match("^SimilarityCutoff$", ParamName, re.I): 890 if ParamValue > 1: 891 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: <= 1\n" % (ParamValue, ParamName, ParamsOptionName)) 892 893 for ParamName in ["MaxNumFrags", "TimeOut"]: 894 ParamValue = SynthonSearchParams[ParamName] 895 if ParamValue <= 0: 896 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 897 898 for ParamName in ["MaxHits", "RandomSeed"]: 899 ParamValue = SynthonSearchParams[ParamName] 900 if not (ParamValue == -1 or ParamValue > 0): 901 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: -1 or > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 902 903 ParamName = "NumThreads" 904 ParamValue = SynthonSearchParams[ParamName] 905 if ParamValue > 0: 906 if ParamValue > mp.cpu_count(): 907 MiscUtil.PrintWarning("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is greater than number of CPUs, %s, returned by mp.cpu_count()." % (ParamValue, ParamName, ParamsOptionName, mp.cpu_count())) 908 elif ParamValue < 0: 909 if abs(ParamValue) > mp.cpu_count(): 910 MiscUtil.PrintWarning("The absolute parameter value, %s, specified for parameter name, %s, using \"%s\" option is greater than number of CPUs, %s, returned by mp.cpu_count()." % (abs(ParamValue), ParamName, ParamsOptionName, mp.cpu_count())) 911 912 # Setup RDKit object for synthon space search parameters... 913 RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams() 914 for ParamName in SynthonSearchParams.keys(): 915 ParamValue = SynthonSearchParams[ParamName] 916 917 # Convert first letter to lower case for RDKit param name and set its value... 918 RDKitParamName = LowercaseFirstLetter(ParamName) 919 if hasattr(RDKitSynthonSearchParams, RDKitParamName): 920 setattr(RDKitSynthonSearchParams, RDKitParamName, ParamValue) 921 else: 922 MiscUtil.PrintWarning("The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName) 923 924 OptionsInfo["CountHitsMode"] = False if SynthonSearchParams["BuildHits"] else True 925 926 OptionsInfo["SynthonSearchParams"] = SynthonSearchParams 927 OptionsInfo["RDKitSynthonSearchParams"] = RDKitSynthonSearchParams 928 929 def LowercaseFirstLetter(Text): 930 """Convert first letter of a string to lowercase. """ 931 932 if Text is None or len(Text) == 0: 933 return Text 934 935 return Text[0].lower() + Text[1:] 936 937 def ProcessQueryPatternOption(): 938 """Process query pattern option. """ 939 940 QueryPattern = None if re.match("^None$", Options["--queryPattern"], re.I) else Options["--queryPattern"] 941 QueryPatternMols = None 942 943 if QueryPattern is not None: 944 QueryPatternMols = [] 945 Patterns = QueryPattern.split() 946 for Pattern in Patterns: 947 PatternMol = Chem.MolFromSmarts(Pattern) 948 if PatternMol is None: 949 MiscUtil.PrintError("The value specified, %s, using option \"--queryPattern\" is not a valid SMARTS: Failed to create pattern molecule" % (Pattern)) 950 QueryPatternMols.append(PatternMol) 951 952 OptionsInfo["QueryPattern"] = QueryPattern 953 OptionsInfo["QueryPatternMols"] = QueryPatternMols 954 955 def ProcessOptions(): 956 """Process and validate command line arguments and options.""" 957 958 MiscUtil.PrintInfo("Processing options...") 959 960 # Validate options... 961 ValidateOptions() 962 963 OptionsInfo["Mode"] = Options["--mode"] 964 OptionsInfo["RascalSimilaritySearchMode"] = True if re.match("^RASCALSimilaritySearch$", Options["--mode"], re.I) else False 965 OptionsInfo["SimilaritySearchMode"] = True if re.match("^SimilaritySearch$", Options["--mode"], re.I) else False 966 OptionsInfo["SubstructureSearchMode"] = True if re.match("^SubstructureSearch$", Options["--mode"], re.I) else False 967 968 OptionsInfo["Fingerprints"] = Options["--fingerprints"] 969 970 OptionsInfo["FingerprintsParams"] = Options["--fingerprintsParams"] 971 ProcessFingerprintsParameters() 972 973 OptionsInfo["Infile"] = Options["--infile"] 974 975 ProcessOutfileParameters() 976 977 OptionsInfo["Overwrite"] = Options["--overwrite"] 978 979 ProcessQueryPatternOption() 980 981 OptionsInfo["QueryFile"] = None if re.match("^none$", Options["--queryFile"]) else Options["--queryFile"] 982 if OptionsInfo["QueryFile"] is None: 983 OptionsInfo["QueryFileParams"] = None 984 else: 985 OptionsInfo["QueryFileParams"] = MiscUtil.ProcessOptionInfileParameters("--queryFileParams", Options["--queryFileParams"], Options["--queryFile"]) 986 987 ProcessRascalSearchParametersOption() 988 989 ProcessSubstructureMatchParametersOption() 990 ProcessSynthonSearchParamatersOption() 991 992 OptionsInfo["Overwrite"] = Options["--overwrite"] 993 994 def RetrieveOptions(): 995 """Retrieve command line arguments and options.""" 996 997 # Get options... 998 global Options 999 Options = docopt(_docoptUsage_) 1000 1001 # Set current working directory to the specified directory... 1002 WorkingDir = Options["--workingdir"] 1003 if WorkingDir: 1004 os.chdir(WorkingDir) 1005 1006 # Handle examples option... 1007 if "--examples" in Options and Options["--examples"]: 1008 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 1009 sys.exit(0) 1010 1011 def ValidateOptions(): 1012 """Validate option values.""" 1013 1014 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "FingerprintsGeneration BinaryDBFileGeneration LibraryEnumeration RASCALSimilaritySearch SimilaritySearch SubstructureSearch") 1015 1016 MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs Morgan MorganFeatures PathLength TopologicalTorsions") 1017 1018 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 1019 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc") 1020 1021 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi csv tsv txt spc") 1022 if re.match("^SingleFile$", Options["--outfileMode"], re.I): 1023 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 1024 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 1025 1026 if re.match("^(FingerprintsGeneration|BinaryDBFileGeneration)$", Options["--mode"], re.I): 1027 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "spc") 1028 if not MiscUtil.CheckFileExt(Options["--outfile"], "spc"): 1029 MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid during, %s, value of \"--mode\" option. Supported file formats: spc\n" % (Options["--outfile"], Options["--mode"])) 1030 elif re.match("^LibraryEnumeration$", Options["--mode"], re.I): 1031 if not MiscUtil.CheckFileExt(Options["--outfile"], "smi"): 1032 MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid during, %s, value of \"--mode\" option. Supported file formats: smi\n" % (Options["--outfile"], Options["--mode"])) 1033 elif re.match("^(RASCALSimilaritySearch|SimilaritySearch|SubstructureSearch)$", Options["--mode"], re.I): 1034 if not MiscUtil.CheckFileExt(Options["--outfile"], "sdf sd smi csv tsv txt"): 1035 MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid during, %s, value of \"--mode\" option. Supported file formats: sdf sd smi csv tsv txt\n" % (Options["--outfile"], Options["--mode"])) 1036 1037 MiscUtil.ValidateOptionTextValue("--outfileMode", Options["--outfileMode"], "SingleFile or MultipleFiles") 1038 1039 QueryPattern = Options["--queryPattern"] 1040 if re.match("^SubstructureSearch$", Options["--mode"], re.I): 1041 if re.match("^None$", QueryPattern, re.I): 1042 MiscUtil.PrintError("You must specify a valid SMARTS pattern(s) for option \"--queryPattern\" during, SubstructureSearch, value of \"-m, --mode\" option.") 1043 1044 PatternMols = [] 1045 if not re.match("^None$", QueryPattern, re.I): 1046 Patterns = QueryPattern.split() 1047 for Pattern in Patterns: 1048 PatternMol = Chem.MolFromSmarts(Pattern) 1049 if PatternMol is None: 1050 MiscUtil.PrintError("The value specified, %s, using option \"--queryPattern\" is not a valid SMARTS: Failed to create pattern molecule" % (Pattern)) 1051 PatternMols.append(PatternMol) 1052 1053 if re.match("^SubstructureSearch$", Options["--mode"], re.I): 1054 if len(PatternMols) == 0: 1055 MiscUtil.PrintError("You must specify a valid SMARTS pattern(s) for option \"--queryPattern\" during, SubstructureSearch, value of \"-m, --mode\" option.") 1056 1057 if re.match("^(RASCALSimilaritySearch|SimilaritySearch)$", Options["--mode"], re.I): 1058 if re.match("^None$", Options["--queryFile"], re.I): 1059 MiscUtil.PrintError("You must specify a valid filename for option \"--queryFile\" during, SimilaritySearch, value of \"-m, --mode\" option.") 1060 1061 if not re.match("^None$", Options["--queryFile"], re.I): 1062 MiscUtil.ValidateOptionFilePath("--queryFile", Options["--queryFile"]) 1063 MiscUtil.ValidateOptionFileExt("--queryFile", Options["--queryFile"], "sdf sd smi csv tsv") 1064 1065 # Setup a usage string for docopt... 1066 _docoptUsage_ = """ 1067 RDKitPerformSynthonSpaceSearch.py - Perform a synthon space search 1068 1069 Usage: 1070 RDKitPerformSynthonSpaceSearch.py [--fingerprints <Morgan, PathLength...>] [--fingerprintsParams <Name,Value,...>] 1071 [--mode <SubstructureSearch...>] [ --outfileParams <Name,Value,...>] [--outfileMode <SingleFile or MultipleFiles>] 1072 [--overwrite] [--queryPattern <SMARTS>] [--queryFileParams <Name,Value,...>] [--queryFile <filename>] 1073 [--rascalSearchParams <Name,Value,...>] [--substructureMatchParams <Name,Value,...>] 1074 [--synthonSearchParams <Name,Value,...>] [-w <dir>] -i <infile> -o <outfile> 1075 RDKitPerformSynthonSpaceSearch.py -l | --list -i <infile> 1076 RDKitPerformSynthonSpaceSearch.py -h | --help | -e | --examples 1077 1078 Description: 1079 Perform a similarity or substructure search, using query molecules or SMARTS 1080 patterns, against a synthon space [ Ref 174 ] in an input file, and write out the 1081 hit molecules to output file(s). You may optionally count the hits without 1082 building and writing them out. 1083 1084 In addition, you may enumerate a combinatorial library corresponding to a 1085 synthon space, generate fingerprints for a synthon space, or list information 1086 about a synthon space. 1087 1088 You must provide a valid synthon space text or binary database file supported 1089 by RDKit module rdSynthonSpaceSearch. 1090 1091 You may perform similarity search using fingerprints or employ RASCAL (RApid 1092 Similarity CALculations using Maximum Edge Subgrahps) methodology [ Ref 175 ]. 1093 1094 A number of fingerprints are available for performing similarity search. The 1095 similarity metric, however, is calculated using Tanimoto similarity on hashed 1096 fingerprints. 1097 1098 The RASCAL similarity between two molecuels is calculated based on MCES 1099 (Maximum Common Edge Subgraphs) and corresponds to Johnson similarity. 1100 1101 The supported input file formats are: CSV/TXT synthon space (.csv, .txt) or 1102 binary synthon space (.spc). 1103 1104 The supported outfile formats, for different '--mode' values, are shown 1105 below: 1106 1107 BinaryDBFileGeneration: Binary database file (.spc) 1108 FingerprintsGeneration: Binary database file (.spc) 1109 LibraryEnumeration: SMILES (.smi) 1110 SimilaritySearch or SubstructureSearch: SD (.sdf, .sd), SMILES (.smi), 1111 CSV/TSV (.csv or .tsv) 1112 1113 Possible output files: 1114 1115 <OutfileRoot>.<sdf,sd,smi,csv,tsv> 1116 1117 <OutfileRoot>_Mol<Num>.<sdf,sd,smi,csv,tsv> 1118 <OutfileRoot>_Pattern<Num>.<sdf,sd,smi,csv,tsv> 1119 1120 <OutfileRoot>_HitCount.csv 1121 1122 The <OutfileRoot>_HitCount.csv contains aditional information regarding hit 1123 counts and is writter out for both similarity and substructure search. 1124 1125 Options: 1126 -f, --fingerprints <Morgan, PathLength...> [default: Morgan] 1127 Fingerprints to use for performing synthon space similarity search. 1128 Supported values: AtomPairs, Morgan, MorganFeatures, PathLength, 1129 TopologicalTorsions. The PathLength fingerprints are Daylight like 1130 fingerprints. The Morgan and MorganFeature fingerprints are circular 1131 fingerprints, corresponding Scitegic's Extended Connectivity Fingerprints 1132 (ECFP) and Features Connectivity Fingerprints (FCFP). The values of 1133 default parameters for generating fingerprints can be modified using 1134 '--fingerprintsParams' option. 1135 --fingerprintsParams <Name,Value,...> [default: auto] 1136 Parameter values to use for generating fingerprints. The default values 1137 are dependent on the value of '-f, --fingerprints' option. In general, it is a 1138 comma delimited list of parameter name and value pairs for the name of 1139 fingerprints specified using '-f, --fingerprints' option. The supported 1140 parameter names along with their default values for valid fingerprints 1141 names are shown below: 1142 1143 AtomPairs: minLength,1 ,maxLength,useChirality,No, 1144 use2D, yes, fpSize, 2048 1145 Morgan: radius,2, useChirality,No, useBondTypes, yes, 1146 useRingMembership, yes, fpSize, 2048 1147 MorganFeatures: radius,2, useChirality,No, useBondTypes, yes, 1148 useRingMembership, yes, fpSize, 2048 1149 PathLength: minPath,1, maxPath,7, useExplicitHs, yes, 1150 useBranchedPaths, yes,useBondOrder,yes, fpSize, 2048, 1151 bitsPerHash,2 1152 TopologicalTorsions: useChirality,No, fpSize, 2048 1153 1154 A brief description of parameters, taken from RDKit documentation, is 1155 provided below: 1156 1157 AtomPairs: 1158 1159 minLength: Minimum distance between atoms. 1160 maxLength: Maximum distance between atoms. 1161 useChirality: Use chirality for atom invariants. 1162 use2D: Use topological distance matrix. 1163 fpSize: Size of the fingerpints bit vector. 1164 1165 Morgan and MorganFeatures: 1166 1167 radius: Neighborhood radius. 1168 useChirality: Use chirality to generate fingerprints. 1169 useBondTypes: Use bond type for the bond invariants. 1170 useRingMembership: Use ring membership. 1171 fpSize: Size of the fingerpints bit vector. 1172 1173 PathLength: 1174 1175 minPath: Minimum bond path length. 1176 maxPath: Maximum bond path length. 1177 useExplicitHs: Use explicit hydrogens. 1178 useBranchedPaths: Use branched paths along with linear paths. 1179 useBondOrder: Us bond order in the path hashes. 1180 fpSize: Size of the fingerpints bit vector. 1181 bitsPerHash: Number of bits set per path. 1182 1183 TopologicalTorsions 1184 1185 useChirality: Use chirality to generate fingerprints. 1186 fpSize: Size of the fingerpints bit vector. 1187 1188 -e, --examples 1189 Print examples. 1190 -h, --help 1191 Print this help message. 1192 -i, --infile <infile> 1193 Synthon space Input file name. 1194 -l, --list 1195 List information about synthon space. 1196 -m, --mode <SubstructureSearch...> [default: SimilaritySearch] 1197 Perform similarity or substructure search, enumerate synthon space, 1198 or list information about a synthon space. The supported values along 1199 with a brief explanation of the expected behavior are shown below: 1200 1201 BinaryDBFileGeneration: Write out a binary database file for a 1202 synthon space. 1203 FingerprintsGeneration: Generate fingerints for a synthon space and 1204 write out a binary database file along with fingerprints. 1205 LibraryEnumeration: Enumerate a combinatorial library for a synthon 1206 space and write out a SMILES file. 1207 RASCALSimilaritySearch: Perform a RASCAL (RApid Similarity 1208 CALculations using Maximum Edge Subgrahps) similarity search. 1209 SimilaritySearch: Perform a similarity search using fingerprints. 1210 SubstructureSearch: Perform a substructure search using specified 1211 SMARTS patterns. 1212 1213 -o, --outfile <outfile> 1214 Output file name. The <OutfileRoot> and <OutfileExt> are used to generate 1215 file names during 'MultipleFiles' value for '--outfileMode' option. 1216 --outfileMode <SingleFile or MultipleFiles> [default: SingleFile] 1217 Write out a single file containing hit molecules for substructure or 1218 similarity search or generate an individual file for each query pattern 1219 or molecule. Possible values: SingleFile or MultipleFiles. The query 1220 pattern number or molecule name is written to output file(s). The query 1221 pattern or molecule number is also appended to output file names during 1222 the generation of multiple output files. 1223 --outfileParams <Name,Value,...> [default: auto] 1224 A comma delimited list of parameter name and value pairs for writing 1225 molecules to files during similarity and substructue search. The supported 1226 parameter names for different file formats, along with their default values, 1227 are shown below: 1228 1229 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 1230 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 1231 smilesTitleLine,yes 1232 1233 Default value for compute2DCoords: yes for SMILES input file; no for all other 1234 file types. The kekulize and smilesIsomeric parameters are also used during 1235 generation of SMILES strings for CSV/TSV files. 1236 --queryPattern <SMARTS SMARTS ...> [default: none] 1237 A space delimited list of SMARTS patterns for performing substructure 1238 search. This is required for 'SubstructureSearch' value of '--mode' option. 1239 --queryFile <filename> [default: none] 1240 Input file containing query molecules for performing similarity search. This 1241 is required for 'SimilaritySearch' value of '--mode' option. 1242 --queryFileParams <Name,Value,...> [default: auto] 1243 A comma delimited list of parameter name and value pairs for reading 1244 molecules from query files during similarity search. The supported 1245 parameter names for different file formats, along with their default 1246 values, are shown below: 1247 1248 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 1249 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 1250 smilesTitleLine,auto,sanitize,yes 1251 1252 Possible values for smilesDelimiter: space, comma or tab. 1253 --rascalSearchParams <Name,Value,...> [default: auto] 1254 Parameter values to use for RASCAL similarity search. 1255 1256 The default values are automatically updated to match RDKit default values. 1257 The supported parameter names along with their default values are 1258 are shown below: 1259 1260 allBestMCESs, no, completeAromaticRings, yes, 1261 completeSmallestRings, no, exactConnectionsMatch, no, 1262 ignoreAtomAromaticity, yes, ignoreBondOrders, no, 1263 maxBondMatchPairs, 1000, maxFragSeparation, -1, minCliqueSize, 0, 1264 minFragSize, -1, returnEmptyMCES, false, ringMatchesRingOnly, false, 1265 similarityThreshold, 0.7, singleLargestFrag, no, 1266 timeout, 60 1267 1268 A brief description of parameters, taken from RDKit documentation, is 1269 provided below: 1270 1271 allBestMCESs: Find all Maximum Common Edge Subgraphs (MCES). 1272 completeAromaticRings: Use only complete aromatic rings. 1273 completeSmallestRings: Only complete rings present in both 1274 molecules. 1275 exactConnectionsMatch: Match atoms only when they have the same 1276 number of explicit connections. 1277 ignoreAtomAromaticity: Ignore aromaticity during atom matching. 1278 ignoreBondOrders: Ignore bond orders during atom matching. 1279 maxBondMatchPairs: Maximum number of matching bond pairs. 1280 maxFragSeparation: Maximum bond distance that bonds can match. 1281 value of -1 implies no maximum. 1282 minCliqueSize: A value of > 0 overrides the similarityThreshold. 1283 This refers to the minimum number of bonds in the MCES. 1284 minFragSize: Minimum number of atoms in a fragment. A value of -1 1285 implies no minimum. 1286 returnEmptyMCES: Return empty MCES results. 1287 ringMatchesRingOnly: Match ring bonds to only ring bonds. 1288 similarityThreshold: Similarity threshold for matching and 1289 evaluating MCES. 1290 singleLargestFrag: Find only a single fragment for the MCES. By 1291 default, multiple fragments are generated as necessary. 1292 timeout: Max run time in seconds. A value of -1 implies no max. 1293 1294 --substructureMatchParams <Name,Value,...> [default: auto] 1295 Parameter values to use for substructure match during synthon substructure 1296 search. 1297 1298 The default values are automatically updated to match RDKit default values. 1299 The supported parameter names along with their default values are 1300 are shown below: 1301 1302 aromaticMatchesConjugated, no, maxMatches, 1000, 1303 maxRecursiveMatches, 1000, recursionPossible, yes, 1304 specifiedStereoQueryMatchesUnspecified, no, uniquify, yes, 1305 useChirality, no, useEnhancedStereo, no, useGenericMatchers, no, 1306 1307 A brief description of parameters, taken from RDKit documentation, is 1308 provided below: 1309 1310 aromaticMatchesConjugated: Match aromatic and conjugated bonds. 1311 maxMatches: Maximum number of matches. 1312 maxRecursiveMatches: Maximum number of recursive matches. 1313 recursionPossible: Allow recursive queries. 1314 specifiedStereoQueryMatchesUnspecified: Match query atoms and bonds 1315 with specified stereochemistry to atoms and bonds with unspecified 1316 stereochemistry. 1317 uniquify: Uniquify match results using atom indices. 1318 useChirality: Use chirality to match atom and bonds. 1319 useEnhancedStereo: Use enhanced stereochemistry during the use 1320 of chirality. 1321 useGenericMatchers: Use generic groups as a post-filtering step. 1322 1323 --synthonSearchParams <Name,Value,...> [default: auto] 1324 Parameter values to use for performing synthon substructure and similarity 1325 search. 1326 1327 The default values are automatically updated to match RDKit default values. 1328 The supported parameter names along with their default values are 1329 are shown below: 1330 1331 approxSimilarityAdjuster, 0.1, [ Default value for Morgan FPs ] 1332 buildHits, yes, fragSimilarityAdjuster, 0.1, hitStart, 0, 1333 maxHits, 1000, [ A value of -1 retrives all hits ] 1334 maxNumFrags, 100000, 1335 numThreads, 1 [ 0: Use maximum number of threads supported by the 1336 hardware; Negative value: Added to the maxiumum number of 1337 threads supported by the hardware ] 1338 randomSample, no, 1339 randomSeed, -1 [ Default value implies use random seed ] 1340 similarityCutoff, 0.5, [ Default for Morgan FPs. Ignored during RASCAL 1341 similarity search; instead, RASCAL parameter similarityThreshold is 1342 used. ] 1343 timeOut, 600 [ Unit: sec. The RASCAL searches take longer and may 1344 need a higher value for timeOut. For example: 3600 ] 1345 1346 A brief description of parameters, taken from RDKit documentation, is 1347 provided below: 1348 1349 approxSimilarityAdjuster: Value used for reducing similarity cutoff 1350 during approximate similarity check for fingerprint search. A 1351 lower value leads to faster run times at the risk of missing 1352 some hits. 1353 buildHits: A no value implies to report the maximum number of hits a 1354 search could generate without returning any hits. 1355 fragSimilarityAdjuster: Value used for reducing fragment matching 1356 similarity cutoff to accommodate low bit densities for fragments. 1357 hitStart: Return hits starting from the specified sequence number 1358 to support retrieval of hits in batches. 1359 maxHits: Maximum number of hits to return. A value of -1 implies 1360 retrieve all hits. 1361 maxNumFrags: Maximum number of fragments for breaking a query. 1362 numThreads: Number of threads to use for search. A value of 0 1363 implies the use of all available hardware threads. A negative 1364 value is added to the number of available hardware threads to 1365 calculate number of threads to use. 1366 randomSample: Return a random sample of hits up to maxHits. 1367 randomSeed: Random number seed to use during search. A value of -1 1368 implies the use of a random seed. 1369 similarityCutoff: Similarity cutoff for returning hits by fingerprint 1370 similarity search. A default value of 0.5 is set for Morgan 1371 fingeprints. 1372 timeOut: Time limit for search, in seconds. A valus of 0 implies 1373 no timeout. 1374 1375 --overwrite 1376 Overwrite existing files. 1377 -w, --workingdir <dir> 1378 Location of working directory which defaults to the current directory. 1379 1380 Examples: 1381 To list information about a synthon space in a text file, type: 1382 1383 % RDKitPerformSynthonSpaceSearch.py --list -i SampleSynthonSpace.csv 1384 1385 To generate a binary database file for a synthon space in a text file, type: 1386 1387 % RDKitPerformSynthonSpaceSearch.py -m BinaryDBFileGeneration 1388 -i SampleSynthonSpace.csv -o SampleSynthonSpace.spc 1389 1390 To enumerate a combnatorial library for a synthon space in a text file and 1391 write out a SMILES file, type: 1392 1393 % RDKitPerformSynthonSpaceSearch.py -m LibraryEnumeration 1394 -i SampleSynthonSpace.csv -o SampleSynthonSpace_Library.smi 1395 1396 To generate Morgan fingerprints for a synthon space in a text file, employing 1397 radius of 2 and bit vector size of 2048, and write out a binary database file, 1398 type: 1399 1400 % RDKitPerformSynthonSpaceSearch.py -m FingerprintsGeneration 1401 -i SampleSynthonSpace.csv -o SampleSynthonSpace_MorganFPs.spc 1402 1403 To perform a similarity search using Morgan fingerprints for query molecules 1404 in an input file, against a binary data base file synthon space containing 1405 Morgan fingerprints, employing radius 2 and bit vector size of 2048, finding 1406 a maximum of 1000 hits for each query molecule, and write out a single output 1407 file containing hit molecules, type: 1408 1409 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1410 -i SampleSynthonSpace_MorganFPs.spc 1411 --queryFile SampleSynthonSpaceQuery.sdf 1412 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf 1413 1414 or only count hits without building hits and writing them to an output 1415 file: 1416 1417 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1418 -i SampleSynthonSpace_MorganFPs.spc 1419 --queryFile SampleSynthonSpaceQuery.sdf 1420 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf 1421 --synthonSearchParams "buildHits,No" 1422 1423 To run previous example for writing individual output files for each query 1424 molecule, type: 1425 1426 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1427 -i SampleSynthonSpace_MorganFPs.spc 1428 --queryFile SampleSynthonSpaceQuery.sdf 1429 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf 1430 --outfileMode MultipleFiles 1431 1432 To run previous example for retrieving all possible hits for query molecules 1433 and write out individual output files for each query molecules, type: 1434 1435 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1436 -i SampleSynthonSpace_MorganFPs.spc 1437 --queryFile SampleSynthonSpaceQuery.sdf 1438 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf 1439 --outfileMode MultipleFiles 1440 --synthonSearchParams "maxHits,-1" 1441 1442 To run the previous example using multi-threading employing all available 1443 threads on your machine, retrieve maximum of 1000 hits for each query 1444 molecule and generate various output files, type: 1445 1446 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1447 -i SampleSynthonSpace_MorganFPs.spc 1448 --queryFile SampleSynthonSpaceQuery.smi 1449 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi 1450 --outfileMode MultipleFiles 1451 --synthonSearchParams "maxHits, 1000, numThreads, 0" 1452 1453 To run the previous example using multi-threading employing all but one 1454 available threads on your machine, type: 1455 1456 % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch 1457 -i SampleSynthonSpace_MorganFPs.spc 1458 --queryFile SampleSynthonSpaceQuery.smi 1459 -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi 1460 --outfileMode MultipleFiles 1461 --synthonSearchParams "maxHits, 1000, numThreads, -1" 1462 1463 To perform a substructure search using query pattern SMARTS against a synthon 1464 space file, finding a maximum of 1000 hits for each query pattern and write out 1465 a single output file containing hit molecules, type: 1466 1467 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch 1468 -i SampleSynthonSpace.spc 1469 --queryPattern "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1" 1470 -o SampleSynthonSpace_SubstructureSearchResults.sdf 1471 1472 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch 1473 -i SampleSynthonSpace.csv 1474 --queryPattern 'c1c[n,s,o][n,s,o,c]c1C(=O)[$(N1CCCCC1),$(N1CCCC1)]' 1475 -o SampleSynthonSpace_SubstructureSearchResults.sdf 1476 1477 To run previous example for retrieving for writing out individual output files 1478 for each query molecules, type: 1479 1480 % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch 1481 -i SampleSynthonSpace.spc 1482 --queryPattern "CCN(C(=O)c1cc2cc(OC)ccc2nc1C)C1CCCN(C(=O)OC(C)(C)C)C1 1483 C=CCc1c(N[C@H](C)c2cccc(C)c2)ncnc1N(C)CCCC(=O)OC" 1484 -o SampleSynthonSpace_SubstructureSearchResults.sdf 1485 --outfileMode MultipleFiles 1486 1487 To perform RASCAL similarity search for query molecules in an input file, 1488 against a binary data base file synthon space, finding a maximum of 1000 hits 1489 for each query molecule, using multi-threadsing employing all available CPUs, 1490 timing out after 3600 seconds, and write out a single output file containing 1491 hit molecules, type: 1492 1493 % RDKitPerformSynthonSpaceSearch.py -m RASCALSimilaritySearch 1494 -i SampleSynthonSpace.spc 1495 --queryFile SampleSynthonSpaceQuery.sdf 1496 -o SampleSynthonSpace_RASCALSimilaritySearchResults.sdf 1497 --synthonSearchParams "maxHits, 1000, numThreads, 0, timeOut, 3600" 1498 1499 Author: 1500 Manish Sud(msud@san.rr.com) 1501 1502 Acknowledgments: 1503 Dave Cosgrove 1504 1505 See also: 1506 RDKitConvertFileFormat.py, RDKitPickDiverseMolecules.py, RDKitSearchFunctionalGroups.py, 1507 RDKitSearchSMARTS.py 1508 1509 Copyright: 1510 Copyright (C) 2025 Manish Sud. All rights reserved. 1511 1512 The functionality available in this script is implemented using RDKit, an 1513 open source toolkit for cheminformatics developed by Greg Landrum. 1514 1515 This file is part of MayaChemTools. 1516 1517 MayaChemTools is free software; you can redistribute it and/or modify it under 1518 the terms of the GNU Lesser General Public License as published by the Free 1519 Software Foundation; either version 3 of the License, or (at your option) any 1520 later version. 1521 1522 """ 1523 1524 if __name__ == "__main__": 1525 main()