MayaChemTools

    1 #!/bin/env python
    2 #
    3 # File: RDKitPerformSynthonSpaceSearch.py
    4 # Author: Manish Sud <msud@san.rr.com>
    5 #
    6 # Acknowledgment: Dave Cosgrove
    7 #
    8 # Copyright (C) 2026 Manish Sud. All rights reserved.
    9 #
   10 # The functionality available in this script is implemented using RDKit, an
   11 # open source toolkit for cheminformatics developed by Greg Landrum.
   12 #
   13 # This file is part of MayaChemTools.
   14 #
   15 # MayaChemTools is free software; you can redistribute it and/or modify it under
   16 # the terms of the GNU Lesser General Public License as published by the Free
   17 # Software Foundation; either version 3 of the License, or (at your option) any
   18 # later version.
   19 #
   20 # MayaChemTools is distributed in the hope that it will be useful, but without
   21 # any warranty; without even the implied warranty of merchantability of fitness
   22 # for a particular purpose.  See the GNU Lesser General Public License for more
   23 # details.
   24 #
   25 # You should have received a copy of the GNU Lesser General Public License
   26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   28 # Boston, MA, 02111-1307, USA.
   29 #
   30 
   31 from __future__ import print_function
   32 
   33 import os
   34 import sys
   35 import time
   36 import re
   37 import multiprocessing as mp
   38 
   39 # RDKit imports...
   40 try:
   41     from rdkit import rdBase
   42     from rdkit import Chem
   43     from rdkit.Chem import AllChem
   44     from rdkit.Chem import rdSynthonSpaceSearch
   45     from rdkit.Chem import rdFingerprintGenerator
   46     from rdkit.Chem import rdRascalMCES
   47 except ImportError as ErrMsg:
   48     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
   49     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
   50     sys.exit(1)
   51 
   52 # MayaChemTools imports...
   53 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
   54 try:
   55     from docopt import docopt
   56     import MiscUtil
   57     import RDKitUtil
   58 except ImportError as ErrMsg:
   59     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
   60     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
   61     sys.exit(1)
   62 
   63 ScriptName = os.path.basename(sys.argv[0])
   64 Options = {}
   65 OptionsInfo = {}
   66 
   67 
   68 def main():
   69     """Start execution of the script."""
   70 
   71     MiscUtil.PrintInfo(
   72         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
   73         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
   74     )
   75 
   76     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
   77 
   78     # Retrieve command line arguments and options...
   79     RetrieveOptions()
   80 
   81     if Options and Options["--list"]:
   82         # Process list option...
   83         ProcessListSynthonSearchSpace()
   84     else:
   85         # Process and validate command line arguments and options...
   86         ProcessOptions()
   87 
   88         # Perform actions required by the script...
   89         PerformSynthonSpaceSearch()
   90 
   91     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
   92     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
   93 
   94 
   95 def PerformSynthonSpaceSearch():
   96     """Perform synthon space search."""
   97 
   98     Mode = OptionsInfo["Mode"]
   99     if re.match("^FingerprintsGeneration$", Mode, re.I):
  100         GenerateFingerprints()
  101     elif re.match("^BinaryDBFileGeneration$", Mode, re.I):
  102         GenerateBinaryDatabaseFile()
  103     elif re.match("^LibraryEnumeration$", Mode, re.I):
  104         PerformLibraryEnumeration()
  105     elif re.match("^RascalSimilaritySearch$", Mode, re.I):
  106         PerformRascalSimilaritySearch()
  107     elif re.match("^SimilaritySearch$", Mode, re.I):
  108         PerformSimilaritySearch()
  109     elif re.match("^SubstructureSearch$", Mode, re.I):
  110         PerformSubtructureSearch()
  111     else:
  112         MiscUtil.PrintError('The value specified, %s, for option "--mode" is not valid.' % Mode)
  113 
  114 
  115 def GenerateFingerprints():
  116     """Generate fingerprints for synthons and write out a binary file."""
  117 
  118     MiscUtil.PrintInfo("\nGenerating fingerprints (Mode: %s)..." % OptionsInfo["Mode"])
  119 
  120     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  121 
  122     StartTime = time.perf_counter()
  123 
  124     MiscUtil.PrintInfo("\nGenerating fingerprints (Type: %s)..." % OptionsInfo["SpecifiedFingerprints"])
  125     FPGenerator = InitializeFingerprintsGenerator()
  126     SynthonSpace.BuildSynthonFingerprints(FPGenerator)
  127 
  128     TotalTime = time.perf_counter() - StartTime
  129     MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
  130 
  131     WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"])
  132 
  133 
  134 def GenerateBinaryDatabaseFile():
  135     """Write out a binary file for synthons."""
  136 
  137     MiscUtil.PrintInfo("\nGenerating binary database file (Mode: %s)..." % OptionsInfo["Mode"])
  138 
  139     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  140     WriteSynthonSpaceBinaryFile(SynthonSpace, OptionsInfo["Outfile"])
  141 
  142 
  143 def PerformLibraryEnumeration():
  144     """Enumerate library using synthons and write out a SMILES file."""
  145 
  146     MiscUtil.PrintInfo("\nPerforming library enumeration (Mode: %s)..." % OptionsInfo["Mode"])
  147 
  148     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  149 
  150     MiscUtil.PrintInfo("\nWriting file %s ..." % OptionsInfo["Outfile"])
  151     SynthonSpace.WriteEnumeratedFile(OptionsInfo["Outfile"])
  152 
  153 
  154 def PerformSimilaritySearch():
  155     """Perform similarity search."""
  156 
  157     SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
  158     CountHitsMode = OptionsInfo["CountHitsMode"]
  159     SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
  160 
  161     MiscUtil.PrintInfo(
  162         "\nPerforming similiarity search (Fingerprints: %s; SimilarityCutoff: %s; MaxHits: %s)..."
  163         % (
  164             OptionsInfo["SpecifiedFingerprints"],
  165             SynthonSearchParams["SimilarityCutoff"],
  166             SynthonSearchParams["MaxHits"],
  167         )
  168     )
  169 
  170     # Setup synthon space...
  171     SynthonSpace, FPGenerator = SetupSynthonSpaceForSimilaritySearch()
  172 
  173     # Setup out file writers...
  174     SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
  175 
  176     # Setup a molecule reader...
  177     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"])
  178     QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"])
  179 
  180     # Process query molecules...
  181     (QueryMolCount, ValidQueryMolCount) = [0] * 2
  182     for QueryMol in QueryMols:
  183         QueryMolCount += 1
  184         if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol):
  185             continue
  186 
  187         ValidQueryMolCount += 1
  188         QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount)
  189 
  190         HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSimilaritySearch(
  191             SynthonSpace, FPGenerator, QueryMol
  192         )
  193 
  194         if CountHitsMode:
  195             WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
  196         else:
  197             WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
  198 
  199             Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
  200             WriteMolecules(Writer, QueryMolName, HitMols)
  201 
  202             if not SingleOutFileMode:
  203                 if Writer is not None:
  204                     Writer.close()
  205 
  206     if SingleOutFileWriter is not None:
  207         SingleOutFileWriter.close()
  208 
  209     if HitsInfoWriter is not None:
  210         HitsInfoWriter.close()
  211 
  212     MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount)
  213     MiscUtil.PrintInfo("Number of valid query  molecules: %d" % ValidQueryMolCount)
  214     MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount))
  215 
  216 
  217 def PerformSubtructureSearch():
  218     """Perform substructure search."""
  219 
  220     SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
  221     CountHitsMode = OptionsInfo["CountHitsMode"]
  222     SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
  223 
  224     MiscUtil.PrintInfo("\nPerforming substructue search (MaxHits: %s)..." % (SynthonSearchParams["MaxHits"]))
  225 
  226     # Setup synthon space...
  227     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  228 
  229     # Setup out file writers...
  230     SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
  231 
  232     # Process query pattern molecules...
  233     MiscUtil.PrintInfo("\nProcessing query patterns...")
  234 
  235     QueryMolCount = 0
  236     for QueryMol in OptionsInfo["QueryPatternMols"]:
  237         QueryMolCount += 1
  238         QueryMolName = "Pattern%s" % QueryMolCount
  239 
  240         HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol)
  241 
  242         if CountHitsMode:
  243             WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
  244         else:
  245             WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
  246 
  247             Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
  248             WriteMolecules(Writer, QueryMolName, HitMols)
  249 
  250             if not SingleOutFileMode:
  251                 if Writer is not None:
  252                     Writer.close()
  253 
  254     if SingleOutFileWriter is not None:
  255         SingleOutFileWriter.close()
  256 
  257     if HitsInfoWriter is not None:
  258         HitsInfoWriter.close()
  259 
  260     MiscUtil.PrintInfo("\nTotal number of query patterns: %d" % QueryMolCount)
  261 
  262 
  263 def PerformRascalSimilaritySearch():
  264     """Perform RASCAL similarity search."""
  265 
  266     SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
  267     CountHitsMode = OptionsInfo["CountHitsMode"]
  268     RascalSearchParams = OptionsInfo["RascalSearchParams"]
  269     SynthonSearchParams = OptionsInfo["SynthonSearchParams"]
  270 
  271     MiscUtil.PrintInfo(
  272         "\nPerforming RASCAL similiarity search (SimilarityThreshold: %s; MaxHits: %s)..."
  273         % (RascalSearchParams["SimilarityThreshold"], SynthonSearchParams["MaxHits"])
  274     )
  275 
  276     # Setup synthon space...
  277     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  278 
  279     # Setup out file writers...
  280     SingleOutFileWriter, HitsInfoWriter = SetupOutfileWriters()
  281 
  282     # Setup a molecule reader...
  283     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["QueryFile"])
  284     QueryMols = RDKitUtil.ReadMolecules(OptionsInfo["QueryFile"], **OptionsInfo["QueryFileParams"])
  285 
  286     # Process query molecules...
  287     (QueryMolCount, ValidQueryMolCount) = [0] * 2
  288     for QueryMol in QueryMols:
  289         QueryMolCount += 1
  290         if QueryMol is None or RDKitUtil.IsMolEmpty(QueryMol):
  291             continue
  292 
  293         ValidQueryMolCount += 1
  294         QueryMolName = RDKitUtil.GetMolName(QueryMol, QueryMolCount)
  295 
  296         HitMols, HitMolsCount, MaxPossibleHits = PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol)
  297 
  298         if CountHitsMode:
  299             WriteHitsInfo(HitsInfoWriter, [QueryMolName, MaxPossibleHits])
  300         else:
  301             WriteHitsInfo(HitsInfoWriter, [QueryMolName, HitMolsCount, MaxPossibleHits])
  302 
  303             Writer = SingleOutFileWriter if SingleOutFileMode else SetupMoleculeWriter(SingleOutFileMode, QueryMolCount)
  304             WriteMolecules(Writer, QueryMolName, HitMols)
  305 
  306             if not SingleOutFileMode:
  307                 if Writer is not None:
  308                     Writer.close()
  309 
  310     if SingleOutFileWriter is not None:
  311         SingleOutFileWriter.close()
  312 
  313     if HitsInfoWriter is not None:
  314         HitsInfoWriter.close()
  315 
  316     MiscUtil.PrintInfo("\nTotal number of query molecules: %d" % QueryMolCount)
  317     MiscUtil.PrintInfo("Number of valid query  molecules: %d" % ValidQueryMolCount)
  318     MiscUtil.PrintInfo("Number of ignored query molecules: %d" % (QueryMolCount - ValidQueryMolCount))
  319 
  320 
  321 def ProcessListSynthonSearchSpace():
  322     """Process list synthon search space information."""
  323 
  324     MiscUtil.PrintInfo("\nListing information...")
  325 
  326     # Validate infile..
  327     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
  328     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc")
  329 
  330     # Process infile..
  331     OptionsInfo["Infile"] = Options["--infile"]
  332 
  333     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  334 
  335     MiscUtil.PrintInfo("\nSummary of synthon space:\n")
  336     SynthonSpace.Summarise()
  337 
  338     ListSynthonSpaceFingerprintsType(SynthonSpace)
  339 
  340 
  341 def PerformSynthonSpaceSimilaritySearch(SynthonSpace, FPGenerator, QueryMol):
  342     """Perform synthon space similarity search."""
  343 
  344     try:
  345         Results = SynthonSpace.FingerprintSearch(QueryMol, FPGenerator, params=OptionsInfo["RDKitSynthonSearchParams"])
  346     except Exception as ErrMsg:
  347         MiscUtil.PrintInfo("")
  348         MiscUtil.PrintError("Failed to perform synthon space fingerprints seach:\n%s\n" % (ErrMsg))
  349 
  350     HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
  351 
  352     return (HitMols, HitMolsCount, MaxPossibleHits)
  353 
  354 
  355 def PerformSynthonSpaceRascalSimilaritySearch(SynthonSpace, QueryMol):
  356     """Perform synthon space RASCAL similarity search."""
  357 
  358     try:
  359         Results = SynthonSpace.RascalSearch(
  360             QueryMol, OptionsInfo["RDKitRascalSearchParams"], params=OptionsInfo["RDKitSynthonSearchParams"]
  361         )
  362     except Exception as ErrMsg:
  363         MiscUtil.PrintInfo("")
  364         MiscUtil.PrintError("Failed to perform synthon space RASCAL similarity seach:\n%s\n" % (ErrMsg))
  365 
  366     HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
  367 
  368     return (HitMols, HitMolsCount, MaxPossibleHits)
  369 
  370 
  371 def PerformSynthonSpaceSubstructureSearch(SynthonSpace, QueryMol):
  372     """Perform synthon space substructure search."""
  373 
  374     try:
  375         Results = SynthonSpace.SubstructureSearch(
  376             QueryMol,
  377             substructMatchParams=OptionsInfo["RDKitSubstructureMatchParams"],
  378             params=OptionsInfo["RDKitSynthonSearchParams"],
  379         )
  380     except Exception as ErrMsg:
  381         MiscUtil.PrintInfo("")
  382         MiscUtil.PrintError("Failed to perform synthon space substructure seach:\n%s\n" % (ErrMsg))
  383 
  384     HitMols, HitMolsCount, MaxPossibleHits = GetSynthonSpaceHitMolecules(Results)
  385 
  386     return (HitMols, HitMolsCount, MaxPossibleHits)
  387 
  388 
  389 def GetSynthonSpaceHitMolecules(Results):
  390     """Retrieve synthon space hit molecues."""
  391 
  392     HitMols = Results.GetHitMolecules()
  393 
  394     HitMolsCount = len(HitMols)
  395     if HitMolsCount == 0:
  396         HitMols = None
  397         HitMolsCount = None
  398 
  399     MaxPossibleHits = Results.GetMaxNumResults()
  400 
  401     return (HitMols, HitMolsCount, MaxPossibleHits)
  402 
  403 
  404 def SetupSynthonSpaceForSimilaritySearch():
  405     """Setup synthon space for similarity search."""
  406 
  407     SynthonSpace = ReadSynthonSpaceFile(OptionsInfo["Infile"])
  408 
  409     FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace)
  410     if FPType is None:
  411         MiscUtil.PrintInfo("")
  412         MiscUtil.PrintError(
  413             "The synthon space input file, %s, doesn't contain any fingerprints. You must specify a synthon space binary database file containing appropriate fingerprints for similarity search.."
  414             % OptionsInfo["Infile"]
  415         )
  416 
  417     if not re.search("%s" % OptionsInfo["SpecifiedFingerprints"], FPType, re.I):
  418         MiscUtil.PrintInfo("")
  419         MiscUtil.PrintWarning(
  420             'The fingerprints type, %s, in synthon space input file, %s, doesn\'t appear to match fingerprints, %s, specified using "--fingerprints" option for similarity search.'
  421             % (FPType, OptionsInfo["Infile"], OptionsInfo["SpecifiedFingerprints"])
  422         )
  423 
  424     FPGenerator = InitializeFingerprintsGenerator()
  425 
  426     return (SynthonSpace, FPGenerator)
  427 
  428 
  429 def InitializeFingerprintsGenerator():
  430     """Initialize fingerprints generator."""
  431 
  432     FPGenerator = None
  433     SpecifiedFingerprints = OptionsInfo["SpecifiedFingerprints"]
  434     if re.match("^AtomPairs$", SpecifiedFingerprints, re.I):
  435         FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"]
  436         FPGenerator = rdFingerprintGenerator.GetAtomPairGenerator(
  437             minDistance=FPParamsInfo["MinLength"],
  438             maxDistance=FPParamsInfo["MaxLength"],
  439             includeChirality=FPParamsInfo["UseChirality"],
  440             use2D=FPParamsInfo["Use2D"],
  441             fpSize=FPParamsInfo["FPSize"],
  442         )
  443     elif re.match("^Morgan$", SpecifiedFingerprints, re.I):
  444         FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["Morgan"]
  445         FPGenerator = rdFingerprintGenerator.GetMorganGenerator(
  446             radius=FPParamsInfo["Radius"],
  447             includeChirality=FPParamsInfo["UseChirality"],
  448             useBondTypes=FPParamsInfo["UseBondTypes"],
  449             includeRingMembership=FPParamsInfo["UseRingMembership"],
  450             fpSize=FPParamsInfo["FPSize"],
  451         )
  452     elif re.match("^MorganFeatures$", SpecifiedFingerprints, re.I):
  453         FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"]
  454         FPGenerator = rdFingerprintGenerator.GetMorganGenerator(
  455             radius=FPParamsInfo["Radius"],
  456             includeChirality=FPParamsInfo["UseChirality"],
  457             useBondTypes=FPParamsInfo["UseBondTypes"],
  458             includeRingMembership=FPParamsInfo["UseRingMembership"],
  459             fpSize=FPParamsInfo["FPSize"],
  460             atomInvariantsGenerator=rdFingerprintGenerator.GetMorganAtomInvGen(),
  461         )
  462     elif re.match("^PathLength$", SpecifiedFingerprints, re.I):
  463         FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["PathLength"]
  464         FPGenerator = rdFingerprintGenerator.GetRDKitFPGenerator(
  465             minPath=FPParamsInfo["MinPath"],
  466             maxPath=FPParamsInfo["MaxPath"],
  467             useHs=FPParamsInfo["UseExplicitHs"],
  468             branchedPaths=FPParamsInfo["UseBranchedPaths"],
  469             useBondOrder=FPParamsInfo["UseBondOrder"],
  470             fpSize=FPParamsInfo["FPSize"],
  471             numBitsPerFeature=FPParamsInfo["BitsPerHash"],
  472         )
  473     elif re.match("^TopologicalTorsions$", SpecifiedFingerprints, re.I):
  474         FPParamsInfo = OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"]
  475         FPGenerator = rdFingerprintGenerator.GetTopologicalTorsionGenerator(
  476             includeChirality=FPParamsInfo["UseChirality"], fpSize=FPParamsInfo["FPSize"]
  477         )
  478     else:
  479         MiscUtil.PrintError('The value specified, %s, for option "--fingerprints" is not valid.')
  480 
  481     return FPGenerator
  482 
  483 
  484 def ReadSynthonSpaceFile(Infile):
  485     """Read synthon space file."""
  486 
  487     MiscUtil.PrintInfo("\nReading synthon space file %s..." % Infile)
  488     SynthonSpace = rdSynthonSpaceSearch.SynthonSpace()
  489 
  490     StartTime = time.perf_counter()
  491 
  492     try:
  493         if MiscUtil.CheckFileExt(Infile, "spc"):
  494             SynthonSpace.ReadDBFile(Infile)
  495         else:
  496             SynthonSpace.ReadTextFile(Infile)
  497     except Exception as ErrMsg:
  498         MiscUtil.PrintInfo("")
  499         MiscUtil.PrintError("Failed to read synthon space file:\n%s\n" % (ErrMsg))
  500 
  501     TotalTime = time.perf_counter() - StartTime
  502     MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
  503 
  504     return SynthonSpace
  505 
  506 
  507 def WriteSynthonSpaceBinaryFile(SynthonSpace, Outfile):
  508     """Write synthon space binary file."""
  509 
  510     MiscUtil.PrintInfo("\nWriting synthon space file %s..." % Outfile)
  511     StartTime = time.perf_counter()
  512 
  513     try:
  514         SynthonSpace.WriteDBFile(Outfile)
  515     except Exception as ErrMsg:
  516         MiscUtil.PrintInfo("")
  517         MiscUtil.PrintError("Failed to write synthon space file:\n%s\n" % (ErrMsg))
  518 
  519     TotalTime = time.perf_counter() - StartTime
  520     MiscUtil.PrintInfo("Total time: %.2f secs" % TotalTime)
  521 
  522     return SynthonSpace
  523 
  524 
  525 def ListSynthonSpaceFingerprintsType(SynthonSpace):
  526     """List synthon space fingerprints type."""
  527 
  528     FPType, FPInfo = GetSynthonFingerprintsInfo(SynthonSpace)
  529 
  530     if FPInfo is None:
  531         MiscUtil.PrintInfo("\nFingerprints type: %s" % (FPInfo))
  532     else:
  533         MiscUtil.PrintInfo("\nFingerprints type: %s\nFingerprints Info: %s" % (FPType, FPInfo))
  534 
  535 
  536 def GetSynthonFingerprintsInfo(SynthonSpace):
  537     """Get synthon fingerprints information."""
  538 
  539     FPInfo = SynthonSpace.GetSynthonFingerprintType()
  540     if len(FPInfo) == 0:
  541         return (None, None)
  542 
  543     if re.search("AtomPairArguments", FPInfo, re.I):
  544         FPType = "AtomPairs"
  545     elif re.search("MorganArguments", FPInfo, re.I):
  546         FPType = "Morgan or MorganFeatures"
  547     elif re.search("RDKitFPArguments", FPInfo, re.I):
  548         FPType = "PathLength"
  549     elif re.search("TopologicalTorsionArguments", FPInfo, re.I):
  550         FPType = "TopologicalTorsions"
  551     else:
  552         FPType = "Unknown"
  553 
  554     return (FPType, FPInfo)
  555 
  556 
  557 def SetupMoleculeWriter(SIngleOutFile, MolCount=0):
  558     """Setup molecule writer."""
  559 
  560     TextOutFileMode = OptionsInfo["TextOutFileMode"]
  561     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
  562     TextOutFileTitleLine = OptionsInfo["TextOutFileTitleLine"]
  563 
  564     if SIngleOutFile:
  565         Outfile = OptionsInfo["Outfile"]
  566     else:
  567         Outfile = "%s_%s%s.%s" % (
  568             OptionsInfo["OutFileRoot"],
  569             OptionsInfo["OutFileSuffix"],
  570             MolCount,
  571             OptionsInfo["OutFileExt"],
  572         )
  573 
  574     if TextOutFileMode:
  575         Writer = open(Outfile, "w")
  576     else:
  577         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
  578     if Writer is None:
  579         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
  580 
  581     if TextOutFileMode:
  582         if TextOutFileTitleLine:
  583             WriteTextFileHeaderLine(Writer, TextOutFileDelim)
  584 
  585     return Writer
  586 
  587 
  588 def WriteTextFileHeaderLine(Writer, TextOutFileDelim):
  589     """Write out a header line for text files including SMILES file."""
  590 
  591     Line = ""
  592     if OptionsInfo["SubstructureSearchMode"]:
  593         Line = TextOutFileDelim.join(["SMILES", "Name", "QueryPatternNumber"])
  594     elif OptionsInfo["SimilaritySearchMode"]:
  595         Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"])
  596     elif OptionsInfo["RascalSimilaritySearchMode"]:
  597         Line = TextOutFileDelim.join(["SMILES", "Name", "Similarity", "QueryMolName"])
  598 
  599     Writer.write("%s\n" % Line)
  600 
  601 
  602 def WriteMolecules(Writer, QueryMolName, HitMols):
  603     """Write hit molecules for similarity and substructure search."""
  604 
  605     RascalSimilaritySearchMode = OptionsInfo["RascalSimilaritySearchMode"]
  606     SimilaritySearchMode = OptionsInfo["SimilaritySearchMode"]
  607     SubstructureSearchMode = OptionsInfo["SubstructureSearchMode"]
  608 
  609     TextOutFileMode = OptionsInfo["TextOutFileMode"]
  610     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
  611 
  612     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
  613 
  614     SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"]
  615     SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"]
  616 
  617     HitMolCount = 0
  618     for HitMol in HitMols:
  619         HitMolCount += 1
  620 
  621         if TextOutFileMode:
  622             # Write out text file including SMILES file...
  623             LineWords = []
  624             LineWords.append(Chem.MolToSmiles(HitMol, isomericSmiles=SMILESIsomeric, kekuleSmiles=SMILESKekulize))
  625             LineWords.append(RDKitUtil.GetMolName(HitMol, HitMolCount))
  626 
  627             if SimilaritySearchMode or RascalSimilaritySearchMode:
  628                 Similarity = "%.2f" % float(HitMol.GetProp("Similarity"))
  629                 LineWords.append(Similarity)
  630 
  631             LineWords.append(QueryMolName)
  632 
  633             Line = TextOutFileDelim.join(LineWords)
  634             Writer.write("%s\n" % Line)
  635         else:
  636             # Write out SD file...
  637             if SimilaritySearchMode or RascalSimilaritySearchMode:
  638                 HitMol.SetProp("QueryMolName", QueryMolName)
  639             elif SubstructureSearchMode:
  640                 HitMol.SetProp("QueryPatternNum", QueryMolName)
  641 
  642             if SimilaritySearchMode or RascalSimilaritySearchMode:
  643                 Similarity = "%.2f" % float(HitMol.GetProp("Similarity"))
  644                 HitMol.SetProp("Similarity", Similarity)
  645 
  646             if Compute2DCoords:
  647                 AllChem.Compute2DCoords(HitMol)
  648             Writer.write(HitMol)
  649 
  650 
  651 def SetupOutfileWriters():
  652     """Setup outfile writers."""
  653 
  654     SingleOutFileWriter, HitsInfoWriter = [None] * 2
  655 
  656     if OptionsInfo["CountHitsMode"]:
  657         MiscUtil.PrintInfo(
  658             "\nSkipping generation of output files containing hit structures and only counting hits (BuildHits: No)..."
  659         )
  660     else:
  661         if OptionsInfo["SingleOutFileMode"]:
  662             SingleOutFileWriter = SetupMoleculeWriter(OptionsInfo["SingleOutFileMode"])
  663             MiscUtil.PrintInfo("\nGenerating output file %s..." % OptionsInfo["Outfile"])
  664         else:
  665             MiscUtil.PrintInfo(
  666                 "\nGenerating output file(s) %s_%s*.%s..."
  667                 % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"])
  668             )
  669 
  670     HitsInfoWriter = SetupHitsInfoWriter()
  671 
  672     return (SingleOutFileWriter, HitsInfoWriter)
  673 
  674 
  675 def SetupHitsInfoWriter():
  676     """Setup hits info writer."""
  677 
  678     HitsInfoOutFile = OptionsInfo["HitsInfoOutFile"]
  679     HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"]
  680 
  681     MiscUtil.PrintInfo("\nGenerating output file %s..." % HitsInfoOutFile)
  682 
  683     Writer = open(HitsInfoOutFile, "w")
  684 
  685     # Setup and write out header...
  686     MolIDColName = "MolID"
  687     if OptionsInfo["SubstructureSearchMode"]:
  688         MolIDColName = "QueryPatternNumber"
  689     elif OptionsInfo["SimilaritySearchMode"]:
  690         MolIDColName = "QueryMolName"
  691     elif OptionsInfo["RascalSimilaritySearchMode"]:
  692         MolIDColName = "QueryMolName"
  693 
  694     if OptionsInfo["CountHitsMode"]:
  695         Line = HitsInfoOutFileDelim.join([MolIDColName, "MaxPossibleHits"])
  696     else:
  697         Line = HitsInfoOutFileDelim.join([MolIDColName, "HitsCount", "MaxPossibleHits"])
  698 
  699     Writer.write("%s\n" % Line)
  700 
  701     return Writer
  702 
  703 
  704 def WriteHitsInfo(Writer, HitsInfo):
  705     """Write hits info."""
  706 
  707     HitsInfoWords = ["%s" % HitInfo for HitInfo in HitsInfo]
  708 
  709     HitsInfoOutFileDelim = OptionsInfo["HitsInfoOutFileDelim"]
  710     Line = HitsInfoOutFileDelim.join(HitsInfoWords)
  711 
  712     Writer.write("%s\n" % Line)
  713 
  714 
  715 def ProcessFingerprintsParameters():
  716     """Set up and process fingerprints parameters."""
  717 
  718     SetupFingerprintsNamesAndParameters()
  719 
  720     ProcessSpecifiedFingerprintsName()
  721     ProcessSpecifiedFingerprintsParameters()
  722 
  723 
  724 def SetupFingerprintsNamesAndParameters():
  725     """Set up fingerprints parameters."""
  726 
  727     OptionsInfo["FingerprintsNames"] = ["AtomPairs", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
  728 
  729     OptionsInfo["FingerprintsParamsInfo"] = {}
  730     OptionsInfo["FingerprintsParamsInfo"]["AtomPairs"] = {
  731         "MinLength": 1,
  732         "MaxLength": 30,
  733         "UseChirality": False,
  734         "Use2D": True,
  735         "FPSize": 2048,
  736     }
  737     OptionsInfo["FingerprintsParamsInfo"]["Morgan"] = {
  738         "Radius": 2,
  739         "UseChirality": False,
  740         "UseBondTypes": True,
  741         "UseRingMembership": True,
  742         "FPSize": 2048,
  743     }
  744     OptionsInfo["FingerprintsParamsInfo"]["MorganFeatures"] = {
  745         "Radius": 2,
  746         "UseChirality": False,
  747         "UseBondTypes": True,
  748         "UseRingMembership": True,
  749         "FPSize": 2048,
  750     }
  751     OptionsInfo["FingerprintsParamsInfo"]["PathLength"] = {
  752         "MinPath": 1,
  753         "MaxPath": 7,
  754         "UseExplicitHs": True,
  755         "UseBranchedPaths": True,
  756         "UseBondOrder": True,
  757         "FPSize": 2048,
  758         "BitsPerHash": 2,
  759     }
  760     OptionsInfo["FingerprintsParamsInfo"]["TopologicalTorsions"] = {"UseChirality": False, "FPSize": 2048}
  761 
  762 
  763 def ProcessSpecifiedFingerprintsName():
  764     """Process specified fingerprints name."""
  765 
  766     #  Set up a canonical fingerprints name map...
  767     CanonicalFingerprintsNamesMap = {}
  768     for Name in OptionsInfo["FingerprintsNames"]:
  769         CanonicalName = Name.lower()
  770         CanonicalFingerprintsNamesMap[CanonicalName] = Name
  771 
  772     # Validate specified fingerprints name...
  773     CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
  774     if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
  775         MiscUtil.PrintError(
  776             'The fingerprints name, %s, specified using "-f, --fingerprints" option is not a valid name.'
  777             % (OptionsInfo["Fingerprints"])
  778         )
  779 
  780     OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
  781 
  782 
  783 def ProcessSpecifiedFingerprintsParameters():
  784     """Process specified fingerprints parameters."""
  785 
  786     if re.match("^auto$", OptionsInfo["FingerprintsParams"], re.I):
  787         # Nothing to process...
  788         return
  789 
  790     SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
  791 
  792     # Parse specified fingerprints parameters...
  793     FingerprintsParams = re.sub(" ", "", OptionsInfo["FingerprintsParams"])
  794     if not FingerprintsParams:
  795         MiscUtil.PrintError(
  796             'No valid parameter name and value pairs specified using "--fingerprintsParams" option corrresponding to fingerprints %s.'
  797             % (SpecifiedFingerprintsName)
  798         )
  799 
  800     FingerprintsParamsWords = FingerprintsParams.split(",")
  801     if len(FingerprintsParamsWords) % 2:
  802         MiscUtil.PrintError(
  803             'The number of comma delimited paramater names and values, %d, specified using "--fingerprintsParams" option must be an even number.'
  804             % (len(FingerprintsParamsWords))
  805         )
  806 
  807     # Setup canonical parameter names for specified fingerprints...
  808     ValidParamNames = []
  809     CanonicalParamNamesMap = {}
  810     for ParamName in sorted(OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName]):
  811         ValidParamNames.append(ParamName)
  812         CanonicalParamNamesMap[ParamName.lower()] = ParamName
  813 
  814     # Validate and set paramater names and value...
  815     for Index in range(0, len(FingerprintsParamsWords), 2):
  816         Name = FingerprintsParamsWords[Index]
  817         Value = FingerprintsParamsWords[Index + 1]
  818 
  819         CanonicalName = Name.lower()
  820         if CanonicalName not in CanonicalParamNamesMap:
  821             MiscUtil.PrintError(
  822                 'The parameter name, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid name. Supported parameter names: %s'
  823                 % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames))
  824             )
  825 
  826         ParamName = CanonicalParamNamesMap[CanonicalName]
  827         if re.match(
  828             "^(UseChirality|Use2D|UseBondTypes|UseRingMembership|UseExplicitHs|UseBranchedPaths|UseBondOrder)$",
  829             ParamName,
  830             re.I,
  831         ):
  832             if not re.match("^(Yes|No|True|False)$", Value, re.I):
  833                 MiscUtil.PrintError(
  834                     'The parameter value, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False'
  835                     % (Value, SpecifiedFingerprintsName)
  836                 )
  837             ParamValue = False
  838             if re.match("^(Yes|True)$", Value, re.I):
  839                 ParamValue = True
  840         else:
  841             ParamValue = int(Value)
  842             if ParamValue <= 0:
  843                 MiscUtil.PrintError(
  844                     'The parameter value, %s, specified using "--fingerprintsParams" option for fingerprints, %s, is not a valid value. Supported values: > 0'
  845                     % (Value, SpecifiedFingerprintsName)
  846                 )
  847 
  848         # Set value...
  849         OptionsInfo["FingerprintsParamsInfo"][SpecifiedFingerprintsName][ParamName] = ParamValue
  850 
  851 
  852 def ProcessOutfileParameters():
  853     """Process outfile related parameters"""
  854 
  855     Mode = OptionsInfo["Mode"]
  856 
  857     OptionsInfo["Outfile"] = Options["--outfile"]
  858     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
  859         "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
  860     )
  861 
  862     # OutfileMode is only used for similarity and substructure search...
  863     OptionsInfo["OutFileMode"] = Options["--outfileMode"]
  864     SingleOutFileMode = True
  865     if not re.match("^SingleFile$", Options["--outfileMode"], re.I):
  866         SingleOutFileMode = False
  867     OptionsInfo["SingleOutFileMode"] = SingleOutFileMode
  868 
  869     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
  870     OptionsInfo["OutFileRoot"] = FileName
  871     OptionsInfo["OutFileExt"] = FileExt
  872 
  873     OutFileSuffix = ""
  874     if re.match("^SubstructureSearch$", Mode, re.I):
  875         OutFileSuffix = "Pattern"
  876     elif re.match("^SimilaritySearch$", Mode, re.I):
  877         OutFileSuffix = "Mol"
  878     OptionsInfo["OutFileSuffix"] = OutFileSuffix
  879 
  880     OptionsInfo["HitsInfoOutFile"] = "%s_HitCount.csv" % OptionsInfo["OutFileRoot"]
  881     OptionsInfo["HitsInfoOutFileDelim"] = ","
  882 
  883     TextOutFileMode, TextOutFileDelim, TextOutFileTitleLine = [None] * 3
  884     if re.match("^(SimilaritySearch|SubstructureSearch)$", Mode, re.I):
  885         TextOutFileMode = False
  886         TextOutFileDelim = ""
  887         TextOutFileTitleLine = True
  888 
  889         if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
  890             TextOutFileMode = True
  891             TextOutFileDelim = ","
  892         elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
  893             TextOutFileMode = True
  894             TextOutFileDelim = "\t"
  895         elif MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
  896             TextOutFileMode = True
  897             TextOutFileDelim = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
  898             TextOutFileTitleLine = OptionsInfo["OutfileParams"]["SMILESTitleLine"]
  899 
  900     OptionsInfo["TextOutFileMode"] = TextOutFileMode
  901     OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
  902     OptionsInfo["TextOutFileTitleLine"] = TextOutFileTitleLine
  903 
  904     if not OptionsInfo["SingleOutFileMode"]:
  905         FilesSpec = "%s_%s*.%s" % (OptionsInfo["OutFileRoot"], OptionsInfo["OutFileSuffix"], OptionsInfo["OutFileExt"])
  906         FileNames = MiscUtil.ExpandFileNames(FilesSpec)
  907         if len(FileNames):
  908             if not Options["--overwrite"]:
  909                 MiscUtil.PrintError(
  910                     'The output files, %s, corresponding to output file specified, %s, for option "-o, --outfile" already exist. Use option "--ov" or "--overwrite" and try again.'
  911                     % (FilesSpec, OptionsInfo["Outfile"])
  912                 )
  913 
  914 
  915 def ProcessRascalSearchParametersOption():
  916     """Process option for RASCAL similarity search."""
  917 
  918     ParamsOptionName = "--rascalSearchParams"
  919     ParamsOptionValue = Options[ParamsOptionName]
  920 
  921     ParamsDefaultInfo = {
  922         "AllBestMCESs": ["bool", False],
  923         "CompleteAromaticRings": ["bool", True],
  924         "CompleteSmallestRings": ["bool", False],
  925         "ExactConnectionsMatch": ["bool", False],
  926         "IgnoreAtomAromaticity": ["bool", True],
  927         "IgnoreBondOrders": ["bool", False],
  928         "MaxBondMatchPairs": ["int", 1000],
  929         "MaxFragSeparation": ["int", -1],
  930         "MinCliqueSize": ["int", 0],
  931         "MinFragSize": ["int", -1],
  932         "ReturnEmptyMCES": ["bool", False],
  933         "RingMatchesRingOnly": ["bool", False],
  934         "SimilarityThreshold": ["float", 0.7],
  935         "SingleLargestFrag": ["bool", False],
  936         "Timeout": ["int", 60],
  937     }
  938 
  939     # Update default values to match RDKit default values...
  940     RDKitRascalSearchParams = rdRascalMCES.RascalOptions()
  941     for ParamName in ParamsDefaultInfo.keys():
  942         RDKitParamName = LowercaseFirstLetter(ParamName)
  943         if hasattr(RDKitRascalSearchParams, RDKitParamName):
  944             RDKitParamValue = getattr(RDKitRascalSearchParams, RDKitParamName)
  945             ParamsDefaultInfo[ParamName][1] = RDKitParamValue
  946         else:
  947             MiscUtil.PrintWarning(
  948                 "The RASCAL search parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
  949             )
  950 
  951     RascalSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(
  952         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  953     )
  954 
  955     for ParamName in ["MaxBondMatchPairs"]:
  956         ParamValue = RascalSearchParams[ParamName]
  957         if ParamValue <= 0:
  958             MiscUtil.PrintError(
  959                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  960                 % (ParamValue, ParamName, ParamsOptionName)
  961             )
  962 
  963     for ParamName in ["MinCliqueSize", "SimilarityThreshold"]:
  964         ParamValue = RascalSearchParams[ParamName]
  965         if ParamValue < 0:
  966             MiscUtil.PrintError(
  967                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
  968                 % (ParamValue, ParamName, ParamsOptionName)
  969             )
  970         if re.match("^SimilarityThreshold$", ParamName, re.I):
  971             if ParamValue > 1:
  972                 MiscUtil.PrintError(
  973                     'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: <= 1\n'
  974                     % (ParamValue, ParamName, ParamsOptionName)
  975                 )
  976 
  977     for ParamName in ["MaxFragSeparation", "MinFragSize", "Timeout"]:
  978         ParamValue = RascalSearchParams[ParamName]
  979         if not (ParamValue == -1 or ParamValue > 0):
  980             MiscUtil.PrintError(
  981                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: -1 or > 0\n'
  982                 % (ParamValue, ParamName, ParamsOptionName)
  983             )
  984 
  985     # Setup RDKit object for RASCAL match parameters...
  986     RDKitRascalSearchParams = rdRascalMCES.RascalOptions()
  987     for ParamName in RascalSearchParams.keys():
  988         ParamValue = RascalSearchParams[ParamName]
  989 
  990         # Convert first letter to lower case for RDKit param name and set its value...
  991         RDKitParamName = LowercaseFirstLetter(ParamName)
  992         if hasattr(RDKitRascalSearchParams, RDKitParamName):
  993             setattr(RDKitRascalSearchParams, RDKitParamName, ParamValue)
  994         else:
  995             MiscUtil.PrintWarning(
  996                 "The RASCAL searh parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
  997             )
  998 
  999     OptionsInfo["RascalSearchParams"] = RascalSearchParams
 1000     OptionsInfo["RDKitRascalSearchParams"] = RDKitRascalSearchParams
 1001 
 1002 
 1003 def ProcessSubstructureMatchParametersOption():
 1004     """Process option for substructure match parameters."""
 1005 
 1006     ParamsOptionName = "--substructureMatchParams"
 1007     ParamsOptionValue = Options[ParamsOptionName]
 1008 
 1009     ParamsDefaultInfo = {
 1010         "AromaticMatchesConjugated": ["bool", False],
 1011         "MaxMatches": ["int", 1000],
 1012         "MaxRecursiveMatches": ["int", 1000],
 1013         "RecursionPossible": ["bool", True],
 1014         "SpecifiedStereoQueryMatchesUnspecified": ["bool", False],
 1015         "Uniquify": ["bool", True],
 1016         "UseChirality": ["bool", False],
 1017         "UseEnhancedStereo": ["bool", False],
 1018         "UseGenericMatchers": ["bool", False],
 1019     }
 1020 
 1021     # Update default values to match RDKit default values...
 1022     RDKitSubstructureMatchParams = Chem.SubstructMatchParameters()
 1023     for ParamName in ParamsDefaultInfo.keys():
 1024         RDKitParamName = LowercaseFirstLetter(ParamName)
 1025         if hasattr(RDKitSubstructureMatchParams, RDKitParamName):
 1026             RDKitParamValue = getattr(RDKitSubstructureMatchParams, RDKitParamName)
 1027             ParamsDefaultInfo[ParamName][1] = RDKitParamValue
 1028         else:
 1029             MiscUtil.PrintWarning(
 1030                 "The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
 1031             )
 1032 
 1033     SubstructureMatchParams = MiscUtil.ProcessOptionNameValuePairParameters(
 1034         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
 1035     )
 1036 
 1037     for ParamName in ["MaxMatches", "MaxRecursiveMatches"]:
 1038         ParamValue = SubstructureMatchParams[ParamName]
 1039         if ParamValue <= 0:
 1040             MiscUtil.PrintError(
 1041                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
 1042                 % (ParamValue, ParamName, ParamsOptionName)
 1043             )
 1044 
 1045     # Setup RDKit object for substructure match parameters...
 1046     RDKitSubstructureMatchParams = Chem.SubstructMatchParameters()
 1047     for ParamName in SubstructureMatchParams.keys():
 1048         ParamValue = SubstructureMatchParams[ParamName]
 1049 
 1050         # Convert first letter to lower case for RDKit param name and set its value...
 1051         RDKitParamName = LowercaseFirstLetter(ParamName)
 1052         if hasattr(RDKitSubstructureMatchParams, RDKitParamName):
 1053             setattr(RDKitSubstructureMatchParams, RDKitParamName, ParamValue)
 1054         else:
 1055             MiscUtil.PrintWarning(
 1056                 "The substructure match parameter, %s, is not available in RDKit. Ignoring parameter..." % ParamName
 1057             )
 1058 
 1059     OptionsInfo["SubstructureMatchParams"] = SubstructureMatchParams
 1060     OptionsInfo["RDKitSubstructureMatchParams"] = RDKitSubstructureMatchParams
 1061 
 1062 
 1063 def ProcessSynthonSearchParamatersOption():
 1064     """Process option for synthon search parameters."""
 1065 
 1066     ParamsOptionName = "--synthonSearchParams"
 1067     ParamsOptionValue = Options[ParamsOptionName]
 1068 
 1069     ParamsDefaultInfo = {
 1070         "ApproxSimilarityAdjuster": ["float", 0.1],
 1071         "BuildHits": ["bool", True],
 1072         "FragSimilarityAdjuster": ["float", 0.1],
 1073         "HitStart": ["int", 0],
 1074         "MaxHits": ["int", 1000],
 1075         "MaxNumFrags": ["int", 100000],
 1076         "NumThreads": ["int", 1],
 1077         "RandomSample": ["bool", False],
 1078         "RandomSeed": ["int", -1],
 1079         "SimilarityCutoff": ["float", 0.5],
 1080         "TimeOut": ["int", 600],
 1081     }
 1082 
 1083     # Update default values to match RDKit default values...
 1084     RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams()
 1085     for ParamName in ParamsDefaultInfo.keys():
 1086         RDKitParamName = LowercaseFirstLetter(ParamName)
 1087         if hasattr(RDKitSynthonSearchParams, RDKitParamName):
 1088             RDKitParamValue = getattr(RDKitSynthonSearchParams, RDKitParamName)
 1089             ParamsDefaultInfo[ParamName][1] = RDKitParamValue
 1090         else:
 1091             MiscUtil.PrintWarning(
 1092                 "The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName
 1093             )
 1094 
 1095     SynthonSearchParams = MiscUtil.ProcessOptionNameValuePairParameters(
 1096         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
 1097     )
 1098 
 1099     for ParamName in ["ApproxSimilarityAdjuster", "FragSimilarityAdjuster", "SimilarityCutoff", "HitStart"]:
 1100         ParamValue = SynthonSearchParams[ParamName]
 1101         if ParamValue < 0:
 1102             MiscUtil.PrintError(
 1103                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
 1104                 % (ParamValue, ParamName, ParamsOptionName)
 1105             )
 1106         if re.match("^SimilarityCutoff$", ParamName, re.I):
 1107             if ParamValue > 1:
 1108                 MiscUtil.PrintError(
 1109                     'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: <= 1\n'
 1110                     % (ParamValue, ParamName, ParamsOptionName)
 1111                 )
 1112 
 1113     for ParamName in ["MaxNumFrags", "TimeOut"]:
 1114         ParamValue = SynthonSearchParams[ParamName]
 1115         if ParamValue <= 0:
 1116             MiscUtil.PrintError(
 1117                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
 1118                 % (ParamValue, ParamName, ParamsOptionName)
 1119             )
 1120 
 1121     for ParamName in ["MaxHits", "RandomSeed"]:
 1122         ParamValue = SynthonSearchParams[ParamName]
 1123         if not (ParamValue == -1 or ParamValue > 0):
 1124             MiscUtil.PrintError(
 1125                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: -1 or > 0\n'
 1126                 % (ParamValue, ParamName, ParamsOptionName)
 1127             )
 1128 
 1129     ParamName = "NumThreads"
 1130     ParamValue = SynthonSearchParams[ParamName]
 1131     if ParamValue > 0:
 1132         if ParamValue > mp.cpu_count():
 1133             MiscUtil.PrintWarning(
 1134                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is greater than number of CPUs, %s, returned by mp.cpu_count().'
 1135                 % (ParamValue, ParamName, ParamsOptionName, mp.cpu_count())
 1136             )
 1137     elif ParamValue < 0:
 1138         if abs(ParamValue) > mp.cpu_count():
 1139             MiscUtil.PrintWarning(
 1140                 'The absolute parameter value, %s, specified for parameter name, %s, using "%s" option is greater than number of CPUs, %s, returned by mp.cpu_count().'
 1141                 % (abs(ParamValue), ParamName, ParamsOptionName, mp.cpu_count())
 1142             )
 1143 
 1144     # Setup RDKit object for synthon space search parameters...
 1145     RDKitSynthonSearchParams = rdSynthonSpaceSearch.SynthonSpaceSearchParams()
 1146     for ParamName in SynthonSearchParams.keys():
 1147         ParamValue = SynthonSearchParams[ParamName]
 1148 
 1149         # Convert first letter to lower case for RDKit param name and set its value...
 1150         RDKitParamName = LowercaseFirstLetter(ParamName)
 1151         if hasattr(RDKitSynthonSearchParams, RDKitParamName):
 1152             setattr(RDKitSynthonSearchParams, RDKitParamName, ParamValue)
 1153         else:
 1154             MiscUtil.PrintWarning(
 1155                 "The synthon space search paramater, %s, is not available in RDKit. Ignoring parameter..." % ParamName
 1156             )
 1157 
 1158     OptionsInfo["CountHitsMode"] = False if SynthonSearchParams["BuildHits"] else True
 1159 
 1160     OptionsInfo["SynthonSearchParams"] = SynthonSearchParams
 1161     OptionsInfo["RDKitSynthonSearchParams"] = RDKitSynthonSearchParams
 1162 
 1163 
 1164 def LowercaseFirstLetter(Text):
 1165     """Convert first letter of a string to lowercase."""
 1166 
 1167     if Text is None or len(Text) == 0:
 1168         return Text
 1169 
 1170     return Text[0].lower() + Text[1:]
 1171 
 1172 
 1173 def ProcessQueryPatternOption():
 1174     """Process query pattern option."""
 1175 
 1176     QueryPattern = None if re.match("^None$", Options["--queryPattern"], re.I) else Options["--queryPattern"]
 1177     QueryPatternMols = None
 1178 
 1179     if QueryPattern is not None:
 1180         QueryPatternMols = []
 1181         Patterns = QueryPattern.split()
 1182         for Pattern in Patterns:
 1183             PatternMol = Chem.MolFromSmarts(Pattern)
 1184             if PatternMol is None:
 1185                 MiscUtil.PrintError(
 1186                     'The value specified, %s, using option "--queryPattern" is not a valid SMARTS: Failed to create pattern molecule'
 1187                     % (Pattern)
 1188                 )
 1189             QueryPatternMols.append(PatternMol)
 1190 
 1191     OptionsInfo["QueryPattern"] = QueryPattern
 1192     OptionsInfo["QueryPatternMols"] = QueryPatternMols
 1193 
 1194 
 1195 def ProcessOptions():
 1196     """Process and validate command line arguments and options."""
 1197 
 1198     MiscUtil.PrintInfo("Processing options...")
 1199 
 1200     # Validate options...
 1201     ValidateOptions()
 1202 
 1203     OptionsInfo["Mode"] = Options["--mode"]
 1204     OptionsInfo["RascalSimilaritySearchMode"] = (
 1205         True if re.match("^RASCALSimilaritySearch$", Options["--mode"], re.I) else False
 1206     )
 1207     OptionsInfo["SimilaritySearchMode"] = True if re.match("^SimilaritySearch$", Options["--mode"], re.I) else False
 1208     OptionsInfo["SubstructureSearchMode"] = True if re.match("^SubstructureSearch$", Options["--mode"], re.I) else False
 1209 
 1210     OptionsInfo["Fingerprints"] = Options["--fingerprints"]
 1211 
 1212     OptionsInfo["FingerprintsParams"] = Options["--fingerprintsParams"]
 1213     ProcessFingerprintsParameters()
 1214 
 1215     OptionsInfo["Infile"] = Options["--infile"]
 1216 
 1217     ProcessOutfileParameters()
 1218 
 1219     OptionsInfo["Overwrite"] = Options["--overwrite"]
 1220 
 1221     ProcessQueryPatternOption()
 1222 
 1223     OptionsInfo["QueryFile"] = None if re.match("^none$", Options["--queryFile"]) else Options["--queryFile"]
 1224     if OptionsInfo["QueryFile"] is None:
 1225         OptionsInfo["QueryFileParams"] = None
 1226     else:
 1227         OptionsInfo["QueryFileParams"] = MiscUtil.ProcessOptionInfileParameters(
 1228             "--queryFileParams", Options["--queryFileParams"], Options["--queryFile"]
 1229         )
 1230 
 1231     ProcessRascalSearchParametersOption()
 1232 
 1233     ProcessSubstructureMatchParametersOption()
 1234     ProcessSynthonSearchParamatersOption()
 1235 
 1236     OptionsInfo["Overwrite"] = Options["--overwrite"]
 1237 
 1238 
 1239 def RetrieveOptions():
 1240     """Retrieve command line arguments and options."""
 1241 
 1242     # Get options...
 1243     global Options
 1244     Options = docopt(_docoptUsage_)
 1245 
 1246     # Set current working directory to the specified directory...
 1247     WorkingDir = Options["--workingdir"]
 1248     if WorkingDir:
 1249         os.chdir(WorkingDir)
 1250 
 1251     # Handle examples option...
 1252     if "--examples" in Options and Options["--examples"]:
 1253         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 1254         sys.exit(0)
 1255 
 1256 
 1257 def ValidateOptions():
 1258     """Validate option values."""
 1259 
 1260     MiscUtil.ValidateOptionTextValue(
 1261         "-m, --mode",
 1262         Options["--mode"],
 1263         "FingerprintsGeneration BinaryDBFileGeneration LibraryEnumeration RASCALSimilaritySearch SimilaritySearch SubstructureSearch",
 1264     )
 1265 
 1266     MiscUtil.ValidateOptionTextValue(
 1267         "-f, --fingerprints",
 1268         Options["--fingerprints"],
 1269         "AtomPairs Morgan MorganFeatures PathLength TopologicalTorsions",
 1270     )
 1271 
 1272     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 1273     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "txt csv spc")
 1274 
 1275     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi csv tsv txt spc")
 1276     if re.match("^SingleFile$", Options["--outfileMode"], re.I):
 1277         MiscUtil.ValidateOptionsOutputFileOverwrite(
 1278             "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 1279         )
 1280     MiscUtil.ValidateOptionsDistinctFileNames(
 1281         "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 1282     )
 1283 
 1284     if re.match("^(FingerprintsGeneration|BinaryDBFileGeneration)$", Options["--mode"], re.I):
 1285         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "spc")
 1286         if not MiscUtil.CheckFileExt(Options["--outfile"], "spc"):
 1287             MiscUtil.PrintError(
 1288                 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: spc\n'
 1289                 % (Options["--outfile"], Options["--mode"])
 1290             )
 1291     elif re.match("^LibraryEnumeration$", Options["--mode"], re.I):
 1292         if not MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
 1293             MiscUtil.PrintError(
 1294                 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: smi\n'
 1295                 % (Options["--outfile"], Options["--mode"])
 1296             )
 1297     elif re.match("^(RASCALSimilaritySearch|SimilaritySearch|SubstructureSearch)$", Options["--mode"], re.I):
 1298         if not MiscUtil.CheckFileExt(Options["--outfile"], "sdf sd smi csv tsv txt"):
 1299             MiscUtil.PrintError(
 1300                 'The file name specified , %s, for option "--outfile" is not valid during, %s, value of "--mode" option. Supported file formats: sdf sd smi csv tsv txt\n'
 1301                 % (Options["--outfile"], Options["--mode"])
 1302             )
 1303 
 1304     MiscUtil.ValidateOptionTextValue("--outfileMode", Options["--outfileMode"], "SingleFile or MultipleFiles")
 1305 
 1306     QueryPattern = Options["--queryPattern"]
 1307     if re.match("^SubstructureSearch$", Options["--mode"], re.I):
 1308         if re.match("^None$", QueryPattern, re.I):
 1309             MiscUtil.PrintError(
 1310                 'You must specify a valid SMARTS pattern(s) for option "--queryPattern" during, SubstructureSearch, value of "-m, --mode" option.'
 1311             )
 1312 
 1313     PatternMols = []
 1314     if not re.match("^None$", QueryPattern, re.I):
 1315         Patterns = QueryPattern.split()
 1316         for Pattern in Patterns:
 1317             PatternMol = Chem.MolFromSmarts(Pattern)
 1318             if PatternMol is None:
 1319                 MiscUtil.PrintError(
 1320                     'The value specified, %s, using option "--queryPattern" is not a valid SMARTS: Failed to create pattern molecule'
 1321                     % (Pattern)
 1322                 )
 1323             PatternMols.append(PatternMol)
 1324 
 1325     if re.match("^SubstructureSearch$", Options["--mode"], re.I):
 1326         if len(PatternMols) == 0:
 1327             MiscUtil.PrintError(
 1328                 'You must specify a valid SMARTS pattern(s) for option "--queryPattern" during, SubstructureSearch, value of "-m, --mode" option.'
 1329             )
 1330 
 1331     if re.match("^(RASCALSimilaritySearch|SimilaritySearch)$", Options["--mode"], re.I):
 1332         if re.match("^None$", Options["--queryFile"], re.I):
 1333             MiscUtil.PrintError(
 1334                 'You must specify a valid filename for option "--queryFile" during, SimilaritySearch, value of "-m, --mode" option.'
 1335             )
 1336 
 1337     if not re.match("^None$", Options["--queryFile"], re.I):
 1338         MiscUtil.ValidateOptionFilePath("--queryFile", Options["--queryFile"])
 1339         MiscUtil.ValidateOptionFileExt("--queryFile", Options["--queryFile"], "sdf sd smi csv tsv")
 1340 
 1341 
 1342 # Setup a usage string for docopt...
 1343 _docoptUsage_ = """
 1344 RDKitPerformSynthonSpaceSearch.py - Perform a synthon space search
 1345 
 1346 Usage:
 1347     RDKitPerformSynthonSpaceSearch.py [--fingerprints <Morgan, PathLength...>] [--fingerprintsParams <Name,Value,...>]
 1348                                       [--mode <SubstructureSearch...>] [ --outfileParams <Name,Value,...>] [--outfileMode <SingleFile or MultipleFiles>]
 1349                                       [--overwrite] [--queryPattern <SMARTS>] [--queryFileParams <Name,Value,...>] [--queryFile <filename>]
 1350                                       [--rascalSearchParams <Name,Value,...>] [--substructureMatchParams <Name,Value,...>]
 1351                                       [--synthonSearchParams <Name,Value,...>] [-w <dir>] -i <infile> -o <outfile>
 1352     RDKitPerformSynthonSpaceSearch.py -l | --list -i <infile>
 1353     RDKitPerformSynthonSpaceSearch.py -h | --help | -e | --examples
 1354 
 1355 Description:
 1356     Perform a similarity or substructure search, using query molecules or SMARTS
 1357     patterns, against a synthon space [ Ref 174 ] in an input file, and write out the
 1358     hit molecules to output file(s). You may optionally count the hits without
 1359     building and writing them out.
 1360 
 1361     In addition, you may enumerate a combinatorial library corresponding to a
 1362     synthon space, generate fingerprints for a synthon space, or list information
 1363     about a synthon space.
 1364 
 1365     You must provide a valid synthon space text or binary database file supported
 1366     by RDKit module rdSynthonSpaceSearch.
 1367 
 1368     You may perform similarity search using fingerprints or employ RASCAL (RApid
 1369     Similarity CALculations using Maximum Edge Subgrahps) methodology [ Ref 175 ].
 1370 
 1371     A number of fingerprints are available for performing similarity search. The
 1372     similarity metric, however, is calculated using Tanimoto similarity on hashed
 1373     fingerprints. 
 1374 
 1375     The RASCAL similarity between two molecuels is calculated based on MCES
 1376     (Maximum Common Edge Subgraphs) and corresponds to Johnson similarity.
 1377 
 1378     The supported input file formats are: CSV/TXT synthon space (.csv, .txt) or
 1379     binary synthon space (.spc).
 1380 
 1381     The supported outfile formats, for different '--mode' values, are shown
 1382     below:
 1383         
 1384         BinaryDBFileGeneration: Binary database file (.spc)
 1385         FingerprintsGeneration: Binary database file (.spc)
 1386         LibraryEnumeration: SMILES (.smi)
 1387         SimilaritySearch or SubstructureSearch: SD (.sdf, .sd), SMILES (.smi),
 1388             CSV/TSV (.csv or .tsv)
 1389         
 1390     Possible output files:
 1391          
 1392         <OutfileRoot>.<sdf,sd,smi,csv,tsv>
 1393          
 1394         <OutfileRoot>_Mol<Num>.<sdf,sd,smi,csv,tsv>
 1395         <OutfileRoot>_Pattern<Num>.<sdf,sd,smi,csv,tsv>
 1396          
 1397          <OutfileRoot>_HitCount.csv
 1398          
 1399     The <OutfileRoot>_HitCount.csv contains aditional information regarding hit
 1400      counts and is writter out for both similarity and substructure search.
 1401 
 1402 Options:
 1403     -f, --fingerprints <Morgan, PathLength...>  [default: Morgan]
 1404         Fingerprints to use for performing synthon space similarity search.
 1405         Supported values: AtomPairs, Morgan, MorganFeatures, PathLength,
 1406         TopologicalTorsions. The PathLength fingerprints are Daylight like
 1407         fingerprints. The Morgan and MorganFeature fingerprints are circular
 1408         fingerprints, corresponding Scitegic's Extended Connectivity Fingerprints
 1409         (ECFP) and Features Connectivity Fingerprints (FCFP). The values of
 1410         default parameters for generating fingerprints can be modified using
 1411         '--fingerprintsParams' option.
 1412     --fingerprintsParams <Name,Value,...>  [default: auto]
 1413         Parameter values to use for generating fingerprints. The default values
 1414         are dependent on the value of '-f, --fingerprints' option. In general, it is a
 1415         comma delimited list of parameter name and value pairs for the name of
 1416         fingerprints specified using '-f, --fingerprints' option. The supported
 1417         parameter names along with their default values for valid fingerprints
 1418         names are shown below:
 1419             
 1420             AtomPairs: minLength,1 ,maxLength,useChirality,No,
 1421                 use2D, yes, fpSize, 2048
 1422             Morgan: radius,2, useChirality,No, useBondTypes, yes,
 1423                 useRingMembership, yes, fpSize, 2048
 1424             MorganFeatures: radius,2, useChirality,No, useBondTypes, yes,
 1425                 useRingMembership, yes, fpSize, 2048
 1426             PathLength: minPath,1, maxPath,7, useExplicitHs, yes,
 1427                 useBranchedPaths, yes,useBondOrder,yes, fpSize, 2048,
 1428                 bitsPerHash,2
 1429             TopologicalTorsions: useChirality,No, fpSize, 2048
 1430             
 1431         A brief description of parameters, taken from RDKit documentation, is
 1432         provided below:
 1433             
 1434             AtomPairs:
 1435             
 1436             minLength: Minimum distance between atoms.
 1437             maxLength: Maximum distance between atoms.
 1438             useChirality: Use chirality for atom invariants.
 1439             use2D: Use topological distance matrix.
 1440             fpSize: Size of the fingerpints bit vector.
 1441             
 1442             Morgan and MorganFeatures:
 1443             
 1444             radius: Neighborhood radius.
 1445             useChirality: Use chirality to generate fingerprints.
 1446             useBondTypes: Use bond type for the bond invariants.
 1447             useRingMembership: Use ring membership.
 1448             fpSize: Size of the fingerpints bit vector.
 1449             
 1450             PathLength:
 1451             
 1452             minPath: Minimum bond path length.
 1453             maxPath: Maximum bond path length.
 1454             useExplicitHs: Use explicit hydrogens.
 1455             useBranchedPaths: Use branched paths along with linear paths.
 1456             useBondOrder: Us bond order in the path hashes.
 1457             fpSize: Size of the fingerpints bit vector.
 1458             bitsPerHash: Number of bits set per path.
 1459             
 1460             TopologicalTorsions
 1461             
 1462             useChirality: Use chirality to generate fingerprints.
 1463             fpSize: Size of the fingerpints bit vector.
 1464             
 1465     -e, --examples
 1466         Print examples.
 1467     -h, --help
 1468         Print this help message.
 1469     -i, --infile <infile>
 1470         Synthon space Input file name.
 1471     -l, --list
 1472         List information about synthon space.
 1473     -m, --mode <SubstructureSearch...>  [default: SimilaritySearch]
 1474         Perform similarity or substructure search, enumerate synthon space,
 1475         or list information about a synthon space. The supported values along
 1476         with a brief explanation of the expected behavior are shown below:
 1477             
 1478             BinaryDBFileGeneration: Write out a binary database file for a
 1479                 synthon space.
 1480             FingerprintsGeneration: Generate fingerints for a synthon space and
 1481                 write out a binary database file along with fingerprints.
 1482             LibraryEnumeration: Enumerate a combinatorial library for a synthon
 1483                 space and write out a SMILES file.
 1484             RASCALSimilaritySearch: Perform a RASCAL (RApid Similarity
 1485                 CALculations using Maximum Edge Subgrahps) similarity search.
 1486             SimilaritySearch: Perform a similarity search using fingerprints.
 1487             SubstructureSearch: Perform a substructure search using specified
 1488                 SMARTS patterns.
 1489             
 1490     -o, --outfile <outfile>
 1491         Output file name. The <OutfileRoot> and <OutfileExt> are used to generate
 1492         file names during 'MultipleFiles' value for '--outfileMode' option.
 1493     --outfileMode <SingleFile or MultipleFiles>  [default: SingleFile]
 1494         Write out a single file containing hit molecules for substructure or
 1495         similarity search or  generate an individual file for each query pattern
 1496         or molecule. Possible values: SingleFile or MultipleFiles. The query
 1497         pattern number or molecule name is written to output file(s). The query
 1498         pattern or molecule number is also appended to output file names during
 1499         the generation of multiple output files.
 1500     --outfileParams <Name,Value,...>  [default: auto]
 1501         A comma delimited list of parameter name and value pairs for writing
 1502         molecules to files during similarity and substructue search. The supported
 1503         parameter names for different file formats, along with their default values,
 1504         are shown below:
 1505             
 1506             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 1507             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 1508                 smilesTitleLine,yes
 1509             
 1510         Default value for compute2DCoords: yes for SMILES input file; no for all other
 1511         file types. The kekulize and smilesIsomeric parameters are also used during
 1512         generation of SMILES strings for CSV/TSV files.
 1513     --queryPattern <SMARTS SMARTS ...>  [default: none]
 1514         A space delimited list of SMARTS patterns for performing substructure
 1515         search. This is required for 'SubstructureSearch' value of '--mode' option.
 1516     --queryFile <filename>  [default: none]
 1517         Input file containing query molecules for performing similarity search. This
 1518         is required for 'SimilaritySearch' value of '--mode' option.
 1519     --queryFileParams <Name,Value,...>  [default: auto]
 1520         A comma delimited list of parameter name and value pairs for reading 
 1521         molecules from query files during similarity search. The supported
 1522         parameter names for different file formats, along with their default
 1523         values, are shown below:
 1524             
 1525             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 1526             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 1527                 smilesTitleLine,auto,sanitize,yes
 1528             
 1529         Possible values for smilesDelimiter: space, comma or tab.
 1530     --rascalSearchParams <Name,Value,...>  [default: auto]
 1531         Parameter values to use for RASCAL similarity search.
 1532         
 1533         The default values are automatically updated to match RDKit default values.
 1534         The supported parameter names along with their default values are
 1535         are shown below:
 1536             
 1537             allBestMCESs, no, completeAromaticRings, yes,
 1538             completeSmallestRings, no, exactConnectionsMatch, no, 
 1539             ignoreAtomAromaticity, yes, ignoreBondOrders, no,
 1540             maxBondMatchPairs, 1000, maxFragSeparation, -1, minCliqueSize, 0,
 1541             minFragSize, -1, returnEmptyMCES, false, ringMatchesRingOnly, false,
 1542             similarityThreshold, 0.7, singleLargestFrag, no,
 1543             timeout, 60
 1544             
 1545         A brief description of parameters, taken from RDKit documentation, is
 1546         provided below:
 1547             
 1548             allBestMCESs: Find all Maximum Common Edge Subgraphs (MCES).
 1549             completeAromaticRings: Use only complete aromatic rings.
 1550             completeSmallestRings: Only complete rings present in both
 1551                 molecules.
 1552             exactConnectionsMatch: Match atoms only when they have the same
 1553                 number of explicit connections.
 1554             ignoreAtomAromaticity: Ignore aromaticity during atom matching.
 1555             ignoreBondOrders: Ignore bond orders during atom matching.
 1556             maxBondMatchPairs: Maximum number of matching bond pairs.
 1557             maxFragSeparation: Maximum bond distance that bonds can match.
 1558                 value of -1 implies no maximum.
 1559             minCliqueSize: A value of > 0 overrides the similarityThreshold.
 1560                 This refers to the minimum number of bonds in the MCES.
 1561             minFragSize: Minimum number of atoms in a fragment. A value of -1
 1562                 implies no minimum.
 1563             returnEmptyMCES: Return empty MCES results.
 1564             ringMatchesRingOnly: Match ring bonds to only ring bonds.
 1565             similarityThreshold: Similarity threshold for matching and
 1566                 evaluating MCES.
 1567             singleLargestFrag: Find only a single fragment for the MCES. By
 1568                 default, multiple fragments are generated as necessary.
 1569             timeout: Max run time in seconds. A value of -1 implies no max.
 1570             
 1571     --substructureMatchParams <Name,Value,...>  [default: auto]
 1572         Parameter values to use for substructure match during synthon substructure
 1573         search.
 1574         
 1575         The default values are automatically updated to match RDKit default values.
 1576         The supported parameter names along with their default values are
 1577         are shown below:
 1578             
 1579             aromaticMatchesConjugated, no, maxMatches, 1000,
 1580             maxRecursiveMatches, 1000, recursionPossible, yes,
 1581             specifiedStereoQueryMatchesUnspecified, no,  uniquify, yes,
 1582             useChirality, no, useEnhancedStereo, no, useGenericMatchers, no,
 1583             
 1584         A brief description of parameters, taken from RDKit documentation, is
 1585         provided below:
 1586             
 1587             aromaticMatchesConjugated: Match aromatic and conjugated bonds.
 1588             maxMatches: Maximum number of matches.
 1589             maxRecursiveMatches: Maximum number of recursive matches.
 1590             recursionPossible: Allow recursive queries.
 1591             specifiedStereoQueryMatchesUnspecified: Match query atoms and bonds
 1592                 with specified stereochemistry to atoms and bonds with unspecified
 1593                 stereochemistry.
 1594             uniquify: Uniquify match results using atom indices.
 1595             useChirality: Use chirality to match atom and bonds.
 1596             useEnhancedStereo: Use enhanced stereochemistry during the use
 1597                 of chirality.
 1598             useGenericMatchers: Use generic groups as a post-filtering step.
 1599             
 1600     --synthonSearchParams <Name,Value,...>  [default: auto]
 1601         Parameter values to use for performing synthon substructure and similarity
 1602         search.
 1603         
 1604         The default values are automatically updated to match RDKit default values.
 1605         The supported parameter names along with their default values are
 1606         are shown below:
 1607             
 1608             approxSimilarityAdjuster, 0.1, [ Default value for Morgan FPs ]
 1609             buildHits, yes, fragSimilarityAdjuster, 0.1, hitStart, 0,
 1610             maxHits, 1000, [ A value of -1 retrives all hits ]
 1611             maxNumFrags, 100000,
 1612             numThreads, 1 [ 0: Use maximum number of threads supported by the
 1613                 hardware; Negative value: Added to the maxiumum number of
 1614                 threads supported by the hardware ]
 1615             randomSample, no,
 1616             randomSeed, -1 [  Default value implies use random seed ]
 1617             similarityCutoff, 0.5, [ Default for Morgan FPs. Ignored during RASCAL
 1618                 similarity search; instead, RASCAL parameter similarityThreshold is
 1619                 used.  ]
 1620             timeOut, 600 [ Unit: sec. The RASCAL searches take longer and may
 1621                 need a higher value for timeOut. For example: 3600 ]
 1622             
 1623         A brief description of parameters, taken from RDKit documentation, is
 1624         provided below:
 1625             
 1626             approxSimilarityAdjuster: Value used for reducing similarity cutoff
 1627                 during approximate similarity check for fingerprint search. A
 1628                 lower value leads to faster run times at the risk of missing
 1629                 some hits.
 1630             buildHits: A no value implies to report the maximum number of hits a
 1631                 search could generate without returning any hits.
 1632             fragSimilarityAdjuster: Value used for reducing fragment matching
 1633                 similarity cutoff to accommodate low bit densities for fragments.
 1634             hitStart: Return hits starting from the specified sequence number
 1635                 to support retrieval of hits in batches.
 1636             maxHits: Maximum number of hits to return. A value of -1 implies
 1637                 retrieve all hits.
 1638             maxNumFrags: Maximum number of fragments for breaking a query. 
 1639             numThreads: Number of threads to use for search. A value of 0 
 1640                 implies the use of all available hardware threads. A negative
 1641                 value is added to the number of available hardware threads to
 1642                 calculate number of threads to use.
 1643             randomSample: Return a random sample of hits up to maxHits.
 1644             randomSeed: Random number seed to use during search. A value of -1
 1645                 implies the use of a random seed.
 1646             similarityCutoff: Similarity cutoff for returning hits by fingerprint
 1647                 similarity search. A default value of 0.5 is set for Morgan
 1648                 fingeprints.
 1649             timeOut: Time limit for search, in seconds. A valus of  0 implies
 1650                 no timeout.
 1651             
 1652     --overwrite
 1653         Overwrite existing files.
 1654     -w, --workingdir <dir>
 1655         Location of working directory which defaults to the current directory.
 1656 
 1657 Examples:
 1658     To list information about a synthon space in a text file, type:
 1659 
 1660         % RDKitPerformSynthonSpaceSearch.py --list -i SampleSynthonSpace.csv
 1661 
 1662     To generate a binary database file for a synthon space in a text file, type:
 1663 
 1664         % RDKitPerformSynthonSpaceSearch.py -m BinaryDBFileGeneration
 1665           -i SampleSynthonSpace.csv -o SampleSynthonSpace.spc
 1666 
 1667     To enumerate a combnatorial library for a synthon space in a text file and
 1668     write out a SMILES file, type:
 1669 
 1670         % RDKitPerformSynthonSpaceSearch.py -m LibraryEnumeration
 1671           -i SampleSynthonSpace.csv -o SampleSynthonSpace_Library.smi
 1672 
 1673     To generate Morgan fingerprints for a synthon space in a text file, employing
 1674     radius of 2 and bit vector size of 2048, and write out a binary database file,
 1675     type:
 1676 
 1677         % RDKitPerformSynthonSpaceSearch.py -m FingerprintsGeneration
 1678           -i SampleSynthonSpace.csv -o SampleSynthonSpace_MorganFPs.spc
 1679 
 1680     To perform a similarity search using Morgan fingerprints for query molecules
 1681     in an input file, against a binary data base file synthon space containing
 1682     Morgan fingerprints, employing radius 2 and bit vector size of 2048, finding
 1683     a maximum of 1000 hits for each query molecule, and write out a single output
 1684     file containing hit molecules, type:
 1685 
 1686         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1687           -i SampleSynthonSpace_MorganFPs.spc
 1688           --queryFile SampleSynthonSpaceQuery.sdf
 1689           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
 1690 
 1691     or only count hits without building hits and writing them to an output
 1692     file:
 1693 
 1694         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1695           -i SampleSynthonSpace_MorganFPs.spc
 1696           --queryFile SampleSynthonSpaceQuery.sdf
 1697           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
 1698           --synthonSearchParams "buildHits,No"
 1699 
 1700     To run previous example for writing individual output files for each query
 1701     molecule, type:
 1702 
 1703         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1704           -i SampleSynthonSpace_MorganFPs.spc
 1705           --queryFile SampleSynthonSpaceQuery.sdf
 1706           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
 1707           --outfileMode MultipleFiles
 1708 
 1709     To run previous example for retrieving all possible hits for query molecules
 1710     and write out individual output files for each query molecules, type:
 1711 
 1712         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1713           -i SampleSynthonSpace_MorganFPs.spc
 1714           --queryFile SampleSynthonSpaceQuery.sdf
 1715           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.sdf
 1716           --outfileMode MultipleFiles
 1717           --synthonSearchParams "maxHits,-1"
 1718 
 1719     To run the previous example using multi-threading employing all available
 1720     threads on your machine, retrieve maximum of 1000 hits for each query
 1721     molecule and generate various output files, type:
 1722 
 1723         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1724           -i SampleSynthonSpace_MorganFPs.spc
 1725           --queryFile SampleSynthonSpaceQuery.smi
 1726           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi
 1727           --outfileMode MultipleFiles
 1728           --synthonSearchParams "maxHits, 1000, numThreads, 0"
 1729 
 1730     To run the previous example using multi-threading employing all but one
 1731     available threads on your machine, type:
 1732 
 1733         % RDKitPerformSynthonSpaceSearch.py -m SimilaritySearch
 1734           -i SampleSynthonSpace_MorganFPs.spc
 1735           --queryFile SampleSynthonSpaceQuery.smi
 1736           -o SampleSynthonSpace_SimilaritySearchResultsMorganFPs.smi
 1737           --outfileMode MultipleFiles
 1738           --synthonSearchParams "maxHits, 1000, numThreads, -1"
 1739 
 1740     To perform a substructure search using query pattern SMARTS against a synthon
 1741     space file, finding a maximum of 1000 hits for each query pattern and write out
 1742     a single output file containing hit molecules, type:
 1743 
 1744         % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
 1745           -i SampleSynthonSpace.spc
 1746           --queryPattern "c12ccc(C)cc1[nH]nc2C(=O)NCc1cncs1"
 1747           -o SampleSynthonSpace_SubstructureSearchResults.sdf
 1748 
 1749         % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
 1750           -i SampleSynthonSpace.csv
 1751           --queryPattern 'c1c[n,s,o][n,s,o,c]c1C(=O)[$(N1CCCCC1),$(N1CCCC1)]'
 1752           -o SampleSynthonSpace_SubstructureSearchResults.sdf
 1753 
 1754     To run previous example for retrieving for writing out individual output files
 1755     for each query molecules, type:
 1756 
 1757         % RDKitPerformSynthonSpaceSearch.py -m SubstructureSearch
 1758           -i SampleSynthonSpace.spc
 1759           --queryPattern "CCN(C(=O)c1cc2cc(OC)ccc2nc1C)C1CCCN(C(=O)OC(C)(C)C)C1 
 1760           C=CCc1c(N[C@H](C)c2cccc(C)c2)ncnc1N(C)CCCC(=O)OC"
 1761           -o SampleSynthonSpace_SubstructureSearchResults.sdf
 1762           --outfileMode MultipleFiles
 1763 
 1764     To perform RASCAL similarity search for query molecules in an input file,
 1765     against a binary data base file synthon space, finding a maximum of 1000 hits
 1766     for each query molecule, using multi-threadsing employing all available CPUs,
 1767     timing out after 3600 seconds, and write out a single output file containing
 1768     hit molecules, type:
 1769 
 1770         % RDKitPerformSynthonSpaceSearch.py -m RASCALSimilaritySearch
 1771           -i SampleSynthonSpace.spc
 1772           --queryFile SampleSynthonSpaceQuery.sdf
 1773           -o SampleSynthonSpace_RASCALSimilaritySearchResults.sdf
 1774           --synthonSearchParams "maxHits, 1000, numThreads, 0, timeOut, 3600"
 1775 
 1776 Author:
 1777     Manish Sud(msud@san.rr.com)
 1778 
 1779 Acknowledgment:
 1780     Dave Cosgrove
 1781 
 1782 See also:
 1783     RDKitConvertFileFormat.py, RDKitPickDiverseMolecules.py, RDKitSearchFunctionalGroups.py,
 1784     RDKitSearchSMARTS.py
 1785 
 1786 Copyright:
 1787     Copyright (C) 2026 Manish Sud. All rights reserved.
 1788 
 1789     The functionality available in this script is implemented using RDKit, an
 1790     open source toolkit for cheminformatics developed by Greg Landrum.
 1791 
 1792     This file is part of MayaChemTools.
 1793 
 1794     MayaChemTools is free software; you can redistribute it and/or modify it under
 1795     the terms of the GNU Lesser General Public License as published by the Free
 1796     Software Foundation; either version 3 of the License, or (at your option) any
 1797     later version.
 1798 
 1799 """
 1800 
 1801 if __name__ == "__main__":
 1802     main()