MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitGenerateConformers.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43     from rdkit.Chem import Descriptors
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 def main():
  64     """Start execution of the script"""
  65     
  66     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  67     
  68     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  69     
  70     # Retrieve command line arguments and options...
  71     RetrieveOptions()
  72     
  73     # Process and validate command line arguments and options...
  74     ProcessOptions()
  75     
  76     # Perform actions required by the script...
  77     GenerateConformers()
  78     
  79     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  80     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  81 
  82 def GenerateConformers():
  83     """Generate conformers."""
  84     
  85     # Setup a molecule reader...
  86     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  87     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  88     
  89     # Set up a molecule writer...
  90     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
  91     if Writer is None:
  92         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
  93     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
  94 
  95     MolCount, ValidMolCount, ConfGenFailedCount = ProcessMolecules(Mols, Writer)
  96 
  97     if Writer is not None:
  98         Writer.close()
  99     
 100     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 101     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 102     MiscUtil.PrintInfo("Number of molecules failed during conformation generation or minimization: %d" % ConfGenFailedCount)
 103     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + ConfGenFailedCount))
 104 
 105 def ProcessMolecules(Mols, Writer):
 106     """Process molecules to generate comformers."""
 107     
 108     if OptionsInfo["MPMode"]:
 109         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 110     else:
 111         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 112 
 113 def ProcessMoleculesUsingSingleProcess(Mols, Writer):
 114     """Process molecules to generate conformers using a single process."""
 115     
 116     if OptionsInfo["SkipForceFieldMinimization"]:
 117         MiscUtil.PrintInfo("\nGenerating conformers without performing energy minimization...")
 118     else:
 119         MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization...")
 120     
 121     (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3
 122     for Mol in Mols:
 123         MolCount += 1
 124         
 125         if Mol is None:
 126             continue
 127         
 128         if RDKitUtil.IsMolEmpty(Mol):
 129             if not OptionsInfo["QuietMode"]:
 130                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 131                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 132             continue
 133         ValidMolCount += 1
 134 
 135         ConformerMol, CalcStatus, ConfIDs = GenerateMolConformers(Mol, MolCount)
 136         
 137         if not CalcStatus:
 138             ConfGenFailedCount += 1
 139             continue
 140         
 141         WriteMolConformers(Writer, ConformerMol, MolCount, ConfIDs)
 142     
 143     return (MolCount, ValidMolCount, ConfGenFailedCount)
 144 
 145 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 146     """Process and minimize molecules using multiprocessing."""
 147     
 148     if OptionsInfo["SkipForceFieldMinimization"]:
 149         MiscUtil.PrintInfo("\nGenerating conformers without performing energy minimization using multiprocessing...")
 150     else:
 151         MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization using multiprocessing...")
 152     
 153     MPParams = OptionsInfo["MPParams"]
 154     
 155     # Setup data for initializing a worker process...
 156     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 157     
 158     # Setup a encoded mols data iterable for a worker process...
 159     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 160 
 161     # Setup process pool along with data initialization for each process...
 162     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 163     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 164     
 165     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 166     
 167     # Start processing...
 168     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 169         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 170     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 171         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 172     else:
 173         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 174     
 175     (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3
 176     for Result in Results:
 177         MolCount += 1
 178         MolIndex, EncodedMol, CalcStatus, ConfIDs = Result
 179         
 180         if EncodedMol is None:
 181             continue
 182         ValidMolCount += 1
 183         
 184         if not CalcStatus:
 185             ConfGenFailedCount += 1
 186             continue
 187             
 188         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 189         WriteMolConformers(Writer, Mol, MolCount, ConfIDs)
 190     
 191     return (MolCount, ValidMolCount, ConfGenFailedCount)
 192 
 193 def InitializeWorkerProcess(*EncodedArgs):
 194     """Initialize data for a worker process."""
 195     
 196     global Options, OptionsInfo
 197 
 198     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 199     
 200     # Decode Options and OptionInfo...
 201     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 202     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 203     
 204 def WorkerProcess(EncodedMolInfo):
 205     """Process data for a worker process."""
 206 
 207     MolIndex, EncodedMol = EncodedMolInfo
 208     
 209     CalcStatus = False
 210     ConfIDs = None
 211     
 212     if EncodedMol is None:
 213         return [MolIndex, None, CalcStatus, ConfIDs]
 214 
 215     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 216     if RDKitUtil.IsMolEmpty(Mol):
 217         if not OptionsInfo["QuietMode"]:
 218             MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 219             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 220         return [MolIndex, None, CalcStatus, ConfIDs]
 221     
 222     Mol, CalcStatus, ConfIDs = GenerateMolConformers(Mol, (MolIndex + 1))
 223 
 224     return [MolIndex, RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps), CalcStatus, ConfIDs]
 225 
 226 def GenerateMolConformers(Mol, MolNum = None):
 227     "Generate conformers for a molecule"
 228     
 229     if OptionsInfo["SkipForceFieldMinimization"]:
 230         return GenerateMolConformersWithoutMinimization(Mol, MolNum)
 231     else:
 232         return GenerateMolConformersWithMinimization(Mol, MolNum)
 233 
 234 def GenerateMolConformersWithoutMinimization(Mol, MolNum = None):
 235     "Generate conformers for a molecule without performing minimization."
 236     
 237     ConfIDs = EmbedMolecule(Mol, MolNum)
 238     if not len(ConfIDs):
 239         if not OptionsInfo["QuietMode"]:
 240             MolName = RDKitUtil.GetMolName(Mol, MolNum)
 241             MiscUtil.PrintWarning("Conformation generation couldn't be performed for molecule %s: Embedding failed...\n" % MolName)
 242         return [Mol, False, None]
 243     
 244     if OptionsInfo["AlignConformers"]:
 245         AllChem.AlignMolConformers(Mol)
 246     
 247     if not OptionsInfo["QuietMode"]:
 248         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 249         MiscUtil.PrintInfo("\nNumber of conformations generated for %s: %d" % (MolName, len(ConfIDs)))
 250 
 251     # Convert ConfIDs into a list...
 252     ConfIDsList = [ConfID for ConfID in ConfIDs]
 253     
 254     return [Mol, True, ConfIDsList]
 255 
 256 def GenerateMolConformersWithMinimization(Mol, MolNum):
 257     "Generate and mininize conformers for a molecule."
 258     
 259     if  OptionsInfo["AddHydrogens"]:
 260         Mol = Chem.AddHs(Mol)
 261     
 262     ConfIDs = EmbedMolecule(Mol, MolNum)
 263     if not len(ConfIDs):
 264         if not OptionsInfo["QuietMode"]:
 265             MolName = RDKitUtil.GetMolName(Mol, MolNum)
 266             MiscUtil.PrintWarning("Conformation generation couldn't be performed for molecule %s: Embedding failed...\n" % MolName)
 267         return [Mol, False, None]
 268 
 269     CalcEnergyMap = {}
 270     for ConfID in ConfIDs:
 271         try:
 272             if OptionsInfo["UseUFF"]:
 273                 Status = AllChem.UFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["MaxIters"])
 274             elif OptionsInfo["UseMMFF"]:
 275                 Status = AllChem.MMFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["MaxIters"])
 276             else:
 277                 MiscUtil.PrintError("Minimization couldn't be performed: Specified forcefield, %s, is not supported" % OptionsInfo["ForceField"])
 278         except RuntimeError as ErrMsg:
 279             if not OptionsInfo["QuietMode"]:
 280                 MolName = RDKitUtil.GetMolName(Mol, MolNum)
 281                 MiscUtil.PrintWarning("Minimization couldn't be performed for molecule %s:\n%s\n" % (MolName, ErrMsg))
 282             return [Mol, False, None]
 283         
 284         EnergyStatus, Energy = GetConformerEnergy(Mol, ConfID)
 285         if not EnergyStatus:
 286             if not OptionsInfo["QuietMode"]:
 287                 MolName = RDKitUtil.GetMolName(Mol, MolNum)
 288                 MiscUtil.PrintWarning("Failed to retrieve calculated energy for conformation number %d of molecule %s. Try again after removing any salts or cleaing up the molecule...\n" % (ConfID, MolName))
 289             return [Mol, False, None]
 290         
 291         if Status != 0:
 292             if not OptionsInfo["QuietMode"]:
 293                 MolName = RDKitUtil.GetMolName(Mol, MolNum)
 294                 MiscUtil.PrintWarning("Minimization failed to converge for conformation number %d of molecule %s in %d steps. Try using higher value for \"--maxIters\" option...\n" % (ConfID, MolName, OptionsInfo["MaxIters"]))
 295             
 296         CalcEnergyMap[ConfID] = Energy
 297 
 298     if  OptionsInfo["RemoveHydrogens"]:
 299         Mol = Chem.RemoveHs(Mol)
 300 
 301     # Align molecules after minimization...
 302     if OptionsInfo["AlignConformers"]:
 303         AllChem.AlignMolConformers(Mol)
 304     
 305     SortedConfIDs = sorted(ConfIDs, key = lambda ConfID: CalcEnergyMap[ConfID])
 306     
 307     MinEnergyConfID = SortedConfIDs[0]
 308     MinConfEnergy = CalcEnergyMap[MinEnergyConfID]
 309     EnergyWindow = OptionsInfo["EnergyWindow"]
 310 
 311     EnergyRMSDCutoff = OptionsInfo["EnergyRMSDCutoff"]
 312     ApplyEnergyRMSDCutoff = False
 313     if EnergyRMSDCutoff > 0:
 314         ApplyEnergyRMSDCutoff = True
 315 
 316     # Calculate RMSD values for conformers...
 317     PreAligned = False
 318     if OptionsInfo["AlignConformers"]:
 319         PreAligned = True
 320     
 321     CalcRMSDMap = {}
 322     if ApplyEnergyRMSDCutoff:
 323         for ConfID in SortedConfIDs:
 324             RMSD = AllChem.GetConformerRMS(Mol, MinEnergyConfID, ConfID, prealigned=PreAligned)
 325             CalcRMSDMap[ConfID] = RMSD
 326     
 327     # Track conformers with in the specified energy window  from the lowest
 328     # energy conformation along with applying RMSD cutoff as needed...
 329     #
 330     SelectedConfIDs = []
 331     
 332     ConfCount = 0
 333     IgnoredByEnergyConfCount = 0
 334     IgnoredByRMSDConfCount = 0
 335     
 336     FirstConf = True
 337     
 338     for ConfID in SortedConfIDs:
 339         if FirstConf:
 340             FirstConf = False
 341             SelectedConfIDs.append(ConfID)
 342             continue
 343             
 344         ConfEnergyDiff = abs(CalcEnergyMap[ConfID] - MinConfEnergy )
 345         if  ConfEnergyDiff > EnergyWindow:
 346             IgnoredByEnergyConfCount += 1
 347             continue
 348         
 349         if ApplyEnergyRMSDCutoff:
 350             if CalcRMSDMap[ConfID] < EnergyRMSDCutoff:
 351                 IgnoredByRMSDConfCount += 1
 352                 continue
 353 
 354         ConfCount += 1
 355         SelectedConfIDs.append(ConfID)
 356     
 357     if not OptionsInfo["QuietMode"]:
 358         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 359         MiscUtil.PrintInfo("\nTotal Number of conformations generated for %s: %d" % (MolName, ConfCount))
 360         MiscUtil.PrintInfo("Number of conformations ignored due to energy window cutoff: %d" % (IgnoredByEnergyConfCount))
 361         if ApplyEnergyRMSDCutoff:
 362             MiscUtil.PrintInfo("Number of conformations ignored due to energy RMSD cutoff:  %d" % (IgnoredByRMSDConfCount))
 363 
 364     return [Mol, True, SelectedConfIDs]
 365     
 366 def GetConformerEnergy(Mol, ConfID):
 367     "Calculate conformer energy"
 368 
 369     Status = True
 370     Energy = 0.0
 371     
 372     if OptionsInfo["UseUFF"]:
 373         UFFMoleculeForcefield = AllChem.UFFGetMoleculeForceField(Mol, confId = ConfID)
 374         if UFFMoleculeForcefield is None:
 375             Status = False
 376         else:
 377             Energy = UFFMoleculeForcefield.CalcEnergy()
 378     elif OptionsInfo["UseMMFF"]:
 379         MMFFMoleculeProperties = AllChem.MMFFGetMoleculeProperties(Mol)
 380         MMFFMoleculeForcefield = AllChem.MMFFGetMoleculeForceField(Mol, MMFFMoleculeProperties, confId = ConfID)
 381         if MMFFMoleculeForcefield is None:
 382             Status = False
 383         else:
 384             Energy = MMFFMoleculeForcefield.CalcEnergy()
 385     else:
 386         MiscUtil.PrintError("Couldn't retrieve conformer energy: Specified forcefield, %s, is not supported" % OptionsInfo["ForceField"])
 387 
 388     return (Status, Energy)
 389     
 390 def EmbedMolecule(Mol, MolNum = None):
 391     "Embed conformations"
 392     
 393     ConfIDs = []
 394     
 395     # Figure out the number of conformations to embded...
 396     if re.match("^Auto$", OptionsInfo["MaxConfs"], re.I):
 397         NumOfRotBonds = Descriptors.NumRotatableBonds(Mol)
 398         if NumOfRotBonds <= 5:
 399             MaxConfs = 100
 400         elif NumOfRotBonds >=6 and NumOfRotBonds <= 10:
 401             MaxConfs = 200
 402         else:
 403             MaxConfs = 300
 404     else:
 405         MaxConfs = int(OptionsInfo["MaxConfs"])
 406         
 407     RandomSeed = OptionsInfo["RandomSeed"]
 408     EnforceChirality = OptionsInfo["EnforceChirality"]
 409     UseExpTorsionAnglePrefs = OptionsInfo["UseExpTorsionAnglePrefs"]
 410     UseBasicKnowledge = OptionsInfo["UseBasicKnowledge"]
 411     EmbedRMSDCutoff = OptionsInfo["EmbedRMSDCutoff"]
 412     
 413     try:
 414         ConfIDs = AllChem.EmbedMultipleConfs(Mol, numConfs = MaxConfs, randomSeed = RandomSeed, pruneRmsThresh = EmbedRMSDCutoff, enforceChirality = EnforceChirality, useExpTorsionAnglePrefs = UseExpTorsionAnglePrefs, useBasicKnowledge = UseBasicKnowledge)
 415     except ValueError as ErrMsg:
 416         if not OptionsInfo["QuietMode"]:
 417             MolName = RDKitUtil.GetMolName(Mol, MolNum)
 418             MiscUtil.PrintWarning("Embedding failed  for molecule %s:\n%s\n" % (MolName, ErrMsg))
 419         ConfIDs = []
 420 
 421     return ConfIDs
 422 
 423 def WriteMolConformers(Writer, Mol, MolNum, ConfIDs):
 424     """Write molecule coformers. """
 425 
 426     if ConfIDs is None:
 427         return
 428     
 429     MolName = RDKitUtil.GetMolName(Mol, MolNum)
 430     
 431     for ConfID in ConfIDs:
 432         SetConfMolName(Mol, MolName, ConfID)
 433         Writer.write(Mol, confId = ConfID)
 434     
 435 def SetConfMolName(Mol, MolName, ConfCount):
 436     """Set conf mol name"""
 437 
 438     ConfName = "%s_Conf%d" % (MolName, ConfCount)
 439     Mol.SetProp("_Name", ConfName)
 440 
 441 def ProcessOptions():
 442     """Process and validate command line arguments and options"""
 443     
 444     MiscUtil.PrintInfo("Processing options...")
 445     
 446     # Validate options...
 447     ValidateOptions()
 448     
 449     OptionsInfo["Infile"] = Options["--infile"]
 450     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 451     
 452     OptionsInfo["Outfile"] = Options["--outfile"]
 453     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"])
 454     
 455     OptionsInfo["Overwrite"] = Options["--overwrite"]
 456 
 457     OptionsInfo["AddHydrogens"] = True
 458     if re.match("^no$", Options["--addHydrogens"], re.I):
 459         OptionsInfo["AddHydrogens"] = False
 460 
 461     OptionsInfo["AlignConformers"] = True
 462     if re.match("^no$", Options["--alignConformers"], re.I):
 463         OptionsInfo["AlignConformers"] = False
 464 
 465     if re.match("^ETDG$", Options["--conformerGenerator"], re.I):
 466         ConformerGenerator = "ETDG"
 467         UseExpTorsionAnglePrefs = True
 468         UseBasicKnowledge = False
 469     elif re.match("^KDG$", Options["--conformerGenerator"], re.I):
 470         ConformerGenerator = "KDG"
 471         UseExpTorsionAnglePrefs = False
 472         UseBasicKnowledge = True
 473     elif re.match("^ETKDG$", Options["--conformerGenerator"], re.I):
 474         ConformerGenerator = "ETKDG"
 475         UseExpTorsionAnglePrefs = True
 476         UseBasicKnowledge = True
 477     elif re.match("^SDG$", Options["--conformerGenerator"], re.I):
 478         ConformerGenerator = "SDG"
 479         UseExpTorsionAnglePrefs = False
 480         UseBasicKnowledge = False
 481     
 482     OptionsInfo["ConformerGenerator"] = ConformerGenerator
 483     OptionsInfo["UseExpTorsionAnglePrefs"] = UseExpTorsionAnglePrefs
 484     OptionsInfo["UseBasicKnowledge"] = UseBasicKnowledge
 485 
 486     if re.match("^UFF$", Options["--forceField"], re.I):
 487         ForceField = "UFF"
 488         UseUFF = True
 489         UseMMFF = False
 490         SkipForceFieldMinimization = False
 491     elif re.match("^MMFF$", Options["--forceField"], re.I):
 492         ForceField = "MMFF"
 493         UseUFF = False
 494         UseMMFF = True
 495         SkipForceFieldMinimization = False
 496     else:
 497         ForceField = "None"
 498         UseUFF = False
 499         UseMMFF = False
 500         SkipForceFieldMinimization = True
 501         
 502     OptionsInfo["SkipForceFieldMinimization"] = SkipForceFieldMinimization
 503     OptionsInfo["ForceField"] = ForceField
 504     OptionsInfo["UseMMFF"] = UseMMFF
 505     OptionsInfo["UseUFF"] = UseUFF
 506         
 507     OptionsInfo["EnforceChirality"] = True
 508     if re.match("^no$", Options["--enforceChirality"], re.I):
 509         OptionsInfo["EnforceChirality"] = False
 510     
 511     OptionsInfo["EnergyWindow"] = float(Options["--energyWindow"])
 512     
 513     EmbedRMSDCutoff = -1.0
 514     if not re.match("^none$", Options["--embedRMSDCutoff"], re.I):
 515         EmbedRMSDCutoff = float(Options["--embedRMSDCutoff"])
 516     OptionsInfo["EmbedRMSDCutoff"] = EmbedRMSDCutoff
 517     
 518     EnergyRMSDCutoff = -1.0
 519     if not re.match("^none$", Options["--energyRMSDCutoff"], re.I):
 520         EnergyRMSDCutoff = float(Options["--energyRMSDCutoff"])
 521     OptionsInfo["EnergyRMSDCutoff"] = EnergyRMSDCutoff
 522     
 523     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 524     OptionsInfo["MaxConfs"] = Options["--maxConfs"]
 525     
 526     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 527     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 528     
 529     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
 530     
 531     RandomSeed = -1
 532     if not re.match("^auto$", Options["--randomSeed"], re.I):
 533         RandomSeed = int(Options["--randomSeed"])
 534     OptionsInfo["RandomSeed"] = RandomSeed
 535     
 536     OptionsInfo["RemoveHydrogens"] = True
 537     if re.match("^no$", Options["--removeHydrogens"], re.I):
 538         OptionsInfo["RemoveHydrogens"] = False
 539     
 540 def RetrieveOptions():
 541     """Retrieve command line arguments and options"""
 542     
 543     # Get options...
 544     global Options
 545     Options = docopt(_docoptUsage_)
 546 
 547     # Set current working directory to the specified directory...
 548     WorkingDir = Options["--workingdir"]
 549     if WorkingDir:
 550         os.chdir(WorkingDir)
 551     
 552     # Handle examples option...
 553     if "--examples" in Options and Options["--examples"]:
 554         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 555         sys.exit(0)
 556 
 557 def ValidateOptions():
 558     """Validate option values"""
 559     
 560     MiscUtil.ValidateOptionTextValue("-a, --addHydrogens", Options["--addHydrogens"], "yes no")
 561     MiscUtil.ValidateOptionTextValue("--alignConformers", Options["--alignConformers"], "yes no")
 562     MiscUtil.ValidateOptionTextValue("-c, --conformerGenerator", Options["--conformerGenerator"], "SDG ETDG KDG ETKDG")
 563     MiscUtil.ValidateOptionTextValue("-f, --forceField", Options["--forceField"], "UFF MMFF None")
 564     
 565     MiscUtil.ValidateOptionTextValue("--enforceChirality ", Options["--enforceChirality"], "yes no")
 566     MiscUtil.ValidateOptionFloatValue("--energyWindow", Options["--energyWindow"], {">": 0.0})
 567     
 568     if not re.match("^none$", Options["--embedRMSDCutoff"], re.I):
 569         MiscUtil.ValidateOptionFloatValue("--embedRMSDCutoff", Options["--embedRMSDCutoff"], {">": 0.0})
 570     
 571     if not re.match("^none$", Options["--energyRMSDCutoff"], re.I):
 572         MiscUtil.ValidateOptionFloatValue("--energyRMSDCutoff", Options["--energyRMSDCutoff"], {">": 0.0})
 573         # Make sure that the alignConformers option is being used...
 574         if not re.match("^yes$", Options["--alignConformers"], re.I):
 575             MiscUtil.PrintError("%s value of \"--alignConformers\" is not allowed for %s value of \"--energyRMSDCutoff\" option " % (Options["--alignConformers"], Options["--energyRMSDCutoff"]))
 576         
 577     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 578     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
 579     
 580     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd")
 581     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 582     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 583         
 584     if not re.match("^auto$", Options["--maxConfs"], re.I):
 585         MiscUtil.ValidateOptionIntegerValue("--maxConfs", Options["--maxConfs"], {">": 0})
 586     
 587     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 588     
 589     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 590     MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
 591     
 592     if not re.match("^auto$", Options["--randomSeed"], re.I):
 593         MiscUtil.ValidateOptionIntegerValue("--randomSeed", Options["--randomSeed"], {})
 594         RandomSeed = int(Options["--randomSeed"])
 595     
 596     MiscUtil.ValidateOptionTextValue("-r, --removeHydrogens", Options["--removeHydrogens"], "yes no")
 597 
 598 # Setup a usage string for docopt...
 599 _docoptUsage_ = """
 600 RDKitGenerateConformers.py - Generate molecular conformations
 601 
 602 Usage:
 603     RDKitGenerateConformers.py [--alignConformers <yes or no>] [--addHydrogens <yes or no>]
 604                                [--conformerGenerator <SDG, ETDG, KDG, ETKDG>] [--embedRMSDCutoff <number>]
 605                                [--enforceChirality <yes or no>] [--energyRMSDCutoff <number>]
 606                                [--energyWindow <number> ]  [--forceField <UFF, MMFF, None>] 
 607                                [--infileParams <Name,Value,...>] [--maxConfs <number>]
 608                                [--mp <yes or no>] [--mpParams <Name.Value,...>]
 609                                [--maxIters <number>]  [ --outfileParams <Name,Value,...> ]  [--overwrite]
 610                                [--quiet <yes or no>] [ --removeHydrogens <yes or no>] [--randomSeed <number>]
 611                                [-w <dir>] -i <infile> -o <outfile> 
 612     RDKitGenerateConformers.py -h | --help | -e | --examples
 613 
 614 Description:
 615     Generate molecular conformations using a combination of distance geometry and
 616     forcefield minimization. The forcefield minimization may be skipped to only generate
 617     conformations by available distance geometry based methodologies.
 618 
 619     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 620     .csv, .tcsv .txt)
 621 
 622     The supported output file format are: SD (.sdf, .sd)
 623 
 624 Options:
 625     -a, --addHydrogens <yes or no>  [default: yes]
 626         Add hydrogens before minimization.
 627     --alignConformers <yes or no>  [default: yes]
 628         Align conformers for each molecule.
 629     -c, --conformerGenerator <SDG, ETDG, KDG, ETKDG>  [default: ETKDG]
 630         Conformation generation methodology for generating initial 3D coordinates of a
 631         molecule. Possible values: Standard Distance Geometry, (SDG), Experimental
 632         Torsion-angle preference with Distance Geometry (ETDG)  [Ref 129] , basic Knowledge-terms
 633         with Distance Geometry (KDG), and Experimental Torsion-angle preference along
 634         with basic Knowledge-terms and Distance Geometry (ETKDG). 
 635     --embedRMSDCutoff <number>  [default: none]
 636         RMSD cutoff for retaining conformations after embedding and before energy minimization.
 637         All embedded conformations are kept by default. Otherwise, only those conformations
 638         which are different from each other by the specified RMSD cutoff are kept. The first
 639         embedded conformation is always retained.
 640     --enforceChirality <yes or no>  [default: Yes]
 641         Enforce chirality for defined chiral centers.
 642     --energyRMSDCutoff <number>  [default: none]
 643         RMSD cutoff for retaining conformations after energy minimization. By default,
 644         all minimized conformations with in the specified energy window from the lowest energy
 645         conformation are kept. Otherwise, only those conformations which are different from
 646         the lowest energy conformation by the specified RMSD cutoff and are with in the
 647         specified energy window are kept. The lowest energy conformation is always retained.
 648     --energyWindow <number>  [default: 20]
 649         Energy window in kcal/mol for selecting conformers. This option is ignored during
 650         'None' value of '-f, --forcefield' option.
 651     -e, --examples
 652         Print examples.
 653     -f, --forceField <UFF, MMFF, None>  [default: MMFF]
 654         Forcefield method to use for energy minimization. Possible values: Universal Force
 655         Field (UFF) [Ref 81],  Merck Molecular Mechanics Force Field (MMFF) [Ref 83-87] or
 656         None.
 657     -h, --help
 658         Print this help message.
 659     -i, --infile <infile>
 660         Input file name.
 661     --infileParams <Name,Value,...>  [default: auto]
 662         A comma delimited list of parameter name and value pairs for reading
 663         molecules from files. The supported parameter names for different file
 664         formats, along with their default values, are shown below:
 665             
 666             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 667             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 668                 smilesTitleLine,auto,sanitize,yes
 669             
 670         Possible values for smilesDelimiter: space, comma or tab.
 671     --maxConfs <number>  [default: auto]
 672         Maximum number of conformations to generate for each molecule by conformation
 673         generation methodology. The conformations are minimized using the specified
 674         forcefield as needed and written to the output file. The default value for maximum
 675         number of conformations is dependent on the number of rotatable bonds in molecules:
 676         RotBonds <= 5, maxConfs = 100; RotBonds >=6 and <= 10, MaxConfs = 200;
 677         RotBonds >= 11, maxConfs = 300
 678     --maxIters <number>  [default: 250]
 679         Maximum number of iterations to perform for each molecule during forcefield
 680         minimization.
 681     --mp <yes or no>  [default: no]
 682         Use multiprocessing.
 683          
 684         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 685         function employing lazy RDKit data iterable. This allows processing of
 686         arbitrary large data sets without any additional requirements memory.
 687         
 688         All input data may be optionally loaded into memory by mp.Pool.map()
 689         before starting worker processes in a process pool by setting the value
 690         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 691         
 692         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 693         data mode may adversely impact the performance. The '--mpParams' section
 694         provides additional information to tune the value of 'chunkSize'.
 695     --mpParams <Name,Value,...>  [default: auto]
 696         A comma delimited list of parameter name and value pairs for to
 697         configure multiprocessing.
 698         
 699         The supported parameter names along with their default and possible
 700         values are shown below:
 701         
 702             chunkSize, auto
 703             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 704             numProcesses, auto   [ Default: mp.cpu_count() ]
 705         
 706         These parameters are used by the following functions to configure and
 707         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 708         mp.Pool.imap().
 709         
 710         The chunkSize determines chunks of input data passed to each worker
 711         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 712         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 713         
 714         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 715         automatically converts RDKit data iterable into a list, loads all data into
 716         memory, and calculates the default chunkSize using the following method
 717         as shown in its code:
 718         
 719             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 720             if extra: chunkSize += 1
 721         
 722         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 723         and 100 data items.
 724         
 725         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 726         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 727         data into memory. Consequently, the size of input data is not known a priori.
 728         It's not possible to estimate an optimal value for the chunkSize. The default 
 729         chunkSize is set to 1.
 730         
 731         The default value for the chunkSize during 'Lazy' data mode may adversely
 732         impact the performance due to the overhead associated with exchanging
 733         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 734         a larger value during 'Lazy' input data mode, based on the size of your input
 735         data and number of processes in the process pool.
 736         
 737         The mp.Pool.map() function waits for all worker processes to process all
 738         the data and return the results. The mp.Pool.imap() function, however,
 739         returns the the results obtained from worker processes as soon as the
 740         results become available for specified chunks of data.
 741         
 742         The order of data in the results returned by both mp.Pool.map() and 
 743         mp.Pool.imap() functions always corresponds to the input data.
 744     -o, --outfile <outfile>
 745         Output file name.
 746     --outfileParams <Name,Value,...>  [default: auto]
 747         A comma delimited list of parameter name and value pairs for writing
 748         molecules to files. The supported parameter names for different file
 749         formats, along with their default values, are shown below:
 750             
 751             SD: kekulize,no
 752             
 753     --overwrite
 754         Overwrite existing files.
 755     -q, --quiet <yes or no>  [default: no]
 756         Use quiet mode. The warning and information messages will not be printed.
 757     -r, --removeHydrogens <yes or no>  [default: Yes]
 758         Remove hydrogens after minimization.
 759     --randomSeed <number>  [default: auto]
 760         Seed for the random number generator for reproducing 3D coordinates.
 761         Default is to use a random seed.
 762     -w, --workingdir <dir>
 763         Location of working directory which defaults to the current directory.
 764 
 765 Examples:
 766     To generate conformers using Experimental Torsion-angle preference along
 767     with basic Knowledge-terms and Distance Geometry (ETKDG) followed by
 768     MMFF minimization with automatic determination of maximum number of
 769     conformers for each molecule and write out a SD file, type:
 770 
 771         % RDKitGenerateConformers.py  -i Sample.smi -o SampleOut.sdf
 772 
 773     To rerun the first example in a quiet mode and write out a SD file, type:
 774 
 775         % RDKitGenerateConformers.py -q yes -i Sample.smi -o SampleOut.sdf
 776 
 777     To rerun the first example in multiprocessing mode on all available CPUs
 778     without loading all data into memory and write out a SD file, type:
 779 
 780         % RDKitGenerateConformers.py --mp yes -i Sample.smi -o SampleOut.sdf
 781 
 782     To run the first example in multiprocessing mode on all available CPUs
 783     by loading all data into memory and write out a SD file, type:
 784 
 785         % RDKitGenerateConformers.py --mp yes --mpParams "inputDataMode,
 786           InMemory" -i Sample.smi -o SampleOut.sdf
 787 
 788     To rerun the first example in multiprocessing mode on specific number of
 789     CPUs and chunk size without loading all data into memory and write out a SD file,
 790     type:
 791 
 792         % RDKitGenerateConformers.py --mp yes --mpParams "inputDataMode,Lazy,
 793           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf
 794 
 795     To generate up to 150 conformers for each molecule using ETKDG and UFF forcefield
 796     minimization along with conformers within 25 kcal/mol energy window and write out a
 797     SD file, type:
 798 
 799         % RDKitGenerateConformers.py  --energyWindow 25 -f UFF --maxConfs 150
 800           -i Sample.smi -o SampleOut.sdf
 801 
 802     To generate up to 50 conformers for each molecule using KDG without any forcefield
 803     minimization and alignment of conformers and write out a SD file, type:
 804 
 805         % RDKitGenerateConformers.py  -f none --maxConfs 50 --alignConformers no
 806           -i Sample.sdf -o SampleOut.sdf
 807 
 808     To generate up to 50 conformers using SDG without any forcefield minimization
 809     and alignment of conformers for molecules in a  CSV SMILES file, SMILES strings
 810     in column 1, name in column 2, and write out a SD file, type:
 811 
 812         % RDKitGenerateConformers.py  --maxConfs 50  --maxIters 50 -c SDG
 813           --alignConformers no -f none --infileParams "smilesDelimiter,comma,
 814           smilesTitleLine,yes, smilesColumn,1,smilesNameColumn,2"
 815           -i SampleSMILES.csv -o SampleOut.sdf
 816 
 817 Author:
 818     Manish Sud(msud@san.rr.com)
 819 
 820 See also:
 821     RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py,
 822     RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py,
 823     RDKitPerformMinimization.py
 824 
 825 Copyright:
 826     Copyright (C) 2019 Manish Sud. All rights reserved.
 827 
 828     The functionality available in this script is implemented using RDKit, an
 829     open source toolkit for cheminformatics developed by Greg Landrum.
 830 
 831     This file is part of MayaChemTools.
 832 
 833     MayaChemTools is free software; you can redistribute it and/or modify it under
 834     the terms of the GNU Lesser General Public License as published by the Free
 835     Software Foundation; either version 3 of the License, or (at your option) any
 836     later version.
 837 
 838 """
 839 
 840 if __name__ == "__main__":
 841     main()