1 #!/bin/env python 2 # 3 # File: RDKitGenerateConformers.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 from rdkit.Chem import Descriptors 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 67 68 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 69 70 # Retrieve command line arguments and options... 71 RetrieveOptions() 72 73 # Process and validate command line arguments and options... 74 ProcessOptions() 75 76 # Perform actions required by the script... 77 GenerateConformers() 78 79 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 80 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 81 82 def GenerateConformers(): 83 """Generate conformers.""" 84 85 # Setup a molecule reader... 86 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 87 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 88 89 # Set up a molecule writer... 90 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 91 if Writer is None: 92 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 93 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 94 95 MolCount, ValidMolCount, ConfGenFailedCount = ProcessMolecules(Mols, Writer) 96 97 if Writer is not None: 98 Writer.close() 99 100 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 101 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 102 MiscUtil.PrintInfo("Number of molecules failed during conformation generation or minimization: %d" % ConfGenFailedCount) 103 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + ConfGenFailedCount)) 104 105 def ProcessMolecules(Mols, Writer): 106 """Process molecules to generate conformers.""" 107 108 if OptionsInfo["MPMode"]: 109 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 110 else: 111 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 112 113 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 114 """Process molecules to generate conformers using a single process.""" 115 116 if OptionsInfo["SkipForceFieldMinimization"]: 117 MiscUtil.PrintInfo("\nGenerating conformers without performing energy minimization...") 118 else: 119 MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization...") 120 121 (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3 122 for Mol in Mols: 123 MolCount += 1 124 125 if Mol is None: 126 continue 127 128 if RDKitUtil.IsMolEmpty(Mol): 129 if not OptionsInfo["QuietMode"]: 130 MolName = RDKitUtil.GetMolName(Mol, MolCount) 131 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 132 continue 133 ValidMolCount += 1 134 135 ConformerMol, CalcStatus, ConfIDs, ConfEnergies = GenerateMolConformers(Mol, MolCount) 136 137 if not CalcStatus: 138 ConfGenFailedCount += 1 139 continue 140 141 WriteMolConformers(Writer, ConformerMol, MolCount, ConfIDs, ConfEnergies) 142 143 return (MolCount, ValidMolCount, ConfGenFailedCount) 144 145 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 146 """Process and minimize molecules using multiprocessing.""" 147 148 if OptionsInfo["SkipForceFieldMinimization"]: 149 MiscUtil.PrintInfo("\nGenerating conformers without performing energy minimization using multiprocessing...") 150 else: 151 MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization using multiprocessing...") 152 153 MPParams = OptionsInfo["MPParams"] 154 155 # Setup data for initializing a worker process... 156 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 157 158 # Setup a encoded mols data iterable for a worker process... 159 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 160 161 # Setup process pool along with data initialization for each process... 162 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 163 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 164 165 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 166 167 # Start processing... 168 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 169 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 170 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 171 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 172 else: 173 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 174 175 (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3 176 for Result in Results: 177 MolCount += 1 178 MolIndex, EncodedMol, CalcStatus, ConfIDs, ConfEnergies = Result 179 180 if EncodedMol is None: 181 continue 182 ValidMolCount += 1 183 184 if not CalcStatus: 185 ConfGenFailedCount += 1 186 continue 187 188 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 189 WriteMolConformers(Writer, Mol, MolCount, ConfIDs, ConfEnergies) 190 191 return (MolCount, ValidMolCount, ConfGenFailedCount) 192 193 def InitializeWorkerProcess(*EncodedArgs): 194 """Initialize data for a worker process.""" 195 196 global Options, OptionsInfo 197 198 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 199 200 # Decode Options and OptionInfo... 201 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 202 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 203 204 def WorkerProcess(EncodedMolInfo): 205 """Process data for a worker process.""" 206 207 MolIndex, EncodedMol = EncodedMolInfo 208 209 CalcStatus = False 210 ConfIDs = None 211 ConfEnergies = None 212 213 if EncodedMol is None: 214 return [MolIndex, None, CalcStatus, ConfIDs, ConfEnergies] 215 216 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 217 if RDKitUtil.IsMolEmpty(Mol): 218 if not OptionsInfo["QuietMode"]: 219 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 220 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 221 return [MolIndex, None, CalcStatus, ConfIDs, ConfEnergies] 222 223 Mol, CalcStatus, ConfIDs, ConfEnergies = GenerateMolConformers(Mol, (MolIndex + 1)) 224 225 return [MolIndex, RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps), CalcStatus, ConfIDs, ConfEnergies] 226 227 def GenerateMolConformers(Mol, MolNum = None): 228 """Generate conformers for a molecule.""" 229 230 if OptionsInfo["SkipForceFieldMinimization"]: 231 return GenerateMolConformersWithoutMinimization(Mol, MolNum) 232 else: 233 return GenerateMolConformersWithMinimization(Mol, MolNum) 234 235 def GenerateMolConformersWithoutMinimization(Mol, MolNum = None): 236 """Generate conformers for a molecule without performing minimization.""" 237 238 ConfIDs = EmbedMolecule(Mol, MolNum) 239 if not len(ConfIDs): 240 if not OptionsInfo["QuietMode"]: 241 MolName = RDKitUtil.GetMolName(Mol, MolNum) 242 MiscUtil.PrintWarning("Conformation generation couldn't be performed for molecule %s: Embedding failed...\n" % MolName) 243 return [Mol, False, None, None] 244 245 if OptionsInfo["AlignConformers"]: 246 AllChem.AlignMolConformers(Mol) 247 248 if not OptionsInfo["QuietMode"]: 249 MolName = RDKitUtil.GetMolName(Mol, MolNum) 250 MiscUtil.PrintInfo("\nNumber of conformations generated for %s: %d" % (MolName, len(ConfIDs))) 251 252 # Convert ConfIDs into a list... 253 ConfIDsList = [ConfID for ConfID in ConfIDs] 254 255 # Setup conformation energies... 256 ConfEnergies = None 257 if OptionsInfo["EnergyOut"]: 258 ConfEnergies = [] 259 for ConfID in ConfIDsList: 260 EnergyStatus, Energy = GetConformerEnergy(Mol, ConfID) 261 262 Energy = "%.2f" % Energy if EnergyStatus else "NotAvailable" 263 ConfEnergies.append(Energy) 264 265 if not EnergyStatus: 266 if not OptionsInfo["QuietMode"]: 267 MolName = RDKitUtil.GetMolName(Mol, MolNum) 268 MiscUtil.PrintWarning("Failed to retrieve calculated energy for conformation number %d of molecule %s. Try again after removing any salts or cleaing up the molecule...\n" % (ConfID, MolName)) 269 270 return [Mol, True, ConfIDsList, ConfEnergies] 271 272 def GenerateMolConformersWithMinimization(Mol, MolNum): 273 """Generate and mininize conformers for a molecule.""" 274 275 if OptionsInfo["AddHydrogens"]: 276 Mol = Chem.AddHs(Mol) 277 278 ConfIDs = EmbedMolecule(Mol, MolNum) 279 if not len(ConfIDs): 280 if not OptionsInfo["QuietMode"]: 281 MolName = RDKitUtil.GetMolName(Mol, MolNum) 282 MiscUtil.PrintWarning("Conformation generation couldn't be performed for molecule %s: Embedding failed...\n" % MolName) 283 return [Mol, False, None, None] 284 285 CalcEnergyMap = {} 286 for ConfID in ConfIDs: 287 try: 288 if OptionsInfo["UseUFF"]: 289 Status = AllChem.UFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["MaxIters"]) 290 elif OptionsInfo["UseMMFF"]: 291 Status = AllChem.MMFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["MaxIters"], mmffVariant = OptionsInfo["MMFFVariant"]) 292 else: 293 MiscUtil.PrintError("Minimization couldn't be performed: Specified forcefield, %s, is not supported" % OptionsInfo["ForceField"]) 294 except (RuntimeError, Chem.rdchem.KekulizeException) as ErrMsg: 295 if not OptionsInfo["QuietMode"]: 296 MolName = RDKitUtil.GetMolName(Mol, MolNum) 297 MiscUtil.PrintWarning("Minimization couldn't be performed for molecule %s:\n%s\n" % (MolName, ErrMsg)) 298 return [Mol, False, None, None] 299 300 EnergyStatus, Energy = GetConformerEnergy(Mol, ConfID) 301 if not EnergyStatus: 302 if not OptionsInfo["QuietMode"]: 303 MolName = RDKitUtil.GetMolName(Mol, MolNum) 304 MiscUtil.PrintWarning("Failed to retrieve calculated energy for conformation number %d of molecule %s. Try again after removing any salts or cleaing up the molecule...\n" % (ConfID, MolName)) 305 return [Mol, False, None, None] 306 307 if Status != 0: 308 if not OptionsInfo["QuietMode"]: 309 MolName = RDKitUtil.GetMolName(Mol, MolNum) 310 MiscUtil.PrintWarning("Minimization failed to converge for conformation number %d of molecule %s in %d steps. Try using higher value for \"--maxIters\" option...\n" % (ConfID, MolName, OptionsInfo["MaxIters"])) 311 312 CalcEnergyMap[ConfID] = Energy 313 314 if OptionsInfo["RemoveHydrogens"]: 315 Mol = Chem.RemoveHs(Mol) 316 317 # Align molecules after minimization... 318 if OptionsInfo["AlignConformers"]: 319 AllChem.AlignMolConformers(Mol) 320 321 SortedConfIDs = sorted(ConfIDs, key = lambda ConfID: CalcEnergyMap[ConfID]) 322 323 MinEnergyConfID = SortedConfIDs[0] 324 MinConfEnergy = CalcEnergyMap[MinEnergyConfID] 325 EnergyWindow = OptionsInfo["EnergyWindow"] 326 327 EnergyRMSDCutoff = OptionsInfo["EnergyRMSDCutoff"] 328 ApplyEnergyRMSDCutoff = False 329 if EnergyRMSDCutoff > 0: 330 ApplyEnergyRMSDCutoff = True 331 332 EnergyRMSDCutoffLowest = OptionsInfo["EnergyRMSDCutoffModeLowest"] 333 EnergyRMSDCalcModeBest = OptionsInfo["EnergyRMSDCalcModeBest"] 334 335 PreAligned = False 336 if OptionsInfo["AlignConformers"]: 337 PreAligned = True 338 339 RefMol, ProbeMol = [None] * 2 340 if EnergyRMSDCalcModeBest: 341 # Copy molecules for best RMSD calculations to avoid change in the coordinates 342 # of the conformations... 343 RefMol = AllChem.Mol(Mol) 344 ProbeMol = AllChem.Mol(Mol) 345 346 # Track conformers with in the specified energy window from the lowest 347 # energy conformation along with applying RMSD cutoff as needed... 348 # 349 SelectedConfIDs = [] 350 351 ConfCount = 0 352 IgnoredByEnergyConfCount = 0 353 IgnoredByRMSDConfCount = 0 354 355 FirstConf = True 356 for ConfID in SortedConfIDs: 357 if FirstConf: 358 FirstConf = False 359 ConfCount += 1 360 SelectedConfIDs.append(ConfID) 361 continue 362 363 ConfEnergyDiff = abs(CalcEnergyMap[ConfID] - MinConfEnergy ) 364 if ConfEnergyDiff > EnergyWindow: 365 IgnoredByEnergyConfCount += 1 366 continue 367 368 if ApplyEnergyRMSDCutoff: 369 IgnoreConf = False 370 if EnergyRMSDCutoffLowest: 371 # Compare RMSD with the lowest energy conformation... 372 if EnergyRMSDCalcModeBest: 373 CalcRMSD = AllChem.GetBestRMS(ProbeMol, RefMol, prbId = ConfID, refId = MinEnergyConfID) 374 else: 375 CalcRMSD = AllChem.GetConformerRMS(Mol, MinEnergyConfID, ConfID, prealigned=PreAligned) 376 if CalcRMSD < EnergyRMSDCutoff: 377 IgnoreConf = True 378 else: 379 for SelectedConfID in SelectedConfIDs: 380 if EnergyRMSDCalcModeBest: 381 CalcRMSD = AllChem.GetBestRMS(ProbeMol, RefMol, prbId = ConfID, refId = SelectedConfID) 382 else: 383 CalcRMSD = AllChem.GetConformerRMS(Mol, SelectedConfID, ConfID, prealigned=PreAligned) 384 if CalcRMSD < EnergyRMSDCutoff: 385 IgnoreConf = True 386 break 387 if IgnoreConf: 388 IgnoredByRMSDConfCount += 1 389 continue 390 391 ConfCount += 1 392 SelectedConfIDs.append(ConfID) 393 394 if not OptionsInfo["QuietMode"]: 395 MolName = RDKitUtil.GetMolName(Mol, MolNum) 396 MiscUtil.PrintInfo("\nTotal Number of conformations generated for %s: %d" % (MolName, ConfCount)) 397 MiscUtil.PrintInfo("Number of conformations ignored due to energy window cutoff: %d" % (IgnoredByEnergyConfCount)) 398 if ApplyEnergyRMSDCutoff: 399 MiscUtil.PrintInfo("Number of conformations ignored due to energy RMSD cutoff: %d" % (IgnoredByRMSDConfCount)) 400 401 SelectedConfEnergies = None 402 if OptionsInfo["EnergyOut"]: 403 SelectedConfEnergies = ["%.2f" % CalcEnergyMap[ConfID] for ConfID in SelectedConfIDs] 404 405 return [Mol, True, SelectedConfIDs, SelectedConfEnergies] 406 407 def GetConformerEnergy(Mol, ConfID): 408 """Calculate conformer energy.""" 409 410 Status = True 411 Energy = 0.0 412 413 if OptionsInfo["UseUFF"]: 414 UFFMoleculeForcefield = AllChem.UFFGetMoleculeForceField(Mol, confId = ConfID) 415 if UFFMoleculeForcefield is None: 416 Status = False 417 else: 418 Energy = UFFMoleculeForcefield.CalcEnergy() 419 elif OptionsInfo["UseMMFF"]: 420 MMFFMoleculeProperties = AllChem.MMFFGetMoleculeProperties(Mol, mmffVariant = OptionsInfo["MMFFVariant"]) 421 MMFFMoleculeForcefield = AllChem.MMFFGetMoleculeForceField(Mol, MMFFMoleculeProperties, confId = ConfID) 422 if MMFFMoleculeForcefield is None: 423 Status = False 424 else: 425 Energy = MMFFMoleculeForcefield.CalcEnergy() 426 else: 427 MiscUtil.PrintError("Couldn't retrieve conformer energy: Specified forcefield, %s, is not supported" % OptionsInfo["ForceField"]) 428 429 return (Status, Energy) 430 431 def EmbedMolecule(Mol, MolNum = None): 432 """Embed conformations.""" 433 434 ConfIDs = [] 435 436 # Figure out the number of conformations to embded... 437 if re.match("^Auto$", OptionsInfo["MaxConfs"], re.I): 438 NumOfRotBonds = Descriptors.NumRotatableBonds(Mol) 439 if NumOfRotBonds <= 5: 440 MaxConfs = 100 441 elif NumOfRotBonds >=6 and NumOfRotBonds <= 10: 442 MaxConfs = 200 443 else: 444 MaxConfs = 300 445 else: 446 MaxConfs = int(OptionsInfo["MaxConfs"]) 447 448 RandomSeed = OptionsInfo["RandomSeed"] 449 EnforceChirality = OptionsInfo["EnforceChirality"] 450 UseExpTorsionAnglePrefs = OptionsInfo["UseExpTorsionAnglePrefs"] 451 UseBasicKnowledge = OptionsInfo["UseBasicKnowledge"] 452 ETVersion = OptionsInfo["ETVersion"] 453 EmbedRMSDCutoff = OptionsInfo["EmbedRMSDCutoff"] 454 455 try: 456 ConfIDs = AllChem.EmbedMultipleConfs(Mol, numConfs = MaxConfs, randomSeed = RandomSeed, pruneRmsThresh = EmbedRMSDCutoff, enforceChirality = EnforceChirality, useExpTorsionAnglePrefs = UseExpTorsionAnglePrefs, useBasicKnowledge = UseBasicKnowledge, ETversion = ETVersion) 457 except ValueError as ErrMsg: 458 if not OptionsInfo["QuietMode"]: 459 MolName = RDKitUtil.GetMolName(Mol, MolNum) 460 MiscUtil.PrintWarning("Embedding failed for molecule %s:\n%s\n" % (MolName, ErrMsg)) 461 ConfIDs = [] 462 463 if not OptionsInfo["QuietMode"]: 464 if EmbedRMSDCutoff > 0: 465 MiscUtil.PrintInfo("Generating initial conformation ensemble by distance geometry for %s - EmbedRMSDCutoff: %s; Size: %s; Size after RMSD filtering: %s" % (RDKitUtil.GetMolName(Mol, MolNum), EmbedRMSDCutoff, MaxConfs, len(ConfIDs))) 466 else: 467 MiscUtil.PrintInfo("Generating initial conformation ensemble by distance geometry for %s - EmbedRMSDCutoff: None; Size: %s" % (RDKitUtil.GetMolName(Mol, MolNum), len(ConfIDs))) 468 469 return ConfIDs 470 471 def WriteMolConformers(Writer, Mol, MolNum, ConfIDs, ConfEnergies = None): 472 """Write molecule coformers.""" 473 474 if ConfIDs is None: 475 return 476 477 MolName = RDKitUtil.GetMolName(Mol, MolNum) 478 479 for Index, ConfID in enumerate(ConfIDs): 480 SetConfMolName(Mol, MolName, ConfID) 481 482 if ConfEnergies is not None: 483 Mol.SetProp(OptionsInfo["EnergyLabel"], ConfEnergies[Index]) 484 485 Writer.write(Mol, confId = ConfID) 486 487 def SetConfMolName(Mol, MolName, ConfCount): 488 """Set conf mol name.""" 489 490 ConfName = "%s_Conf%d" % (MolName, ConfCount) 491 Mol.SetProp("_Name", ConfName) 492 493 def ProcesssConformerGeneratorOption(): 494 """Process comformer generator option. """ 495 496 ConfGenParams = MiscUtil.ProcessOptionConformerGenerator("--conformerGenerator", Options["--conformerGenerator"]) 497 498 OptionsInfo["ConformerGenerator"] = ConfGenParams["ConformerGenerator"] 499 OptionsInfo["UseBasicKnowledge"] = ConfGenParams["UseBasicKnowledge"] 500 OptionsInfo["UseExpTorsionAnglePrefs"] = ConfGenParams["UseExpTorsionAnglePrefs"] 501 OptionsInfo["ETVersion"] = ConfGenParams["ETVersion"] 502 503 def ProcessOptions(): 504 """Process and validate command line arguments and options.""" 505 506 MiscUtil.PrintInfo("Processing options...") 507 508 # Validate options... 509 ValidateOptions() 510 511 OptionsInfo["Infile"] = Options["--infile"] 512 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 513 514 OptionsInfo["Outfile"] = Options["--outfile"] 515 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"]) 516 517 OptionsInfo["Overwrite"] = Options["--overwrite"] 518 519 OptionsInfo["AddHydrogens"] = True 520 if re.match("^no$", Options["--addHydrogens"], re.I): 521 OptionsInfo["AddHydrogens"] = False 522 523 OptionsInfo["AlignConformers"] = True 524 if re.match("^no$", Options["--alignConformers"], re.I): 525 OptionsInfo["AlignConformers"] = False 526 527 ProcesssConformerGeneratorOption() 528 529 if re.match("^UFF$", Options["--forceField"], re.I): 530 ForceField = "UFF" 531 UseUFF = True 532 UseMMFF = False 533 SkipForceFieldMinimization = False 534 elif re.match("^MMFF$", Options["--forceField"], re.I): 535 ForceField = "MMFF" 536 UseUFF = False 537 UseMMFF = True 538 SkipForceFieldMinimization = False 539 else: 540 ForceField = "None" 541 UseUFF = False 542 UseMMFF = False 543 SkipForceFieldMinimization = True 544 545 MMFFVariant = "MMFF94" if re.match("^MMFF94$", Options["--forceFieldMMFFVariant"], re.I) else "MMFF94s" 546 547 OptionsInfo["SkipForceFieldMinimization"] = SkipForceFieldMinimization 548 OptionsInfo["ForceField"] = ForceField 549 OptionsInfo["MMFFVariant"] = MMFFVariant 550 OptionsInfo["UseMMFF"] = UseMMFF 551 OptionsInfo["UseUFF"] = UseUFF 552 553 OptionsInfo["EnergyOut"] = True if re.match("^yes$", Options["--energyOut"], re.I) else False 554 if UseUFF: 555 EnergyLabel = "UFF_Energy" 556 elif UseMMFF: 557 EnergyLabel = "%s_Energy" % MMFFVariant 558 else: 559 EnergyLabel = "Energy" 560 OptionsInfo["EnergyLabel"] = EnergyLabel 561 562 OptionsInfo["EnforceChirality"] = True 563 if re.match("^no$", Options["--enforceChirality"], re.I): 564 OptionsInfo["EnforceChirality"] = False 565 566 OptionsInfo["EnergyWindow"] = float(Options["--energyWindow"]) 567 568 EmbedRMSDCutoff = -1.0 569 if not re.match("^none$", Options["--embedRMSDCutoff"], re.I): 570 EmbedRMSDCutoff = float(Options["--embedRMSDCutoff"]) 571 OptionsInfo["EmbedRMSDCutoff"] = EmbedRMSDCutoff 572 573 OptionsInfo["EnergyRMSDCalcMode"] = Options["--energyRMSDCalcMode"] 574 OptionsInfo["EnergyRMSDCalcModeBest"] = True if re.match("^BestRMSD$", Options["--energyRMSDCalcMode"], re.I) else False 575 576 EnergyRMSDCutoff = -1.0 577 if not re.match("^none$", Options["--energyRMSDCutoff"], re.I): 578 EnergyRMSDCutoff = float(Options["--energyRMSDCutoff"]) 579 OptionsInfo["EnergyRMSDCutoff"] = EnergyRMSDCutoff 580 581 OptionsInfo["EnergyRMSDCutoffMode"] = Options["--energyRMSDCutoffMode"] 582 OptionsInfo["EnergyRMSDCutoffModeLowest"] = True if re.match("^Lowest$", Options["--energyRMSDCutoffMode"], re.I) else False 583 584 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 585 OptionsInfo["MaxConfs"] = Options["--maxConfs"] 586 587 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 588 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 589 590 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 591 592 RandomSeed = -1 593 if not re.match("^auto$", Options["--randomSeed"], re.I): 594 RandomSeed = int(Options["--randomSeed"]) 595 OptionsInfo["RandomSeed"] = RandomSeed 596 597 OptionsInfo["RemoveHydrogens"] = True 598 if re.match("^no$", Options["--removeHydrogens"], re.I): 599 OptionsInfo["RemoveHydrogens"] = False 600 601 def RetrieveOptions(): 602 """Retrieve command line arguments and options.""" 603 604 # Get options... 605 global Options 606 Options = docopt(_docoptUsage_) 607 608 # Set current working directory to the specified directory... 609 WorkingDir = Options["--workingdir"] 610 if WorkingDir: 611 os.chdir(WorkingDir) 612 613 # Handle examples option... 614 if "--examples" in Options and Options["--examples"]: 615 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 616 sys.exit(0) 617 618 def ValidateOptions(): 619 """Validate option values.""" 620 621 MiscUtil.ValidateOptionTextValue("-a, --addHydrogens", Options["--addHydrogens"], "yes no") 622 MiscUtil.ValidateOptionTextValue("--alignConformers", Options["--alignConformers"], "yes no") 623 MiscUtil.ValidateOptionTextValue("-c, --conformerGenerator", Options["--conformerGenerator"], "SDG KDG ETDG ETKDG ETKDGv2") 624 625 MiscUtil.ValidateOptionTextValue("-f, --forceField", Options["--forceField"], "UFF MMFF None") 626 MiscUtil.ValidateOptionTextValue(" --forceFieldMMFFVariant", Options["--forceFieldMMFFVariant"], "MMFF94 MMFF94s") 627 628 MiscUtil.ValidateOptionTextValue("--energyOut", Options["--energyOut"], "yes no") 629 MiscUtil.ValidateOptionTextValue("--enforceChirality ", Options["--enforceChirality"], "yes no") 630 MiscUtil.ValidateOptionFloatValue("--energyWindow", Options["--energyWindow"], {">": 0.0}) 631 632 MiscUtil.ValidateOptionTextValue(" --energyRMSDCalcMode", Options["--energyRMSDCalcMode"], "RMSD BestRMSD") 633 634 if not re.match("^none$", Options["--embedRMSDCutoff"], re.I): 635 MiscUtil.ValidateOptionFloatValue("--embedRMSDCutoff", Options["--embedRMSDCutoff"], {">": 0.0}) 636 MiscUtil.ValidateOptionTextValue(" --energyRMSDCutoffMode", Options["--energyRMSDCutoffMode"], "All Lowest") 637 638 if not re.match("^none$", Options["--energyRMSDCutoff"], re.I): 639 MiscUtil.ValidateOptionFloatValue("--energyRMSDCutoff", Options["--energyRMSDCutoff"], {">": 0.0}) 640 # Make sure that the alignConformers option is being used... 641 if not re.match("^yes$", Options["--alignConformers"], re.I): 642 MiscUtil.PrintError("\"%s\" value of \"--alignConformers\" is not allowed for %s value of \"--energyRMSDCutoff\" option. It must be set to \"yes\"." % (Options["--alignConformers"], Options["--energyRMSDCutoff"])) 643 644 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 645 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 646 647 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd") 648 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 649 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 650 651 if not re.match("^auto$", Options["--maxConfs"], re.I): 652 MiscUtil.ValidateOptionIntegerValue("--maxConfs", Options["--maxConfs"], {">": 0}) 653 654 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 655 656 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 657 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 658 659 if not re.match("^auto$", Options["--randomSeed"], re.I): 660 MiscUtil.ValidateOptionIntegerValue("--randomSeed", Options["--randomSeed"], {}) 661 RandomSeed = int(Options["--randomSeed"]) 662 663 MiscUtil.ValidateOptionTextValue("-r, --removeHydrogens", Options["--removeHydrogens"], "yes no") 664 665 # Setup a usage string for docopt... 666 _docoptUsage_ = """ 667 RDKitGenerateConformers.py - Generate molecular conformations 668 669 Usage: 670 RDKitGenerateConformers.py [--alignConformers <yes or no>] [--addHydrogens <yes or no>] 671 [--conformerGenerator <SDG, ETDG, KDG, ETKDG, ETKDGv2>] [--embedRMSDCutoff <number>] 672 [--energyOut <yes or no>] [--enforceChirality <yes or no>] [--energyRMSDCalcMode <RMSD or BestRMSD>] 673 [--energyRMSDCutoff <number>] [--energyRMSDCutoffMode <All or Lowest>] [--energyWindow <number>] 674 [--forceField <UFF, MMFF, None>] [--forceFieldMMFFVariant <MMFF94 or MMFF94s>] 675 [--infileParams <Name,Value,...>] [--maxConfs <number>] 676 [--mp <yes or no>] [--mpParams <Name,Value,...>] 677 [--maxIters <number>] [ --outfileParams <Name,Value,...> ] [--overwrite] 678 [--quiet <yes or no>] [ --removeHydrogens <yes or no>] [--randomSeed <number>] 679 [-w <dir>] -i <infile> -o <outfile> 680 RDKitGenerateConformers.py -h | --help | -e | --examples 681 682 Description: 683 Generate 3D conformes of molecules using a combination of distance geometry and 684 forcefield minimization. The forcefield minimization may be skipped to only generate 685 conformations by available distance geometry based methodologies. 686 687 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 688 .csv, .tsv, .txt) 689 690 The supported output file format are: SD (.sdf, .sd) 691 692 Options: 693 -a, --addHydrogens <yes or no> [default: yes] 694 Add hydrogens before minimization. 695 --alignConformers <yes or no> [default: yes] 696 Align conformers for each molecule. 697 -c, --conformerGenerator <text> [default: ETKDGv2] 698 Conformation generation methodology for generating initial 3D coordinates. The 699 possible values along with a brief description are shown below: 700 701 SDG: Standard Distance Geometry 702 KDG: basic Knowledge-terms with Distance Geometry 703 ETDG: Experimental Torsion-angle preference with Distance Geometry 704 ETKDG: Experimental Torsion-angle preference along with basic 705 Knowledge-terms and Distance Geometry [Ref 129] 706 ETKDGv2: Experimental Torsion-angle preference along with basic 707 Knowledge-terms and Distance Geometry [Ref 167] 708 --embedRMSDCutoff <number> [default: none] 709 RMSD cutoff for retaining conformations after embedding and before energy minimization. 710 All embedded conformations are kept by default. Otherwise, only those conformations 711 which are different from each other by the specified RMSD cutoff are kept. The first 712 embedded conformation is always retained. 713 --energyOut <yes or no> [default: No] 714 Write out energy values. 715 --enforceChirality <yes or no> [default: Yes] 716 Enforce chirality for defined chiral centers. 717 --energyRMSDCalcMode <RMSD or BestRMSD> [default: RMSD] 718 Methodology for calculating RMSD values during the application of RMSD 719 cutoff for retaining conformations after energy minimization. Possible 720 values: RMSD or BestRMSD. This option is ignore during 'None' value of 721 '--energyRMSDCutoff' option. 722 723 During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to 724 align and calculate RMSD. This function calculates optimal RMSD for aligning two 725 molecules, taking symmetry into account. Otherwise, the RMSD value is calculated 726 using 'AllChem.GetConformerRMS' without changing the atom order. A word to the 727 wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to 728 align all permutations of matching atom orders in both molecules, for some molecules 729 it will lead to 'combinatorial explosion'. 730 --energyRMSDCutoff <number> [default: none] 731 RMSD cutoff for retaining conformations after energy minimization. By default, 732 all minimized conformations with in the specified energy window from the lowest 733 energy conformation are kept. Otherwise, only those conformations which are 734 different from the lowest energy conformation or all selected conformations 735 by the specified RMSD cutoff and are with in the specified energy window are 736 kept. The lowest energy conformation is always retained. 737 --energyRMSDCutoffMode <All or Lowest> [default: All] 738 RMSD cutoff mode for retaining conformations after energy minimization. 739 Possible values: All or Lowest. The RMSD values are compared against all 740 the selected conformations or the lowest energy conformation during 'All' 741 and 'Lowest' value of '--energyRMSDCutoffMode'. This option is ignored 742 during 'None' value of '--energyRMSDCutoff' option. 743 744 By default, only those conformations which all different from all selected 745 conformations by the specified RMSD cutoff and are with in the specified 746 energy window are kept. 747 --energyWindow <number> [default: 20] 748 Energy window in kcal/mol for selecting conformers. This option is ignored during 749 'None' value of '-f, --forcefield' option. 750 -e, --examples 751 Print examples. 752 -f, --forceField <UFF, MMFF, None> [default: MMFF] 753 Forcefield method to use for energy minimization. Possible values: Universal Force 754 Field (UFF) [Ref 81], Merck Molecular Mechanics Force Field (MMFF) [Ref 83-87] or 755 None. 756 --forceFieldMMFFVariant <MMFF94 or MMFF94s> [default: MMFF94] 757 Variant of MMFF forcefield to use for energy minimization. 758 -h, --help 759 Print this help message. 760 -i, --infile <infile> 761 Input file name. 762 --infileParams <Name,Value,...> [default: auto] 763 A comma delimited list of parameter name and value pairs for reading 764 molecules from files. The supported parameter names for different file 765 formats, along with their default values, are shown below: 766 767 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 768 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 769 smilesTitleLine,auto,sanitize,yes 770 771 Possible values for smilesDelimiter: space, comma or tab. 772 --maxConfs <number> [default: auto] 773 Maximum number of conformations to generate for each molecule by conformation 774 generation methodology. The conformations are minimized using the specified 775 forcefield as needed and written to the output file. The default value for maximum 776 number of conformations is dependent on the number of rotatable bonds in molecules: 777 RotBonds <= 5, maxConfs = 100; RotBonds >=6 and <= 10, MaxConfs = 200; 778 RotBonds >= 11, maxConfs = 300 779 --maxIters <number> [default: 250] 780 Maximum number of iterations to perform for each molecule during forcefield 781 minimization. 782 --mp <yes or no> [default: no] 783 Use multiprocessing. 784 785 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 786 function employing lazy RDKit data iterable. This allows processing of 787 arbitrary large data sets without any additional requirements memory. 788 789 All input data may be optionally loaded into memory by mp.Pool.map() 790 before starting worker processes in a process pool by setting the value 791 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 792 793 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 794 data mode may adversely impact the performance. The '--mpParams' section 795 provides additional information to tune the value of 'chunkSize'. 796 --mpParams <Name,Value,...> [default: auto] 797 A comma delimited list of parameter name and value pairs to configure 798 multiprocessing. 799 800 The supported parameter names along with their default and possible 801 values are shown below: 802 803 chunkSize, auto 804 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 805 numProcesses, auto [ Default: mp.cpu_count() ] 806 807 These parameters are used by the following functions to configure and 808 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 809 mp.Pool.imap(). 810 811 The chunkSize determines chunks of input data passed to each worker 812 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 813 The default value of chunkSize is dependent on the value of 'inputDataMode'. 814 815 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 816 automatically converts RDKit data iterable into a list, loads all data into 817 memory, and calculates the default chunkSize using the following method 818 as shown in its code: 819 820 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 821 if extra: chunkSize += 1 822 823 For example, the default chunkSize will be 7 for a pool of 4 worker processes 824 and 100 data items. 825 826 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 827 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 828 data into memory. Consequently, the size of input data is not known a priori. 829 It's not possible to estimate an optimal value for the chunkSize. The default 830 chunkSize is set to 1. 831 832 The default value for the chunkSize during 'Lazy' data mode may adversely 833 impact the performance due to the overhead associated with exchanging 834 small chunks of data. It is generally a good idea to explicitly set chunkSize to 835 a larger value during 'Lazy' input data mode, based on the size of your input 836 data and number of processes in the process pool. 837 838 The mp.Pool.map() function waits for all worker processes to process all 839 the data and return the results. The mp.Pool.imap() function, however, 840 returns the the results obtained from worker processes as soon as the 841 results become available for specified chunks of data. 842 843 The order of data in the results returned by both mp.Pool.map() and 844 mp.Pool.imap() functions always corresponds to the input data. 845 -o, --outfile <outfile> 846 Output file name. 847 --outfileParams <Name,Value,...> [default: auto] 848 A comma delimited list of parameter name and value pairs for writing 849 molecules to files. The supported parameter names for different file 850 formats, along with their default values, are shown below: 851 852 SD: kekulize,yes,forceV3000,no 853 854 --overwrite 855 Overwrite existing files. 856 -q, --quiet <yes or no> [default: no] 857 Use quiet mode. The warning and information messages will not be printed. 858 -r, --removeHydrogens <yes or no> [default: Yes] 859 Remove hydrogens after minimization. 860 --randomSeed <number> [default: auto] 861 Seed for the random number generator for reproducing 3D coordinates. 862 Default is to use a random seed. 863 -w, --workingdir <dir> 864 Location of working directory which defaults to the current directory. 865 866 Examples: 867 To generate conformers using Experimental Torsion-angle preference along 868 with basic Knowledge-terms and Distance Geometry (ETKDG) followed by 869 MMFF minimization with automatic determination of maximum number of 870 conformers for each molecule and write out a SD file, type: 871 872 % RDKitGenerateConformers.py -i Sample.smi -o SampleOut.sdf 873 874 To rerun the first example in a quiet mode and write out a SD file, type: 875 876 % RDKitGenerateConformers.py -q yes -i Sample.smi -o SampleOut.sdf 877 878 To rerun the first example in multiprocessing mode on all available CPUs 879 without loading all data into memory and write out a SD file, type: 880 881 % RDKitGenerateConformers.py --mp yes -i Sample.smi -o SampleOut.sdf 882 883 To run the first example in multiprocessing mode on all available CPUs 884 by loading all data into memory and write out a SD file, type: 885 886 % RDKitGenerateConformers.py --mp yes --mpParams "inputDataMode, 887 InMemory" -i Sample.smi -o SampleOut.sdf 888 889 To rerun the first example in multiprocessing mode on specific number of 890 CPUs and chunk size without loading all data into memory and write out a SD file, 891 type: 892 893 % RDKitGenerateConformers.py --mp yes --mpParams "inputDataMode,Lazy, 894 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf 895 896 To generate up to 150 conformers for each molecule using ETKDG and UFF forcefield 897 minimization along with conformers within 25 kcal/mol energy window and write out a 898 SD file, type: 899 900 % RDKitGenerateConformers.py --energyWindow 25 -f UFF --maxConfs 150 901 -i Sample.smi -o SampleOut.sdf 902 903 To generate up to 50 conformers for each molecule using KDG without any forcefield 904 minimization and alignment of conformers and write out a SD file, type: 905 906 % RDKitGenerateConformers.py -f none --maxConfs 50 --alignConformers no 907 -i Sample.sdf -o SampleOut.sdf 908 909 To generate up to 50 conformers using SDG without any forcefield minimization 910 and alignment of conformers for molecules in a CSV SMILES file, SMILES strings 911 in column 1, name in column 2, and write out a SD file, type: 912 913 % RDKitGenerateConformers.py --maxConfs 50 --maxIters 50 -c SDG 914 --alignConformers no -f none --infileParams "smilesDelimiter,comma, 915 smilesTitleLine,yes, smilesColumn,1,smilesNameColumn,2" 916 -i SampleSMILES.csv -o SampleOut.sdf 917 918 Author: 919 Manish Sud(msud@san.rr.com) 920 921 See also: 922 RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, 923 RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py, 924 RDKitGenerateConstrainedConformers.py, RDKitPerformMinimization.py 925 926 Copyright: 927 Copyright (C) 2024 Manish Sud. All rights reserved. 928 929 The functionality available in this script is implemented using RDKit, an 930 open source toolkit for cheminformatics developed by Greg Landrum. 931 932 This file is part of MayaChemTools. 933 934 MayaChemTools is free software; you can redistribute it and/or modify it under 935 the terms of the GNU Lesser General Public License as published by the Free 936 Software Foundation; either version 3 of the License, or (at your option) any 937 later version. 938 939 """ 940 941 if __name__ == "__main__": 942 main()