1 #!/bin/env python 2 # 3 # File: Psi4GenerateConformers.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using Psi4, an 9 # open source quantum chemistry software package, and RDKit, an open 10 # source toolkit for cheminformatics developed by Greg Landrum. 11 # 12 # This file is part of MayaChemTools. 13 # 14 # MayaChemTools is free software; you can redistribute it and/or modify it under 15 # the terms of the GNU Lesser General Public License as published by the Free 16 # Software Foundation; either version 3 of the License, or (at your option) any 17 # later version. 18 # 19 # MayaChemTools is distributed in the hope that it will be useful, but without 20 # any warranty; without even the implied warranty of merchantability of fitness 21 # for a particular purpose. See the GNU Lesser General Public License for more 22 # details. 23 # 24 # You should have received a copy of the GNU Lesser General Public License 25 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 26 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 27 # Boston, MA, 02111-1307, USA. 28 # 29 30 from __future__ import print_function 31 32 # Add local python path to the global path and import standard library modules... 33 import os 34 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 35 import time 36 import re 37 import shutil 38 import multiprocessing as mp 39 40 # Psi4 imports... 41 if (hasattr(shutil, 'which') and shutil.which("psi4") is None): 42 sys.stderr.write("\nWarning: Failed to find 'psi4' in your PATH indicating potential issues with your\n") 43 sys.stderr.write("Psi4 environment. The 'import psi4' directive in the global scope of the script\n") 44 sys.stderr.write("interferes with the multiprocessing functionality. It is imported later in the\n") 45 sys.stderr.write("local scope during the execution of the script and may fail. Check/update your\n") 46 sys.stderr.write("Psi4 environment and try again.\n\n") 47 48 # RDKit imports... 49 try: 50 from rdkit import rdBase 51 from rdkit import Chem 52 from rdkit.Chem import AllChem 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 56 sys.exit(1) 57 58 # MayaChemTools imports... 59 try: 60 from docopt import docopt 61 import MiscUtil 62 import Psi4Util 63 import RDKitUtil 64 except ImportError as ErrMsg: 65 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 66 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 67 sys.exit(1) 68 69 ScriptName = os.path.basename(sys.argv[0]) 70 Options = {} 71 OptionsInfo = {} 72 73 def main(): 74 """Start execution of the script.""" 75 76 MiscUtil.PrintInfo("\n%s (Psi4: Imported later; RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 77 78 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 79 80 # Retrieve command line arguments and options... 81 RetrieveOptions() 82 83 # Process and validate command line arguments and options... 84 ProcessOptions() 85 86 # Perform actions required by the script... 87 GenerateConformers() 88 89 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 90 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 91 92 def GenerateConformers(): 93 """Generate conformers.""" 94 95 # Setup a molecule reader... 96 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 97 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 98 99 # Set up a molecule writer... 100 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 101 if Writer is None: 102 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 103 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 104 105 MolCount, ValidMolCount, ConfGenFailedCount = ProcessMolecules(Mols, Writer) 106 107 if Writer is not None: 108 Writer.close() 109 110 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 111 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 112 MiscUtil.PrintInfo("Number of molecules failed during conformation generation or minimization: %d" % ConfGenFailedCount) 113 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + ConfGenFailedCount)) 114 115 def ProcessMolecules(Mols, Writer): 116 """Process molecules to generate conformers.""" 117 118 if OptionsInfo["MPMode"]: 119 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 120 else: 121 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 122 123 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 124 """Process molecules to generate conformers using a single process.""" 125 126 # Intialize Psi4... 127 MiscUtil.PrintInfo("\nInitializing Psi4...") 128 Psi4Handle = Psi4Util.InitializePsi4(Psi4RunParams = OptionsInfo["Psi4RunParams"], Psi4OptionsParams = OptionsInfo["Psi4OptionsParams"], PrintVersion = True, PrintHeader = True) 129 OptionsInfo["psi4"] = Psi4Handle 130 131 # Setup max iterations global variable... 132 Psi4Util.UpdatePsi4OptionsParameters(Psi4Handle, {'GEOM_MAXITER': OptionsInfo["MaxIters"]}) 133 134 # Setup conversion factor for energy units... 135 SetupEnergyConversionFactor(Psi4Handle) 136 137 MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization...") 138 139 (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3 140 for Mol in Mols: 141 MolCount += 1 142 143 if not CheckAndValidateMolecule(Mol, MolCount): 144 continue 145 146 # Setup 2D coordinates for SMILES input file... 147 if OptionsInfo["SMILESInfileStatus"]: 148 AllChem.Compute2DCoords(Mol) 149 150 ValidMolCount += 1 151 152 ConformerMol, CalcStatus, ConfIDs, ConfEnergies = GenerateMolConformers(Psi4Handle, Mol, MolCount) 153 154 if not CalcStatus: 155 if not OptionsInfo["QuietMode"]: 156 MiscUtil.PrintWarning("Failed to calculate energy for molecule %s" % RDKitUtil.GetMolName(Mol, MolCount)) 157 158 ConfGenFailedCount += 1 159 continue 160 161 WriteMolConformers(Writer, ConformerMol, MolCount, ConfIDs, ConfEnergies) 162 163 return (MolCount, ValidMolCount, ConfGenFailedCount) 164 165 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 166 """Process and minimize molecules using multiprocessing.""" 167 168 if OptionsInfo["MPLevelConformersMode"]: 169 return ProcessMoleculesUsingMultipleProcessesAtConformersLevel(Mols, Writer) 170 elif OptionsInfo["MPLevelMoleculesMode"]: 171 return ProcessMoleculesUsingMultipleProcessesAtMoleculesLevel(Mols, Writer) 172 else: 173 MiscUtil.PrintError("The value, %s, option \"--mpLevel\" is not supported." % (OptionsInfo["MPLevel"])) 174 175 def ProcessMoleculesUsingMultipleProcessesAtMoleculesLevel(Mols, Writer): 176 """Process molecules to generate conformers using multiprocessing at molecules level.""" 177 178 MiscUtil.PrintInfo("\nGenerating conformers and performing energy minimization using multiprocessing at molecules level...") 179 180 MPParams = OptionsInfo["MPParams"] 181 182 # Setup data for initializing a worker process... 183 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 184 185 # Setup a encoded mols data iterable for a worker process... 186 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 187 188 # Setup process pool along with data initialization for each process... 189 if not OptionsInfo["QuietMode"]: 190 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 191 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 192 193 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 194 195 # Start processing... 196 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 197 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 198 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 199 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 200 else: 201 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 202 203 # Print out Psi4 version in the main process... 204 MiscUtil.PrintInfo("\nInitializing Psi4...\n") 205 Psi4Handle = Psi4Util.InitializePsi4(PrintVersion = True, PrintHeader = False) 206 OptionsInfo["psi4"] = Psi4Handle 207 208 (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3 209 for Result in Results: 210 MolCount += 1 211 MolIndex, EncodedMol, CalcStatus, ConfIDs, ConfEnergies = Result 212 213 if EncodedMol is None: 214 continue 215 ValidMolCount += 1 216 217 if not CalcStatus: 218 ConfGenFailedCount += 1 219 continue 220 221 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 222 WriteMolConformers(Writer, Mol, MolCount, ConfIDs, ConfEnergies) 223 224 return (MolCount, ValidMolCount, ConfGenFailedCount) 225 226 def InitializeWorkerProcess(*EncodedArgs): 227 """Initialize data for a worker process.""" 228 229 global Options, OptionsInfo 230 231 if not OptionsInfo["QuietMode"]: 232 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 233 234 # Decode Options and OptionInfo... 235 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 236 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 237 238 # Psi4 is initialized in the worker process to avoid creation of redundant Psi4 239 # output files for each process... 240 OptionsInfo["Psi4Initialized"] = False 241 242 def WorkerProcess(EncodedMolInfo): 243 """Process data for a worker process.""" 244 245 if not OptionsInfo["Psi4Initialized"]: 246 InitializePsi4ForWorkerProcess() 247 248 MolIndex, EncodedMol = EncodedMolInfo 249 MolNum = MolIndex + 1 250 251 CalcStatus = False 252 ConfIDs = None 253 ConfEnergies = None 254 255 if EncodedMol is None: 256 return [MolIndex, None, CalcStatus, ConfIDs, ConfEnergies] 257 258 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 259 260 if not CheckAndValidateMolecule(Mol, MolNum): 261 return [MolIndex, None, CalcStatus, ConfIDs, ConfEnergies] 262 263 # Setup 2D coordinates for SMILES input file... 264 if OptionsInfo["SMILESInfileStatus"]: 265 AllChem.Compute2DCoords(Mol) 266 267 Mol, CalcStatus, ConfIDs, ConfEnergies = GenerateMolConformers(OptionsInfo["psi4"], Mol, MolNum) 268 269 return [MolIndex, RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps), CalcStatus, ConfIDs, ConfEnergies] 270 271 def ProcessMoleculesUsingMultipleProcessesAtConformersLevel(Mols, Writer): 272 """Process molecules to generate conformers using multiprocessing at conformers level.""" 273 274 MiscUtil.PrintInfo("\nPerforming minimization with generation of conformers using multiprocessing at conformers level...") 275 276 (MolCount, ValidMolCount, ConfGenFailedCount) = [0] * 3 277 for Mol in Mols: 278 MolCount += 1 279 280 if not CheckAndValidateMolecule(Mol, MolCount): 281 continue 282 283 # Setup 2D coordinates for SMILES input file... 284 if OptionsInfo["SMILESInfileStatus"]: 285 AllChem.Compute2DCoords(Mol) 286 287 ValidMolCount += 1 288 289 Mol, CalcStatus, ConfIDs, ConfEnergies = ProcessConformersUsingMultipleProcesses(Mol, MolCount) 290 291 if not CalcStatus: 292 ConfGenFailedCount += 1 293 continue 294 295 WriteMolConformers(Writer, Mol, MolCount, ConfIDs, ConfEnergies) 296 297 return (MolCount, ValidMolCount, ConfGenFailedCount) 298 299 def ProcessConformersUsingMultipleProcesses(Mol, MolNum): 300 """Generate coformers and minimize them using multiple processes.""" 301 302 # Add hydrogens... 303 Mol = Chem.AddHs(Mol) 304 305 # Setup conformers... 306 ConfIDs = EmbedMolecule(Mol, MolNum) 307 if not len(ConfIDs): 308 if not OptionsInfo["QuietMode"]: 309 MolName = RDKitUtil.GetMolName(Mol, MolNum) 310 MiscUtil.PrintWarning("Minimization couldn't be performed for molecule %s: Embedding failed...\n" % MolName) 311 return (Mol, False, None, None) 312 313 MPParams = OptionsInfo["MPParams"] 314 315 # Setup data for initializing a worker process... 316 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 317 318 # Setup a encoded mols data iterable for a worker process... 319 MolIndex = MolNum - 1 320 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs) 321 322 # Setup process pool along with data initialization for each process... 323 if not OptionsInfo["QuietMode"]: 324 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 325 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 326 327 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeConformerWorkerProcess, InitializeWorkerProcessArgs) 328 329 # Start processing... 330 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 331 Results = ProcessPool.imap(ConformerWorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 332 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 333 Results = ProcessPool.map(ConformerWorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 334 else: 335 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 336 337 CalcEnergyMap = {} 338 CalcFailedCount = 0 339 for Result in Results: 340 MolIndex, EncodedMol, CalcStatus, ConfID, Energy = Result 341 342 if EncodedMol is None: 343 CalcFailedCount += 1 344 continue 345 346 if not CalcStatus: 347 CalcFailedCount += 1 348 continue 349 350 # Retrieve minimized atom positions... 351 MinimizedMol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 352 AtomPositions = RDKitUtil.GetAtomPositions(MinimizedMol, ConfID = ConfID) 353 354 # Update atom positions... 355 RDKitUtil.SetAtomPositions(Mol, AtomPositions, ConfID = ConfID) 356 357 CalcEnergyMap[ConfID] = Energy 358 359 if CalcFailedCount: 360 return (Mol, False, None, None) 361 362 # Align molecules after minimization... 363 if OptionsInfo["ConfGenerationParams"]["AlignConformers"]: 364 AllChem.AlignMolConformers(Mol) 365 366 # Filter conformers... 367 SelectedConfIDs, SelectedConfEnergies = FilterMolConformers(Mol, MolNum, ConfIDs, CalcEnergyMap) 368 369 return [Mol, True, SelectedConfIDs, SelectedConfEnergies] 370 371 def InitializeConformerWorkerProcess(*EncodedArgs): 372 """Initialize data for a conformer worker process.""" 373 374 global Options, OptionsInfo 375 376 if not OptionsInfo["QuietMode"]: 377 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 378 379 # Decode Options and OptionInfo... 380 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 381 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 382 383 # Psi4 is initialized in the worker process to avoid creation of redundant Psi4 384 # output files for each process... 385 OptionsInfo["Psi4Initialized"] = False 386 387 def ConformerWorkerProcess(EncodedMolInfo): 388 """Process data for a conformer worker process.""" 389 390 if not OptionsInfo["Psi4Initialized"]: 391 InitializePsi4ForWorkerProcess() 392 393 MolIndex, EncodedMol, ConfID = EncodedMolInfo 394 395 MolNum = MolIndex + 1 396 397 CalcStatus = False 398 Energy = None 399 400 if EncodedMol is None: 401 return [MolIndex, None, CalcStatus, ConfID, Energy] 402 403 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 404 MolName = RDKitUtil.GetMolName(Mol, MolNum) 405 406 if not OptionsInfo["QuietMode"]: 407 MiscUtil.PrintInfo("Processing conformer ID %s for molecule %s..." % (ConfID, MolName)) 408 409 Status, ConvergeStatus = MinimizeMoleculeUsingForceField(Mol, MolNum, ConfID) 410 if not Status: 411 return [MolIndex, EncodedMol, CalcStatus, ConfID, Energy] 412 413 if ConvergeStatus != 0: 414 if not OptionsInfo["QuietMode"]: 415 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 416 MiscUtil.PrintWarning("Minimization using forcefield failed to converge for molecule %s in %d steps. Try using higher value for \"maxIters\" in \"--confParams\" option...\n" % (MolName, OptionsInfo["ConfGenerationParams"]["MaxIters"])) 417 418 # Perform Psi4 minimization... 419 CalcStatus, Energy = MinimizeMoleculeUsingPsi4(OptionsInfo["psi4"], Mol, MolNum, ConfID) 420 if not CalcStatus: 421 if not OptionsInfo["QuietMode"]: 422 MiscUtil.PrintWarning("Minimization couldn't be performed for molecule %s\n" % (MolName)) 423 return [MolIndex, EncodedMol, False, ConfID, Energy] 424 425 return [MolIndex, RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps), CalcStatus, ConfID, Energy] 426 427 def InitializePsi4ForWorkerProcess(): 428 """Initialize Psi4 for a worker process.""" 429 430 if OptionsInfo["Psi4Initialized"]: 431 return 432 433 OptionsInfo["Psi4Initialized"] = True 434 435 if OptionsInfo["MPLevelConformersMode"] and re.match("auto", OptionsInfo["Psi4RunParams"]["OutputFileSpecified"], re.I): 436 # Run Psi4 in quiet mode during multiprocessing at Conformers level for 'auto' OutputFile... 437 OptionsInfo["Psi4RunParams"]["OutputFile"] = "quiet" 438 else: 439 # Update output file... 440 OptionsInfo["Psi4RunParams"]["OutputFile"] = Psi4Util.UpdatePsi4OutputFileUsingPID(OptionsInfo["Psi4RunParams"]["OutputFile"], os.getpid()) 441 442 # Intialize Psi4... 443 OptionsInfo["psi4"] = Psi4Util.InitializePsi4(Psi4RunParams = OptionsInfo["Psi4RunParams"], Psi4OptionsParams = OptionsInfo["Psi4OptionsParams"], PrintVersion = False, PrintHeader = True) 444 445 # Setup max iterations global variable... 446 Psi4Util.UpdatePsi4OptionsParameters(OptionsInfo["psi4"], {'GEOM_MAXITER': OptionsInfo["MaxIters"]}) 447 448 # Setup conversion factor for energy units... 449 SetupEnergyConversionFactor(OptionsInfo["psi4"]) 450 451 def GenerateMolConformers(Psi4Handle, Mol, MolNum): 452 """Generate and mininize conformers for a molecule.""" 453 454 # Add hydrogens.. 455 Mol = Chem.AddHs(Mol) 456 457 MolName = RDKitUtil.GetMolName(Mol, MolNum) 458 459 # Setup conformers... 460 ConfIDs = EmbedMolecule(Mol, MolNum) 461 if not len(ConfIDs): 462 if not OptionsInfo["QuietMode"]: 463 MiscUtil.PrintWarning("Conformation generation couldn't be performed for molecule %s: Embedding failed...\n" % MolName) 464 return (Mol, False, None, None) 465 466 # Minimize conformers... 467 CalcEnergyMap = {} 468 for ConfID in ConfIDs: 469 if not OptionsInfo["QuietMode"]: 470 MiscUtil.PrintInfo("Processing conformer ID %s for molecule %s..." % (ConfID, MolName)) 471 472 # Perform forcefield minimization... 473 Status, ConvergeStatus = MinimizeMoleculeUsingForceField(Mol, MolNum, ConfID) 474 if not Status: 475 return (Mol, False, None, None) 476 477 if ConvergeStatus != 0: 478 if not OptionsInfo["QuietMode"]: 479 MiscUtil.PrintWarning("Minimization using forcefield failed to converge for molecule %s in %d steps. Try using higher value for \"maxIters\" in \"--confParams\" option...\n" % (MolName, OptionsInfo["ConfGenerationParams"]["MaxIters"])) 480 481 # Perform Psi4 minimization... 482 CalcStatus, Energy = MinimizeMoleculeUsingPsi4(Psi4Handle, Mol, MolNum, ConfID) 483 if not CalcStatus: 484 if not OptionsInfo["QuietMode"]: 485 MiscUtil.PrintWarning("Minimization couldn't be performed for molecule %s\n" % (MolName)) 486 return (Mol, False, None, None) 487 488 CalcEnergyMap[ConfID] = Energy 489 490 # Align molecules after minimization... 491 if OptionsInfo["ConfGenerationParams"]["AlignConformers"]: 492 AllChem.AlignMolConformers(Mol) 493 494 # Filter conformers... 495 SelectedConfIDs, SelectedConfEnergies = FilterMolConformers(Mol, MolNum, ConfIDs, CalcEnergyMap) 496 497 return [Mol, True, SelectedConfIDs, SelectedConfEnergies] 498 499 def FilterMolConformers(Mol, MolNum, ConfIDs, CalcEnergyMap): 500 """Filter conformers for a molecule.""" 501 502 SortedConfIDs = sorted(ConfIDs, key = lambda ConfID: CalcEnergyMap[ConfID]) 503 504 MinEnergyConfID = SortedConfIDs[0] 505 MinConfEnergy = CalcEnergyMap[MinEnergyConfID] 506 EnergyWindow = OptionsInfo["EnergyWindow"] 507 508 EnergyRMSDCutoff = OptionsInfo["EnergyRMSDCutoff"] 509 ApplyEnergyRMSDCutoff = False 510 if EnergyRMSDCutoff > 0: 511 ApplyEnergyRMSDCutoff = True 512 513 EnergyRMSDCutoffLowest = OptionsInfo["EnergyRMSDCutoffModeLowest"] 514 EnergyRMSDCalcModeBest = OptionsInfo["EnergyRMSDCalcModeBest"] 515 516 PreAligned = False 517 if OptionsInfo["ConfGenerationParams"]["AlignConformers"]: 518 PreAligned = True 519 520 RefMol, ProbeMol = [None] * 2 521 if EnergyRMSDCalcModeBest: 522 # Copy molecules for best RMSD calculations to avoid change in the coordinates 523 # of the conformations... 524 RefMol = AllChem.Mol(Mol) 525 ProbeMol = AllChem.Mol(Mol) 526 527 # Track conformers with in the specified energy window from the lowest 528 # energy conformation along with applying RMSD cutoff as needed... 529 # 530 SelectedConfIDs = [] 531 532 ConfCount = 0 533 IgnoredByEnergyConfCount = 0 534 IgnoredByRMSDConfCount = 0 535 536 FirstConf = True 537 538 for ConfID in SortedConfIDs: 539 if FirstConf: 540 FirstConf = False 541 ConfCount += 1 542 SelectedConfIDs.append(ConfID) 543 continue 544 545 ConfEnergyDiff = abs(CalcEnergyMap[ConfID] - MinConfEnergy ) 546 if ConfEnergyDiff > EnergyWindow: 547 IgnoredByEnergyConfCount += 1 548 continue 549 550 if ApplyEnergyRMSDCutoff: 551 IgnoreConf = False 552 if EnergyRMSDCutoffLowest: 553 # Compare RMSD with the lowest energy conformation... 554 if EnergyRMSDCalcModeBest: 555 CalcRMSD = AllChem.GetBestRMS(ProbeMol, RefMol, prbId = ConfID, refId = MinEnergyConfID) 556 else: 557 CalcRMSD = AllChem.GetConformerRMS(Mol, MinEnergyConfID, ConfID, prealigned=PreAligned) 558 if CalcRMSD < EnergyRMSDCutoff: 559 IgnoreConf = True 560 else: 561 for SelectedConfID in SelectedConfIDs: 562 if EnergyRMSDCalcModeBest: 563 CalcRMSD = AllChem.GetBestRMS(ProbeMol, RefMol, prbId = ConfID, refId = SelectedConfID) 564 else: 565 CalcRMSD = AllChem.GetConformerRMS(Mol, SelectedConfID, ConfID, prealigned=PreAligned) 566 if CalcRMSD < EnergyRMSDCutoff: 567 IgnoreConf = True 568 break 569 if IgnoreConf: 570 IgnoredByRMSDConfCount += 1 571 continue 572 573 ConfCount += 1 574 SelectedConfIDs.append(ConfID) 575 576 if not OptionsInfo["QuietMode"]: 577 MiscUtil.PrintInfo("\nTotal Number of conformations generated for %s: %d" % (RDKitUtil.GetMolName(Mol, MolNum), ConfCount)) 578 MiscUtil.PrintInfo("Number of conformations ignored due to energy window cutoff: %d" % (IgnoredByEnergyConfCount)) 579 if ApplyEnergyRMSDCutoff: 580 MiscUtil.PrintInfo("Number of conformations ignored due to energy RMSD cutoff: %d" % (IgnoredByRMSDConfCount)) 581 582 SelectedConfEnergies = None 583 if OptionsInfo["EnergyOut"]: 584 SelectedConfEnergies = [] 585 for ConfID in SelectedConfIDs: 586 Energy = "%.*f" % (OptionsInfo["Precision"], CalcEnergyMap[ConfID]) 587 SelectedConfEnergies.append(Energy) 588 589 return [SelectedConfIDs, SelectedConfEnergies] 590 591 def MinimizeMoleculeUsingPsi4(Psi4Handle, Mol, MolNum, ConfID = -1): 592 """Minimize molecule using Psi4.""" 593 594 # Setup a Psi4Mol... 595 Psi4Mol = SetupPsi4Mol(Psi4Handle, Mol, MolNum, ConfID) 596 if Psi4Mol is None: 597 return (False, None) 598 599 # Setup reference wave function... 600 Reference = SetupReferenceWavefunction(Mol) 601 Psi4Handle.set_options({'Reference': Reference}) 602 603 # Setup method name and basis set... 604 MethodName, BasisSet = SetupMethodNameAndBasisSet(Mol) 605 606 # Optimize geometry... 607 Status, Energy, WaveFunction = Psi4Util.PerformGeometryOptimization(Psi4Handle, Psi4Mol, MethodName, BasisSet, ReturnWaveFunction = True, Quiet = OptionsInfo["QuietMode"]) 608 609 if not Status: 610 PerformPsi4Cleanup(Psi4Handle) 611 return (False, None) 612 613 # Update atom positions... 614 AtomPositions = Psi4Util.GetAtomPositions(Psi4Handle, WaveFunction, InAngstroms = True) 615 RDKitUtil.SetAtomPositions(Mol, AtomPositions, ConfID = ConfID) 616 617 # Convert energy units... 618 if OptionsInfo["ApplyEnergyConversionFactor"]: 619 Energy = Energy * OptionsInfo["EnergyConversionFactor"] 620 621 # Clean up 622 PerformPsi4Cleanup(Psi4Handle) 623 624 return (True, Energy) 625 626 def MinimizeMoleculeUsingForceField(Mol, MolNum, ConfID = -1): 627 """Minimize molecule using forcefield available in RDKit.""" 628 629 try: 630 if OptionsInfo["ConfGenerationParams"]["UseUFF"]: 631 ConvergeStatus = AllChem.UFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["ConfGenerationParams"]["MaxIters"]) 632 elif OptionsInfo["ConfGenerationParams"]["UseMMFF"]: 633 ConvergeStatus = AllChem.MMFFOptimizeMolecule(Mol, confId = ConfID, maxIters = OptionsInfo["ConfGenerationParams"]["MaxIters"], mmffVariant = OptionsInfo["ConfGenerationParams"]["MMFFVariant"]) 634 else: 635 MiscUtil.PrintError("Minimization couldn't be performed: Specified forcefield, %s, is not supported" % OptionsInfo["ConfGenerationParams"]["ForceField"]) 636 except (ValueError, RuntimeError, Chem.rdchem.KekulizeException) as ErrMsg: 637 if not OptionsInfo["QuietMode"]: 638 MolName = RDKitUtil.GetMolName(Mol, MolNum) 639 MiscUtil.PrintWarning("Minimization using forcefield couldn't be performed for molecule %s:\n%s\n" % (MolName, ErrMsg)) 640 return (False, None) 641 642 return (True, ConvergeStatus) 643 644 def EmbedMolecule(Mol, MolNum = None): 645 """Embed conformations""" 646 647 ConfIDs = [] 648 649 MaxConfs = OptionsInfo["ConfGenerationParams"]["MaxConfs"] 650 RandomSeed = OptionsInfo["ConfGenerationParams"]["RandomSeed"] 651 EnforceChirality = OptionsInfo["ConfGenerationParams"]["EnforceChirality"] 652 UseExpTorsionAnglePrefs = OptionsInfo["ConfGenerationParams"]["UseExpTorsionAnglePrefs"] 653 ETVersion = OptionsInfo["ConfGenerationParams"]["ETVersion"] 654 UseBasicKnowledge = OptionsInfo["ConfGenerationParams"]["UseBasicKnowledge"] 655 EmbedRMSDCutoff = OptionsInfo["ConfGenerationParams"]["EmbedRMSDCutoff"] 656 657 try: 658 ConfIDs = AllChem.EmbedMultipleConfs(Mol, numConfs = MaxConfs, randomSeed = RandomSeed, pruneRmsThresh = EmbedRMSDCutoff, enforceChirality = EnforceChirality, useExpTorsionAnglePrefs = UseExpTorsionAnglePrefs, useBasicKnowledge = UseBasicKnowledge, ETversion = ETVersion) 659 except ValueError as ErrMsg: 660 if not OptionsInfo["QuietMode"]: 661 MolName = RDKitUtil.GetMolName(Mol, MolNum) 662 MiscUtil.PrintWarning("Embedding failed for molecule %s:\n%s\n" % (MolName, ErrMsg)) 663 ConfIDs = [] 664 665 if not OptionsInfo["QuietMode"]: 666 if EmbedRMSDCutoff > 0: 667 MiscUtil.PrintInfo("Generating initial conformation ensemble by distance geometry for %s - EmbedRMSDCutoff: %s; Size: %s; Size after RMSD filtering: %s" % (RDKitUtil.GetMolName(Mol, MolNum), EmbedRMSDCutoff, MaxConfs, len(ConfIDs))) 668 else: 669 MiscUtil.PrintInfo("Generating initial conformation ensemble by distance geometry for %s - EmbedRMSDCutoff: None; Size: %s" % (RDKitUtil.GetMolName(Mol, MolNum), len(ConfIDs))) 670 671 return ConfIDs 672 673 def SetupPsi4Mol(Psi4Handle, Mol, MolNum, ConfID = -1): 674 """Setup a Psi4 molecule object.""" 675 676 if OptionsInfo["RecenterAndReorient"]: 677 MolGeometry = RDKitUtil.GetPsi4XYZFormatString(Mol, ConfID = ConfID, NoCom = False, NoReorient = False) 678 else: 679 MolGeometry = RDKitUtil.GetPsi4XYZFormatString(Mol, ConfID = ConfID, NoCom = True, NoReorient = True) 680 681 try: 682 Psi4Mol = Psi4Handle.geometry(MolGeometry) 683 except Exception as ErrMsg: 684 Psi4Mol = None 685 if not OptionsInfo["QuietMode"]: 686 MiscUtil.PrintWarning("Failed to create Psi4 molecule from geometry string: %s\n" % ErrMsg) 687 MolName = RDKitUtil.GetMolName(Mol, MolNum) 688 MiscUtil.PrintWarning("Ignoring molecule: %s" % MolName) 689 690 if OptionsInfo["Symmetrize"]: 691 Psi4Mol.symmetrize(OptionsInfo["SymmetrizeTolerance"]) 692 693 return Psi4Mol 694 695 def PerformPsi4Cleanup(Psi4Handle): 696 """Perform clean up.""" 697 698 # Clean up after Psi4 run... 699 Psi4Handle.core.clean() 700 701 # Clean up any leftover scratch files... 702 if OptionsInfo["MPMode"]: 703 Psi4Util.RemoveScratchFiles(Psi4Handle, OptionsInfo["Psi4RunParams"]["OutputFile"]) 704 705 def CheckAndValidateMolecule(Mol, MolCount = None): 706 """Validate molecule for Psi4 calculations.""" 707 708 if Mol is None: 709 if not OptionsInfo["QuietMode"]: 710 MiscUtil.PrintInfo("\nProcessing molecule number %s..." % MolCount) 711 return False 712 713 MolName = RDKitUtil.GetMolName(Mol, MolCount) 714 if not OptionsInfo["QuietMode"]: 715 MiscUtil.PrintInfo("\nProcessing molecule %s..." % MolName) 716 717 if RDKitUtil.IsMolEmpty(Mol): 718 if not OptionsInfo["QuietMode"]: 719 MiscUtil.PrintWarning("Ignoring empty molecule: %s\n" % MolName) 720 return False 721 722 if not RDKitUtil.ValidateElementSymbols(RDKitUtil.GetAtomSymbols(Mol)): 723 if not OptionsInfo["QuietMode"]: 724 MiscUtil.PrintWarning("Ignoring molecule containing invalid element symbols: %s\n" % MolName) 725 return False 726 727 return True 728 729 def SetupMethodNameAndBasisSet(Mol): 730 """Setup method name and basis set.""" 731 732 MethodName = OptionsInfo["MethodName"] 733 if OptionsInfo["MethodNameAuto"]: 734 MethodName = "B3LYP" 735 736 BasisSet = OptionsInfo["BasisSet"] 737 if OptionsInfo["BasisSetAuto"]: 738 BasisSet = "6-31+G**" if RDKitUtil.IsAtomSymbolPresentInMol(Mol, "S") else "6-31G**" 739 740 return (MethodName, BasisSet) 741 742 def SetupReferenceWavefunction(Mol): 743 """Setup reference wavefunction.""" 744 745 Reference = OptionsInfo["Reference"] 746 if OptionsInfo["ReferenceAuto"]: 747 Reference = 'UHF' if (RDKitUtil.GetSpinMultiplicity(Mol) > 1) else 'RHF' 748 749 return Reference 750 751 def SetupEnergyConversionFactor(Psi4Handle): 752 """Setup converstion factor for energt units. The Psi4 energy units are Hartrees.""" 753 754 EnergyUnits = OptionsInfo["EnergyUnits"] 755 756 ApplyConversionFactor = True 757 if re.match("^kcal\/mol$", EnergyUnits, re.I): 758 ConversionFactor = Psi4Handle.constants.hartree2kcalmol 759 elif re.match("^kJ\/mol$", EnergyUnits, re.I): 760 ConversionFactor = Psi4Handle.constants.hartree2kJmol 761 elif re.match("^eV$", EnergyUnits, re.I): 762 ConversionFactor = Psi4Handle.constants.hartree2ev 763 else: 764 ApplyConversionFactor = False 765 ConversionFactor = 1.0 766 767 OptionsInfo["ApplyEnergyConversionFactor"] = ApplyConversionFactor 768 OptionsInfo["EnergyConversionFactor"] = ConversionFactor 769 770 def WriteMolConformers(Writer, Mol, MolNum, ConfIDs, ConfEnergies = None): 771 """Write molecule coformers.""" 772 773 if ConfIDs is None: 774 return True 775 776 MolName = RDKitUtil.GetMolName(Mol, MolNum) 777 778 for Index, ConfID in enumerate(ConfIDs): 779 SetConfMolName(Mol, MolName, ConfID) 780 781 if OptionsInfo["EnergyOut"] and ConfEnergies is not None: 782 Mol.SetProp(OptionsInfo["EnergyDataFieldLabel"], ConfEnergies[Index]) 783 784 Writer.write(Mol, confId = ConfID) 785 786 def SetConfMolName(Mol, MolName, ConfCount): 787 """Set conf mol name.""" 788 789 ConfName = "%s_Conf%d" % (MolName, ConfCount) 790 Mol.SetProp("_Name", ConfName) 791 792 793 def ProcessOptions(): 794 """Process and validate command line arguments and options.""" 795 796 MiscUtil.PrintInfo("Processing options...") 797 798 # Validate options... 799 ValidateOptions() 800 801 OptionsInfo["Infile"] = Options["--infile"] 802 OptionsInfo["SMILESInfileStatus"] = True if MiscUtil.CheckFileExt(Options["--infile"], "smi csv tsv txt") else False 803 ParamsDefaultInfoOverride = {"RemoveHydrogens": False} 804 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], InfileName = Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 805 806 OptionsInfo["Outfile"] = Options["--outfile"] 807 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"]) 808 809 OptionsInfo["Overwrite"] = Options["--overwrite"] 810 811 # Method, basis set, and reference wavefunction... 812 OptionsInfo["BasisSet"] = Options["--basisSet"] 813 OptionsInfo["BasisSetAuto"] = True if re.match("^auto$", Options["--basisSet"], re.I) else False 814 815 OptionsInfo["MethodName"] = Options["--methodName"] 816 OptionsInfo["MethodNameAuto"] = True if re.match("^auto$", Options["--methodName"], re.I) else False 817 818 OptionsInfo["Reference"] = Options["--reference"] 819 OptionsInfo["ReferenceAuto"] = True if re.match("^auto$", Options["--reference"], re.I) else False 820 821 # Run and options parameters... 822 OptionsInfo["Psi4OptionsParams"] = Psi4Util.ProcessPsi4OptionsParameters("--psi4OptionsParams", Options["--psi4OptionsParams"]) 823 OptionsInfo["Psi4RunParams"] = Psi4Util.ProcessPsi4RunParameters("--psi4RunParams", Options["--psi4RunParams"], InfileName = OptionsInfo["Infile"]) 824 825 # Conformer generation paramaters... 826 ParamsDefaultInfoOverride = {"MaxConfs": 50, "MaxIters": 250} 827 OptionsInfo["ConfGenerationParams"] = MiscUtil.ProcessOptionConformerParameters("--confParams", Options["--confParams"], ParamsDefaultInfoOverride) 828 829 # Energy parameters... 830 OptionsInfo["EnergyOut"] = True if re.match("^yes$", Options["--energyOut"], re.I) else False 831 OptionsInfo["EnergyUnits"] = Options["--energyUnits"] 832 833 EnergyDataFieldLabel = Options["--energyDataFieldLabel"] 834 if re.match("^auto$", EnergyDataFieldLabel, re.I): 835 EnergyDataFieldLabel = "Psi4_Energy (%s)" % Options["--energyUnits"] 836 OptionsInfo["EnergyDataFieldLabel"] = EnergyDataFieldLabel 837 838 OptionsInfo["EnergyRMSDCalcMode"] = Options["--energyRMSDCalcMode"] 839 OptionsInfo["EnergyRMSDCalcModeBest"] = True if re.match("^BestRMSD$", Options["--energyRMSDCalcMode"], re.I) else False 840 841 OptionsInfo["EnergyRMSDCutoff"] = float(Options["--energyRMSDCutoff"]) 842 843 OptionsInfo["EnergyRMSDCutoffMode"] = Options["--energyRMSDCutoffMode"] 844 OptionsInfo["EnergyRMSDCutoffModeLowest"] = True if re.match("^Lowest$", Options["--energyRMSDCutoffMode"], re.I) else False 845 846 if OptionsInfo["EnergyRMSDCutoff"] > 0: 847 # Make sure that the alignConformers option is being used... 848 if not OptionsInfo["ConfGenerationParams"]["AlignConformers"]: 849 MiscUtil.PrintError("The value for \"alignConformers\" specified using \"--confParams\" must be set to \" yes\" for non-zero values of \"--energyRMSDCutoff\" ") 850 851 # Process energy window... 852 EnergyWindow = Options["--energyWindow"] 853 if re.match("^auto$", EnergyWindow, re.I): 854 # Set default energy window based on units... 855 EnergyUnits = Options["--energyUnits"] 856 if re.match("^kcal\/mol$", EnergyUnits, re.I): 857 EnergyWindow = 20 858 elif re.match("^kJ\/mol$", EnergyUnits, re.I): 859 EnergyWindow = 83.68 860 elif re.match("^eV$", EnergyUnits, re.I): 861 EnergyWindow = 0.8673 862 elif re.match("^Hartrees$", EnergyUnits, re.I): 863 EnergyWindow = 0.03188 864 else: 865 MiscUtil.PrintError("Failed to set default value for \"--energyWindow\". The value, %s, specified for option \"--energyUnits\" is not valid. " % EnergyUnits) 866 else: 867 EnergyWindow = float(EnergyWindow) 868 OptionsInfo["EnergyWindow"] = EnergyWindow 869 870 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 871 872 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 873 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 874 875 # Multiprocessing level... 876 MPLevelMoleculesMode = False 877 MPLevelConformersMode = False 878 MPLevel = Options["--mpLevel"] 879 if re.match("^Molecules$", MPLevel, re.I): 880 MPLevelMoleculesMode = True 881 elif re.match("^Conformers$", MPLevel, re.I): 882 MPLevelConformersMode = True 883 else: 884 MiscUtil.PrintError("The value, %s, specified for option \"--mpLevel\" is not valid. " % MPLevel) 885 OptionsInfo["MPLevel"] = MPLevel 886 OptionsInfo["MPLevelMoleculesMode"] = MPLevelMoleculesMode 887 OptionsInfo["MPLevelConformersMode"] = MPLevelConformersMode 888 889 OptionsInfo["Precision"] = int(Options["--precision"]) 890 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 891 892 OptionsInfo["RecenterAndReorient"] = True if re.match("^yes$", Options["--recenterAndReorient"], re.I) else False 893 894 Symmetrize = Options["--symmetrize"] 895 if re.match("^auto$", Symmetrize, re.I): 896 Symmetrize = True if OptionsInfo["RecenterAndReorient"] else False 897 else: 898 Symmetrize = True if re.match("^yes$", Symmetrize, re.I) else False 899 OptionsInfo["Symmetrize"] = Symmetrize 900 901 OptionsInfo["SymmetrizeTolerance"] = float(Options["--symmetrizeTolerance"]) 902 903 def RetrieveOptions(): 904 """Retrieve command line arguments and options.""" 905 906 # Get options... 907 global Options 908 Options = docopt(_docoptUsage_) 909 910 # Set current working directory to the specified directory... 911 WorkingDir = Options["--workingdir"] 912 if WorkingDir: 913 os.chdir(WorkingDir) 914 915 # Handle examples option... 916 if "--examples" in Options and Options["--examples"]: 917 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 918 sys.exit(0) 919 920 def ValidateOptions(): 921 """Validate option values.""" 922 923 MiscUtil.ValidateOptionTextValue("--energyOut", Options["--energyOut"], "yes no") 924 MiscUtil.ValidateOptionTextValue("--energyUnits", Options["--energyUnits"], "Hartrees kcal/mol kJ/mol eV") 925 926 MiscUtil.ValidateOptionTextValue(" --energyRMSDCalcMode", Options["--energyRMSDCalcMode"], "RMSD BestRMSD") 927 928 MiscUtil.ValidateOptionFloatValue("--energyRMSDCutoff", Options["--energyRMSDCutoff"], {">=": 0}) 929 MiscUtil.ValidateOptionTextValue(" --energyRMSDCutoffMode", Options["--energyRMSDCutoffMode"], "All Lowest") 930 931 if not re.match("^auto$", Options["--energyWindow"], re.I): 932 MiscUtil.ValidateOptionFloatValue("--energyWindow", Options["--energyWindow"], {">": 0}) 933 934 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 935 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 936 937 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 938 939 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd") 940 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 941 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 942 943 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 944 MiscUtil.ValidateOptionTextValue("--mpLevel", Options["--mpLevel"], "Molecules Conformers") 945 946 MiscUtil.ValidateOptionIntegerValue("-p, --precision", Options["--precision"], {">": 0}) 947 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 948 949 MiscUtil.ValidateOptionTextValue("--recenterAndReorient", Options["--recenterAndReorient"], "yes no") 950 MiscUtil.ValidateOptionTextValue("--symmetrize", Options["--symmetrize"], "yes no auto") 951 MiscUtil.ValidateOptionFloatValue("--symmetrizeTolerance", Options["--symmetrizeTolerance"], {">": 0}) 952 953 # Setup a usage string for docopt... 954 _docoptUsage_ = """ 955 Psi4GenerateConformers.py - Generate molecular conformations 956 957 Usage: 958 Psi4GenerateConformers.py [--basisSet <text>] [--confParams <Name,Value,...>] [--energyOut <yes or no>] 959 [--energyDataFieldLabel <text>] [--energyUnits <text>] [--energyRMSDCalcMode <RMSD or BestRMSD>] 960 [--energyRMSDCutoff <number>] [--energyRMSDCutoffMode <All or Lowest>] [--energyWindow <number>] 961 [--infileParams <Name,Value,...>] [--maxIters <number>] 962 [--methodName <text>] [--mp <yes or no>] [--mpLevel <Molecules or Conformers>] 963 [--mpParams <Name, Value,...>] [ --outfileParams <Name,Value,...> ] [--overwrite] [--precision <number>] 964 [--psi4OptionsParams <Name,Value,...>] [--psi4RunParams <Name,Value,...>] 965 [--quiet <yes or no>] [--reference <text>] [--recenterAndReorient <yes or no>] 966 [--symmetrize <yes or no>] [--symmetrizeTolerance <number>] [-w <dir>] -i <infile> -o <outfile> 967 Psi4GenerateConformers.py -h | --help | -e | --examples 968 969 Description: 970 Generate 3D conformers of molecules using a combination of distance geometry 971 and forcefield minimization followed by geometry optimization using a quantum 972 chemistry method. A set of initial 3D structures are generated for a molecule 973 employing distance geometry. The 3D structures in the conformation ensemble 974 are sequentially minimized using forcefield and a quantum chemistry method. 975 976 A Psi4 XYZ format geometry string is automatically generated for each molecule 977 in input file. It contains atom symbols and 3D coordinates for each atom in a 978 molecule. In addition, the formal charge and spin multiplicity are present in the 979 the geometry string. These values are either retrieved from molecule properties 980 named 'FormalCharge' and 'SpinMultiplicty' or dynamically calculated for a 981 molecule. 982 983 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 984 .csv, .tsv .txt) 985 986 The supported output file formats are: SD (.sdf, .sd) 987 988 Options: 989 -b, --basisSet <text> [default: auto] 990 Basis set to use for energy minimization. Default: 6-31+G** for sulfur 991 containing molecules; Otherwise, 6-31G** [ Ref 150 ]. The specified 992 value must be a valid Psi4 basis set. No validation is performed. 993 994 The following list shows a representative sample of basis sets available 995 in Psi4: 996 997 STO-3G, 6-31G, 6-31+G, 6-31++G, 6-31G*, 6-31+G*, 6-31++G*, 998 6-31G**, 6-31+G**, 6-31++G**, 6-311G, 6-311+G, 6-311++G, 999 6-311G*, 6-311+G*, 6-311++G*, 6-311G**, 6-311+G**, 6-311++G**, 1000 cc-pVDZ, cc-pCVDZ, aug-cc-pVDZ, cc-pVDZ-DK, cc-pCVDZ-DK, def2-SVP, 1001 def2-SVPD, def2-TZVP, def2-TZVPD, def2-TZVPP, def2-TZVPPD 1002 1003 --confParams <Name,Value,...> [default: auto] 1004 Generate an initial 3D conformation ensemble using distance geometry and 1005 forcefield minimization before final geometry optimization by a specified 1006 method name and basis set. Possible values: yes or no. 1007 1008 A comma delimited list of parameter name and value pairs for generating 1009 initial sets of 3D conformations for molecules. The 3D conformation ensemble 1010 is generated using distance geometry and forcefield functionality available 1011 in RDKit. The 3D structures in the conformation ensemble are subsequently 1012 minimized by a quantum chemistry method available in Psi4. 1013 1014 The supported parameter names along with their default values are shown 1015 below: 1016 1017 confMethod,ETKDGv2, 1018 forceField,MMFF, forceFieldMMFFVariant,MMFF94, 1019 enforceChirality,yes,alignConformers,yes, embedRMSDCutoff,0.5, 1020 maxConfs,50,maxIters,250,randomSeed,auto 1021 1022 confMethod,ETKDGv2 [ Possible values: SDG, KDG, ETDG, 1023 ETKDG , or ETKDGv2] 1024 forceField, MMFF [ Possible values: UFF or MMFF ] 1025 forceFieldMMFFVariant,MMFF94 [ Possible values: MMFF94 or MMFF94s ] 1026 enforceChirality,yes [ Possible values: yes or no ] 1027 alignConformers,yes [ Possible values: yes or no ] 1028 embedRMSDCutoff,0.5 [ Possible values: number or None] 1029 1030 confMethod: Conformation generation methodology for generating initial 3D 1031 coordinates. Possible values: Standard Distance Geometry (SDG), Experimental 1032 Torsion-angle preference with Distance Geometry (ETDG), basic Knowledge-terms 1033 with Distance Geometry (KDG) and Experimental Torsion-angle preference 1034 along with basic Knowledge-terms with Distance Geometry (ETKDG or 1035 ETKDGv2) [Ref 129, 167] . 1036 1037 forceField: Forcefield method to use for energy minimization. Possible 1038 values: Universal Force Field (UFF) [ Ref 81 ] or Merck Molecular Mechanics 1039 Force Field [ Ref 83-87 ] . 1040 1041 enforceChirality: Enforce chirality for defined chiral centers during 1042 forcefield minimization. 1043 1044 alignConformers: Align conformers for each molecule. 1045 1046 maxConfs: Maximum number of conformations to generate for each molecule 1047 during the generation of an initial 3D conformation ensemble using 1048 conformation generation methodology. The conformations are minimized 1049 using the specified forcefield and a quantum chemistry method. The lowest 1050 energy conformation is written to the output file. 1051 1052 embedRMSDCutoff: RMSD cutoff for retaining initial set of conformers embedded 1053 using distance geometry and before forcefield minimization. All embedded 1054 conformers are kept for 'None' value. Otherwise, only those conformers which 1055 are different from each other by the specified RMSD cutoff, 0.5 by default, 1056 are kept. The first embedded conformer is always retained. 1057 1058 maxIters: Maximum number of iterations to perform for each conformation 1059 during forcefield minimization. 1060 1061 randomSeed: Seed for the random number generator for reproducing initial 1062 3D coordinates in a conformation ensemble. Default is to use a random seed. 1063 --energyOut <yes or no> [default: yes] 1064 Write out energy values. 1065 --energyDataFieldLabel <text> [default: auto] 1066 Energy data field label for writing energy values. Default: Psi4_Energy (<Units>). 1067 --energyUnits <text> [default: kcal/mol] 1068 Energy units. Possible values: Hartrees, kcal/mol, kJ/mol, or eV. 1069 --energyRMSDCalcMode <RMSD or BestRMSD> [default: RMSD] 1070 Methodology for calculating RMSD values during the application of RMSD 1071 cutoff for retaining conformations after the final energy minimization. Possible 1072 values: RMSD or BestRMSD. This option is ignore during 'None' value of 1073 '--energyRMSDCutoff' option. 1074 1075 During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to 1076 align and calculate RMSD. This function calculates optimal RMSD for aligning two 1077 molecules, taking symmetry into account. Otherwise, the RMSD value is calculated 1078 using 'AllChem.GetConformerRMS' without changing the atom order. A word to the 1079 wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to 1080 align all permutations of matching atom orders in both molecules, for some molecules 1081 it will lead to 'combinatorial explosion'. 1082 --energyRMSDCutoff <number> [default: 0.5] 1083 RMSD cutoff for retaining conformations after the final energy minimization. 1084 By default, only those conformations which are different from the lowest 1085 energy conformation by the specified RMSD cutoff and are with in the 1086 specified energy window are kept. The lowest energy conformation is always 1087 retained. A value of zero keeps all minimized conformations with in the 1088 specified energy window from the lowest energy. 1089 --energyRMSDCutoffMode <All or Lowest> [default: All] 1090 RMSD cutoff mode for retaining conformations after the final energy 1091 minimization. Possible values: All or Lowest. The RMSD values are compared 1092 against all the selected conformations or the lowest energy conformation during 1093 'All' and 'Lowest' value of '--energyRMSDCutoffMode'. This option is ignored 1094 during 'None' value of --energyRMSDCutoff. 1095 1096 By default, only those conformations which all different from all selected 1097 conformations by the specified RMSD cutoff and are with in the specified 1098 energy window are kept. 1099 --energyWindow <number> [default: auto] 1100 Psi4 Energy window for selecting conformers after the final energy minimization. 1101 The default value is dependent on '--energyUnits': 20 kcal/mol, 83.68 kJ/mol, 1102 0.8673 ev, or 0.03188 Hartrees. The specified value must be in '--energyUnits'. 1103 -e, --examples 1104 Print examples. 1105 -h, --help 1106 Print this help message. 1107 -i, --infile <infile> 1108 Input file name. 1109 --infileParams <Name,Value,...> [default: auto] 1110 A comma delimited list of parameter name and value pairs for reading 1111 molecules from files. The supported parameter names for different file 1112 formats, along with their default values, are shown below: 1113 1114 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes 1115 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 1116 smilesTitleLine,auto,sanitize,yes 1117 1118 Possible values for smilesDelimiter: space, comma or tab. 1119 --maxIters <number> [default: 50] 1120 Maximum number of iterations to perform for each molecule or conformer 1121 during energy minimization by a quantum chemistry method. 1122 -m, --methodName <text> [default: auto] 1123 Method to use for energy minimization. Default: B3LYP [ Ref 150 ]. The 1124 specified value must be a valid Psi4 method name. No validation is 1125 performed. 1126 1127 The following list shows a representative sample of methods available 1128 in Psi4: 1129 1130 B1LYP, B2PLYP, B2PLYP-D3BJ, B2PLYP-D3MBJ, B3LYP, B3LYP-D3BJ, 1131 B3LYP-D3MBJ, CAM-B3LYP, CAM-B3LYP-D3BJ, HF, HF-D3BJ, HF3c, M05, 1132 M06, M06-2x, M06-HF, M06-L, MN12-L, MN15, MN15-D3BJ,PBE, PBE0, 1133 PBEH3c, PW6B95, PW6B95-D3BJ, WB97, WB97X, WB97X-D, WB97X-D3BJ 1134 1135 --mp <yes or no> [default: no] 1136 Use multiprocessing. 1137 1138 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 1139 function employing lazy RDKit data iterable. This allows processing of 1140 arbitrary large data sets without any additional requirements memory. 1141 1142 All input data may be optionally loaded into memory by mp.Pool.map() 1143 before starting worker processes in a process pool by setting the value 1144 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 1145 1146 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 1147 data mode may adversely impact the performance. The '--mpParams' section 1148 provides additional information to tune the value of 'chunkSize'. 1149 --mpLevel <Molecules or Conformers> [default: Molecules] 1150 Perform multiprocessing at molecules or conformers level. Possible values: 1151 Molecules or Conformers. The 'Molecules' value starts a process pool at the 1152 molecules level. All conformers of a molecule are processed in a single 1153 process. The 'Conformers' value, however, starts a process pool at the 1154 conformers level. Each conformer of a molecule is processed in an individual 1155 process in the process pool. The default Psi4 'OutputFile' is set to 'quiet' 1156 using '--psi4RunParams' for 'Conformers' level. Otherwise, it may generate 1157 a large number of Psi4 output files. 1158 --mpParams <Name,Value,...> [default: auto] 1159 A comma delimited list of parameter name and value pairs to configure 1160 multiprocessing. 1161 1162 The supported parameter names along with their default and possible 1163 values are shown below: 1164 1165 chunkSize, auto 1166 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 1167 numProcesses, auto [ Default: mp.cpu_count() ] 1168 1169 These parameters are used by the following functions to configure and 1170 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 1171 mp.Pool.imap(). 1172 1173 The chunkSize determines chunks of input data passed to each worker 1174 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 1175 The default value of chunkSize is dependent on the value of 'inputDataMode'. 1176 1177 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 1178 automatically converts RDKit data iterable into a list, loads all data into 1179 memory, and calculates the default chunkSize using the following method 1180 as shown in its code: 1181 1182 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 1183 if extra: chunkSize += 1 1184 1185 For example, the default chunkSize will be 7 for a pool of 4 worker processes 1186 and 100 data items. 1187 1188 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 1189 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 1190 data into memory. Consequently, the size of input data is not known a priori. 1191 It's not possible to estimate an optimal value for the chunkSize. The default 1192 chunkSize is set to 1. 1193 1194 The default value for the chunkSize during 'Lazy' data mode may adversely 1195 impact the performance due to the overhead associated with exchanging 1196 small chunks of data. It is generally a good idea to explicitly set chunkSize to 1197 a larger value during 'Lazy' input data mode, based on the size of your input 1198 data and number of processes in the process pool. 1199 1200 The mp.Pool.map() function waits for all worker processes to process all 1201 the data and return the results. The mp.Pool.imap() function, however, 1202 returns the the results obtained from worker processes as soon as the 1203 results become available for specified chunks of data. 1204 1205 The order of data in the results returned by both mp.Pool.map() and 1206 mp.Pool.imap() functions always corresponds to the input data. 1207 -o, --outfile <outfile> 1208 Output file name. 1209 --outfileParams <Name,Value,...> [default: auto] 1210 A comma delimited list of parameter name and value pairs for writing 1211 molecules to files. The supported parameter names for different file 1212 formats, along with their default values, are shown below: 1213 1214 SD: kekulize,yes,forceV3000,no 1215 1216 --overwrite 1217 Overwrite existing files. 1218 --precision <number> [default: 6] 1219 Floating point precision for writing energy values. 1220 --psi4OptionsParams <Name,Value,...> [default: none] 1221 A comma delimited list of Psi4 option name and value pairs for setting 1222 global and module options. The names are 'option_name' for global options 1223 and 'module_name__option_name' for options local to a module. The 1224 specified option names must be valid Psi4 names. No validation is 1225 performed. 1226 1227 The specified option name and value pairs are processed and passed to 1228 psi4.set_options() as a dictionary. The supported value types are float, 1229 integer, boolean, or string. The float value string is converted into a float. 1230 The valid values for a boolean string are yes, no, true, false, on, or off. 1231 --psi4RunParams <Name,Value,...> [default: auto] 1232 A comma delimited list of parameter name and value pairs for configuring 1233 Psi4 jobs. 1234 1235 The supported parameter names along with their default and possible 1236 values are shown below: 1237 1238 MemoryInGB, 1 1239 NumThreads, 1 1240 OutputFile, auto [ Possible values: stdout, quiet, or FileName ] 1241 ScratchDir, auto [ Possivle values: DirName] 1242 RemoveOutputFile, yes [ Possible values: yes, no, true, or false] 1243 1244 These parameters control the runtime behavior of Psi4. 1245 1246 The default file name for 'OutputFile' is <InFileRoot>_Psi4.out. The PID 1247 is appended to output file name during multiprocessing as shown: 1248 <InFileRoot>_Psi4_<PIDNum>.out. The 'stdout' value for 'OutputType' 1249 sends Psi4 output to stdout. The 'quiet' or 'devnull' value suppresses 1250 all Psi4 output. The 'OutputFile' is set to 'quiet' for 'auto' value during 1251 'Conformers' of '--mpLevel' option. 1252 1253 The default 'Yes' value of 'RemoveOutputFile' option forces the removal 1254 of any existing Psi4 before creating new files to append output from 1255 multiple Psi4 runs. 1256 1257 The option 'ScratchDir' is a directory path to the location of scratch 1258 files. The default value corresponds to Psi4 default. It may be used to 1259 override the deafult path. 1260 -q, --quiet <yes or no> [default: no] 1261 Use quiet mode. The warning and information messages will not be printed. 1262 -r, --reference <text> [default: auto] 1263 Reference wave function to use for energy calculation. Default: RHF or UHF. 1264 The default values are Restricted Hartree-Fock (RHF) for closed-shell molecules 1265 with all electrons paired and Unrestricted Hartree-Fock (UHF) for open-shell 1266 molecules with unpaired electrons. 1267 1268 The specified value must be a valid Psi4 reference wave function. No validation 1269 is performed. For example: ROHF, CUHF, RKS, etc. 1270 1271 The spin multiplicity determines the default value of reference wave function 1272 for input molecules. It is calculated from number of free radical electrons using 1273 Hund's rule of maximum multiplicity defined as 2S + 1 where S is the total 1274 electron spin. The total spin is 1/2 the number of free radical electrons in a 1275 molecule. The value of 'SpinMultiplicity' molecule property takes precedence 1276 over the calculated value of spin multiplicity. 1277 --recenterAndReorient <yes or no> [default: yes] 1278 Recenter and reorient a molecule during creation of a Psi4 molecule from 1279 a geometry string. 1280 1281 The 'No' values allows the minimization of a molecule in its initial 3D 1282 coordinate space generated by RDKit. 1283 --symmetrize <yes or no> [default: auto] 1284 Symmetrize molecules before energy minimization. Default: 'Yes' during 1285 'Yes' value of '--recenterAndReorient'; Otherwise, 'No'. The psi4 function, 1286 psi4mol.symmetrize( SymmetrizeTolerance), is called to symmetrize 1287 the molecule before calling psi4.optimize(). 1288 1289 The 'No' value of '--symmetrize' during 'Yes' value of '--recenterAndReorient' 1290 may cause psi4.optimize() to fail with a 'Point group changed...' error 1291 message. 1292 --symmetrizeTolerance <number> [default: 0.01] 1293 Symmetry tolerance for '--symmetrize'. 1294 -w, --workingdir <dir> 1295 Location of working directory which defaults to the current directory. 1296 1297 Examples: 1298 To generate an initial conformer ensemble of up to 50 conformations using a 1299 combination of ETKDGv2 distance geometry methodology, applying embed RMSD 1300 cutoff of 0.5 and MMFF forcefield minimization, followed by energy minimization 1301 using B3LYP/6-31G** or B3LYP/6-31+G** (sulfur containing), selecting a final set 1302 of minimized conformers for molecules in a SMILES file, applying energy RMSD 1303 cutoff of 0.5 and energy window value value of 20 kcal/mol, and write out a SD 1304 file containing minimized conformers, type: 1305 1306 % Psi4GenerateConformers.py -i Psi4Sample.smi -o Psi4SampleOut.sdf 1307 1308 To run the first example in a quiet mode and write out a SD file, type: 1309 1310 % Psi4GenerateConformers.py -q yes -i Psi4Sample.smi -o 1311 Psi4SampleOut.sdf 1312 1313 To run the first example in multiprocessing mode at molecules level on all 1314 available CPUs without loading all data into memory and write out a SD file, 1315 type: 1316 1317 % Psi4GenerateConformers.py --mp yes -i Psi4Sample.smi -o 1318 Psi4SampleOut.sdf 1319 1320 To run the first example in multiprocessing mode at conformers level on all 1321 available CPUs without loading all data into memory and write out a SD file, 1322 type: 1323 1324 % Psi4GenerateConformers.py --mp yes --mpLevel Conformers 1325 -i Psi4Sample.smi -o Psi4SampleOut.sdf 1326 1327 To run the first example in multiprocessing mode at molecules level on specific 1328 number of CPUs and chunk size without loading all data into memory and write 1329 out a SD file, type: 1330 1331 % Psi4GenerateConformers.py --mp yes --mpParams "inputDataMode,Lazy, 1332 numProcesses,4,chunkSize,8" -i Psi4Sample.smi -o Psi4SampleOut.sdf 1333 1334 To run the first example by using an explicit set of specific parameters, and 1335 write out a SD file, type 1336 1337 % Psi4GenerateConformers.py --confParams "confMethod,ETKDGv2, 1338 forceField,MMFF, forceFieldMMFFVariant,MMFF94s, maxConfs,20, 1339 embedRMSDCutoff,0.25" --energyUnits "kJ/mol" -m B3LYP 1340 -b "6-31+G**" --maxIters 20 -i Psi4Sample.smi -o Psi4SampleOut.sdf 1341 1342 To run the first example for molecules in a CSV SMILES file, SMILES strings 1343 in column 1, name column 2, and write out a SD file, type: 1344 1345 % Psi4GenerateConformers.py --infileParams "smilesDelimiter,comma, 1346 smilesTitleLine,yes,smilesColumn,1,smilesNameColumn,2" 1347 -i Psi4Sample.csv -o Psi4SampleOut.sdf 1348 1349 Author: 1350 1351 Manish Sud(msud@san.rr.com) 1352 1353 See also: 1354 Psi4CalculateEnergy.py, Psi4CalculatePartialCharges.py, Psi4PerformMinimization.py 1355 1356 Copyright: 1357 Copyright (C) 2024 Manish Sud. All rights reserved. 1358 1359 The functionality available in this script is implemented using Psi4, an 1360 open source quantum chemistry software package, and RDKit, an open 1361 source toolkit for cheminformatics developed by Greg Landrum. 1362 1363 This file is part of MayaChemTools. 1364 1365 MayaChemTools is free software; you can redistribute it and/or modify it under 1366 the terms of the GNU Lesser General Public License as published by the Free 1367 Software Foundation; either version 3 of the License, or (at your option) any 1368 later version. 1369 1370 """ 1371 1372 if __name__ == "__main__": 1373 main()