1 #!/bin/env python 2 # 3 # File: RDKitCalculateEnergy.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 CalculateEnergy() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def CalculateEnergy(): 82 """Calculate single point energy.""" 83 84 # Setup a molecule reader... 85 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 86 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 87 88 # Set up a molecule writer... 89 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 90 if Writer is None: 91 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 92 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 93 94 MolCount, ValidMolCount, EnergyFailedCount = ProcessMolecules(Mols, Writer) 95 96 if Writer is not None: 97 Writer.close() 98 99 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 100 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 101 MiscUtil.PrintInfo("Number of molecules failed during energy calculation: %d" % EnergyFailedCount) 102 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + EnergyFailedCount)) 103 104 def ProcessMolecules(Mols, Writer): 105 """Process and calculate energy of molecules.""" 106 107 if OptionsInfo["MPMode"]: 108 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 109 else: 110 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 111 112 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 113 """Process and calculate energy of molecules using a single process.""" 114 115 MiscUtil.PrintInfo("\nCalculating energy...") 116 117 (MolCount, ValidMolCount, EnergyFailedCount) = [0] * 3 118 for Mol in Mols: 119 MolCount += 1 120 121 if Mol is None: 122 continue 123 124 if RDKitUtil.IsMolEmpty(Mol): 125 if not OptionsInfo["QuietMode"]: 126 MolName = RDKitUtil.GetMolName(Mol, MolCount) 127 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 128 continue 129 130 ValidMolCount += 1 131 132 CalcStatus, Energy = CalculateMoleculeEnergy(Mol, MolCount) 133 if CalcStatus: 134 Energy = "%.2f" % Energy 135 else: 136 if not OptionsInfo["QuietMode"]: 137 MolName = RDKitUtil.GetMolName(Mol, MolCount) 138 MiscUtil.PrintWarning("Failed to calculate energy for molecule %s" % MolName) 139 140 EnergyFailedCount += 1 141 continue 142 143 WriteMolecule(Writer, Mol, Energy) 144 145 return (MolCount, ValidMolCount, EnergyFailedCount) 146 147 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 148 """Process and calculate energy of molecules using process.""" 149 150 MiscUtil.PrintInfo("\nCalculating energy using multiprocessing...") 151 152 MPParams = OptionsInfo["MPParams"] 153 154 # Setup data for initializing a worker process... 155 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 156 157 # Setup a encoded mols data iterable for a worker process... 158 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 159 160 # Setup process pool along with data initialization for each process... 161 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 162 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 163 164 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 165 166 # Start processing... 167 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 168 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 169 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 170 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 171 else: 172 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 173 174 (MolCount, ValidMolCount, EnergyFailedCount) = [0] * 3 175 for Result in Results: 176 MolCount += 1 177 MolIndex, EncodedMol, CalcStatus, Energy = Result 178 179 if EncodedMol is None: 180 continue 181 ValidMolCount += 1 182 183 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 184 if CalcStatus: 185 Energy = "%.2f" % Energy 186 else: 187 if not OptionsInfo["QuietMode"]: 188 MolName = RDKitUtil.GetMolName(Mol, MolCount) 189 MiscUtil.PrintWarning("Failed to calculate energy for molecule %s" % MolName) 190 191 EnergyFailedCount += 1 192 continue 193 194 WriteMolecule(Writer, Mol, Energy) 195 196 return (MolCount, ValidMolCount, EnergyFailedCount) 197 198 def InitializeWorkerProcess(*EncodedArgs): 199 """Initialize data for a worker process.""" 200 201 global Options, OptionsInfo 202 203 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 204 205 # Decode Options and OptionInfo... 206 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 207 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 208 209 def WorkerProcess(EncodedMolInfo): 210 """Process data for a worker process.""" 211 212 MolIndex, EncodedMol = EncodedMolInfo 213 214 CalcStatus = False 215 Energy = None 216 217 if EncodedMol is None: 218 return [MolIndex, None, CalcStatus, Energy] 219 220 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 221 if RDKitUtil.IsMolEmpty(Mol): 222 if not OptionsInfo["QuietMode"]: 223 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 224 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 225 return [MolIndex, None, CalcStatus, Energy] 226 227 CalcStatus, Energy = CalculateMoleculeEnergy(Mol, (MolIndex + 1)) 228 229 return [MolIndex, RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps), CalcStatus, Energy] 230 231 def CalculateMoleculeEnergy(Mol, MolNum = None): 232 "Calculate energy." 233 234 Status = True 235 Energy = None 236 ConfID = -1 237 238 if OptionsInfo["UseUFF"]: 239 UFFMoleculeForcefield = AllChem.UFFGetMoleculeForceField(Mol, confId = ConfID) 240 if UFFMoleculeForcefield is None: 241 Status = False 242 else: 243 Energy = UFFMoleculeForcefield.CalcEnergy() 244 elif OptionsInfo["UseMMFF"]: 245 MMFFMoleculeProperties = AllChem.MMFFGetMoleculeProperties(Mol, mmffVariant = OptionsInfo["MMFFVariant"]) 246 MMFFMoleculeForcefield = AllChem.MMFFGetMoleculeForceField(Mol, MMFFMoleculeProperties, confId = ConfID) 247 if MMFFMoleculeForcefield is None: 248 Status = False 249 else: 250 Energy = MMFFMoleculeForcefield.CalcEnergy() 251 else: 252 MiscUtil.PrintError("Couldn't calculate nergy: Specified forcefield, %s, is not supported" % OptionsInfo["ForceField"]) 253 254 return (Status, Energy) 255 256 def WriteMolecule(Writer, Mol, Energy): 257 """Write molecule.""" 258 259 Mol.SetProp(OptionsInfo["EnergyLabel"], Energy) 260 Writer.write(Mol) 261 262 def ProcessOptions(): 263 """Process and validate command line arguments and options.""" 264 265 MiscUtil.PrintInfo("Processing options...") 266 267 # Validate options... 268 ValidateOptions() 269 270 OptionsInfo["Infile"] = Options["--infile"] 271 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 272 273 OptionsInfo["Outfile"] = Options["--outfile"] 274 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"]) 275 276 OptionsInfo["Overwrite"] = Options["--overwrite"] 277 278 if re.match("^UFF$", Options["--forceField"], re.I): 279 ForceField = "UFF" 280 UseUFF = True 281 UseMMFF = False 282 elif re.match("^MMFF$", Options["--forceField"], re.I): 283 ForceField = "MMFF" 284 UseUFF = False 285 UseMMFF = True 286 else: 287 MiscUtil.PrintError("The value, %s, specified for \"--forceField\" is not supported." % (Options["--forceField"],)) 288 289 MMFFVariant = "MMFF94" if re.match("^MMFF94$", Options["--forceFieldMMFFVariant"], re.I) else "MMFF94s" 290 291 OptionsInfo["ForceField"] = ForceField 292 OptionsInfo["MMFFVariant"] = MMFFVariant 293 OptionsInfo["UseMMFF"] = UseMMFF 294 OptionsInfo["UseUFF"] = UseUFF 295 296 if UseMMFF: 297 OptionsInfo["EnergyLabel"] = "%s_Energy" % MMFFVariant 298 else: 299 OptionsInfo["EnergyLabel"] = "%s_Energy" % ForceField 300 301 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 302 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 303 304 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 305 306 def RetrieveOptions(): 307 """Retrieve command line arguments and options.""" 308 309 # Get options... 310 global Options 311 Options = docopt(_docoptUsage_) 312 313 # Set current working directory to the specified directory... 314 WorkingDir = Options["--workingdir"] 315 if WorkingDir: 316 os.chdir(WorkingDir) 317 318 # Handle examples option... 319 if "--examples" in Options and Options["--examples"]: 320 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 321 sys.exit(0) 322 323 def ValidateOptions(): 324 """Validate option values.""" 325 326 MiscUtil.ValidateOptionTextValue("-f, --forceField", Options["--forceField"], "UFF MMFF") 327 MiscUtil.ValidateOptionTextValue(" --forceFieldMMFFVariant", Options["--forceFieldMMFFVariant"], "MMFF94 MMFF94s") 328 329 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 330 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol") 331 332 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd") 333 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 334 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 335 336 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 337 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 338 339 # Setup a usage string for docopt... 340 _docoptUsage_ = """ 341 RDKitCalculateEnergy.py - Calculate energy 342 343 Usage: 344 RDKitCalculateEnergy.py [--forceField <UFF, or MMFF>] [--forceFieldMMFFVariant <MMFF94 or MMFF94s>] 345 [--infileParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 346 [ --outfileParams <Name,Value,...> ] [--overwrite] [--quiet <yes or no>] [-w <dir>] -i <infile> -o <outfile> 347 RDKitCalculateEnergy.py -h | --help | -e | --examples 348 349 Description: 350 Calculate single point energy for molecules using a specified forcefield. The 351 molecules must have 3D coordinates in input file. 352 353 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 354 355 The supported output file formats are: SD (.sdf, .sd) 356 357 Options: 358 -f, --forceField <UFF, MMFF> [default: MMFF] 359 Forcefield method to use for energy calculation. Possible values: Universal Force 360 Field (UFF) [ Ref 81 ] or Merck Molecular Mechanics Force Field [ Ref 83-87 ] . 361 --forceFieldMMFFVariant <MMFF94 or MMFF94s> [default: MMFF94] 362 Variant of MMFF forcefield to use for energy calculation. 363 -e, --examples 364 Print examples. 365 -h, --help 366 Print this help message. 367 -i, --infile <infile> 368 Input file name. 369 --infileParams <Name,Value,...> [default: auto] 370 A comma delimited list of parameter name and value pairs for reading 371 molecules from files. The supported parameter names for different file 372 formats, along with their default values, are shown below: 373 374 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 375 376 --mp <yes or no> [default: no] 377 Use multiprocessing. 378 379 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 380 function employing lazy RDKit data iterable. This allows processing of 381 arbitrary large data sets without any additional requirements memory. 382 383 All input data may be optionally loaded into memory by mp.Pool.map() 384 before starting worker processes in a process pool by setting the value 385 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 386 387 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 388 data mode may adversely impact the performance. The '--mpParams' section 389 provides additional information to tune the value of 'chunkSize'. 390 --mpParams <Name,Value,...> [default: auto] 391 A comma delimited list of parameter name and value pairs to configure 392 multiprocessing. 393 394 The supported parameter names along with their default and possible 395 values are shown below: 396 397 chunkSize, auto 398 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 399 numProcesses, auto [ Default: mp.cpu_count() ] 400 401 These parameters are used by the following functions to configure and 402 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 403 mp.Pool.imap(). 404 405 The chunkSize determines chunks of input data passed to each worker 406 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 407 The default value of chunkSize is dependent on the value of 'inputDataMode'. 408 409 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 410 automatically converts RDKit data iterable into a list, loads all data into 411 memory, and calculates the default chunkSize using the following method 412 as shown in its code: 413 414 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 415 if extra: chunkSize += 1 416 417 For example, the default chunkSize will be 7 for a pool of 4 worker processes 418 and 100 data items. 419 420 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 421 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 422 data into memory. Consequently, the size of input data is not known a priori. 423 It's not possible to estimate an optimal value for the chunkSize. The default 424 chunkSize is set to 1. 425 426 The default value for the chunkSize during 'Lazy' data mode may adversely 427 impact the performance due to the overhead associated with exchanging 428 small chunks of data. It is generally a good idea to explicitly set chunkSize to 429 a larger value during 'Lazy' input data mode, based on the size of your input 430 data and number of processes in the process pool. 431 432 The mp.Pool.map() function waits for all worker processes to process all 433 the data and return the results. The mp.Pool.imap() function, however, 434 returns the the results obtained from worker processes as soon as the 435 results become available for specified chunks of data. 436 437 The order of data in the results returned by both mp.Pool.map() and 438 mp.Pool.imap() functions always corresponds to the input data. 439 -o, --outfile <outfile> 440 Output file name. 441 --outfileParams <Name,Value,...> [default: auto] 442 A comma delimited list of parameter name and value pairs for writing 443 molecules to files. The supported parameter names for different file 444 formats, along with their default values, are shown below: 445 446 SD: kekulize,yes,forceV3000,no 447 448 --overwrite 449 Overwrite existing files. 450 -q, --quiet <yes or no> [default: no] 451 Use quiet mode. The warning and information messages will not be printed. 452 -w, --workingdir <dir> 453 Location of working directory which defaults to the current directory. 454 455 Examples: 456 To calculate single point energy using MMFF forcefield for molecules in a SD file 457 containing 3D structures and write a new SD file, type: 458 459 % RDKitCalculateEnergy.py -i Sample3D.sdf -o Sample3DOut.sdf 460 461 To run the first example in multiprocessing mode on all available CPUs 462 without loading all data into memory and write out a SD file, type: 463 464 % RDKitCalculateEnergy.py --mp yes -i Sample3D.sdf -o Sample3DOut.sdf 465 466 To run the first example in multiprocessing mode on all available CPUs 467 by loading all data into memory and write out a SD file, type: 468 469 % RDKitCalculateEnergy.py --mp yes --mpParams "inputDataMode, 470 InMemory" -i Sample3D.sdf -o Sample3DOut.sdf 471 472 To run the first example in multiprocessing mode on specific number of 473 CPUs and chunk size without loading all data into memory and write out a SD file, 474 type: 475 476 % RDKitCalculateEnergy.py --mp yes --mpParams "inputDataMode,Lazy, 477 numProcesses,4,chunkSize,8" -i Sample3D.sdf -o Sample3DOut.sdf 478 479 To calculate single point energy using UFF forcefield for molecules in a SD file 480 containing 3D structures and write a new SD file, type: 481 482 % RDKitCalculateEnergy.py -f UFF -i Sample3D.sdf -o Sample3DOut.sdf 483 484 To calculate single point energy using MMFF94s variant of MMFF forcefield for 485 molecules in a SD file containing 3D structures and write a new SD file, type: 486 487 % RDKitCalculateEnergy.py --forceField MMFF --forceFieldMMFFVariant 488 MMFF94s -i Sample3D.sdf -o Sample3DOut.sdf 489 490 Author: 491 Manish Sud(msud@san.rr.com) 492 493 See also: 494 RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, 495 RDKitConvertFileFormat.py, RDKitGenerateConformers.py, RDKitPerformMinimization.py 496 497 Copyright: 498 Copyright (C) 2024 Manish Sud. All rights reserved. 499 500 The functionality available in this script is implemented using RDKit, an 501 open source toolkit for cheminformatics developed by Greg Landrum. 502 503 This file is part of MayaChemTools. 504 505 MayaChemTools is free software; you can redistribute it and/or modify it under 506 the terms of the GNU Lesser General Public License as published by the Free 507 Software Foundation; either version 3 of the License, or (at your option) any 508 later version. 509 510 """ 511 512 if __name__ == "__main__": 513 main()