1 #!/bin/env python 2 # 3 # File: RDKitCalculateMolecularDescriptors.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 from rdkit.Chem import rdMolDescriptors 44 from rdkit.Chem import Descriptors 45 from rdkit.Chem import Descriptors3D 46 except ImportError as ErrMsg: 47 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 48 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 49 sys.exit(1) 50 51 # RDKit dependency imports... 52 import numpy 53 54 # MayaChemTools imports... 55 try: 56 from docopt import docopt 57 import MiscUtil 58 import RDKitUtil 59 except ImportError as ErrMsg: 60 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 61 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 62 sys.exit(1) 63 64 ScriptName = os.path.basename(sys.argv[0]) 65 Options = {} 66 OptionsInfo = {} 67 68 DescriptorNamesMap = {} 69 70 def main(): 71 """Start execution of the script.""" 72 73 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 74 75 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 76 77 # Retrieve command line arguments and options... 78 RetrieveOptions() 79 80 # Process and validate command line arguments and options... 81 ProcessOptions() 82 83 # Perform actions required by the script... 84 CalculateMolecularDescriptors() 85 86 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 87 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 88 89 def CalculateMolecularDescriptors(): 90 """Calculate molecular descriptors.""" 91 92 ProcessMolecularDescriptorsInfo() 93 PerformCalculations() 94 95 def ProcessMolecularDescriptorsInfo(): 96 """Process descriptors information.""" 97 98 RetrieveMolecularDescriptorsInfo() 99 ProcessSpecifiedDescriptorNames() 100 101 def PerformCalculations(): 102 """Calculate descriptors for a specified list of descriptors.""" 103 104 # Setup a molecule reader... 105 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 106 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 107 108 # Setup a writer... 109 Writer = SetupMoleculeWriter() 110 111 # Process molecules... 112 MolCount, ValidMolCount = ProcessMolecules(Mols, Writer) 113 114 if Writer is not None: 115 Writer.close() 116 117 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 118 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 119 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 120 121 def ProcessMolecules(Mols, Writer): 122 """Process and filter molecules.""" 123 124 if OptionsInfo["MPMode"]: 125 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 126 else: 127 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 128 129 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 130 """Process molecules and calculate descriptors using a single process.""" 131 132 DescriptorsCount = len(OptionsInfo["SpecifiedDescriptorNames"]) 133 MiscUtil.PrintInfo("\nCalculating %d molecular %s for each molecule..." % (DescriptorsCount, ("descroptors" if DescriptorsCount > 1 else "descriptor"))) 134 135 (MolCount, ValidMolCount) = [0] * 2 136 for MolIndex, Mol in enumerate(Mols): 137 MolCount += 1 138 if Mol is None: 139 continue 140 141 if RDKitUtil.IsMolEmpty(Mol): 142 MolName = RDKitUtil.GetMolName(Mol, MolCount) 143 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 144 continue 145 ValidMolCount += 1 146 147 # Calculate and write descriptor values... 148 CalculatedValues = CalculateDescriptorValues(MolIndex, Mol) 149 WriteDescriptorValues(Mol, MolCount, Writer, CalculatedValues) 150 151 return (MolCount, ValidMolCount) 152 153 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 154 """Process molecules and calculate descriptors using multiprocessing.""" 155 156 DescriptorsCount = len(OptionsInfo["SpecifiedDescriptorNames"]) 157 MiscUtil.PrintInfo("\nCalculating %d molecular %s for each molecule using multiprocessing......" % (DescriptorsCount, ("descroptors" if DescriptorsCount > 1 else "descriptor"))) 158 159 MPParams = OptionsInfo["MPParams"] 160 161 # Setup data for initializing a worker process... 162 MiscUtil.PrintInfo("Encoding options info...") 163 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 164 165 # Setup a encoded mols data iterable for a worker process... 166 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 167 168 # Setup process pool along with data initialization for each process... 169 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 170 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 171 172 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 173 174 # Start processing... 175 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 176 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 177 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 178 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 179 else: 180 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 181 182 (MolCount, ValidMolCount) = [0] * 2 183 for Result in Results: 184 MolCount += 1 185 MolIndex, EncodedMol, CalculatedValues = Result 186 187 if EncodedMol is None: 188 continue 189 ValidMolCount += 1 190 191 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 192 193 # Write descriptor values... 194 WriteDescriptorValues(Mol, MolCount, Writer, CalculatedValues) 195 196 return (MolCount, ValidMolCount) 197 198 def InitializeWorkerProcess(*EncodedArgs): 199 """Initialize data for a worker process.""" 200 201 global Options, OptionsInfo 202 203 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 204 205 # Decode Options and OptionInfo... 206 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 207 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 208 209 RetrieveMolecularDescriptorsInfo(PrintInfo = False) 210 211 def WorkerProcess(EncodedMolInfo): 212 """Process data for a worker process.""" 213 214 MolIndex, EncodedMol = EncodedMolInfo 215 216 CalculatedValues = [] 217 if EncodedMol is None: 218 return [MolIndex, None, CalculatedValues] 219 220 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 221 if RDKitUtil.IsMolEmpty(Mol): 222 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 223 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 224 return [MolIndex, None, CalculatedValues] 225 226 return [MolIndex, EncodedMol, CalculateDescriptorValues(MolIndex, Mol)] 227 228 def CalculateDescriptorValues(MolIndex, Mol): 229 """Calculate descriptor values.""" 230 231 return [FormatCalculatedValue(CalculateDescriptorValue(MolIndex, Mol, Name), OptionsInfo["Precision"]) for Name in OptionsInfo["SpecifiedDescriptorNames"]] 232 233 def CalculateDescriptorValue(MolIndex, Mol, Name): 234 """Calculate value for a specific descriptor along with handling any calculation failure.""" 235 236 try: 237 Value = DescriptorNamesMap["ComputeFunction"][Name](Mol) 238 except ValueError as ErrMsg: 239 MiscUtil.PrintWarning("Failed to calculate descriptor %s for molecule %s:\n%s\n" % (Name, (MolIndex + 1), ErrMsg)) 240 Value = "NA" 241 242 return Value 243 244 def WriteDescriptorValues(Mol, MolNum, Writer, CalculatedValues): 245 """Write out calculated descriptor values.""" 246 247 if OptionsInfo["TextOutFileMode"]: 248 LineWords = [] 249 if OptionsInfo["SMILESOut"]: 250 SMILES = Chem.MolToSmiles(Mol, isomericSmiles = True, canonical = True) 251 LineWords.append(SMILES) 252 253 MolName = RDKitUtil.GetMolName(Mol, MolNum) 254 LineWords.append(MolName) 255 LineWords.extend(CalculatedValues) 256 Line = OptionsInfo["TextOutFileDelim"].join(LineWords) 257 Writer.write("%s\n" % Line) 258 else: 259 for Index, Name in enumerate(OptionsInfo["SpecifiedDescriptorNames"]): 260 Mol.SetProp(Name, CalculatedValues[Index]) 261 262 if OptionsInfo["OutfileParams"]["Compute2DCoords"]: 263 AllChem.Compute2DCoords(Mol) 264 265 Writer.write(Mol) 266 267 def SetupMoleculeWriter(): 268 """Setup a molecule writer. """ 269 270 if OptionsInfo["TextOutFileMode"]: 271 Writer = open(OptionsInfo["Outfile"], "w") 272 else: 273 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 274 275 if Writer is None: 276 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 277 278 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 279 280 # Wite out headers for a text file... 281 if OptionsInfo["TextOutFileMode"]: 282 LineWords = [] 283 if OptionsInfo["SMILESOut"]: 284 LineWords.append("SMILES") 285 LineWords.append("MolID") 286 LineWords.extend(OptionsInfo["SpecifiedDescriptorNames"]) 287 Line = OptionsInfo["TextOutFileDelim"].join(LineWords) 288 Writer.write("%s\n" % Line) 289 290 return Writer 291 292 def FormatCalculatedValue(Value, Precision): 293 """Format calculated value of descriptor based on its type.""" 294 295 if (type(Value) is float) or (type(Value) is numpy.float64): 296 FormattedValue = "%.*f" % (Precision, Value) 297 if not re.search("[1-9]", FormattedValue): 298 FormattedValue = "0.0" 299 elif type(Value) is list: 300 FormattedValue = "%s" % Value 301 FormattedValue = re.sub('\[|\]|,', '', FormattedValue) 302 else: 303 FormattedValue = "%s" % Value 304 305 return FormattedValue 306 307 def ProcessSpecifiedDescriptorNames(): 308 """Process and validate specified decriptor names.""" 309 310 OptionsInfo["SpecifiedDescriptorNames"] = [] 311 312 if not re.match("^(2D|3D|All|FragmentCountOnly|Specify)$", OptionsInfo["Mode"], re.I): 313 MiscUtil.PrintError("Mode value, %s, using \"-m, --mode\" option is not a valid value." % OptionsInfo["Mode"]) 314 315 if re.match("^2D$", OptionsInfo["Mode"], re.I): 316 OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["2D"]["Names"] 317 if OptionsInfo["FragmentCount"]: 318 OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["FragmentCount"]["Names"]) 319 return 320 elif re.match("^3D$", OptionsInfo["Mode"], re.I): 321 OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["3D"]["Names"] 322 return 323 elif re.match("^All$", OptionsInfo["Mode"], re.I): 324 OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["2D"]["Names"] 325 if OptionsInfo["FragmentCount"]: 326 OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["FragmentCount"]["Names"]) 327 OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["3D"]["Names"]) 328 return 329 elif re.match("^FragmentCountOnly$", OptionsInfo["Mode"], re.I): 330 OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["FragmentCount"]["Names"] 331 return 332 333 # Set up a canonical descriptor names map for checking specified names... 334 CanonicalNameMap = {} 335 for Name in DescriptorNamesMap["ComputeFunction"]: 336 CanonicalNameMap[Name.lower()] = Name 337 338 # Parse and validate specified names... 339 DescriptorNames = re.sub(" ", "", OptionsInfo["DescriptorNames"]) 340 if not DescriptorNames: 341 MiscUtil.PrintError("No descriptor names specified for \"-d, --descriptorNames\" option") 342 343 SMILESInfile = MiscUtil.CheckFileExt(Options["--infile"], "smi") 344 Canonical3DNameMap = {} 345 if SMILESInfile: 346 for Name in DescriptorNamesMap["3D"]["Names"]: 347 Canonical3DNameMap[Name.lower()] = Name 348 349 SpecifiedDescriptorNames = [] 350 for Name in DescriptorNames.split(","): 351 CanonicalName = Name.lower() 352 if CanonicalName in CanonicalNameMap: 353 SpecifiedDescriptorNames.append(CanonicalNameMap[CanonicalName]) 354 else: 355 MiscUtil.PrintError("The descriptor name, %s, specified using \"-d, --descriptorNames\" option is not a valid name." % (Name)) 356 if SMILESInfile: 357 if CanonicalName in Canonical3DNameMap: 358 MiscUtil.PrintError("The 3D descriptor name, %s, specified using \"-d, --descriptorNames\" option is not a valid for SMILES input file." % (Name)) 359 360 if not len(SpecifiedDescriptorNames): 361 MiscUtil.PrintError("No valid descriptor name specified for \"-d, --descriptorNames\" option") 362 363 OptionsInfo["SpecifiedDescriptorNames"] = SpecifiedDescriptorNames 364 365 def RetrieveMolecularDescriptorsInfo(PrintInfo = True): 366 """Retrieve descriptors information.""" 367 368 if PrintInfo: 369 MiscUtil.PrintInfo("\nRetrieving information for avalible molecular descriptors...") 370 371 # Initialze data for 2D, FragmentCount and 3D descriptors... 372 DescriptorNamesMap["Types"] = ["2D", "FragmentCount", "3D"] 373 DescriptorNamesMap["ComputeFunction"] = {} 374 375 Autocorr2DExclude = OptionsInfo["Autocorr2DExclude"] if "Autocorr2DExclude" in OptionsInfo else False 376 377 for Type in DescriptorNamesMap["Types"]: 378 DescriptorNamesMap[Type] = {} 379 DescriptorNamesMap[Type]["Names"] = [] 380 381 # Setup data for 2D and FragmentCount... 382 DescriptorsInfo = Descriptors.descList 383 for DescriptorInfo in DescriptorsInfo: 384 Name = DescriptorInfo[0] 385 ComputeFunction = DescriptorInfo[1] 386 387 Type = "2D" 388 if re.match("^fr_", Name, re.I): 389 Type = "FragmentCount" 390 elif re.match("^Autocorr2D$", Name, re.I): 391 if Autocorr2DExclude: 392 continue 393 394 if Name in DescriptorNamesMap["ComputeFunction"]: 395 if PrintInfo: 396 MiscUtil.PrintWarning("Ignoring duplicate descriptor name: %s..." % Name) 397 else: 398 DescriptorNamesMap[Type]["Names"].append(Name) 399 DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction 400 401 # Add new 2D decriptor name to the list... 402 Type = "2D" 403 if not Autocorr2DExclude: 404 try: 405 Name = "Autocorr2D" 406 ComputeFunction = rdMolDescriptors.CalcAUTOCORR2D 407 if not Name in DescriptorNamesMap["ComputeFunction"]: 408 DescriptorNamesMap[Type]["Names"].append(Name) 409 DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction 410 except (AttributeError): 411 if PrintInfo: 412 MiscUtil.PrintInfo("2D descriptor, %s, is not available in your current version of RDKit." % Name) 413 414 # Set data for 3D descriptors... 415 Type = "3D" 416 NameToComputeFunctionMap = {'PMI1' : Descriptors3D.PMI1, 'PMI2' : Descriptors3D.PMI2, 'PMI3' : Descriptors3D.PMI3, 'NPR1' : Descriptors3D.NPR1, 'NPR2' : Descriptors3D.NPR2, 'RadiusOfGyration' : Descriptors3D.RadiusOfGyration, 'InertialShapeFactor' : Descriptors3D.InertialShapeFactor, 'Eccentricity' : Descriptors3D.Eccentricity, 'Asphericity' : Descriptors3D.Asphericity, 'SpherocityIndex' : Descriptors3D.SpherocityIndex} 417 418 for Name in NameToComputeFunctionMap: 419 ComputeFunction = NameToComputeFunctionMap[Name] 420 if Name in DescriptorNamesMap["ComputeFunction"]: 421 if PrintInfo: 422 MiscUtil.PrintWarning("Ignoring duplicate descriptor name: %s..." % Name) 423 else: 424 DescriptorNamesMap[Type]["Names"].append(Name) 425 DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction 426 427 # Check and add new 3D descriptors not directly available through Descriptors3D module... 428 Type = "3D" 429 AvailableName3DMap = {} 430 for Name in ['Autocorr3D', 'RDF', 'MORSE', 'WHIM', 'GETAWAY']: 431 ComputeFunction = None 432 try: 433 if re.match("^Autocorr3D$", Name, re.I): 434 ComputeFunction = rdMolDescriptors.CalcAUTOCORR3D 435 elif re.match("^RDF$", Name, re.I): 436 ComputeFunction = rdMolDescriptors.CalcRDF 437 elif re.match("^MORSE$", Name, re.I): 438 ComputeFunction = rdMolDescriptors.CalcMORSE 439 elif re.match("^WHIM$", Name, re.I): 440 ComputeFunction = rdMolDescriptors.CalcWHIM 441 elif re.match("^GETAWAY$", Name, re.I): 442 ComputeFunction = rdMolDescriptors.CalcGETAWAY 443 else: 444 ComputeFunction = None 445 except (AttributeError): 446 if PrintInfo: 447 MiscUtil.PrintWarning("3D descriptor, %s, is not available in your current version of RDKit" % Name) 448 449 if ComputeFunction is not None: 450 AvailableName3DMap[Name] = ComputeFunction 451 452 for Name in AvailableName3DMap: 453 ComputeFunction = AvailableName3DMap[Name] 454 if not Name in DescriptorNamesMap["ComputeFunction"]: 455 DescriptorNamesMap[Type]["Names"].append(Name) 456 DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction 457 458 Count = 0 459 TypesCount = [] 460 for Type in DescriptorNamesMap["Types"]: 461 TypeCount = len(DescriptorNamesMap[Type]["Names"]) 462 TypesCount.append(TypeCount) 463 Count += TypeCount 464 465 if not Count: 466 if PrintInfo: 467 MiscUtil.PrintError("Failed to retrieve any molecular descriptors...") 468 469 # Sort descriptor names... 470 for Type in DescriptorNamesMap["Types"]: 471 DescriptorNamesMap[Type]["Names"] = sorted(DescriptorNamesMap[Type]["Names"]) 472 473 if PrintInfo: 474 MiscUtil.PrintInfo("\nTotal number of availble molecular descriptors: %d" % Count) 475 for Index in range(0, len(DescriptorNamesMap["Types"])): 476 Type = DescriptorNamesMap["Types"][Index] 477 TypeCount = TypesCount[Index] 478 if PrintInfo: 479 MiscUtil.PrintInfo("Number of %s molecular descriptors: %d" % (Type, TypeCount)) 480 481 def ListMolecularDescriptorsInfo(): 482 """List descriptors information.""" 483 484 MiscUtil.PrintInfo("\nListing information for avalible molecular descriptors...") 485 486 Delimiter = ", " 487 for Type in DescriptorNamesMap["Types"]: 488 Names = DescriptorNamesMap[Type]["Names"] 489 MiscUtil.PrintInfo("\n%s descriptors: %s" % (Type, Delimiter.join(Names))) 490 491 MiscUtil.PrintInfo("") 492 493 def ProcessOptions(): 494 """Process and validate command line arguments and options.""" 495 496 MiscUtil.PrintInfo("Processing options...") 497 498 # Validate options... 499 ValidateOptions() 500 501 OptionsInfo["Autocorr2DExclude"] = True 502 if not re.match("^Yes$", Options["--autocorr2DExclude"], re.I): 503 OptionsInfo["Autocorr2DExclude"] = False 504 505 OptionsInfo["FragmentCount"] = True 506 if not re.match("^Yes$", Options["--fragmentCount"], re.I): 507 OptionsInfo["FragmentCount"] = False 508 509 OptionsInfo["DescriptorNames"] = Options["--descriptorNames"] 510 OptionsInfo["Mode"] = Options["--mode"] 511 512 OptionsInfo["Infile"] = Options["--infile"] 513 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 514 515 OptionsInfo["Outfile"] = Options["--outfile"] 516 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 517 518 OptionsInfo["Overwrite"] = Options["--overwrite"] 519 520 TextOutFileMode = False 521 TextOutFileDelim = "" 522 if MiscUtil.CheckFileExt(Options["--outfile"], "csv"): 523 TextOutFileMode = True 524 TextOutFileDelim = "," 525 elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"): 526 TextOutFileMode = True 527 TextOutFileDelim = "\t" 528 OptionsInfo["TextOutFileMode"] = TextOutFileMode 529 OptionsInfo["TextOutFileDelim"] = TextOutFileDelim 530 531 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 532 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 533 534 OptionsInfo["Precision"] = int(Options["--precision"]) 535 536 OptionsInfo["SMILESOut"] = False 537 if re.match("^Yes$", Options["--smilesOut"], re.I): 538 OptionsInfo["SMILESOut"] = True 539 540 def RetrieveOptions(): 541 """Retrieve command line arguments and options.""" 542 543 # Get options... 544 global Options 545 Options = docopt(_docoptUsage_) 546 547 # Set current working directory to the specified directory... 548 WorkingDir = Options["--workingdir"] 549 if WorkingDir: 550 os.chdir(WorkingDir) 551 552 # Handle examples option... 553 if "--examples" in Options and Options["--examples"]: 554 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 555 sys.exit(0) 556 557 # Handle listing of descriptor information... 558 if Options["--list"]: 559 ProcessListMolecularDescriptorsOption() 560 sys.exit(0) 561 562 def ProcessListMolecularDescriptorsOption(): 563 """Process list descriptors information.""" 564 565 RetrieveMolecularDescriptorsInfo() 566 ListMolecularDescriptorsInfo() 567 568 def ValidateOptions(): 569 """Validate option values.""" 570 571 MiscUtil.ValidateOptionTextValue("-a, --autocorr2DExclude", Options["--autocorr2DExclude"], "yes no") 572 MiscUtil.ValidateOptionTextValue("-f, --fragmentCount", Options["--fragmentCount"], "yes no") 573 574 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "2D 3D All FragmentCountOnly Specify") 575 576 if re.match("^Specify$", Options["--mode"], re.I): 577 if re.match("^none$", Options["--descriptorNames"], re.I): 578 MiscUtil.PrintError("The name(s) of molecular descriptors must be specified using \"-d, --descriptorNames\" option during \"Specify\" value of \"-m, --mode\" option.") 579 580 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 581 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt") 582 583 if re.match("^3D|All$", Options["--mode"], re.I): 584 if MiscUtil.CheckFileExt(Options["--infile"], "smi"): 585 MiscUtil.PrintError("The input SMILES file, %s, is not valid for \"3D or All\" value of \"-m, --mode\" option.") 586 587 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd csv tsv txt") 588 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 589 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 590 591 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 592 593 MiscUtil.ValidateOptionIntegerValue("-p, --precision", Options["--precision"], {">": 0}) 594 MiscUtil.ValidateOptionTextValue("-s, --smilesOut", Options["--smilesOut"], "yes no") 595 596 # Setup a usage string for docopt... 597 _docoptUsage_ = """ 598 RDKitCalculateMolecularDescriptors.py - Calculate 2D/3D molecular descriptors 599 600 Usage: 601 RDKitCalculateMolecularDescriptors.py [--autocorr2DExclude <yes or no>] [--fragmentCount <yes or no>] 602 [--descriptorNames <Name1,Name2,...>] [--infileParams <Name,Value,...>] 603 [--mode <2D, 3D, All...>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 604 [--outfileParams <Name,Value,...>] [--overwrite] [--precision <number>] 605 [--smilesOut <yes or no>] [-w <dir>] -i <infile> -o <outfile> 606 RDKitCalculateMolecularDescriptors.py -l | --list 607 RDKitCalculateMolecularDescriptors.py -h | --help | -e | --examples 608 609 Description: 610 Calculate 2D/3D molecular descriptors for molecules and write them out to a SD or 611 CSV/TSV text file. 612 613 The complete list of currently available molecular descriptors may be obtained by 614 using '-l, --list' option. The names of valid 2D, fragment count, and 3D molecular 615 descriptors are shown below: 616 617 2D descriptors: Autocorr2D, BalabanJ, BertzCT, Chi0, Chi1, Chi0n - Chi4n, Chi0v - Chi4v, 618 EState_VSA1 - EState_VSA11, ExactMolWt, FpDensityMorgan1, FpDensityMorgan2, FpDensityMorgan3, 619 FractionCSP3, HallKierAlpha, HeavyAtomCount, HeavyAtomMolWt, Ipc, Kappa1 - Kappa3, 620 LabuteASA, MaxAbsEStateIndex, MaxAbsPartialCharge, MaxEStateIndex, MaxPartialCharge, 621 MinAbsEStateIndex, MinAbsPartialCharge, MinEStateIndex, MinPartialCharge, MolLogP, 622 MolMR, MolWt, NHOHCount, NOCount, NumAliphaticCarbocycles, NumAliphaticHeterocycles, 623 NumAliphaticRings, NumAromaticCarbocycles, NumAromaticHeterocycles, NumAromaticRings, 624 NumHAcceptors, NumHDonors, NumHeteroatoms, NumRadicalElectrons, NumRotatableBonds, 625 NumSaturatedCarbocycles, NumSaturatedHeterocycles, NumSaturatedRings, NumValenceElectrons, 626 PEOE_VSA1 - PEOE_VSA14, RingCount, SMR_VSA1 - SMR_VSA10, SlogP_VSA1 - SlogP_VSA12, 627 TPSA, VSA_EState1 - VSA_EState10, qed 628 629 FragmentCount 2D descriptors: fr_Al_COO, fr_Al_OH, fr_Al_OH_noTert, fr_ArN, fr_Ar_COO, 630 fr_Ar_N, fr_Ar_NH, fr_Ar_OH, fr_COO, fr_COO2, fr_C_O, fr_C_O_noCOO, fr_C_S, fr_HOCCN, 631 fr_Imine, fr_NH0, fr_NH1, fr_NH2, fr_N_O, fr_Ndealkylation1, fr_Ndealkylation2, fr_Nhpyrrole, 632 fr_SH, fr_aldehyde, fr_alkyl_carbamate, fr_alkyl_halide, fr_allylic_oxid, fr_amide, fr_amidine, 633 fr_aniline, fr_aryl_methyl, fr_azide, fr_azo, fr_barbitur, fr_benzene, fr_benzodiazepine, 634 fr_bicyclic, fr_diazo, fr_dihydropyridine, fr_epoxide, fr_ester, fr_ether, fr_furan, fr_guanido, 635 fr_halogen, fr_hdrzine, fr_hdrzone, fr_imidazole, fr_imide, fr_isocyan, fr_isothiocyan, fr_ketone, 636 fr_ketone_Topliss, fr_lactam, fr_lactone, fr_methoxy, fr_morpholine, fr_nitrile, fr_nitro, 637 fr_nitro_arom, fr_nitro_arom_nonortho, fr_nitroso, fr_oxazole, fr_oxime, fr_para_hydroxylation, 638 fr_phenol, fr_phenol_noOrthoHbond, fr_phos_acid, fr_phos_ester, fr_piperdine, fr_piperzine, 639 fr_priamide, fr_prisulfonamd, fr_pyridine, fr_quatN, fr_sulfide, fr_sulfonamd, fr_sulfone, 640 fr_term_acetylene, fr_tetrazole, fr_thiazole, fr_thiocyan, fr_thiophene, fr_unbrch_alkane, fr_urea 641 642 3D descriptors: Asphericity, Autocorr3D, Eccentricity, GETAWAY, InertialShapeFactor, MORSE, 643 NPR1, NPR2, PMI1, PMI2, PMI3, RDF, RadiusOfGyration, SpherocityIndex, WHIM 644 645 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 646 .txt, .csv, .tsv) 647 648 The supported output file formats are: SD File (.sdf, .sd), CSV/TSV (.csv, .tsv, .txt) 649 650 Options: 651 -a, --autocorr2DExclude <yes or no> [default: yes] 652 Exclude Autocorr2D descriptor from the calculation of 2D descriptors. 653 -f, --fragmentCount <yes or no> [default: yes] 654 Include 2D fragment count descriptors during the calculation. These descriptors are 655 counted using SMARTS patterns specified in FragmentDescriptors.csv file distributed 656 with RDKit. This option is only used during '2D' or 'All' value of '-m, --mode' option. 657 -d, --descriptorNames <Name1,Name2,...> [default: none] 658 A comma delimited list of supported molecular descriptor names to calculate. 659 This option is only used during 'Specify' value of '-m, --mode' option. 660 -e, --examples 661 Print examples. 662 -h, --help 663 Print this help message. 664 -i, --infile <infile> 665 Input file name. 666 --infileParams <Name,Value,...> [default: auto] 667 A comma delimited list of parameter name and value pairs for reading 668 molecules from files. The supported parameter names for different file 669 formats, along with their default values, are shown below: 670 671 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 672 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 673 smilesTitleLine,auto,sanitize,yes 674 675 Possible values for smilesDelimiter: space, comma or tab. 676 -l, --list 677 List molecular descriptors without performing any calculations. 678 -m, --mode <2D, 3D, All, FragmentCountOnly, or Specify> [default: 2D] 679 Type of molecular descriptors to calculate. Possible values: 2D, 3D, 680 All or Specify. The name of molecular descriptors must be specified using 681 '-d, --descriptorNames' for 'Specify'. 2D descriptors also include 1D descriptors. 682 The structure of molecules must contain 3D coordinates for the calculation 683 of 3D descriptors. 684 --mp <yes or no> [default: no] 685 Use multiprocessing. 686 687 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 688 function employing lazy RDKit data iterable. This allows processing of 689 arbitrary large data sets without any additional requirements memory. 690 691 All input data may be optionally loaded into memory by mp.Pool.map() 692 before starting worker processes in a process pool by setting the value 693 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 694 695 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 696 data mode may adversely impact the performance. The '--mpParams' section 697 provides additional information to tune the value of 'chunkSize'. 698 --mpParams <Name,Value,...> [default: auto] 699 A comma delimited list of parameter name and value pairs to configure 700 multiprocessing. 701 702 The supported parameter names along with their default and possible 703 values are shown below: 704 705 chunkSize, auto 706 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 707 numProcesses, auto [ Default: mp.cpu_count() ] 708 709 These parameters are used by the following functions to configure and 710 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 711 mp.Pool.imap(). 712 713 The chunkSize determines chunks of input data passed to each worker 714 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 715 The default value of chunkSize is dependent on the value of 'inputDataMode'. 716 717 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 718 automatically converts RDKit data iterable into a list, loads all data into 719 memory, and calculates the default chunkSize using the following method 720 as shown in its code: 721 722 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 723 if extra: chunkSize += 1 724 725 For example, the default chunkSize will be 7 for a pool of 4 worker processes 726 and 100 data items. 727 728 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 729 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 730 data into memory. Consequently, the size of input data is not known a priori. 731 It's not possible to estimate an optimal value for the chunkSize. The default 732 chunkSize is set to 1. 733 734 The default value for the chunkSize during 'Lazy' data mode may adversely 735 impact the performance due to the overhead associated with exchanging 736 small chunks of data. It is generally a good idea to explicitly set chunkSize to 737 a larger value during 'Lazy' input data mode, based on the size of your input 738 data and number of processes in the process pool. 739 740 The mp.Pool.map() function waits for all worker processes to process all 741 the data and return the results. The mp.Pool.imap() function, however, 742 returns the the results obtained from worker processes as soon as the 743 results become available for specified chunks of data. 744 745 The order of data in the results returned by both mp.Pool.map() and 746 mp.Pool.imap() functions always corresponds to the input data. 747 -o, --outfile <outfile> 748 Output file name. 749 --outfileParams <Name,Value,...> [default: auto] 750 A comma delimited list of parameter name and value pairs for writing 751 molecules to files. The supported parameter names for different file 752 formats, along with their default values, are shown below: 753 754 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 755 756 Default value for compute2DCoords: yes for SMILES input file; no for all other 757 file types. 758 -p, --precision <number> [default: 3] 759 Floating point precision for writing the calculated descriptor values. 760 -s, --smilesOut <yes or no> [default: no] 761 Write out SMILES string to CSV/TSV text output file. 762 --overwrite 763 Overwrite existing files. 764 -w, --workingdir <dir> 765 Location of working directory which defaults to the current directory. 766 767 Examples: 768 To compute all available 2D descriptors except Autocorr2D descriptor and 769 write out a CSV file, type: 770 771 % RDKitCalculateMolecularDescriptors.py -i Sample.smi -o SampleOut.csv 772 773 To compute all available 2D descriptors except Autocorr2D descriptor in 774 multiprocessing mode on all available CPUs without loading all data into 775 memory, and write out a CSV file, type: 776 777 % RDKitCalculateMolecularDescriptors.py --mp yes -i Sample.smi 778 -o SampleOut.csv 779 780 To compute all available 2D descriptors except Autocorr2D descriptor in 781 multiprocessing mode on all available CPUs by loading all data into memory, 782 and write out a CSV file, type: 783 784 % RDKitCalculateMolecularDescriptors.py --mp yes --mpParams 785 "inputDataMode,InMemory" -i Sample.smi -o SampleOut.csv 786 787 To compute all available 2D descriptors except Autocorr2D descriptor in 788 multiprocessing mode on specific number of CPUs and chunk size without 789 loading all data into memory, and write out a SDF file, type: 790 791 % RDKitCalculateMolecularDescriptors.py --mp yes --mpParams 792 "inputDataMode,Lazy,numProcesses,4,chunkSize,8" -i Sample.smi 793 -o SampleOut.sdf 794 795 To compute all available 2D descriptors including Autocorr2D descriptor and 796 excluding fragment count descriptors, and write out a TSV file, type: 797 798 % RDKitCalculateMolecularDescriptors.py -m 2D -a no -f no 799 -i Sample.smi -o SampleOut.tsv 800 801 To compute all available 3D descriptors and write out a SD file, type: 802 803 % RDKitCalculateMolecularDescriptors.py -m 3D -i Sample3D.sdf 804 -o Sample3DOut.sdf 805 806 To compute only fragment count 2D descriptors and write out a SD 807 file file, type: 808 809 % RDKitCalculateMolecularDescriptors.py -m FragmentCountOnly 810 -i Sample.sdf -o SampleOut.sdf 811 812 To compute all available 2D and 3D descriptors including fragment count and 813 Autocorr2D and write out a CSV file, type: 814 815 % RDKitCalculateMolecularDescriptors.py -m All -a no -i Sample.sdf 816 -o SampleOut.csv 817 818 To compute a specific set of 2D and 3D descriptors and write out a 819 write out a TSV file, type: 820 821 % RDKitCalculateMolecularDescriptors.py -m specify 822 -d 'MolWt,MolLogP,NHOHCount, NOCount,RadiusOfGyration' 823 -i Sample3D.sdf -o SampleOut.csv 824 825 To compute all available 2D descriptors except Autocorr2D descriptor for 826 molecules in a CSV SMILES file, SMILES strings in column 1, name in 827 column 2, and write out a SD file without calculation of 2D coordinates, type: 828 829 % RDKitCalculateMolecularDescriptors.py --infileParams 830 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 831 smilesNameColumn,2" --outfileParams "compute2DCoords,no" 832 -i SampleSMILES.csv -o SampleOut.sdf 833 834 Author: 835 Manish Sud(msud@san.rr.com) 836 837 See also: 838 RDKitCalculateRMSD.py, RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py, 839 RDKitGenerateConformers.py, RDKitPerformMinimization.py 840 841 Copyright: 842 Copyright (C) 2025 Manish Sud. All rights reserved. 843 844 The functionality available in this script is implemented using RDKit, an 845 open source toolkit for cheminformatics developed by Greg Landrum. 846 847 This file is part of MayaChemTools. 848 849 MayaChemTools is free software; you can redistribute it and/or modify it under 850 the terms of the GNU Lesser General Public License as published by the Free 851 Software Foundation; either version 3 of the License, or (at your option) any 852 later version. 853 854 """ 855 856 if __name__ == "__main__": 857 main()