MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitCalculateMolecularDescriptors.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43     from rdkit.Chem import rdMolDescriptors
  44     from rdkit.Chem import Descriptors
  45     from rdkit.Chem import Descriptors3D
  46 except ImportError as ErrMsg:
  47     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  48     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  49     sys.exit(1)
  50 
  51 # RDKit dependency imports...
  52 import numpy
  53 
  54 # MayaChemTools imports...
  55 try:
  56     from docopt import docopt
  57     import MiscUtil
  58     import RDKitUtil
  59 except ImportError as ErrMsg:
  60     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  61     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  62     sys.exit(1)
  63 
  64 ScriptName = os.path.basename(sys.argv[0])
  65 Options = {}
  66 OptionsInfo = {}
  67 
  68 DescriptorNamesMap = {}
  69 
  70 def main():
  71     """Start execution of the script"""
  72     
  73     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  74     
  75     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  76     
  77     # Retrieve command line arguments and options...
  78     RetrieveOptions()
  79     
  80     # Process and validate command line arguments and options...
  81     ProcessOptions()
  82     
  83     # Perform actions required by the script...
  84     CalculateMolecularDescriptors()
  85     
  86     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  87     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  88 
  89 def CalculateMolecularDescriptors():
  90     """Calculate molecular descriptors."""
  91 
  92     ProcessMolecularDescriptorsInfo()
  93     PerformCalculations()
  94 
  95 def ProcessMolecularDescriptorsInfo():
  96     """Process descriptors information."""
  97 
  98     RetrieveMolecularDescriptorsInfo()
  99     ProcessSpecifiedDescriptorNames()
 100 
 101 def PerformCalculations():
 102     """Calculate descriptors for a specified list of descriptors."""
 103 
 104     # Setup a molecule reader...
 105     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
 106     Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
 107     
 108     # Setup a writer...
 109     Writer = SetupMoleculeWriter()
 110     
 111     # Process molecules...
 112     MolCount, ValidMolCount = ProcessMolecules(Mols, Writer)
 113 
 114     if Writer is not None:
 115         Writer.close()
 116         
 117     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 118     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 119     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 120 
 121 def ProcessMolecules(Mols, Writer):
 122     """Process and filter molecules. """
 123     
 124     if OptionsInfo["MPMode"]:
 125         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 126     else:
 127         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 128 
 129 def ProcessMoleculesUsingSingleProcess(Mols, Writer):
 130     """Process molecules and calculate descriptors using a single process."""
 131 
 132     DescriptorsCount = len(OptionsInfo["SpecifiedDescriptorNames"])
 133     MiscUtil.PrintInfo("\nCalculating %d molecular %s for each molecule..." % (DescriptorsCount, ("descroptors" if DescriptorsCount > 1 else "descriptor")))
 134     
 135     (MolCount, ValidMolCount) = [0] * 2
 136     for MolIndex, Mol in enumerate(Mols):
 137         MolCount += 1
 138         if Mol is None:
 139             continue
 140         
 141         if RDKitUtil.IsMolEmpty(Mol):
 142             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 143             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 144             continue
 145         ValidMolCount += 1
 146 
 147         # Calculate and write descriptor values...
 148         CalculatedValues = CalculateDescriptorValues(MolIndex, Mol)
 149         WriteDescriptorValues(Mol, MolCount, Writer, CalculatedValues)
 150     
 151     return (MolCount, ValidMolCount)
 152     
 153 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 154     """Process molecules and calculate descriptors using multiprocessing."""
 155 
 156     DescriptorsCount = len(OptionsInfo["SpecifiedDescriptorNames"])
 157     MiscUtil.PrintInfo("\nCalculating %d molecular %s for each molecule using multiprocessing......" % (DescriptorsCount, ("descroptors" if DescriptorsCount > 1 else "descriptor")))
 158 
 159     MPParams = OptionsInfo["MPParams"]
 160     
 161     # Setup data for initializing a worker process...
 162     MiscUtil.PrintInfo("Encoding options info...")
 163     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 164     
 165     # Setup a encoded mols data iterable for a worker process...
 166     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 167     
 168     # Setup process pool along with data initialization for each process...
 169     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 170     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 171     
 172     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 173     
 174     # Start processing...
 175     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 176         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 177     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 178         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 179     else:
 180         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 181 
 182     (MolCount, ValidMolCount) = [0] * 2
 183     for Result in Results:
 184         MolCount += 1
 185         MolIndex, EncodedMol, CalculatedValues = Result
 186         
 187         if EncodedMol is None:
 188             continue
 189         ValidMolCount += 1
 190         
 191         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 192         
 193         # Write descriptor values...
 194         WriteDescriptorValues(Mol, MolCount, Writer, CalculatedValues)
 195     
 196     return (MolCount, ValidMolCount)
 197 
 198 def InitializeWorkerProcess(*EncodedArgs):
 199     """Initialize data for a worker process."""
 200     
 201     global Options, OptionsInfo
 202 
 203     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 204     
 205     # Decode Options and OptionInfo...
 206     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 207     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 208     
 209     RetrieveMolecularDescriptorsInfo(PrintInfo = False)
 210     
 211 def WorkerProcess(EncodedMolInfo):
 212     """Process data for a worker process."""
 213 
 214     MolIndex, EncodedMol = EncodedMolInfo
 215     
 216     CalculatedValues = []
 217     if EncodedMol is None:
 218         return [MolIndex, None, CalculatedValues]
 219 
 220     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 221     if RDKitUtil.IsMolEmpty(Mol):
 222         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 223         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 224         return [MolIndex, None, CalculatedValues]
 225 
 226     return [MolIndex, EncodedMol, CalculateDescriptorValues(MolIndex, Mol)]
 227 
 228 def CalculateDescriptorValues(MolIndex, Mol):
 229     """Calculate descriptor values. """
 230 
 231     return [FormatCalculatedValue(CalculateDescriptorValue(MolIndex, Mol, Name), OptionsInfo["Precision"]) for Name in OptionsInfo["SpecifiedDescriptorNames"]]
 232 
 233 def CalculateDescriptorValue(MolIndex, Mol, Name):
 234     """Calculate value for a specific descriptor along with handling any calculation failure."""
 235 
 236     try:
 237         Value = DescriptorNamesMap["ComputeFunction"][Name](Mol)
 238     except ValueError as ErrMsg:
 239         MiscUtil.PrintWarning("Failed to calculate descriptor %s for molecule %s:\n%s\n" % (Name, (MolIndex + 1), ErrMsg))
 240         Value = "NA"
 241     
 242     return Value
 243     
 244 def WriteDescriptorValues(Mol, MolNum, Writer, CalculatedValues):
 245     """Write out calculated descriptor values."""
 246 
 247     if OptionsInfo["TextOutFileMode"]:
 248         LineWords = []
 249         if OptionsInfo["SMILESOut"]:
 250             SMILES = Chem.MolToSmiles(Mol, isomericSmiles = True, canonical = True)
 251             LineWords.append(SMILES)
 252             
 253         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 254         LineWords.append(MolName)
 255         LineWords.extend(CalculatedValues)
 256         Line = OptionsInfo["TextOutFileDelim"].join(LineWords)
 257         Writer.write("%s\n" % Line)
 258     else:
 259         for Index, Name in enumerate(OptionsInfo["SpecifiedDescriptorNames"]):
 260             Mol.SetProp(Name, CalculatedValues[Index])
 261 
 262         if OptionsInfo["OutfileParams"]["Compute2DCoords"]:
 263             AllChem.Compute2DCoords(Mol)
 264         
 265         Writer.write(Mol)
 266 
 267 def SetupMoleculeWriter():
 268     """Setup a molecule writer. """
 269     
 270     if OptionsInfo["TextOutFileMode"]:
 271         Writer = open(OptionsInfo["Outfile"], "w")
 272     else:
 273         Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 274         
 275     if Writer is None:
 276         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 277 
 278     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 279 
 280     # Wite out headers for a text file...
 281     if OptionsInfo["TextOutFileMode"]:
 282         LineWords = []
 283         if OptionsInfo["SMILESOut"]:
 284             LineWords.append("SMILES")
 285         LineWords.append("MolID")
 286         LineWords.extend(OptionsInfo["SpecifiedDescriptorNames"])
 287         Line = OptionsInfo["TextOutFileDelim"].join(LineWords)
 288         Writer.write("%s\n" % Line)
 289     
 290     return Writer
 291 
 292 def FormatCalculatedValue(Value, Precision):
 293     """Format calculated value of descriptor based on its type."""
 294 
 295     if (type(Value) is float) or (type(Value) is numpy.float64):
 296         FormattedValue = "%.*f" % (Precision, Value)
 297         if not re.search("[1-9]", FormattedValue):
 298             FormattedValue = "0.0"
 299     elif type(Value) is list:
 300         FormattedValue = "%s" % Value
 301         FormattedValue = re.sub('\[|\]|,', '', FormattedValue)
 302     else:
 303         FormattedValue = "%s" % Value
 304 
 305     return FormattedValue
 306 
 307 def ProcessSpecifiedDescriptorNames():
 308     """Process and validate specified decriptor names."""
 309 
 310     OptionsInfo["SpecifiedDescriptorNames"] = []
 311 
 312     if not re.match("^(2D|3D|All|FragmentCountOnly|Specify)$", OptionsInfo["Mode"], re.I):
 313         MiscUtil.PrintError("Mode value, %s, using \"-m, --mode\" option is not a valid value." % OptionsInfo["Mode"])
 314     
 315     if re.match("^2D$", OptionsInfo["Mode"], re.I):
 316         OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["2D"]["Names"]
 317         if OptionsInfo["FragmentCount"]:
 318             OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["FragmentCount"]["Names"])
 319         return
 320     elif re.match("^3D$", OptionsInfo["Mode"], re.I):
 321         OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["3D"]["Names"]
 322         return
 323     elif re.match("^All$", OptionsInfo["Mode"], re.I):
 324         OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["2D"]["Names"]
 325         if OptionsInfo["FragmentCount"]:
 326             OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["FragmentCount"]["Names"])
 327         OptionsInfo["SpecifiedDescriptorNames"].extend(DescriptorNamesMap["3D"]["Names"])
 328         return
 329     elif re.match("^FragmentCountOnly$", OptionsInfo["Mode"], re.I):
 330         OptionsInfo["SpecifiedDescriptorNames"] = DescriptorNamesMap["FragmentCount"]["Names"]
 331         return
 332 
 333     # Set up a canonical descriptor names map for checking specified names...
 334     CanonicalNameMap = {}
 335     for Name in  DescriptorNamesMap["ComputeFunction"]:
 336         CanonicalNameMap[Name.lower()] = Name
 337     
 338     # Parse and validate specified names...
 339     DescriptorNames = re.sub(" ", "", OptionsInfo["DescriptorNames"])
 340     if not DescriptorNames:
 341         MiscUtil.PrintError("No descriptor names specified for \"-d, --descriptorNames\" option")
 342 
 343     SMILESInfile = MiscUtil.CheckFileExt(Options["--infile"], "smi")
 344     Canonical3DNameMap = {}
 345     if SMILESInfile:
 346         for Name in DescriptorNamesMap["3D"]["Names"]:
 347             Canonical3DNameMap[Name.lower()] = Name
 348             
 349     SpecifiedDescriptorNames = []
 350     for Name in DescriptorNames.split(","):
 351         CanonicalName = Name.lower()
 352         if CanonicalName in CanonicalNameMap:
 353             SpecifiedDescriptorNames.append(CanonicalNameMap[CanonicalName])
 354         else:
 355             MiscUtil.PrintError("The descriptor name, %s, specified using \"-d, --descriptorNames\" option is not a valid name." % (Name))
 356         if SMILESInfile:
 357             if CanonicalName in Canonical3DNameMap:
 358                 MiscUtil.PrintError("The 3D descriptor name, %s, specified using \"-d, --descriptorNames\" option is not a valid for SMILES input file." % (Name))
 359                 
 360     if not len(SpecifiedDescriptorNames):
 361         MiscUtil.PrintError("No valid descriptor name specified for \"-d, --descriptorNames\" option")
 362     
 363     OptionsInfo["SpecifiedDescriptorNames"] = SpecifiedDescriptorNames
 364 
 365 def RetrieveMolecularDescriptorsInfo(PrintInfo = True):
 366     """Retrieve descriptors information."""
 367 
 368     if PrintInfo:
 369         MiscUtil.PrintInfo("\nRetrieving information for avalible molecular descriptors...")
 370     
 371     # Initialze data for 2D, FragmentCount and 3D descriptors...
 372     DescriptorNamesMap["Types"] = ["2D", "FragmentCount", "3D"]
 373     DescriptorNamesMap["ComputeFunction"] = {}
 374 
 375     Autocorr2DExclude = OptionsInfo["Autocorr2DExclude"]
 376     
 377     for Type in DescriptorNamesMap["Types"]:
 378         DescriptorNamesMap[Type] = {}
 379         DescriptorNamesMap[Type]["Names"] = []
 380     
 381     # Setup data for 2D and FragmentCount...
 382     DescriptorsInfo = Descriptors.descList
 383     for DescriptorInfo in DescriptorsInfo:
 384         Name = DescriptorInfo[0]
 385         ComputeFunction = DescriptorInfo[1]
 386 
 387         Type = "2D"
 388         if re.match("^fr_", Name, re.I):
 389             Type = "FragmentCount"
 390         elif re.match("^Autocorr2D$", Name, re.I):
 391             if Autocorr2DExclude:
 392                 continue
 393         
 394         if Name in DescriptorNamesMap["ComputeFunction"]:
 395             if PrintInfo:
 396                 MiscUtil.PrintWarning("Ignoring duplicate descriptor name: %s..." % Name)
 397         else:
 398             DescriptorNamesMap[Type]["Names"].append(Name)
 399             DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction
 400 
 401     # Add new 2D decriptor name to the list...
 402     Type = "2D"
 403     if not Autocorr2DExclude:
 404         try:
 405             Name = "Autocorr2D"
 406             ComputeFunction =  rdMolDescriptors.CalcAUTOCORR2D
 407             if not Name in DescriptorNamesMap["ComputeFunction"]:
 408                 DescriptorNamesMap[Type]["Names"].append(Name)
 409                 DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction
 410         except (AttributeError):
 411             if PrintInfo:
 412                 MiscUtil.PrintInfo("2D descriptor, %s, is not available in your current version of RDKit." % Name)
 413         
 414     # Set data for 3D descriptors...
 415     Type = "3D"
 416     NameToComputeFunctionMap = {'PMI1' : Descriptors3D.PMI1, 'PMI2' : Descriptors3D.PMI2, 'PMI3' : Descriptors3D.PMI3, 'NPR1' : Descriptors3D.NPR1, 'NPR2' : Descriptors3D.NPR2, 'RadiusOfGyration' : Descriptors3D.RadiusOfGyration, 'InertialShapeFactor' :  Descriptors3D.InertialShapeFactor, 'Eccentricity' : Descriptors3D.Eccentricity, 'Asphericity' : Descriptors3D.Asphericity, 'SpherocityIndex' : Descriptors3D.SpherocityIndex}
 417     
 418     for Name in NameToComputeFunctionMap:
 419         ComputeFunction = NameToComputeFunctionMap[Name]
 420         if Name in DescriptorNamesMap["ComputeFunction"]:
 421             if PrintInfo:
 422                 MiscUtil.PrintWarning("Ignoring duplicate descriptor name: %s..." % Name)
 423         else:
 424             DescriptorNamesMap[Type]["Names"].append(Name)
 425             DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction
 426 
 427     # Check and add new 3D descriptors not directly available through Descriptors3D module...
 428     Type = "3D"
 429     AvailableName3DMap = {}
 430     for Name in ['Autocorr3D', 'RDF', 'MORSE', 'WHIM', 'GETAWAY']:
 431         ComputeFunction = None
 432         try:
 433             if re.match("^Autocorr3D$", Name, re.I):
 434                 ComputeFunction =  rdMolDescriptors.CalcAUTOCORR3D
 435             elif re.match("^RDF$", Name, re.I):
 436                 ComputeFunction =  rdMolDescriptors.CalcRDF
 437             elif re.match("^MORSE$", Name, re.I):
 438                 ComputeFunction =  rdMolDescriptors.CalcMORSE
 439             elif re.match("^WHIM$", Name, re.I):
 440                 ComputeFunction =  rdMolDescriptors.CalcWHIM
 441             elif re.match("^GETAWAY$", Name, re.I):
 442                 ComputeFunction =  rdMolDescriptors.CalcGETAWAY
 443             else:
 444                 ComputeFunction = None
 445         except (AttributeError):
 446             if PrintInfo:
 447                 MiscUtil.PrintWarning("3D descriptor, %s, is not available in your current version of RDKit" % Name)
 448         
 449         if ComputeFunction is not None:
 450             AvailableName3DMap[Name] = ComputeFunction
 451 
 452     for Name in AvailableName3DMap:
 453         ComputeFunction = AvailableName3DMap[Name]
 454         if not Name in DescriptorNamesMap["ComputeFunction"]:
 455             DescriptorNamesMap[Type]["Names"].append(Name)
 456             DescriptorNamesMap["ComputeFunction"][Name] = ComputeFunction
 457 
 458     Count = 0
 459     TypesCount = []
 460     for Type in DescriptorNamesMap["Types"]:
 461         TypeCount = len(DescriptorNamesMap[Type]["Names"])
 462         TypesCount.append(TypeCount)
 463         Count += TypeCount
 464 
 465     if not Count:
 466         if PrintInfo:
 467             MiscUtil.PrintError("Failed to retrieve any  molecular descriptors...")
 468 
 469     # Sort descriptor names...
 470     for Type in DescriptorNamesMap["Types"]:
 471         DescriptorNamesMap[Type]["Names"] = sorted(DescriptorNamesMap[Type]["Names"])
 472     
 473     if PrintInfo:
 474         MiscUtil.PrintInfo("\nTotal number of availble molecular descriptors: %d" % Count)
 475     for Index in range(0, len(DescriptorNamesMap["Types"])):
 476         Type = DescriptorNamesMap["Types"][Index]
 477         TypeCount = TypesCount[Index]
 478         if PrintInfo:
 479             MiscUtil.PrintInfo("Number of %s molecular descriptors: %d" % (Type, TypeCount))
 480     
 481 def ListMolecularDescriptorsInfo():
 482     """List descriptors information."""
 483 
 484     MiscUtil.PrintInfo("\nListing information for avalible molecular descriptors...")
 485 
 486     Delimiter = ", "
 487     for Type in DescriptorNamesMap["Types"]:
 488         Names = DescriptorNamesMap[Type]["Names"]
 489         MiscUtil.PrintInfo("\n%s descriptors: %s" % (Type, Delimiter.join(Names)))
 490 
 491     MiscUtil.PrintInfo("")
 492 
 493 def ProcessOptions():
 494     """Process and validate command line arguments and options"""
 495     
 496     MiscUtil.PrintInfo("Processing options...")
 497     
 498     # Validate options...
 499     ValidateOptions()
 500 
 501     OptionsInfo["Autocorr2DExclude"] = True
 502     if not re.match("^Yes$", Options["--autocorr2DExclude"], re.I):
 503         OptionsInfo["Autocorr2DExclude"] = False
 504     
 505     OptionsInfo["FragmentCount"] = True
 506     if not re.match("^Yes$", Options["--fragmentCount"], re.I):
 507         OptionsInfo["FragmentCount"] = False
 508     
 509     OptionsInfo["DescriptorNames"] = Options["--descriptorNames"]
 510     OptionsInfo["Mode"] = Options["--mode"]
 511     
 512     OptionsInfo["Infile"] = Options["--infile"]
 513     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 514     
 515     OptionsInfo["Outfile"] = Options["--outfile"]
 516     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 517     
 518     OptionsInfo["Overwrite"] = Options["--overwrite"]
 519     
 520     TextOutFileMode = False
 521     TextOutFileDelim = ""
 522     if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
 523         TextOutFileMode = True
 524         TextOutFileDelim = ","
 525     elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
 526         TextOutFileMode = True
 527         TextOutFileDelim = "\t"
 528     OptionsInfo["TextOutFileMode"] = TextOutFileMode
 529     OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
 530     
 531     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 532     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 533     
 534     OptionsInfo["Precision"] = int(Options["--precision"])
 535     
 536     OptionsInfo["SMILESOut"] = False
 537     if re.match("^Yes$", Options["--smilesOut"], re.I):
 538         OptionsInfo["SMILESOut"] = True
 539 
 540 def RetrieveOptions():
 541     """Retrieve command line arguments and options"""
 542     
 543     # Get options...
 544     global Options
 545     Options = docopt(_docoptUsage_)
 546     
 547     # Set current working directory to the specified directory...
 548     WorkingDir = Options["--workingdir"]
 549     if WorkingDir:
 550         os.chdir(WorkingDir)
 551     
 552     # Handle examples option...
 553     if "--examples" in Options and Options["--examples"]:
 554         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 555         sys.exit(0)
 556         
 557     # Handle listing of descriptor information...
 558     if  Options["--list"]:
 559         ProcessListMolecularDescriptorsOption()
 560         sys.exit(0)
 561 
 562 def ProcessListMolecularDescriptorsOption():
 563     """Process list descriptors information."""
 564 
 565     RetrieveMolecularDescriptorsInfo()
 566     ListMolecularDescriptorsInfo()
 567 
 568 def ValidateOptions():
 569     """Validate option values"""
 570 
 571     MiscUtil.ValidateOptionTextValue("-a, --autocorr2DExclude", Options["--autocorr2DExclude"], "yes no")
 572     MiscUtil.ValidateOptionTextValue("-f, --fragmentCount", Options["--fragmentCount"], "yes no")
 573     
 574     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "2D 3D All FragmentCountOnly Specify")
 575     
 576     if re.match("^Specify$", Options["--mode"], re.I):
 577         if re.match("^none$", Options["--descriptorNames"], re.I):
 578             MiscUtil.PrintError("The name(s) of molecular descriptors must be specified using \"-d, --descriptorNames\" option during \"Specify\" value of \"-m, --mode\" option.")
 579     
 580     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 581     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt")
 582     
 583     if re.match("^3D|All$", Options["--mode"], re.I):
 584         if MiscUtil.CheckFileExt(Options["--infile"], "smi"):
 585             MiscUtil.PrintError("The input SMILES file, %s, is not valid for  \"3D or All\" value of \"-m, --mode\" option.")
 586     
 587     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd csv tsv txt")
 588     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 589     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 590     
 591     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 592     
 593     MiscUtil.ValidateOptionIntegerValue("-p, --precision", Options["--precision"], {">": 0})
 594     MiscUtil.ValidateOptionTextValue("-s, --smilesOut", Options["--smilesOut"], "yes no")
 595 
 596 # Setup a usage string for docopt...
 597 _docoptUsage_ = """
 598 RDKitCalculateMolecularDescriptors.py - Calculate 2D/3D molecular descriptors
 599 
 600 Usage:
 601     RDKitCalculateMolecularDescriptors.py [--autocorr2DExclude <yes or no>] [--fragmentCount <yes or no>]
 602                                           [--descriptorNames <Name1,Name2,...>] [--infileParams <Name,Value,...>]
 603                                           [--mode <2D, 3D, All...>] [--mp <yes or no>] [--mpParams <Name.Value,...>]
 604                                           [--outfileParams <Name,Value,...>] [--overwrite] [--precision <number>]
 605                                           [--smilesOut <yes or no>] [-w <dir>] -i <infile> -o <outfile> 
 606     RDKitCalculateMolecularDescriptors.py -l | --list
 607     RDKitCalculateMolecularDescriptors.py -h | --help | -e | --examples
 608 
 609 Description:
 610     Calculate 2D/3D molecular descriptors for molecules and write them out to a SD or
 611     CSV/TSV text file.
 612 
 613     The complete list of currently available molecular descriptors may be obtained by
 614     using '-l, --list' option. The names of valid 2D, fragment count, and 3D molecular
 615     descriptors are shown below:
 616 
 617     2D descriptors: Autocorr2D, BalabanJ, BertzCT, Chi0, Chi1, Chi0n - Chi4n, Chi0v - Chi4v,
 618     EState_VSA1 - EState_VSA11, ExactMolWt, FpDensityMorgan1, FpDensityMorgan2, FpDensityMorgan3,
 619     FractionCSP3, HallKierAlpha, HeavyAtomCount, HeavyAtomMolWt, Ipc, Kappa1 - Kappa3,
 620     LabuteASA, MaxAbsEStateIndex, MaxAbsPartialCharge, MaxEStateIndex, MaxPartialCharge,
 621     MinAbsEStateIndex, MinAbsPartialCharge, MinEStateIndex, MinPartialCharge, MolLogP,
 622     MolMR, MolWt, NHOHCount, NOCount, NumAliphaticCarbocycles, NumAliphaticHeterocycles,
 623     NumAliphaticRings, NumAromaticCarbocycles, NumAromaticHeterocycles, NumAromaticRings,
 624     NumHAcceptors, NumHDonors, NumHeteroatoms, NumRadicalElectrons, NumRotatableBonds,
 625     NumSaturatedCarbocycles, NumSaturatedHeterocycles, NumSaturatedRings, NumValenceElectrons,
 626     PEOE_VSA1 - PEOE_VSA14,  RingCount, SMR_VSA1 - SMR_VSA10, SlogP_VSA1 - SlogP_VSA12,
 627     TPSA, VSA_EState1 - VSA_EState10, qed
 628 
 629     FragmentCount 2D descriptors: fr_Al_COO, fr_Al_OH, fr_Al_OH_noTert, fr_ArN, fr_Ar_COO,
 630     fr_Ar_N, fr_Ar_NH, fr_Ar_OH, fr_COO, fr_COO2, fr_C_O, fr_C_O_noCOO, fr_C_S, fr_HOCCN,
 631     fr_Imine, fr_NH0, fr_NH1, fr_NH2, fr_N_O, fr_Ndealkylation1, fr_Ndealkylation2, fr_Nhpyrrole,
 632     fr_SH, fr_aldehyde, fr_alkyl_carbamate, fr_alkyl_halide, fr_allylic_oxid, fr_amide, fr_amidine,
 633     fr_aniline, fr_aryl_methyl, fr_azide, fr_azo, fr_barbitur, fr_benzene, fr_benzodiazepine,
 634     fr_bicyclic, fr_diazo, fr_dihydropyridine, fr_epoxide, fr_ester, fr_ether, fr_furan, fr_guanido,
 635     fr_halogen, fr_hdrzine, fr_hdrzone, fr_imidazole, fr_imide, fr_isocyan, fr_isothiocyan, fr_ketone,
 636     fr_ketone_Topliss, fr_lactam, fr_lactone, fr_methoxy, fr_morpholine, fr_nitrile, fr_nitro,
 637     fr_nitro_arom, fr_nitro_arom_nonortho, fr_nitroso, fr_oxazole, fr_oxime, fr_para_hydroxylation,
 638     fr_phenol, fr_phenol_noOrthoHbond, fr_phos_acid, fr_phos_ester, fr_piperdine, fr_piperzine,
 639     fr_priamide, fr_prisulfonamd, fr_pyridine, fr_quatN, fr_sulfide, fr_sulfonamd, fr_sulfone,
 640     fr_term_acetylene, fr_tetrazole, fr_thiazole, fr_thiocyan, fr_thiophene, fr_unbrch_alkane, fr_urea
 641 
 642     3D descriptors: Asphericity, Autocorr3D, Eccentricity, GETAWAY, InertialShapeFactor, MORSE,
 643     NPR1, NPR2, PMI1, PMI2, PMI3, RDF, RadiusOfGyration, SpherocityIndex, WHIM
 644 
 645     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 646     .txt, .csv, .tsv)
 647 
 648     The supported output file formats are: SD File (.sdf, .sd), CSV/TSV (.csv, .tsv, .txt)
 649 
 650 Options:
 651     -a, --autocorr2DExclude <yes or no>  [default: yes]
 652         Exclude Autocorr2D descriptor from the calculation of 2D descriptors. 
 653     -f, --fragmentCount <yes or no>  [default: yes]
 654         Include 2D fragment count descriptors during the calculation. These descriptors are
 655         counted using SMARTS patterns specified in FragmentDescriptors.csv file distributed
 656         with RDKit. This option is only used during '2D' or 'All' value of '-m, --mode' option.
 657     -d, --descriptorNames <Name1,Name2,...>  [default: none]
 658         A comma delimited list of supported molecular descriptor names to calculate.
 659         This option is only used during 'Specify' value of '-m, --mode' option.
 660     -e, --examples
 661         Print examples.
 662     -h, --help
 663         Print this help message.
 664     -i, --infile <infile>
 665         Input file name.
 666     --infileParams <Name,Value,...>  [default: auto]
 667         A comma delimited list of parameter name and value pairs for reading
 668         molecules from files. The supported parameter names for different file
 669         formats, along with their default values, are shown below:
 670             
 671             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 672             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 673                 smilesTitleLine,auto,sanitize,yes
 674             
 675         Possible values for smilesDelimiter: space, comma or tab.
 676     -l, --list
 677         List molecular descriptors without performing any calculations.
 678     -m, --mode <2D, 3D, All, FragmentCountOnly, or Specify>  [default: 2D]
 679         Type of molecular descriptors to calculate. Possible values: 2D, 3D,
 680         All or Specify. The name of molecular descriptors must be specified using
 681         '-d, --descriptorNames' for 'Specify'. 2D descriptors also include 1D descriptors.
 682         The structure  of molecules must contain 3D coordinates for the  calculation
 683         of 3D descriptors.
 684     --mp <yes or no>  [default: no]
 685         Use multiprocessing.
 686          
 687         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 688         function employing lazy RDKit data iterable. This allows processing of
 689         arbitrary large data sets without any additional requirements memory.
 690         
 691         All input data may be optionally loaded into memory by mp.Pool.map()
 692         before starting worker processes in a process pool by setting the value
 693         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 694         
 695         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 696         data mode may adversely impact the performance. The '--mpParams' section
 697         provides additional information to tune the value of 'chunkSize'.
 698     --mpParams <Name,Value,...>  [default: auto]
 699         A comma delimited list of parameter name and value pairs for to
 700         configure multiprocessing.
 701         
 702         The supported parameter names along with their default and possible
 703         values are shown below:
 704         
 705             chunkSize, auto
 706             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 707             numProcesses, auto   [ Default: mp.cpu_count() ]
 708         
 709         These parameters are used by the following functions to configure and
 710         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 711         mp.Pool.imap().
 712         
 713         The chunkSize determines chunks of input data passed to each worker
 714         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 715         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 716         
 717         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 718         automatically converts RDKit data iterable into a list, loads all data into
 719         memory, and calculates the default chunkSize using the following method
 720         as shown in its code:
 721         
 722             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 723             if extra: chunkSize += 1
 724         
 725         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 726         and 100 data items.
 727         
 728         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 729         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 730         data into memory. Consequently, the size of input data is not known a priori.
 731         It's not possible to estimate an optimal value for the chunkSize. The default 
 732         chunkSize is set to 1.
 733         
 734         The default value for the chunkSize during 'Lazy' data mode may adversely
 735         impact the performance due to the overhead associated with exchanging
 736         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 737         a larger value during 'Lazy' input data mode, based on the size of your input
 738         data and number of processes in the process pool.
 739         
 740         The mp.Pool.map() function waits for all worker processes to process all
 741         the data and return the results. The mp.Pool.imap() function, however,
 742         returns the the results obtained from worker processes as soon as the
 743         results become available for specified chunks of data.
 744         
 745         The order of data in the results returned by both mp.Pool.map() and 
 746         mp.Pool.imap() functions always corresponds to the input data.
 747     -o, --outfile <outfile>
 748         Output file name.
 749     --outfileParams <Name,Value,...>  [default: auto]
 750         A comma delimited list of parameter name and value pairs for writing
 751         molecules to files. The supported parameter names for different file
 752         formats, along with their default values, are shown below:
 753             
 754             SD: compute2DCoords,auto,kekulize,no
 755             
 756         Default value for compute2DCoords: yes for SMILES input file; no for all other
 757         file types.
 758     -p, --precision <number>  [default: 3]
 759         Floating point precision for writing the calculated descriptor values.
 760     -s, --smilesOut <yes or no>  [default: no]
 761         Write out SMILES string to CSV/TSV text output file.
 762     --overwrite
 763         Overwrite existing files.
 764     -w, --workingdir <dir>
 765         Location of working directory which defaults to the current directory.
 766 
 767 Examples:
 768     To compute all available 2D descriptors except Autocorr2D descriptor and
 769     write out a CSV file, type:
 770 
 771         % RDKitCalculateMolecularDescriptors.py  -i Sample.smi -o SampleOut.csv
 772 
 773     To compute all available 2D descriptors except Autocorr2D descriptor in
 774     multiprocessing mode on all available CPUs without loading all data into
 775     memory, and write out a CSV file, type:
 776 
 777         % RDKitCalculateMolecularDescriptors.py  --mp yes -i Sample.smi
 778           -o SampleOut.csv
 779 
 780     To compute all available 2D descriptors except Autocorr2D descriptor in
 781     multiprocessing mode on all available CPUs by loading all data into memory,
 782     and write out a CSV file, type:
 783 
 784         % RDKitCalculateMolecularDescriptors.py  --mp yes --mpParams
 785           "inputDataMode,InMemory" -i Sample.smi -o SampleOut.csv
 786 
 787     To compute all available 2D descriptors except Autocorr2D descriptor in
 788     multiprocessing mode on specific number of CPUs and chunk size without
 789     loading all data into memory, and write out a SDF file, type:
 790 
 791         % RDKitCalculateMolecularDescriptors.py  --mp yes --mpParams
 792           "inputDataMode,Lazy,numProcesses,4,chunkSize,8" -i Sample.smi
 793           -o SampleOut.sdf
 794 
 795     To compute all available 2D descriptors including Autocorr2D descriptor and
 796     excluding fragment count descriptors, and write out a TSV file, type:
 797 
 798         % RDKitCalculateMolecularDescriptors.py  -m 2D -a no -f no
 799           -i Sample.smi -o SampleOut.tsv
 800 
 801     To compute all available 3D descriptors and write out a SD file, type:
 802 
 803         % RDKitCalculateMolecularDescriptors.py  -m 3D -i Sample3D.sdf
 804           -o Sample3DOut.sdf
 805 
 806     To compute only fragment count 2D descriptors and write out a SD
 807     file file, type:
 808 
 809         % RDKitCalculateMolecularDescriptors.py  -m FragmentCountOnly
 810           -i Sample.sdf -o SampleOut.sdf
 811 
 812     To compute all available 2D and 3D descriptors including fragment count and
 813     Autocorr2D and write out a CSV file, type:
 814 
 815         % RDKitCalculateMolecularDescriptors.py  -m All -a no -i Sample.sdf
 816           -o SampleOut.csv
 817 
 818     To compute a specific set of 2D and 3D descriptors and write out a
 819     write out a TSV file, type:
 820 
 821         % RDKitCalculateMolecularDescriptors.py  -m specify
 822           -d 'MolWt,MolLogP,NHOHCount, NOCount,RadiusOfGyration'
 823           -i Sample3D.sdf -o SampleOut.csv
 824 
 825     To compute all available 2D descriptors except Autocorr2D descriptor for 
 826     molecules in a CSV SMILES file, SMILES strings in column 1, name in
 827     column 2, and write out a SD file without calculation of 2D coordinates, type:
 828 
 829         % RDKitCalculateMolecularDescriptors.py --infileParams
 830           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 831           smilesNameColumn,2" --outfileParams "compute2DCoords,no"
 832           -i SampleSMILES.csv -o SampleOut.sdf
 833 
 834 Author:
 835     Manish Sud(msud@san.rr.com)
 836 
 837 See also:
 838     RDKitCalculateRMSD.py, RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py,
 839     RDKitGenerateConformers.py, RDKitPerformMinimization.py
 840 
 841 Copyright:
 842     Copyright (C) 2019 Manish Sud. All rights reserved.
 843 
 844     The functionality available in this script is implemented using RDKit, an
 845     open source toolkit for cheminformatics developed by Greg Landrum.
 846 
 847     This file is part of MayaChemTools.
 848 
 849     MayaChemTools is free software; you can redistribute it and/or modify it under
 850     the terms of the GNU Lesser General Public License as published by the Free
 851     Software Foundation; either version 3 of the License, or (at your option) any
 852     later version.
 853 
 854 """
 855 
 856 if __name__ == "__main__":
 857     main()