MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitEnumerateStereoisomers.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script"""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformEnumeration()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformEnumeration():
  82     """Enumerate stereoisomers."""
  83     
  84     Infile = OptionsInfo["Infile"]
  85     Outfile = OptionsInfo["Outfile"]
  86     
  87     # Setup a molecule reader...
  88     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  89     Mols  = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  90     
  91     # Set up a molecule writer...
  92     Writer = None
  93     Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
  94     if Writer is None:
  95         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
  96     MiscUtil.PrintInfo("Generating file %s...\n" % Outfile)
  97 
  98     # Setup stereo enumeration options...
  99     StereoOptions = StereoEnumerationOptions(tryEmbedding = OptionsInfo["DiscardNonPhysical"], onlyUnassigned = OptionsInfo["UnassignedOnly"], maxIsomers = OptionsInfo["MaxIsomers"])
 100     
 101     # Process molecules...
 102     MolCount = 0
 103     ValidMolCount = 0
 104 
 105     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 106     
 107     for Mol in Mols:
 108         MolCount += 1
 109         
 110         if Mol is None:
 111             continue
 112         
 113         if RDKitUtil.IsMolEmpty(Mol):
 114             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 115             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 116             continue
 117         
 118         ValidMolCount += 1
 119         
 120         MolName = RDKitUtil.GetMolName(Mol, MolCount)
 121         
 122         # Generate and process stereoisomers...
 123         StereoisomersMols = EnumerateStereoisomers(Mol, options = StereoOptions)
 124         IsomerCount = 0
 125         for IsomerMol in StereoisomersMols:
 126             IsomerCount += 1
 127             
 128             # Set isomer mol name...
 129             IsomerMolName = "%s_Isomer%d" % (MolName, IsomerCount)
 130             IsomerMol.SetProp("_Name", IsomerMolName)
 131             
 132             if Compute2DCoords:
 133                 AllChem.Compute2DCoords(IsomerMol)
 134                 
 135             Writer.write(IsomerMol)
 136         
 137         MiscUtil.PrintInfo("Number of stereoisomers written for %s: %d" % (MolName, IsomerCount))
 138             
 139     if Writer is not None:
 140         Writer.close()
 141     
 142     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 143     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 144     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 145 
 146 def ProcessOptions():
 147     """Process and validate command line arguments and options"""
 148     
 149     MiscUtil.PrintInfo("Processing options...")
 150     
 151     # Validate options...
 152     ValidateOptions()
 153     
 154     OptionsInfo["DiscardNonPhysical"] = True
 155     if re.match("^no$", Options["--discardNonPhysical"], re.I):
 156         OptionsInfo["DiscardNonPhysical"] = False
 157     
 158     OptionsInfo["Mode"] = Options["--mode"]
 159     UnassignedOnly = True
 160     if re.match("^All$", Options["--mode"], re.I):
 161         UnassignedOnly = False
 162     OptionsInfo["UnassignedOnly"] = UnassignedOnly
 163     
 164     OptionsInfo["Infile"] = Options["--infile"]
 165     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 166     
 167     OptionsInfo["Outfile"] = Options["--outfile"]
 168     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 169     
 170     OptionsInfo["Overwrite"] = Options["--overwrite"]
 171 
 172     OptionsInfo["MaxIsomers"] = int(Options["--maxIsomers"])
 173 
 174 def RetrieveOptions():
 175     """Retrieve command line arguments and options"""
 176     
 177     # Get options...
 178     global Options
 179     Options = docopt(_docoptUsage_)
 180     
 181     # Set current working directory to the specified directory...
 182     WorkingDir = Options["--workingdir"]
 183     if WorkingDir:
 184         os.chdir(WorkingDir)
 185     
 186     # Handle examples option...
 187     if "--examples" in Options and Options["--examples"]:
 188         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 189         sys.exit(0)
 190 
 191 def ValidateOptions():
 192     """Validate option values"""
 193     
 194     MiscUtil.ValidateOptionTextValue("-d, --discardNonPhysical", Options["--discardNonPhysical"], "yes no")
 195     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "UnassignedOnly All")
 196     
 197     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 198     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
 199     
 200     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 201     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 202     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 203         
 204     MiscUtil.ValidateOptionIntegerValue("--maxIsomers", Options["--maxIsomers"], {">=": 0})
 205 
 206 # Setup a usage string for docopt...
 207 _docoptUsage_ = """
 208 RDKitEnumerateStereoisomers.py - Enumerate stereoisomers of molecules
 209 
 210 Usage:
 211     RDKitEnumerateStereoisomers.py [--discardNonPhysical <yes or no>]
 212                                 [--infileParams <Name,Value,...>] [--mode <UnassignedOnly or All>]
 213                                 [--maxIsomers <number>] [--outfileParams <Name,Value,...>] 
 214                                 [--overwrite] [-w <dir>] -i <infile> -o <outfile> 
 215     RDKitEnumerateStereoisomers.py -h | --help | -e | --examples
 216 
 217 Description:
 218     Perform a combinatorial enumeration of stereoisomers for molecules around all
 219     or unassigned chiral atoms and bonds.
 220 
 221     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 222     .csv, .tsv, .txt)
 223 
 224     The supported output file format are: SD (.sdf, .sd), SMILES (.smi)
 225 
 226 Options:
 227     -d, --discardNonPhysical <yes or no>  [default: yes]
 228         Discard stereoisomers with non-physical structures. Possible values: yes or no.
 229         The non-physical nature of a stereoisomer is determined by embedding the
 230         structure to generate a conformation for the stereoisomer using standard
 231         distance geometry methodology.
 232         
 233         A word to the wise from RDKit documentation: this is computationally expensive
 234         and uses a heuristic that could result in loss of stereoisomers.
 235     -e, --examples
 236         Print examples.
 237     -m, --mode <UnassignedOnly or All>  [default: UnassignedOnly]
 238         Enumerate unassigned or all chiral centers. The chiral atoms and bonds with
 239         defined stereochemistry are preserved.
 240     --maxIsomers <number>  [default: 50]
 241         Maximum number of stereoisomers to generate for each molecule. A  value of zero
 242         indicates generation of all possible steroisomers.
 243     -h, --help
 244         Print this help message.
 245     -i, --infile <infile>
 246         Input file name.
 247     --infileParams <Name,Value,...>  [default: auto]
 248         A comma delimited list of parameter name and value pairs for reading
 249         molecules from files. The supported parameter names for different file
 250         formats, along with their default values, are shown below:
 251             
 252             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 253             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 254                 smilesTitleLine,auto,sanitize,yes
 255             
 256         Possible values for smilesDelimiter: space, comma or tab.
 257     -o, --outfile <outfile>
 258         Output file name.
 259     --outfileParams <Name,Value,...>  [default: auto]
 260         A comma delimited list of parameter name and value pairs for writing
 261         molecules to files. The supported parameter names for different file
 262         formats, along with their default values, are shown below:
 263             
 264             SD: compute2DCoords,auto,kekulize,no
 265             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 266                 smilesTitleLine,yes
 267             
 268         Default value for compute2DCoords: yes for SMILES input file; no for all other
 269         file types.
 270     --overwrite
 271         Overwrite existing files.
 272     -w, --workingdir <dir>
 273         Location of working directory which defaults to the current directory.
 274 
 275 Examples:
 276     To enumerate only unassigned atom and bond chiral centers along with discarding
 277     of non-physical structures, keeping a maximum of 50 stereoisomers for each molecule,
 278     and write out a SMILES file, type:
 279 
 280         % RDKitEnumerateStereoisomers.py  -i Sample.smi -o SampleOut.smi
 281 
 282     To enumerate only unassigned atom and bond chiral centers along with discarding
 283     any non-physical structures, keeping a maximum of 250 stereoisomers for a molecule,
 284     and write out a SD file, type:
 285 
 286         % RDKitEnumerateStereoisomers.py  --maxIsomers 0 -i Sample.smi
 287            --maxIsomers 250 -o SampleOut.sdf
 288 
 289     To enumerate all possible assigned and unassigned atom and bond chiral centers,
 290     without discarding any non-physical structures, keeping a maximum of 500 
 291     stereoisomers for a molecule, and write out a SD file, type:
 292 
 293         % RDKitEnumerateStereoisomers.py  -d no -m all --maxIsomers 500
 294           -i Sample.smi -o SampleOut.sdf
 295 
 296     To enumerate only unassigned atom and bond chiral centers along with discarding
 297     of non-physical structures, keeping a maximum of 50 stereoisomers for each molecule
 298     in a CSV SMILES file, SMILES strings in column 1, name in column 2, and write out a
 299     SD file with kekulization, type:
 300 
 301         % RDKitEnumerateStereoisomers.py  --infileParams 
 302           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 303           smilesNameColumn,2" --outfileParams "compute2DCoords,yes,
 304           kekulize,yes" -i SampleSMILES.csv -o SampleOut.sdf
 305 
 306 Author:
 307     Manish Sud(msud@san.rr.com)
 308 
 309 See also:
 310     RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py, RDKitGenerateConformers.py,
 311     RDKitGenerateMolecularFrameworks.py
 312 
 313 Copyright:
 314     Copyright (C) 2019 Manish Sud. All rights reserved.
 315 
 316     The functionality available in this script is implemented using RDKit, an
 317     open source toolkit for cheminformatics developed by Greg Landrum.
 318 
 319     This file is part of MayaChemTools.
 320 
 321     MayaChemTools is free software; you can redistribute it and/or modify it under
 322     the terms of the GNU Lesser General Public License as published by the Free
 323     Software Foundation; either version 3 of the License, or (at your option) any
 324     later version.
 325 
 326 """
 327 
 328 if __name__ == "__main__":
 329     main()