1 #!/bin/env python 2 # 3 # File: RDKitEnumerateStereoisomers.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformEnumeration() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformEnumeration(): 82 """Enumerate stereoisomers.""" 83 84 Infile = OptionsInfo["Infile"] 85 Outfile = OptionsInfo["Outfile"] 86 87 # Setup a molecule reader... 88 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 89 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 90 91 # Set up a molecule writer... 92 Writer = None 93 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 94 if Writer is None: 95 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 96 MiscUtil.PrintInfo("Generating file %s...\n" % Outfile) 97 98 # Setup stereo enumeration options... 99 StereoOptions = StereoEnumerationOptions(tryEmbedding = OptionsInfo["DiscardNonPhysical"], onlyUnassigned = OptionsInfo["UnassignedOnly"], maxIsomers = OptionsInfo["MaxIsomers"]) 100 101 # Process molecules... 102 MolCount = 0 103 ValidMolCount = 0 104 105 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 106 107 for Mol in Mols: 108 MolCount += 1 109 110 if Mol is None: 111 continue 112 113 if RDKitUtil.IsMolEmpty(Mol): 114 MolName = RDKitUtil.GetMolName(Mol, MolCount) 115 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 116 continue 117 118 ValidMolCount += 1 119 120 MolName = RDKitUtil.GetMolName(Mol, MolCount) 121 122 # Generate and process stereoisomers... 123 StereoisomersMols = EnumerateStereoisomers(Mol, options = StereoOptions) 124 IsomerCount = 0 125 for IsomerMol in StereoisomersMols: 126 IsomerCount += 1 127 128 # Set isomer mol name... 129 IsomerMolName = "%s_Isomer%d" % (MolName, IsomerCount) 130 IsomerMol.SetProp("_Name", IsomerMolName) 131 132 if Compute2DCoords: 133 AllChem.Compute2DCoords(IsomerMol) 134 135 Writer.write(IsomerMol) 136 137 MiscUtil.PrintInfo("Number of stereoisomers written for %s: %d" % (MolName, IsomerCount)) 138 139 if Writer is not None: 140 Writer.close() 141 142 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 143 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 144 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 145 146 def ProcessOptions(): 147 """Process and validate command line arguments and options.""" 148 149 MiscUtil.PrintInfo("Processing options...") 150 151 # Validate options... 152 ValidateOptions() 153 154 OptionsInfo["DiscardNonPhysical"] = True 155 if re.match("^no$", Options["--discardNonPhysical"], re.I): 156 OptionsInfo["DiscardNonPhysical"] = False 157 158 OptionsInfo["Mode"] = Options["--mode"] 159 UnassignedOnly = True 160 if re.match("^All$", Options["--mode"], re.I): 161 UnassignedOnly = False 162 OptionsInfo["UnassignedOnly"] = UnassignedOnly 163 164 OptionsInfo["Infile"] = Options["--infile"] 165 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 166 167 OptionsInfo["Outfile"] = Options["--outfile"] 168 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 169 170 OptionsInfo["Overwrite"] = Options["--overwrite"] 171 172 OptionsInfo["MaxIsomers"] = int(Options["--maxIsomers"]) 173 174 def RetrieveOptions(): 175 """Retrieve command line arguments and options.""" 176 177 # Get options... 178 global Options 179 Options = docopt(_docoptUsage_) 180 181 # Set current working directory to the specified directory... 182 WorkingDir = Options["--workingdir"] 183 if WorkingDir: 184 os.chdir(WorkingDir) 185 186 # Handle examples option... 187 if "--examples" in Options and Options["--examples"]: 188 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 189 sys.exit(0) 190 191 def ValidateOptions(): 192 """Validate option values.""" 193 194 MiscUtil.ValidateOptionTextValue("-d, --discardNonPhysical", Options["--discardNonPhysical"], "yes no") 195 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "UnassignedOnly All") 196 197 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 198 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 199 200 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 201 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 202 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 203 204 MiscUtil.ValidateOptionIntegerValue("--maxIsomers", Options["--maxIsomers"], {">=": 0}) 205 206 # Setup a usage string for docopt... 207 _docoptUsage_ = """ 208 RDKitEnumerateStereoisomers.py - Enumerate stereoisomers of molecules 209 210 Usage: 211 RDKitEnumerateStereoisomers.py [--discardNonPhysical <yes or no>] 212 [--infileParams <Name,Value,...>] [--mode <UnassignedOnly or All>] 213 [--maxIsomers <number>] [--outfileParams <Name,Value,...>] 214 [--overwrite] [-w <dir>] -i <infile> -o <outfile> 215 RDKitEnumerateStereoisomers.py -h | --help | -e | --examples 216 217 Description: 218 Perform a combinatorial enumeration of stereoisomers for molecules around all 219 or unassigned chiral atoms and bonds. 220 221 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 222 .csv, .tsv, .txt) 223 224 The supported output file format are: SD (.sdf, .sd), SMILES (.smi) 225 226 Options: 227 -d, --discardNonPhysical <yes or no> [default: yes] 228 Discard stereoisomers with non-physical structures. Possible values: yes or no. 229 The non-physical nature of a stereoisomer is determined by embedding the 230 structure to generate a conformation for the stereoisomer using standard 231 distance geometry methodology. 232 233 A word to the wise from RDKit documentation: this is computationally expensive 234 and uses a heuristic that could result in loss of stereoisomers. 235 -e, --examples 236 Print examples. 237 -m, --mode <UnassignedOnly or All> [default: UnassignedOnly] 238 Enumerate unassigned or all chiral centers. The chiral atoms and bonds with 239 defined stereochemistry are preserved. 240 --maxIsomers <number> [default: 50] 241 Maximum number of stereoisomers to generate for each molecule. A value of zero 242 indicates generation of all possible steroisomers. 243 -h, --help 244 Print this help message. 245 -i, --infile <infile> 246 Input file name. 247 --infileParams <Name,Value,...> [default: auto] 248 A comma delimited list of parameter name and value pairs for reading 249 molecules from files. The supported parameter names for different file 250 formats, along with their default values, are shown below: 251 252 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 253 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 254 smilesTitleLine,auto,sanitize,yes 255 256 Possible values for smilesDelimiter: space, comma or tab. 257 -o, --outfile <outfile> 258 Output file name. 259 --outfileParams <Name,Value,...> [default: auto] 260 A comma delimited list of parameter name and value pairs for writing 261 molecules to files. The supported parameter names for different file 262 formats, along with their default values, are shown below: 263 264 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 265 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 266 smilesTitleLine,yes 267 268 Default value for compute2DCoords: yes for SMILES input file; no for all other 269 file types. 270 --overwrite 271 Overwrite existing files. 272 -w, --workingdir <dir> 273 Location of working directory which defaults to the current directory. 274 275 Examples: 276 To enumerate only unassigned atom and bond chiral centers along with discarding 277 of non-physical structures, keeping a maximum of 50 stereoisomers for each molecule, 278 and write out a SMILES file, type: 279 280 % RDKitEnumerateStereoisomers.py -i Sample.smi -o SampleOut.smi 281 282 To enumerate only unassigned atom and bond chiral centers along with discarding 283 any non-physical structures, keeping a maximum of 250 stereoisomers for a molecule, 284 and write out a SD file, type: 285 286 % RDKitEnumerateStereoisomers.py --maxIsomers 0 -i Sample.smi 287 --maxIsomers 250 -o SampleOut.sdf 288 289 To enumerate all possible assigned and unassigned atom and bond chiral centers, 290 without discarding any non-physical structures, keeping a maximum of 500 291 stereoisomers for a molecule, and write out a SD file, type: 292 293 % RDKitEnumerateStereoisomers.py -d no -m all --maxIsomers 500 294 -i Sample.smi -o SampleOut.sdf 295 296 To enumerate only unassigned atom and bond chiral centers along with discarding 297 of non-physical structures, keeping a maximum of 50 stereoisomers for each molecule 298 in a CSV SMILES file, SMILES strings in column 1, name in column 2, and write out a 299 SD file without kekulization, type: 300 301 % RDKitEnumerateStereoisomers.py --infileParams 302 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 303 smilesNameColumn,2" --outfileParams "compute2DCoords,yes, 304 kekulize,no" -i SampleSMILES.csv -o SampleOut.sdf 305 306 Author: 307 Manish Sud(msud@san.rr.com) 308 309 See also: 310 RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py, RDKitGenerateConformers.py, 311 RDKitGenerateMolecularFrameworks.py 312 313 Copyright: 314 Copyright (C) 2024 Manish Sud. All rights reserved. 315 316 The functionality available in this script is implemented using RDKit, an 317 open source toolkit for cheminformatics developed by Greg Landrum. 318 319 This file is part of MayaChemTools. 320 321 MayaChemTools is free software; you can redistribute it and/or modify it under 322 the terms of the GNU Lesser General Public License as published by the Free 323 Software Foundation; either version 3 of the License, or (at your option) any 324 later version. 325 326 """ 327 328 if __name__ == "__main__": 329 main()