1 #!/bin/env python 2 # 3 # File: RDKitEnumerateTautomers.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem.MolStandardize import rdMolStandardize 43 from rdkit.Chem import AllChem 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 67 68 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 69 70 # Retrieve command line arguments and options... 71 RetrieveOptions() 72 73 # Process and validate command line arguments and options... 74 ProcessOptions() 75 76 # Perform actions required by the script... 77 EnumerateTautomers() 78 79 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 80 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 81 82 def EnumerateTautomers(): 83 """Enunmerate tautomers.""" 84 85 # Setup a molecule reader... 86 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 87 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 88 89 # Set up a molecule writer... 90 Writer = SetupMoleculeWriter() 91 92 MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount = ProcessMolecules(Mols, Writer) 93 94 if Writer is not None: 95 Writer.close() 96 97 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 98 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 99 MiscUtil.PrintInfo("Number of molecules failed during tautomerization: %d" % TautomerizationFailedCount) 100 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + TautomerizationFailedCount)) 101 102 MiscUtil.PrintInfo("\nNumber of tautomerized molecules: %d" % (ValidMolCount - TautomerizationFailedCount)) 103 104 MiscUtil.PrintInfo("\nTotal number of tautomers for molecules: %d" % TautomersCount) 105 MiscUtil.PrintInfo("Minumum number of tautomers for a molecule: %d" % MinTautomersCount) 106 MiscUtil.PrintInfo("Maxiumum number of tautomers for a molecule: %d" % MaxTautomersCount) 107 MiscUtil.PrintInfo("Average number of tautomers for a molecule: %.1f" % (TautomersCount/(ValidMolCount - TautomerizationFailedCount))) 108 109 def ProcessMolecules(Mols, Writer): 110 """Process molecules.""" 111 112 if OptionsInfo["MPMode"]: 113 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 114 else: 115 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 116 117 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 118 """Process and generate tautomers for molecules using a single process.""" 119 120 MiscUtil.PrintInfo("\nEnumerating tatutomers...") 121 122 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 123 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 124 125 # Set up tautomer enumerator... 126 TautomerEnumerator = SetupTautomerEnumerator() 127 128 (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount) = [0] * 4 129 (MinTautomersCount, MaxTautomersCount) = [sys.maxsize, 0] 130 FirstTautomerMol = True 131 for Mol in Mols: 132 MolCount += 1 133 134 if Mol is None: 135 continue 136 137 if RDKitUtil.IsMolEmpty(Mol): 138 if not OptionsInfo["QuietMode"]: 139 MolName = RDKitUtil.GetMolName(Mol, MolCount) 140 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 141 continue 142 143 ValidMolCount += 1 144 145 TautomerMols, TautomerizationStatus = EnumerateMolTautomers(Mol, TautomerEnumerator, MolCount) 146 if not TautomerizationStatus: 147 if not OptionsInfo["QuietMode"]: 148 MolName = RDKitUtil.GetMolName(Mol, MolCount) 149 MiscUtil.PrintWarning("Failed to tautomerize molecule %s" % MolName) 150 151 TautomerizationFailedCount += 1 152 continue 153 154 if FirstTautomerMol: 155 FirstTautomerMol = False 156 if SetSMILESMolProps: 157 RDKitUtil.SetWriterMolProps(Writer, TautomerMols[0]) 158 159 # Track tautomer count... 160 TautomerMolsCount = len(TautomerMols) 161 TautomersCount += TautomerMolsCount 162 if TautomerMolsCount < MinTautomersCount: 163 MinTautomersCount = TautomerMolsCount 164 if TautomerMolsCount > MaxTautomersCount: 165 MaxTautomersCount = TautomerMolsCount 166 167 WriteMolTautomers(Writer, Mol, MolCount, Compute2DCoords, TautomerMols) 168 169 return (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount) 170 171 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 172 """Process and enumerate tautomer of molecules using multiprocessing.""" 173 174 MiscUtil.PrintInfo("\nEnumerating tatutomers using multiprocessing...") 175 176 MPParams = OptionsInfo["MPParams"] 177 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 178 179 # Setup data for initializing a worker process... 180 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 181 182 # Setup a encoded mols data iterable for a worker process by pickling only public 183 # and private molecule properties... 184 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 185 186 # Setup process pool along with data initialization for each process... 187 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 188 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 189 190 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 191 192 # Start processing... 193 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 194 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 195 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 196 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 197 else: 198 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 199 200 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 201 202 (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount) = [0] * 4 203 (MinTautomersCount, MaxTautomersCount) = [sys.maxsize, 0] 204 FirstTautomerMol = True 205 for Result in Results: 206 MolCount += 1 207 MolIndex, EncodedMol, TautomerizationStatus, EncodedTautomerMols = Result 208 209 if EncodedMol is None: 210 continue 211 ValidMolCount += 1 212 213 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 214 215 TautomerMols = [] 216 if EncodedTautomerMols is not None: 217 TautomerMols = [RDKitUtil.MolFromBase64EncodedMolString(EncodedTautomerMol) for EncodedTautomerMol in EncodedTautomerMols] 218 219 if not TautomerizationStatus: 220 if not OptionsInfo["QuietMode"]: 221 MolName = RDKitUtil.GetMolName(Mol, MolCount) 222 MiscUtil.PrintWarning("Failed to tautomerize molecule %s" % MolName) 223 224 TautomerizationFailedCount += 1 225 continue 226 227 if FirstTautomerMol: 228 FirstTautomerMol = False 229 if SetSMILESMolProps: 230 RDKitUtil.SetWriterMolProps(Writer, TautomerMols[0]) 231 232 # Track tautomer count... 233 TautomerMolsCount = len(TautomerMols) 234 TautomersCount += TautomerMolsCount 235 if TautomerMolsCount < MinTautomersCount: 236 MinTautomersCount = TautomerMolsCount 237 if TautomerMolsCount > MaxTautomersCount: 238 MaxTautomersCount = TautomerMolsCount 239 240 WriteMolTautomers(Writer, Mol, MolCount, Compute2DCoords, TautomerMols) 241 242 return (MolCount, ValidMolCount, TautomerizationFailedCount, TautomersCount, MinTautomersCount, MaxTautomersCount) 243 244 def InitializeWorkerProcess(*EncodedArgs): 245 """Initialize data for a worker process.""" 246 247 global Options, OptionsInfo 248 249 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 250 251 # Decode Options and OptionInfo... 252 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 253 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 254 255 # Set up tautomer enumerator... 256 OptionsInfo["TautomerEnumerator"] = SetupTautomerEnumerator() 257 258 def WorkerProcess(EncodedMolInfo): 259 """Process data for a worker process.""" 260 261 MolIndex, EncodedMol = EncodedMolInfo 262 263 if EncodedMol is None: 264 return [MolIndex, None, False, None] 265 266 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 267 if RDKitUtil.IsMolEmpty(Mol): 268 if not OptionsInfo["QuietMode"]: 269 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 270 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 271 return [MolIndex, None, False, None] 272 273 TautomerMols, TautomerizationStatus = EnumerateMolTautomers(Mol, OptionsInfo["TautomerEnumerator"], (MolIndex + 1)) 274 275 EncodedTautomerMols = None 276 if TautomerMols is not None: 277 EncodedTautomerMols = [RDKitUtil.MolToBase64EncodedMolString(TautomerMol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.AtomProps | Chem.PropertyPickleOptions.BondProps | Chem.PropertyPickleOptions.PrivateProps) for TautomerMol in TautomerMols] 278 279 return [MolIndex, EncodedMol, TautomerizationStatus, EncodedTautomerMols] 280 281 def EnumerateMolTautomers(Mol, TautomerEnumerator, MolNum): 282 """Enumerate tautomers of a molecule and return a list of tatutomers 283 along with the status of tautomerization.""" 284 285 TautomerMols, Status, TautomerScores = [None, False, None] 286 try: 287 TautomerMols = [TautomerMol for TautomerMol in TautomerEnumerator.Enumerate(Mol)] 288 289 if OptionsInfo["ScoreTautomers"]: 290 TautomerScores = [TautomerEnumerator.ScoreTautomer(TautomerMol) for TautomerMol in TautomerMols] 291 292 if OptionsInfo["SortTautomers"]: 293 TautomerMols, TautomerScores = SortMolTautomers(Mol, TautomerEnumerator, TautomerMols, TautomerScores) 294 295 # Set tautomer score... 296 if TautomerScores is not None: 297 for Index, TautomerMol in enumerate(TautomerMols): 298 TautomerMol.SetProp("Tautomer_Score", "%.1f" % TautomerScores[Index]) 299 300 Status = True 301 except Exception as ErrMsg: 302 if not OptionsInfo["QuietMode"]: 303 MiscUtil.PrintWarning("Failed to tautomerize molecule %s: %s" % (RDKitUtil.GetMolName(Mol, MolNum), ErrMsg)) 304 TautomerMols, Status = [None, False] 305 306 return (TautomerMols, Status) 307 308 def SortMolTautomers(Mol, TautomerEnumerator, TautomerMols, TautomerScores = None): 309 """Sort tatutomers by SMILES string and place canonical tautomer at the top 310 of the list.""" 311 312 CanonicalTautomer = TautomerEnumerator.Canonicalize(Mol) 313 CanonicalTautomerSmiles = Chem.MolToSmiles(CanonicalTautomer) 314 if TautomerScores is None: 315 CanonicalTautomerScore = None 316 else: 317 CanonicalTautomerScore = TautomerEnumerator.ScoreTautomer(CanonicalTautomer) 318 319 TautomerSmiles = [Chem.MolToSmiles(TautomerMol) for TautomerMol in TautomerMols] 320 if TautomerScores is None: 321 SortedResults = sorted((Smiles, TautomerMol) for Smiles, TautomerMol in zip(TautomerSmiles, TautomerMols) if Smiles != CanonicalTautomerSmiles) 322 else: 323 SortedResults = sorted((Smiles, TautomerMol, TautomerScore) for Smiles, TautomerMol, TautomerScore in zip(TautomerSmiles, TautomerMols, TautomerScores) if Smiles != CanonicalTautomerSmiles) 324 325 SortedTautomerMols = [CanonicalTautomer] 326 if TautomerScores is None: 327 SortedTautomerMols += [TautomerMol for Smiles, TautomerMol in SortedResults] 328 else: 329 SortedTautomerMols += [TautomerMol for Smiles, TautomerMol, TautomerScore in SortedResults] 330 331 if TautomerScores is None: 332 SortedTautomerScores = None 333 else: 334 SortedTautomerScores = [CanonicalTautomerScore] 335 SortedTautomerScores += [TautomerScore for Smiles, TautomerMol, TautomerScore in SortedResults] 336 337 return (SortedTautomerMols, SortedTautomerScores) 338 339 def WriteMolTautomers(Writer, Mol, MolNum, Compute2DCoords, TautomerMols): 340 """Write out tautomers of a molecule.""" 341 342 if TautomerMols is None: 343 return 344 345 MolName = RDKitUtil.GetMolName(Mol, MolNum) 346 347 for Index, TautomerMol in enumerate(TautomerMols): 348 SetupTautomerMolName(TautomerMol, MolName, (Index + 1)) 349 350 if Compute2DCoords: 351 AllChem.Compute2DCoords(Mol) 352 353 Writer.write(TautomerMol) 354 355 def SetupTautomerMolName(Mol, MolName, TautomerCount): 356 """Set tautomer mol name.""" 357 358 TautomerName = "%s_Taut%d" % (MolName, TautomerCount) 359 Mol.SetProp("_Name", TautomerName) 360 361 def SetupTautomerEnumerator(): 362 """Setup tautomer enumerator. """ 363 364 TautomerParams = SetupTautomerizationParameters() 365 366 return rdMolStandardize.TautomerEnumerator(TautomerParams) 367 368 def SetupTautomerizationParameters(): 369 """Setup tautomerization parameters for RDKit using cleanup parameters.""" 370 371 Params = rdMolStandardize.CleanupParameters() 372 TautomerizationParams = OptionsInfo["TautomerizationParams"] 373 374 if TautomerizationParams["TautomerTransformsFile"] is not None: 375 Params.tautomerTransformsFile = TautomerizationParams["TautomerTransformsFile"] 376 377 Params.maxTautomers = TautomerizationParams["MaxTautomers"] 378 Params.maxTransforms = TautomerizationParams["MaxTransforms"] 379 Params.tautomerRemoveBondStereo = TautomerizationParams["TautomerRemoveBondStereo"] 380 Params.tautomerRemoveIsotopicHs = TautomerizationParams["TautomerRemoveIsotopicHs"] 381 Params.tautomerRemoveSp3Stereo = TautomerizationParams["TautomerRemoveSp3Stereo"] 382 Params.tautomerReassignStereo = TautomerizationParams["TautomerReassignStereo"] 383 384 return Params 385 386 def SetupMoleculeWriter(): 387 """Setup a molecule writer.""" 388 389 Writer = None 390 391 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 392 if Writer is None: 393 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 394 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 395 396 return Writer 397 398 def ProcessTautomerizationParameters(): 399 """Process tautomerizationparameters. """ 400 401 ParamsDefaultInfo = {"TautomerTransformsFile": ["file", None], "MaxTautomers": ["int", 1000], "MaxTransforms": ["int", 1000], "TautomerRemoveBondStereo": ["bool", True], "TautomerRemoveIsotopicHs": ["bool", True], "TautomerRemoveSp3Stereo": ["bool", True], "TautomerReassignStereo": ["bool", True]} 402 403 OptionsInfo["TautomerizationParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--tautomerizationParams", Options["--tautomerizationParams"], ParamsDefaultInfo) 404 405 # Validate numerical values... 406 for ParamName in ["MaxTautomers", "MaxTransforms"]: 407 ParamValue = OptionsInfo["TautomerizationParams"][ParamName] 408 if ParamValue <= 0: 409 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"-t, --tautomerizationParams\" option is not a valid value. Supported values: > 0" % (ParamValue, ParamName)) 410 411 def ProcessOptions(): 412 """Process and validate command line arguments and options.""" 413 414 MiscUtil.PrintInfo("Processing options...") 415 416 # Validate options... 417 ValidateOptions() 418 419 OptionsInfo["Infile"] = Options["--infile"] 420 ParamsDefaultInfoOverride = {'RemoveHydrogens': False} 421 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 422 423 OptionsInfo["Outfile"] = Options["--outfile"] 424 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 425 426 OptionsInfo["Overwrite"] = Options["--overwrite"] 427 428 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 429 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 430 431 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 432 433 OptionsInfo["ScoreTautomers"] = True if re.match("^yes$", Options["--scoreTautomers"], re.I) else False 434 OptionsInfo["SortTautomers"] = True if re.match("^yes$", Options["--sortTautomers"], re.I) else False 435 436 ProcessTautomerizationParameters() 437 438 def RetrieveOptions(): 439 """Retrieve command line arguments and options.""" 440 441 # Get options... 442 global Options 443 Options = docopt(_docoptUsage_) 444 445 # Set current working directory to the specified directory... 446 WorkingDir = Options["--workingdir"] 447 if WorkingDir: 448 os.chdir(WorkingDir) 449 450 # Handle examples option... 451 if "--examples" in Options and Options["--examples"]: 452 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 453 sys.exit(0) 454 455 def ValidateOptions(): 456 """Validate option values.""" 457 458 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 459 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 460 461 if Options["--outfile"]: 462 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 463 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 464 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 465 466 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 467 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 468 469 MiscUtil.ValidateOptionTextValue("--scoreTautomers", Options["--scoreTautomers"], "yes no") 470 MiscUtil.ValidateOptionTextValue("--sortTautomers", Options["--sortTautomers"], "yes no") 471 472 # Setup a usage string for docopt... 473 _docoptUsage_ = """ 474 RDKitEnumerateTautomers.py - Enumerate tautomers of molecules 475 476 Usage: 477 RDKitEnumerateTautomers.py [--infileParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 478 [--outfileParams <Name,Value,...> ] [--overwrite] [--quiet <yes or no>] [--scoreTautomers <yes or no>] 479 [--sortTautomers <yes or no>] [--tautomerizationParams <Name,Value,...>] [-w <dir>] -i <infile> -o <outfile> 480 RDKitEnumerateTautomers.py -h | --help | -e | --examples 481 482 Description: 483 Enumerate tautomers for molecules and write them out to an output file. 484 The tautomer enumerator generates both protomers and valence tautomers. You 485 may optionally calculate tautomer scores and sort tautomers by SMILES string. The 486 canonical tautomer is placed at the top during sorting. 487 488 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 489 490 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 491 492 Options: 493 -e, --examples 494 Print examples. 495 -h, --help 496 Print this help message. 497 -i, --infile <infile> 498 Input file name. 499 --infileParams <Name,Value,...> [default: auto] 500 A comma delimited list of parameter name and value pairs for reading 501 molecules from files. The supported parameter names for different file 502 formats, along with their default values, are shown below: 503 504 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes 505 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 506 smilesTitleLine,auto,sanitize,yes 507 508 Possible values for smilesDelimiter: space, comma or tab. 509 --mp <yes or no> [default: no] 510 Use multiprocessing. 511 512 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 513 function employing lazy RDKit data iterable. This allows processing of 514 arbitrary large data sets without any additional requirements memory. 515 516 All input data may be optionally loaded into memory by mp.Pool.map() 517 before starting worker processes in a process pool by setting the value 518 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 519 520 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 521 data mode may adversely impact the performance. The '--mpParams' section 522 provides additional information to tune the value of 'chunkSize'. 523 --mpParams <Name,Value,...> [default: auto] 524 A comma delimited list of parameter name and value pairs to configure 525 multiprocessing. 526 527 The supported parameter names along with their default and possible 528 values are shown below: 529 530 chunkSize, auto 531 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 532 numProcesses, auto [ Default: mp.cpu_count() ] 533 534 These parameters are used by the following functions to configure and 535 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 536 mp.Pool.imap(). 537 538 The chunkSize determines chunks of input data passed to each worker 539 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 540 The default value of chunkSize is dependent on the value of 'inputDataMode'. 541 542 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 543 automatically converts RDKit data iterable into a list, loads all data into 544 memory, and calculates the default chunkSize using the following method 545 as shown in its code: 546 547 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 548 if extra: chunkSize += 1 549 550 For example, the default chunkSize will be 7 for a pool of 4 worker processes 551 and 100 data items. 552 553 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 554 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 555 data into memory. Consequently, the size of input data is not known a priori. 556 It's not possible to estimate an optimal value for the chunkSize. The default 557 chunkSize is set to 1. 558 559 The default value for the chunkSize during 'Lazy' data mode may adversely 560 impact the performance due to the overhead associated with exchanging 561 small chunks of data. It is generally a good idea to explicitly set chunkSize to 562 a larger value during 'Lazy' input data mode, based on the size of your input 563 data and number of processes in the process pool. 564 565 The mp.Pool.map() function waits for all worker processes to process all 566 the data and return the results. The mp.Pool.imap() function, however, 567 returns the the results obtained from worker processes as soon as the 568 results become available for specified chunks of data. 569 570 The order of data in the results returned by both mp.Pool.map() and 571 mp.Pool.imap() functions always corresponds to the input data. 572 -o, --outfile <outfile> 573 Output file name. 574 --outfileParams <Name,Value,...> [default: auto] 575 A comma delimited list of parameter name and value pairs for writing 576 molecules to files. The supported parameter names for different file 577 formats, along with their default values, are shown below: 578 579 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 580 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 581 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 582 583 Default value for compute2DCoords: yes for SMILES input file; no for all other 584 file types. 585 --overwrite 586 Overwrite existing files. 587 -q, --quiet <yes or no> [default: no] 588 Use quiet mode. The warning and information messages will not be printed. 589 --scoreTautomers <yes or no> [default: no] 590 Calculate and write out tautomer scores [ Ref 159 ]. 591 --sortTautomers <yes or no> [default: no] 592 Sort tatutomers of a molecule by SMILES string and place canonical tautomer 593 at the top of the list. 594 -t, --tautomerizationParams <Name,Value,...> [default: auto] 595 A comma delimited list of parameter name and value pairs for enumerating 596 tautomers of molecules. The supported parameter names along with their 597 default values are shown below: 598 599 tautomerTransformsFile,none, 600 maxTautomers,1000,maxTransforms,1000, 601 tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes 602 tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes 603 604 A brief description of the tatutomerization parameters, taken from RDKit 605 documentation, is as follows: 606 607 tautomerTransformsFile - File containing tautomer transformations 608 609 maxTautomers - Maximum number of tautomers to generate 610 maxTransforms - Maximum number of transforms to apply during 611 tautomer enumeration 612 tautomerRemoveBondStereo - Remove stereochemistry from double bonds 613 involved in tautomerism 614 tautomerRemoveIsotopicHs: Remove isotopic Hs from centers involved in tautomerism 615 tautomerRemoveSp3Stereo - Remove stereochemistry from sp3 centers 616 involved in tautomerism 617 tautomerReassignStereo - AssignStereochemistry on all generated tautomers 618 619 The default value is set to none for the 'tautomerTransformsFile' parameter. The 620 script relies on RDKit to automatically load appropriate tautomer transformations 621 from a set of internal catalog. 622 623 The contents of transformation file are described below: 624 625 tautomerTransformsFile - File containing tautomer transformations 626 627 // Name SMARTS Bonds Charges 628 1,3 (thio)keto/enol f [CX4!H0]-[C]=[O,S,Se,Te;X1] 629 1,3 (thio)keto/enol r [O,S,Se,Te;X2!H0]-[C]=[C] 630 1,5 (thio)keto/enol f [CX4,NX3;!H0]-[C]=[C][CH0]=[O,S,Se,Te;X1] 631 ... ... ... 632 633 -w, --workingdir <dir> 634 Location of working directory which defaults to the current directory. 635 636 Examples: 637 To enumerate tautomers of molecules in a SMILES file and write out a SMILES 638 file, type: 639 640 % RDKitEnumerateTautomers.py -i Sample.smi -o SampleOut.smi 641 642 To enumerate tautomers of molecules in a SD file, calculate tautomer scores, 643 sort tautomers, and write out a SD file, type: 644 645 % RDKitEnumerateTautomers.py --scoreTautomers yes --sortTautomers yes 646 -i Sample.sdf -o SampleOut.sdf 647 648 To enumerate tautomers of molecules in a SD fie , calculate tautomer 649 scores, sort tautomers, and write out a SMILES file, type: 650 651 % RDKitEnumerateTautomers.py --scoreTautomers yes --sortTautomers yes 652 --outfileParams "smilesMolProps,yes" -i Sample.smi -o SampleOut.smi 653 654 To enumerate tautomers of molecules in a SD file, performing enumeration in 655 multiprocessing mode on all available CPUs without loading all data into 656 memory, and write out a SD file, type: 657 658 % RDKitEnumerateTautomers.py --mp yes -i Sample.sdf -o SampleOut.sdf 659 660 To enumerate tautomers of molecules in a SD file, performing enumeration in 661 multiprocessing mode on specific number of CPUs and chunk size without loading 662 all data into memory, and write out a SD file, type: 663 664 % RDKitEnumerateTautomers.py --mp yes --mpParams "inputDataMode,Lazy, 665 numProcesses,4,chunkSize,8" -i Sample.sdf -o SampleOut.sdf 666 667 To enumerate tautomers of molecules in a SD file using specific values of 668 parameters to contol the enumeration behavior, and write out a SD file, type: 669 670 % RDKitEnumerateTautomers.py -t "maxTautomers,1000,maxTransforms,1000, 671 tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes, 672 tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes" 673 --scoreTautomers yes --sortTautomers yes -i Sample.sdf -o SampleOut.sdf 674 675 To enumerate tautomers for molecules in a CSV SMILES file, SMILES strings in column 1, 676 name in column 2, and generate output SD file, type: 677 678 % RDKitEnumerateTautomers.py --infileParams 679 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 680 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 681 -i SampleSMILES.csv -o SampleOut.sdf 682 683 Author: 684 Manish Sud(msud@san.rr.com) 685 686 See also: 687 RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py, 688 RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts.py, 689 RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py, 690 RDKitStandardizeMolecules.py 691 692 Copyright: 693 Copyright (C) 2024 Manish Sud. All rights reserved. 694 695 The functionality available in this script is implemented using RDKit, an 696 open source toolkit for cheminformatics developed by Greg Landrum. 697 698 This file is part of MayaChemTools. 699 700 MayaChemTools is free software; you can redistribute it and/or modify it under 701 the terms of the GNU Lesser General Public License as published by the Free 702 Software Foundation; either version 3 of the License, or (at your option) any 703 later version. 704 705 """ 706 707 if __name__ == "__main__": 708 main()