1 #!/bin/env python 2 # 3 # File: RDKitStandardizeMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem.MolStandardize import rdMolStandardize 43 from rdkit.Chem import AllChem 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 67 68 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 69 70 # Retrieve command line arguments and options... 71 RetrieveOptions() 72 73 # Process and validate command line arguments and options... 74 ProcessOptions() 75 76 # Perform actions required by the script... 77 StandardizeMolecules() 78 79 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 80 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 81 82 def StandardizeMolecules(): 83 """Stanardize molecules.""" 84 85 # Setup a molecule reader... 86 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 87 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 88 89 # Set up a molecule writer... 90 Writer = SetupMoleculeWriter() 91 92 MolCount, ValidMolCount, StandardizationFailedCount = ProcessMolecules(Mols, Writer) 93 94 if Writer is not None: 95 Writer.close() 96 97 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 98 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 99 MiscUtil.PrintInfo("Number of molecules failed during standardization: %d" % StandardizationFailedCount) 100 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + StandardizationFailedCount)) 101 102 MiscUtil.PrintInfo("\nNumber of standardized molecules: %d" % (ValidMolCount - StandardizationFailedCount)) 103 104 def ProcessMolecules(Mols, Writer): 105 """Process and standardize molecules.""" 106 107 if OptionsInfo["MPMode"]: 108 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 109 else: 110 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 111 112 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 113 """Process and standardize molecules using a single process.""" 114 115 MiscUtil.PrintInfo("\nStandardizing molecules...") 116 117 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 118 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 119 120 # Set up standardize... 121 SetupStandardize() 122 123 (MolCount, ValidMolCount, StandardizationFailedCount) = [0] * 3 124 FirstMol = True 125 for Mol in Mols: 126 MolCount += 1 127 128 if Mol is None: 129 continue 130 131 if RDKitUtil.IsMolEmpty(Mol): 132 if not OptionsInfo["QuietMode"]: 133 MolName = RDKitUtil.GetMolName(Mol, MolCount) 134 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 135 continue 136 137 ValidMolCount += 1 138 if FirstMol: 139 FirstMol = False 140 if SetSMILESMolProps: 141 RDKitUtil.SetWriterMolProps(Writer, Mol) 142 143 StandardizedMol, StandardizationStatus = PerformStandardization(Mol, MolCount) 144 if not StandardizationStatus: 145 if not OptionsInfo["QuietMode"]: 146 MolName = RDKitUtil.GetMolName(Mol, MolCount) 147 MiscUtil.PrintWarning("Failed to standardize molecule %s" % MolName) 148 149 StandardizationFailedCount += 1 150 continue 151 152 WriteMolecule(Writer, StandardizedMol, Compute2DCoords) 153 154 return (MolCount, ValidMolCount, StandardizationFailedCount) 155 156 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 157 """Process and standardize molecules using multiprocessing.""" 158 159 MiscUtil.PrintInfo("\nStandardize molecules using multiprocessing...") 160 161 MPParams = OptionsInfo["MPParams"] 162 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 163 164 # Setup data for initializing a worker process... 165 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 166 167 # Setup a encoded mols data iterable for a worker process by pickling only public 168 # and private molecule properties... 169 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 170 171 # Setup process pool along with data initialization for each process... 172 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 173 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 174 175 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 176 177 # Start processing... 178 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 179 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 180 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 181 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 182 else: 183 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 184 185 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 186 187 (MolCount, ValidMolCount, StandardizationFailedCount) = [0] * 3 188 FirstMol = True 189 for Result in Results: 190 MolCount += 1 191 MolIndex, EncodedMol, EncodedStandardizedMol, StandardizationStatus = Result 192 193 if EncodedMol is None: 194 continue 195 ValidMolCount += 1 196 197 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 198 StandardizedMol = RDKitUtil.MolFromBase64EncodedMolString(EncodedStandardizedMol) 199 200 if FirstMol: 201 FirstMol = False 202 if SetSMILESMolProps: 203 RDKitUtil.SetWriterMolProps(Writer, Mol) 204 205 if not StandardizationStatus: 206 if not OptionsInfo["QuietMode"]: 207 MolName = RDKitUtil.GetMolName(Mol, MolCount) 208 MiscUtil.PrintWarning("Failed to standardize molecule %s" % MolName) 209 210 StandardizationFailedCount += 1 211 continue 212 213 WriteMolecule(Writer, StandardizedMol, Compute2DCoords) 214 215 return (MolCount, ValidMolCount, StandardizationFailedCount) 216 217 def InitializeWorkerProcess(*EncodedArgs): 218 """Initialize data for a worker process.""" 219 220 global Options, OptionsInfo 221 222 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 223 224 # Decode Options and OptionInfo... 225 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 226 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 227 228 # Set up standardize... 229 SetupStandardize() 230 231 def WorkerProcess(EncodedMolInfo): 232 """Process data for a worker process.""" 233 234 MolIndex, EncodedMol = EncodedMolInfo 235 236 if EncodedMol is None: 237 return [MolIndex, None, None, False] 238 239 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 240 if RDKitUtil.IsMolEmpty(Mol): 241 if not OptionsInfo["QuietMode"]: 242 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 243 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 244 return [MolIndex, None, None, False] 245 246 StandardizedMol, StandardizationStatus = PerformStandardization(Mol, (MolIndex + 1)) 247 EncodedStandardizedMol = RDKitUtil.MolToBase64EncodedMolString(StandardizedMol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.AtomProps | Chem.PropertyPickleOptions.BondProps | Chem.PropertyPickleOptions.PrivateProps) 248 249 return [MolIndex, EncodedMol, EncodedStandardizedMol, StandardizationStatus] 250 251 def PerformStandardization(Mol, MolNum): 252 """Perform standardization and return a standardized mol along with the status of 253 the standardization.""" 254 255 # Track molname for its restoration after the standardization. RDKit standardization 256 # functions might mangle molname for molecules containing disconnected components... 257 MolName = Mol.GetProp("_Name") if Mol.HasProp("_Name") else None 258 259 StandardizedMol = Mol 260 try: 261 # Step 1: Cleanup... 262 if OptionsInfo["MethodologyParams"]["Cleanup"]: 263 StandardizedMol = CleanupMolecule(StandardizedMol) 264 265 # Step2: Get largest fragment... 266 if OptionsInfo["MethodologyParams"]["RemoveFragments"]: 267 StandardizedMol = ChooseLargestMoleculeFragment(StandardizedMol) 268 269 # Step3: Neutralize... 270 if OptionsInfo["MethodologyParams"]["Neutralize"]: 271 StandardizedMol = NeutralizeMolecule(StandardizedMol) 272 273 # Step4: Canonicalize tautomer... 274 if OptionsInfo["MethodologyParams"]["CanonicalizeTautomer"]: 275 StandardizedMol = CanonicalizeMoleculeTautomer(StandardizedMol) 276 277 Status = True 278 except Exception as ErrMsg: 279 StandardizedMol = None 280 if not OptionsInfo["QuietMode"]: 281 MiscUtil.PrintWarning("Failed to standardize molecule %s: %s" % (RDKitUtil.GetMolName(Mol, MolNum), ErrMsg)) 282 Status = False 283 284 # Restore molname... 285 if MolName is not None: 286 if StandardizedMol is not None: 287 StandardizedMol.SetProp("_Name", MolName) 288 289 return (StandardizedMol, Status) 290 291 def CleanupMolecule(Mol): 292 """Clean up molecule.""" 293 294 if OptionsInfo["StandardizeParams"]["CleanupRemoveHydrogens"]: 295 Mol = Chem.RemoveHs(Mol) 296 297 if OptionsInfo["StandardizeParams"]["CleanupDisconnectMetals"]: 298 # Disconnect metal atoms that are defined as covalently bonded to non-metals... 299 Mol = OptionsInfo["StandardizeObjects"]["MetalDisconnector"].Disconnect(Mol) 300 301 if OptionsInfo["StandardizeParams"]["CleanupNormalize"]: 302 # Apply normalization transforms to correct functional groups and recombine charges... 303 Mol = rdMolStandardize.Normalize(Mol, OptionsInfo["CleanupParams"]) 304 305 if OptionsInfo["StandardizeParams"]["CleanupReionize"]: 306 # Ensure the strongest acid groups ionize first in partially ionized molecules... 307 Mol = rdMolStandardize.Reionize(Mol, OptionsInfo["CleanupParams"]) 308 309 if OptionsInfo["StandardizeParams"]["CleanupAssignStereo"]: 310 # Assign stereochemistry 311 Chem.AssignStereochemistry(Mol, force=OptionsInfo["StandardizeParams"]["CleanupAssignStereoForce"], cleanIt=OptionsInfo["StandardizeParams"]["CleanupAssignStereoCleanIt"]) 312 313 Mol.UpdatePropertyCache() 314 315 return Mol 316 317 def ChooseLargestMoleculeFragment(Mol): 318 """Choose largest molecule fragment. """ 319 320 return OptionsInfo["StandardizeObjects"]["LargestFragmentChooser"].choose(Mol) 321 322 def NeutralizeMolecule(Mol): 323 """Neutralize molecule.""" 324 325 return OptionsInfo["StandardizeObjects"]["Uncharger"].uncharge(Mol) 326 327 def CanonicalizeMoleculeTautomer(Mol): 328 """Canonicalize molecule tautomer.""" 329 330 return OptionsInfo["StandardizeObjects"]["TautomerEnumerator"].Canonicalize(Mol) 331 332 def SetupStandardize(): 333 """Setup RDKit standardize objects to perform standardization.""" 334 335 OptionsInfo["StandardizeObjects"] = {} 336 337 OptionsInfo["CleanupParams"] = SetupStandardizeCleanupParameters() 338 339 if OptionsInfo["MethodologyParams"]["Cleanup"]: 340 if OptionsInfo["StandardizeParams"]["CleanupDisconnectMetals"]: 341 OptionsInfo["StandardizeObjects"]["MetalDisconnector"] = rdMolStandardize.MetalDisconnector() 342 343 if OptionsInfo["MethodologyParams"]["RemoveFragments"]: 344 OptionsInfo["StandardizeObjects"]["LargestFragmentChooser"] = rdMolStandardize.LargestFragmentChooser(OptionsInfo["CleanupParams"]) 345 346 if OptionsInfo["MethodologyParams"]["Neutralize"]: 347 OptionsInfo["StandardizeObjects"]["Uncharger"] = rdMolStandardize.Uncharger(OptionsInfo["CleanupParams"].doCanonical) 348 349 if OptionsInfo["MethodologyParams"]["CanonicalizeTautomer"]: 350 OptionsInfo["StandardizeObjects"]["TautomerEnumerator"] = rdMolStandardize.TautomerEnumerator(OptionsInfo["CleanupParams"]) 351 352 def SetupStandardizeCleanupParameters(): 353 """Setup standardize clean up parameters for RDKit. """ 354 355 CleanupParams = rdMolStandardize.CleanupParameters() 356 StandardizeParams = OptionsInfo["StandardizeParams"] 357 358 if StandardizeParams["AcidBaseFile"] is not None: 359 CleanupParams.acidbaseFile = StandardizeParams["AcidBaseFile"] 360 if StandardizeParams["FragmentFile"] is not None: 361 CleanupParams.acidbaseFile = StandardizeParams["FragmentFile"] 362 if StandardizeParams["NormalizationsFile"] is not None: 363 CleanupParams.normalizationsFile = StandardizeParams["NormalizationsFile"] 364 if StandardizeParams["TautomerTransformsFile"] is not None: 365 CleanupParams.tautomerTransformsFile = StandardizeParams["TautomerTransformsFile"] 366 367 CleanupParams.maxRestarts = StandardizeParams["CleanupNormalizeMaxRestarts"] 368 369 CleanupParams.doCanonical = StandardizeParams["DoCanonical"] 370 371 CleanupParams.largestFragmentChooserUseAtomCount = StandardizeParams["LargestFragmentChooserUseAtomCount"] 372 CleanupParams.largestFragmentChooserCountHeavyAtomsOnly = StandardizeParams["LargestFragmentChooserCountHeavyAtomsOnly"] 373 374 CleanupParams.preferOrganic = StandardizeParams["PreferOrganic"] 375 376 CleanupParams.maxTautomers = StandardizeParams["MaxTautomers"] 377 CleanupParams.maxTransforms = StandardizeParams["MaxTransforms"] 378 CleanupParams.tautomerRemoveBondStereo = StandardizeParams["TautomerRemoveBondStereo"] 379 CleanupParams.tautomerRemoveIsotopicHs = StandardizeParams["TautomerRemoveIsotopicHs"] 380 CleanupParams.tautomerRemoveSp3Stereo = StandardizeParams["TautomerRemoveSp3Stereo"] 381 CleanupParams.tautomerReassignStereo = StandardizeParams["TautomerReassignStereo"] 382 383 return CleanupParams 384 385 def WriteMolecule(Writer, Mol, Compute2DCoords): 386 """Write out molecule.""" 387 388 if OptionsInfo["CountMode"]: 389 return 390 391 if Compute2DCoords: 392 AllChem.Compute2DCoords(Mol) 393 394 Writer.write(Mol) 395 396 def SetupMoleculeWriter(): 397 """Setup a molecule writer.""" 398 399 Writer = None 400 if OptionsInfo["CountMode"]: 401 return Writer 402 403 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 404 if Writer is None: 405 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 406 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 407 408 return Writer 409 410 def ProcessMethodologyParameters(): 411 """Process methodology parameters. """ 412 413 ParamsDefaultInfo = {"Cleanup": ["bool", True], "RemoveFragments": ["bool", True], "Neutralize": ["bool", True], "CanonicalizeTautomer": ["bool", True]} 414 OptionsInfo["MethodologyParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--methodologyParams", Options["--methodologyParams"], ParamsDefaultInfo) 415 416 def ProcessStandardizationParameters(): 417 """Process standardization parameters. """ 418 419 ParamsDefaultInfo = {"AcidBaseFile": ["file", None], "FragmentFile": ["file", None], "NormalizationsFile": ["file", None], "TautomerTransformsFile": ["file", None], "CleanupRemoveHydrogens": ["bool", True], "CleanupDisconnectMetals": ["bool", True], "CleanupNormalize": ["bool", True], "CleanupNormalizeMaxRestarts": ["int", 200], "CleanupReionize": ["bool", True], "CleanupAssignStereo": ["bool", True], "CleanupAssignStereoCleanIt": ["bool", True], "CleanupAssignStereoForce": ["bool", True], "DoCanonical": ["bool", True], "LargestFragmentChooserUseAtomCount": ["bool", True], "LargestFragmentChooserCountHeavyAtomsOnly": ["bool", False], "PreferOrganic": ["bool", False], "MaxTautomers": ["int", 1000], "MaxTransforms": ["int", 1000], "TautomerRemoveBondStereo": ["bool", True], "TautomerRemoveIsotopicHs": ["bool", True], "TautomerRemoveSp3Stereo": ["bool", True], "TautomerReassignStereo": ["bool", True]} 420 421 OptionsInfo["StandardizeParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--standardizeParams", Options["--standardizeParams"], ParamsDefaultInfo) 422 423 # Validate numerical values... 424 for ParamName in ["CleanupNormalizeMaxRestarts", "MaxTautomers", "MaxTransforms"]: 425 ParamValue = OptionsInfo["StandardizeParams"][ParamName] 426 if ParamValue <= 0: 427 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"-s, --standardizeParams\" option is not a valid value. Supported values: > 0" % (ParamValue, ParamName)) 428 429 def ProcessOptions(): 430 """Process and validate command line arguments and options.""" 431 432 MiscUtil.PrintInfo("Processing options...") 433 434 # Validate options... 435 ValidateOptions() 436 437 OptionsInfo["Infile"] = Options["--infile"] 438 ParamsDefaultInfoOverride = {'RemoveHydrogens': False} 439 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 440 441 OptionsInfo["Outfile"] = Options["--outfile"] 442 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 443 444 OptionsInfo["Overwrite"] = Options["--overwrite"] 445 446 OptionsInfo["CountMode"] = False 447 if re.match("^count$", Options["--mode"], re.I): 448 OptionsInfo["CountMode"] = True 449 450 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 451 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 452 453 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 454 455 ProcessMethodologyParameters() 456 ProcessStandardizationParameters() 457 458 def RetrieveOptions(): 459 """Retrieve command line arguments and options.""" 460 461 # Get options... 462 global Options 463 Options = docopt(_docoptUsage_) 464 465 # Set current working directory to the specified directory... 466 WorkingDir = Options["--workingdir"] 467 if WorkingDir: 468 os.chdir(WorkingDir) 469 470 # Handle examples option... 471 if "--examples" in Options and Options["--examples"]: 472 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 473 sys.exit(0) 474 475 def ValidateOptions(): 476 """Validate option values.""" 477 478 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 479 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 480 481 if Options["--outfile"]: 482 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 483 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 484 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 485 486 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "standardize count") 487 if re.match("^standardize$", Options["--mode"], re.I): 488 if not Options["--outfile"]: 489 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"standardize\" value of \"--mode\" option") 490 491 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 492 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 493 494 # Setup a usage string for docopt... 495 _docoptUsage_ = """ 496 RDKitStandardizeMolecules.py - Standardize molecules 497 498 Usage: 499 RDKitStandardizeMolecules.py [--infileParams <Name,Value,...>] [--methodologyParams <Name,Value,...>] 500 [--mode <standardize or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 501 [--outfileParams <Name,Value,...> ] [--overwrite] [--standardizeParams <Name,Value,...>] 502 [--quiet <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> 503 RDKitStandardizeMolecules.py -h | --help | -e | --examples 504 505 Description: 506 Standardize molecules and write them out to an output file or simply count 507 the number of molecules to be standardized. The standardization methodology 508 consists of the following 4 steps executed in a sequential manner: 509 510 1. Cleanup molecules 511 2. Keep largest fragment 512 3. Neutralize molecules 513 4. Select canonical tautomer 514 515 The molecules are cleaned up by performing the following actions: 516 517 1. Remove hydrogens 518 2. Disconnect metal atoms - Disconnect metal atoms covalently bonded 519 to non-metals 520 3. Normalize - Normalize functional groups and recombine charges 521 4. Reionize - Ionize strongest acid groups first in partially 522 ionized molecules 523 5. Assign stereochemistry 524 525 You may optionally skip any cleanup action during standardization. 526 527 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 528 529 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 530 531 Options: 532 -e, --examples 533 Print examples. 534 -h, --help 535 Print this help message. 536 -i, --infile <infile> 537 Input file name. 538 --infileParams <Name,Value,...> [default: auto] 539 A comma delimited list of parameter name and value pairs for reading 540 molecules from files. The supported parameter names for different file 541 formats, along with their default values, are shown below: 542 543 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes 544 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 545 smilesTitleLine,auto,sanitize,yes 546 547 Possible values for smilesDelimiter: space, comma or tab. 548 -m, --mode <standardize or count> [default: standardize] 549 Specify whether to standardize molecules and write them out or simply 550 count the number of molecules being standardized. 551 --methodologyParams <Name,Value,...> [default: auto] 552 A comma delimited list of parameter name and value pairs to control 553 the execution of different steps in the standardization methodology. The 554 supported parameter names along with their default values are shown 555 below: 556 557 cleanup,yes,removeFragments,yes,neutralize,yes, 558 canonicalizeTautomer,yes 559 560 The standardization methodology consists of the following 4 steps executed 561 in a sequential manner starting from step 1: 562 563 1. cleanup 564 2. removeFragments 565 3. neutralize 566 4. canonicalizeTautomer 567 568 You may optionally skip the execution of any standardization step. 569 570 The step1, cleanup, performs the following actions: 571 572 1. Remove hydrogens 573 2. Disconnect metal atoms - Disconnect metal atoms covalently bonded 574 to non-metals 575 3. Normalize - Normalize functional groups and recombine charges 576 4. Reionize - Ionize strongest acid groups first in partially 577 ionized molecules 578 5. Assign stereochemistry 579 580 You may optionally skip any cleanup action using '-s, --standardize' option. 581 582 The step2, removeFragments, employs rdMolStandardize.FragmentParent() 583 function to keep the largest fragment. 584 585 The step3, neutralize, uses rdMolStandardize.Uncharger().uncharge() 586 function to neutralize molecules by adding/removing hydrogens. 587 588 The step4, canonicalizeTautomer, relies on Canonicalize() function availabe via 589 rdMolStandardize.TautomerEnumerator() to select a canonical tautomer. 590 --mp <yes or no> [default: no] 591 Use multiprocessing. 592 593 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 594 function employing lazy RDKit data iterable. This allows processing of 595 arbitrary large data sets without any additional requirements memory. 596 597 All input data may be optionally loaded into memory by mp.Pool.map() 598 before starting worker processes in a process pool by setting the value 599 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 600 601 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 602 data mode may adversely impact the performance. The '--mpParams' section 603 provides additional information to tune the value of 'chunkSize'. 604 --mpParams <Name,Value,...> [default: auto] 605 A comma delimited list of parameter name and value pairs to configure 606 multiprocessing. 607 608 The supported parameter names along with their default and possible 609 values are shown below: 610 611 chunkSize, auto 612 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 613 numProcesses, auto [ Default: mp.cpu_count() ] 614 615 These parameters are used by the following functions to configure and 616 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 617 mp.Pool.imap(). 618 619 The chunkSize determines chunks of input data passed to each worker 620 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 621 The default value of chunkSize is dependent on the value of 'inputDataMode'. 622 623 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 624 automatically converts RDKit data iterable into a list, loads all data into 625 memory, and calculates the default chunkSize using the following method 626 as shown in its code: 627 628 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 629 if extra: chunkSize += 1 630 631 For example, the default chunkSize will be 7 for a pool of 4 worker processes 632 and 100 data items. 633 634 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 635 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 636 data into memory. Consequently, the size of input data is not known a priori. 637 It's not possible to estimate an optimal value for the chunkSize. The default 638 chunkSize is set to 1. 639 640 The default value for the chunkSize during 'Lazy' data mode may adversely 641 impact the performance due to the overhead associated with exchanging 642 small chunks of data. It is generally a good idea to explicitly set chunkSize to 643 a larger value during 'Lazy' input data mode, based on the size of your input 644 data and number of processes in the process pool. 645 646 The mp.Pool.map() function waits for all worker processes to process all 647 the data and return the results. The mp.Pool.imap() function, however, 648 returns the the results obtained from worker processes as soon as the 649 results become available for specified chunks of data. 650 651 The order of data in the results returned by both mp.Pool.map() and 652 mp.Pool.imap() functions always corresponds to the input data. 653 -o, --outfile <outfile> 654 Output file name. 655 --outfileParams <Name,Value,...> [default: auto] 656 A comma delimited list of parameter name and value pairs for writing 657 molecules to files. The supported parameter names for different file 658 formats, along with their default values, are shown below: 659 660 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 661 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 662 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 663 664 Default value for compute2DCoords: yes for SMILES input file; no for all other 665 file types. 666 --overwrite 667 Overwrite existing files. 668 -q, --quiet <yes or no> [default: no] 669 Use quiet mode. The warning and information messages will not be printed. 670 -s, --standardizeParams <Name,Value,...> [default: auto] 671 A comma delimited list of parameter name and value pairs for standardizing 672 molecules. The supported parameter names along with their default values 673 are shown below: 674 675 acidbaseFile,none,fragmentFile,none,normalizationsFile,none, 676 tautomerTransformsFile,none, 677 cleanupRemoveHydrogens,yes,cleanupDisconnectMetals,yes, 678 cleanupNormalize,yes,cleanupNormalizeMaxRestarts,200, 679 cleanupReionize,yes,cleanupAssignStereo,yes, 680 cleanupAssignStereoCleanIt,yes,cleanupAssignStereoForce,yes 681 largestFragmentChooserUseAtomCount,yes, 682 largestFragmentChooserCountHeavyAtomsOnly,no,preferOrganic,no, 683 doCanonical,yes, 684 maxTautomers,1000,maxTransforms,1000, 685 tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes 686 tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes 687 688 A brief description of the standardization parameters, taken from RDKit 689 documentation, is as follows: 690 691 acidbaseFile - File containing acid and base definitions 692 fragmentFile - File containing fragment definitions 693 normalizationsFile - File conataining normalization transformations 694 tautomerTransformsFile - File containing tautomer transformations 695 696 cleanupRemoveHydrogens - Remove hydrogens druring cleanup 697 cleanupDisconnectMetals - Disconnect metal atoms covalently bonded 698 to non-metals during cleanup 699 cleanupNormalize - Normalize functional groups and recombine 700 charges during cleanup 701 cleanupNormalizeMaxRestarts - Maximum number of restarts during 702 normalization step of cleanup 703 cleanupReionize -Ionize strongest acid groups first in partially 704 ionized molecules during cleanup 705 cleanupAssignStereo - Assign stererochemistry during cleanup 706 cleanupAssignStereoCleanIt - Clean property _CIPCode during 707 assign stereochemistry 708 cleanupAssignStereoForce - Always perform stereochemistry 709 calculation during assign stereochemistry 710 711 largestFragmentChooserUseAtomCount - Use atom count as main 712 criterion before molecular weight to determine largest fragment 713 in LargestFragmentChooser 714 largestFragmentChooserCountHeavyAtomsOnly - Count only heavy 715 atoms to determine largest fragment in LargestFragmentChooser 716 preferOrganic - Prefer organic fragments over inorganic ones when 717 choosing fragments 718 719 doCanonical - Apply atom-order dependent normalizations in a 720 canonical order during uncharging 721 722 maxTautomers - Maximum number of tautomers to generate 723 maxTransforms - Maximum number of transforms to apply during 724 tautomer enumeration 725 tautomerRemoveBondStereo - Remove stereochemistry from double bonds 726 involved in tautomerism 727 tautomerRemoveIsotopicHs: Remove isotopic Hs from centers involved in tautomerism 728 tautomerRemoveSp3Stereo - Remove stereochemistry from sp3 centers 729 involved in tautomerism 730 tautomerReassignStereo - AssignStereochemistry on all generated tautomers 731 732 The default value is set to none for the following file name parameters: 733 acidbaseFile, fragmentFile, normalizationsFile, and tautomerTransformsFile. 734 The script relies on RDKit to automatically load appropriate acid base and 735 fragment definitions along with normalization and tautomer transformations 736 from a set of internal catalogs. 737 738 Note: The fragmentFile doesn't appear to be used by the RDKit method 739 rdMolStandardize.FragmentParent() to find largest fragment. 740 741 The contents of various standardization definitions and transformations files 742 are described below: 743 744 acidbaseFile - File containing acid and base definitions 745 746 // Name Acid Base 747 -OSO3H OS(=O)(=O)[OH] OS(=O)(=O)[O-] 748 -SO3H [!O]S(=O)(=O)[OH] [!O]S(=O)(=O)[O-] 749 -OSO2H O[SD3](=O)[OH] O[SD3](=O)[O-] 750 ... ... ... 751 752 fragmentFile - File containing fragment definitions 753 754 // Name SMARTS 755 hydrogen [H] 756 fluorine [F] 757 chlorine [Cl] 758 ... ... ... 759 760 normalizationsFile - File conataining normalization transformations 761 762 // Name SMIRKS 763 Sulfone to S(=O)(=O) [S+2:1]([O-:2])([O-:3])>> 764 [S+0:1](=[O-0:2])(=[O-0:3]) 765 Pyridine oxide to n+O- [n:1]=[O:2]>>[n+:1][O-:2] 766 ... ... ... 767 768 tautomerTransformsFile - File containing tautomer transformations 769 770 // Name SMARTS Bonds Charges 771 1,3 (thio)keto/enol f [CX4!H0]-[C]=[O,S,Se,Te;X1] 772 1,3 (thio)keto/enol r [O,S,Se,Te;X2!H0]-[C]=[C] 773 1,5 (thio)keto/enol f [CX4,NX3;!H0]-[C]=[C][CH0]=[O,S,Se,Te;X1] 774 ... ... ... 775 776 -w, --workingdir <dir> 777 Location of working directory which defaults to the current directory. 778 779 Examples: 780 To standardize molecules in a SMILES file by executing all standardization 781 steps and write out a SMILES file, type: 782 783 % RDKitStandardizeMolecules.py -i Sample.smi -o SampleOut.smi 784 785 To standardize molecules in a SD file by executing all standardization 786 steps, performing standardization in multiprocessing mode on all available 787 CPUs without loading all data into memory, and write out and write out a 788 SD file, type: 789 790 % RDKitStandardizeMolecules.py --mp yes -i Sample.sdf -o SampleOut.sdf 791 792 To standardize molecules in a SMILES file by executing all standardization 793 steps, performing standardization in multiprocessing mode on all available 794 CPUs by loading all data into memory, and write out and write out a 795 SMILES file, type: 796 797 % RDKitStandardizeMolecules.py --mp yes --mpParams "inputDataMode, 798 InMemory" -i Sample.smi -o SampleOut.smi 799 800 To standardize molecules in a SMILES file by executing all standardization 801 steps, performing standardization in multiprocessing mode on specific number 802 of CPUs and chunk size without loading all data into memory, and write out a 803 a SMILES file, type: 804 805 % RDKitStandardizeMolecules.py --mp yes --mpParams "inputDataMode,Lazy, 806 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi 807 808 To count number of molecules to be standardized without generating any 809 output file, type: 810 811 % RDKitStandardizeMolecules.py -m count -i Sample.sdf 812 813 To standardize molecules in a SD file by executing specific standardization 814 steps along with explicit values for various parameters to control the 815 standardization behavior, and write out a SD file, type: 816 817 % RDKitStandardizeMolecules.py --methodologyParams "cleanup,yes, 818 removeFragments,yes,neutralize,yes,canonicalizeTautomer,yes" 819 --standardizeParams "cleanupRemoveHydrogens,yes, 820 cleanupDisconnectMetals,yes,cleanupNormalize,yes, 821 cleanupNormalizeMaxRestarts,200,cleanupReionize,yes, 822 cleanupAssignStereo,yes,largestFragmentChooserUseAtomCount,yes, 823 doCanonical,yes,maxTautomers,1000" 824 -i Sample.sdf -o SampleOut.sdf 825 826 To standardize molecules in a CSV SMILES file, SMILES strings in column 1, 827 name in column 2, and generate output SD file, type: 828 829 % RDKitStandardizeMolecules.py --infileParams 830 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 831 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 832 -i SampleSMILES.csv -o SampleOut.sdf 833 834 Author: 835 Manish Sud(msud@san.rr.com) 836 837 See also: 838 RDKitConvertFileFormat.py, RDKitEnumerateTautomers.py, 839 RDKitRemoveDuplicateMolecules.py, RDKitRemoveInvalidMolecules.py, 840 RDKitRemoveSalts.py, RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py 841 842 Copyright: 843 Copyright (C) 2024 Manish Sud. All rights reserved. 844 845 The functionality available in this script is implemented using RDKit, an 846 open source toolkit for cheminformatics developed by Greg Landrum. 847 848 This file is part of MayaChemTools. 849 850 MayaChemTools is free software; you can redistribute it and/or modify it under 851 the terms of the GNU Lesser General Public License as published by the Free 852 Software Foundation; either version 3 of the License, or (at your option) any 853 later version. 854 855 """ 856 857 if __name__ == "__main__": 858 main()