MayaChemTools:Code:RDKitStandardizeMolecules.py

   1 #!/bin/env python
   2 #
   3 # File: RDKitStandardizeMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem.MolStandardize import rdMolStandardize
  43     from rdkit.Chem import AllChem
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 def main():
  64     """Start execution of the script."""
  65     
  66     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  67     
  68     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  69     
  70     # Retrieve command line arguments and options...
  71     RetrieveOptions()
  72     
  73     # Process and validate command line arguments and options...
  74     ProcessOptions()
  75     
  76     # Perform actions required by the script...
  77     StandardizeMolecules()
  78     
  79     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  80     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  81 
  82 def StandardizeMolecules():
  83     """Stanardize molecules."""
  84     
  85     # Setup a molecule reader...
  86     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  87     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  88     
  89     # Set up a molecule writer...
  90     Writer = SetupMoleculeWriter()
  91 
  92     MolCount, ValidMolCount, StandardizationFailedCount = ProcessMolecules(Mols, Writer)
  93 
  94     if Writer is not None:
  95         Writer.close()
  96     
  97     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  98     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  99     MiscUtil.PrintInfo("Number of molecules failed during standardization: %d" % StandardizationFailedCount)
 100     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + StandardizationFailedCount))
 101     
 102     MiscUtil.PrintInfo("\nNumber of standardized molecules: %d" % (ValidMolCount - StandardizationFailedCount))
 103 
 104 def ProcessMolecules(Mols, Writer):
 105     """Process and standardize molecules."""
 106 
 107     if OptionsInfo["MPMode"]:
 108         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 109     else:
 110         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 111 
 112 def ProcessMoleculesUsingSingleProcess(Mols,  Writer):
 113     """Process and standardize molecules using a single process."""
 114 
 115     MiscUtil.PrintInfo("\nStandardizing molecules...")
 116     
 117     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 118     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 119 
 120     # Set up standardize...
 121     SetupStandardize()
 122 
 123     (MolCount, ValidMolCount, StandardizationFailedCount) = [0] * 3
 124     FirstMol = True
 125     for Mol in Mols:
 126         MolCount += 1
 127         
 128         if Mol is None:
 129             continue
 130         
 131         if RDKitUtil.IsMolEmpty(Mol):
 132             if not OptionsInfo["QuietMode"]:
 133                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 134                 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 135             continue
 136         
 137         ValidMolCount += 1
 138         if FirstMol:
 139             FirstMol = False
 140             if SetSMILESMolProps:
 141                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 142         
 143         StandardizedMol,  StandardizationStatus = PerformStandardization(Mol, MolCount)
 144         if not StandardizationStatus:
 145             if not OptionsInfo["QuietMode"]:
 146                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 147                 MiscUtil.PrintWarning("Failed to standardize molecule %s" % MolName)
 148             
 149             StandardizationFailedCount += 1
 150             continue
 151         
 152         WriteMolecule(Writer, StandardizedMol, Compute2DCoords)
 153     
 154     return (MolCount, ValidMolCount, StandardizationFailedCount)
 155     
 156 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 157     """Process and standardize molecules using  multiprocessing."""
 158     
 159     MiscUtil.PrintInfo("\nStandardize molecules using multiprocessing...")
 160     
 161     MPParams = OptionsInfo["MPParams"]
 162     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 163     
 164     # Setup data for initializing a worker process...
 165     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 166 
 167     # Setup a encoded mols data iterable for a worker process by pickling only public
 168     # and private molecule properties...
 169     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 170 
 171     # Setup process pool along with data initialization for each process...
 172     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 173     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 174     
 175     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 176     
 177     # Start processing...
 178     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 179         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 180     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 181         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 182     else:
 183         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 184     
 185     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 186     
 187     (MolCount, ValidMolCount, StandardizationFailedCount) = [0] * 3
 188     FirstMol = True
 189     for Result in Results:
 190         MolCount += 1
 191         MolIndex, EncodedMol, EncodedStandardizedMol, StandardizationStatus = Result
 192         
 193         if EncodedMol is None:
 194             continue
 195         ValidMolCount += 1
 196         
 197         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 198         StandardizedMol = RDKitUtil.MolFromBase64EncodedMolString(EncodedStandardizedMol)
 199         
 200         if FirstMol:
 201             FirstMol = False
 202             if SetSMILESMolProps:
 203                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 204         
 205         if not StandardizationStatus:
 206             if not OptionsInfo["QuietMode"]:
 207                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 208                 MiscUtil.PrintWarning("Failed to standardize molecule %s" % MolName)
 209             
 210             StandardizationFailedCount += 1
 211             continue
 212         
 213         WriteMolecule(Writer, StandardizedMol, Compute2DCoords)
 214     
 215     return (MolCount, ValidMolCount, StandardizationFailedCount)
 216 
 217 def InitializeWorkerProcess(*EncodedArgs):
 218     """Initialize data for a worker process."""
 219 
 220     global Options, OptionsInfo
 221     
 222     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 223 
 224     # Decode Options and OptionInfo...
 225     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 226     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 227 
 228     # Set up standardize...
 229     SetupStandardize()
 230 
 231 def WorkerProcess(EncodedMolInfo):
 232     """Process data for a worker process."""
 233     
 234     MolIndex, EncodedMol = EncodedMolInfo
 235     
 236     if EncodedMol is None:
 237         return [MolIndex, None, None, False]
 238         
 239     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 240     if RDKitUtil.IsMolEmpty(Mol):
 241         if not OptionsInfo["QuietMode"]:
 242             MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 243             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 244         return [MolIndex, None, None, False]
 245     
 246     StandardizedMol, StandardizationStatus = PerformStandardization(Mol, (MolIndex + 1))
 247     EncodedStandardizedMol = RDKitUtil.MolToBase64EncodedMolString(StandardizedMol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.AtomProps | Chem.PropertyPickleOptions.BondProps | Chem.PropertyPickleOptions.PrivateProps)
 248     
 249     return [MolIndex, EncodedMol, EncodedStandardizedMol, StandardizationStatus]
 250     
 251 def PerformStandardization(Mol, MolNum):
 252     """Perform standardization and return a standardized mol along with the status of
 253     the standardization."""
 254 
 255     # Track molname for its restoration after the standardization. RDKit standardization
 256     # functions might mangle molname for molecules containing disconnected components...
 257     MolName = Mol.GetProp("_Name") if Mol.HasProp("_Name") else None
 258 
 259     StandardizedMol = Mol
 260     try:
 261         # Step 1: Cleanup...
 262         if OptionsInfo["MethodologyParams"]["Cleanup"]:
 263             StandardizedMol = CleanupMolecule(StandardizedMol)
 264         
 265         # Step2: Get largest fragment...
 266         if OptionsInfo["MethodologyParams"]["RemoveFragments"]:
 267             StandardizedMol = ChooseLargestMoleculeFragment(StandardizedMol)
 268         
 269         # Step3: Neutralize...
 270         if OptionsInfo["MethodologyParams"]["Neutralize"]:
 271             StandardizedMol = NeutralizeMolecule(StandardizedMol)
 272         
 273         # Step4: Canonicalize tautomer...
 274         if OptionsInfo["MethodologyParams"]["CanonicalizeTautomer"]:
 275             StandardizedMol = CanonicalizeMoleculeTautomer(StandardizedMol)
 276 
 277         Status = True
 278     except Exception as ErrMsg:
 279         StandardizedMol = None
 280         if not OptionsInfo["QuietMode"]:
 281             MiscUtil.PrintWarning("Failed to standardize molecule %s: %s" % (RDKitUtil.GetMolName(Mol, MolNum), ErrMsg))
 282         Status = False
 283     
 284     # Restore molname...
 285     if MolName is not None:
 286         if StandardizedMol is not None:
 287             StandardizedMol.SetProp("_Name", MolName)
 288     
 289     return (StandardizedMol, Status)
 290 
 291 def CleanupMolecule(Mol):
 292     """Clean up molecule."""
 293     
 294     if OptionsInfo["StandardizeParams"]["CleanupRemoveHydrogens"]:
 295         Mol = Chem.RemoveHs(Mol)
 296         
 297     if OptionsInfo["StandardizeParams"]["CleanupDisconnectMetals"]:
 298         # Disconnect metal atoms that are defined as covalently bonded to non-metals...
 299         Mol = OptionsInfo["StandardizeObjects"]["MetalDisconnector"].Disconnect(Mol)
 300     
 301     if OptionsInfo["StandardizeParams"]["CleanupNormalize"]:
 302         # Apply normalization transforms to correct functional groups and recombine charges...
 303         Mol = rdMolStandardize.Normalize(Mol, OptionsInfo["CleanupParams"])
 304     
 305     if OptionsInfo["StandardizeParams"]["CleanupReionize"]:
 306         # Ensure the strongest acid groups ionize first in partially ionized molecules...
 307         Mol = rdMolStandardize.Reionize(Mol, OptionsInfo["CleanupParams"])
 308     
 309     if OptionsInfo["StandardizeParams"]["CleanupAssignStereo"]:
 310         # Assign stereochemistry
 311         Chem.AssignStereochemistry(Mol, force=OptionsInfo["StandardizeParams"]["CleanupAssignStereoForce"], cleanIt=OptionsInfo["StandardizeParams"]["CleanupAssignStereoCleanIt"])
 312     
 313     Mol.UpdatePropertyCache()
 314     
 315     return Mol
 316 
 317 def ChooseLargestMoleculeFragment(Mol):
 318     """Choose largest molecule fragment. """
 319 
 320     return OptionsInfo["StandardizeObjects"]["LargestFragmentChooser"].choose(Mol)
 321 
 322 def NeutralizeMolecule(Mol):
 323     """Neutralize molecule."""
 324     
 325     return OptionsInfo["StandardizeObjects"]["Uncharger"].uncharge(Mol)
 326 
 327 def CanonicalizeMoleculeTautomer(Mol):
 328     """Canonicalize molecule tautomer."""
 329     
 330     return OptionsInfo["StandardizeObjects"]["TautomerEnumerator"].Canonicalize(Mol)
 331 
 332 def SetupStandardize():
 333     """Setup RDKit standardize objects to perform standardization."""
 334 
 335     OptionsInfo["StandardizeObjects"] = {}
 336     
 337     OptionsInfo["CleanupParams"] = SetupStandardizeCleanupParameters()
 338     
 339     if OptionsInfo["MethodologyParams"]["Cleanup"]:
 340         if OptionsInfo["StandardizeParams"]["CleanupDisconnectMetals"]:
 341             OptionsInfo["StandardizeObjects"]["MetalDisconnector"] = rdMolStandardize.MetalDisconnector()
 342     
 343     if OptionsInfo["MethodologyParams"]["RemoveFragments"]:
 344         OptionsInfo["StandardizeObjects"]["LargestFragmentChooser"] = rdMolStandardize.LargestFragmentChooser(OptionsInfo["CleanupParams"])
 345     
 346     if OptionsInfo["MethodologyParams"]["Neutralize"]:
 347         OptionsInfo["StandardizeObjects"]["Uncharger"] = rdMolStandardize.Uncharger(OptionsInfo["CleanupParams"].doCanonical)
 348 
 349     if OptionsInfo["MethodologyParams"]["CanonicalizeTautomer"]:
 350         OptionsInfo["StandardizeObjects"]["TautomerEnumerator"] = rdMolStandardize.TautomerEnumerator(OptionsInfo["CleanupParams"])
 351 
 352 def SetupStandardizeCleanupParameters():
 353     """Setup standardize clean up parameters for RDKit. """
 354 
 355     CleanupParams = rdMolStandardize.CleanupParameters()
 356     StandardizeParams = OptionsInfo["StandardizeParams"]
 357     
 358     if StandardizeParams["AcidBaseFile"] is not None:
 359         CleanupParams.acidbaseFile = StandardizeParams["AcidBaseFile"]
 360     if StandardizeParams["FragmentFile"] is not None:
 361         CleanupParams.acidbaseFile = StandardizeParams["FragmentFile"]
 362     if StandardizeParams["NormalizationsFile"] is not None:
 363         CleanupParams.normalizationsFile = StandardizeParams["NormalizationsFile"]
 364     if StandardizeParams["TautomerTransformsFile"] is not None:
 365         CleanupParams.tautomerTransformsFile = StandardizeParams["TautomerTransformsFile"]
 366     
 367     CleanupParams.maxRestarts = StandardizeParams["CleanupNormalizeMaxRestarts"]
 368     
 369     CleanupParams.doCanonical = StandardizeParams["DoCanonical"]
 370     
 371     CleanupParams.largestFragmentChooserUseAtomCount = StandardizeParams["LargestFragmentChooserUseAtomCount"]
 372     CleanupParams.largestFragmentChooserCountHeavyAtomsOnly = StandardizeParams["LargestFragmentChooserCountHeavyAtomsOnly"]
 373     
 374     CleanupParams.preferOrganic = StandardizeParams["PreferOrganic"]
 375     
 376     CleanupParams.maxTautomers = StandardizeParams["MaxTautomers"]
 377     CleanupParams.maxTransforms = StandardizeParams["MaxTransforms"]
 378     CleanupParams.tautomerRemoveBondStereo = StandardizeParams["TautomerRemoveBondStereo"]
 379     CleanupParams.tautomerRemoveIsotopicHs = StandardizeParams["TautomerRemoveIsotopicHs"]
 380     CleanupParams.tautomerRemoveSp3Stereo = StandardizeParams["TautomerRemoveSp3Stereo"]
 381     CleanupParams.tautomerReassignStereo = StandardizeParams["TautomerReassignStereo"]
 382     
 383     return CleanupParams
 384 
 385 def WriteMolecule(Writer, Mol, Compute2DCoords):
 386     """Write out molecule."""
 387     
 388     if OptionsInfo["CountMode"]:
 389         return
 390     
 391     if Compute2DCoords:
 392         AllChem.Compute2DCoords(Mol)
 393     
 394     Writer.write(Mol)
 395 
 396 def SetupMoleculeWriter():
 397     """Setup a molecule writer."""
 398     
 399     Writer = None
 400     if OptionsInfo["CountMode"]:
 401         return Writer
 402 
 403     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 404     if Writer is None:
 405         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 406     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 407     
 408     return Writer
 409 
 410 def ProcessMethodologyParameters():
 411     """Process methodology parameters. """
 412 
 413     ParamsDefaultInfo = {"Cleanup": ["bool", True], "RemoveFragments": ["bool", True], "Neutralize": ["bool", True], "CanonicalizeTautomer": ["bool", True]}
 414     OptionsInfo["MethodologyParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--methodologyParams", Options["--methodologyParams"], ParamsDefaultInfo)
 415     
 416 def ProcessStandardizationParameters():
 417     """Process standardization parameters. """
 418 
 419     ParamsDefaultInfo = {"AcidBaseFile": ["file", None], "FragmentFile": ["file", None], "NormalizationsFile": ["file", None], "TautomerTransformsFile": ["file", None], "CleanupRemoveHydrogens": ["bool", True], "CleanupDisconnectMetals": ["bool", True], "CleanupNormalize": ["bool", True], "CleanupNormalizeMaxRestarts": ["int", 200], "CleanupReionize": ["bool", True], "CleanupAssignStereo": ["bool", True], "CleanupAssignStereoCleanIt": ["bool", True], "CleanupAssignStereoForce": ["bool", True], "DoCanonical": ["bool", True], "LargestFragmentChooserUseAtomCount": ["bool", True], "LargestFragmentChooserCountHeavyAtomsOnly": ["bool", False], "PreferOrganic": ["bool", False], "MaxTautomers": ["int", 1000], "MaxTransforms": ["int", 1000], "TautomerRemoveBondStereo": ["bool", True], "TautomerRemoveIsotopicHs": ["bool", True], "TautomerRemoveSp3Stereo": ["bool", True], "TautomerReassignStereo": ["bool", True]}
 420 
 421     OptionsInfo["StandardizeParams"] = MiscUtil.ProcessOptionNameValuePairParameters("--standardizeParams", Options["--standardizeParams"], ParamsDefaultInfo)
 422     
 423     #  Validate numerical values...
 424     for ParamName in ["CleanupNormalizeMaxRestarts", "MaxTautomers", "MaxTransforms"]:
 425         ParamValue = OptionsInfo["StandardizeParams"][ParamName]
 426         if  ParamValue <= 0:
 427             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"-s, --standardizeParams\" option is not a valid value. Supported values: > 0" % (ParamValue, ParamName))
 428 
 429 def ProcessOptions():
 430     """Process and validate command line arguments and options."""
 431     
 432     MiscUtil.PrintInfo("Processing options...")
 433 
 434     # Validate options...
 435     ValidateOptions()
 436     
 437     OptionsInfo["Infile"] = Options["--infile"]
 438     ParamsDefaultInfoOverride = {'RemoveHydrogens': False}
 439     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride)
 440     
 441     OptionsInfo["Outfile"] = Options["--outfile"]
 442     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 443 
 444     OptionsInfo["Overwrite"] = Options["--overwrite"]
 445 
 446     OptionsInfo["CountMode"] = False
 447     if re.match("^count$", Options["--mode"], re.I):
 448         OptionsInfo["CountMode"] = True
 449     
 450     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 451     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 452 
 453     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
 454 
 455     ProcessMethodologyParameters()
 456     ProcessStandardizationParameters()
 457 
 458 def RetrieveOptions():
 459     """Retrieve command line arguments and options."""
 460     
 461     # Get options...
 462     global Options
 463     Options = docopt(_docoptUsage_)
 464     
 465     # Set current working directory to the specified directory...
 466     WorkingDir = Options["--workingdir"]
 467     if WorkingDir:
 468         os.chdir(WorkingDir)
 469     
 470     # Handle examples option...
 471     if "--examples" in Options and Options["--examples"]:
 472         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 473         sys.exit(0)
 474 
 475 def ValidateOptions():
 476     """Validate option values."""
 477 
 478     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 479     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv")
 480     
 481     if Options["--outfile"]:
 482         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 483         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 484         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 485 
 486     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "standardize count")
 487     if re.match("^standardize$", Options["--mode"], re.I):
 488         if not Options["--outfile"]:
 489             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"standardize\" value of \"--mode\" option")
 490     
 491     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 492     MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no")
 493     
 494 # Setup a usage string for docopt...
 495 _docoptUsage_ = """
 496 RDKitStandardizeMolecules.py - Standardize molecules
 497 
 498 Usage:
 499     RDKitStandardizeMolecules.py [--infileParams <Name,Value,...>] [--methodologyParams <Name,Value,...>]
 500                                  [--mode <standardize or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
 501                                  [--outfileParams <Name,Value,...> ] [--overwrite] [--standardizeParams <Name,Value,...>]
 502                                  [--quiet <yes or no>] [-w <dir>] [-o <outfile>] -i <infile>
 503     RDKitStandardizeMolecules.py -h | --help | -e | --examples
 504 
 505 Description:
 506     Standardize molecules and write them out to an output file or simply count
 507     the number of molecules to be standardized. The standardization methodology
 508     consists of the following 4 steps executed in a sequential manner:
 509         
 510         1. Cleanup molecules
 511         2. Keep largest fragment
 512         3. Neutralize molecules
 513         4. Select canonical tautomer
 514         
 515     The molecules are cleaned up by performing the following actions:
 516         
 517         1. Remove hydrogens
 518         2. Disconnect metal atoms - Disconnect metal atoms covalently bonded
 519             to non-metals
 520         3. Normalize - Normalize functional groups and recombine charges
 521         4. Reionize - Ionize strongest acid groups first in partially
 522             ionized molecules
 523         5. Assign stereochemistry
 524         
 525     You may optionally skip any cleanup action during standardization.
 526 
 527     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 528 
 529     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 530 
 531 Options:
 532     -e, --examples
 533         Print examples.
 534     -h, --help
 535         Print this help message.
 536     -i, --infile <infile>
 537         Input file name.
 538     --infileParams <Name,Value,...>  [default: auto]
 539         A comma delimited list of parameter name and value pairs for reading
 540         molecules from files. The supported parameter names for different file
 541         formats, along with their default values, are shown below:
 542             
 543             SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes
 544             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 545                 smilesTitleLine,auto,sanitize,yes
 546             
 547         Possible values for smilesDelimiter: space, comma or tab.
 548     -m, --mode <standardize or count>  [default: standardize]
 549         Specify whether to standardize molecules and write them out or simply
 550         count the number of molecules being standardized.
 551     --methodologyParams <Name,Value,...>  [default: auto]
 552         A comma delimited list of parameter name and value pairs to control
 553         the execution of different steps in the standardization methodology. The
 554         supported parameter names along with their default values are shown
 555         below:
 556             
 557             cleanup,yes,removeFragments,yes,neutralize,yes,
 558             canonicalizeTautomer,yes
 559             
 560         The standardization methodology consists of the following 4 steps executed
 561         in a sequential manner starting from step 1:
 562             
 563             1. cleanup
 564             2. removeFragments
 565             3. neutralize
 566             4. canonicalizeTautomer
 567             
 568         You may optionally skip the execution of any standardization step.
 569         
 570         The step1, cleanup, performs the following actions:
 571             
 572             1. Remove hydrogens
 573             2. Disconnect metal atoms - Disconnect metal atoms covalently bonded
 574                 to non-metals
 575             3. Normalize - Normalize functional groups and recombine charges
 576             4. Reionize - Ionize strongest acid groups first in partially
 577                 ionized molecules
 578             5. Assign stereochemistry
 579             
 580         You may optionally skip any cleanup action using '-s, --standardize' option.
 581         
 582         The step2, removeFragments, employs rdMolStandardize.FragmentParent()
 583         function to keep the largest fragment.
 584         
 585         The step3, neutralize, uses rdMolStandardize.Uncharger().uncharge()
 586         function to neutralize molecules by adding/removing hydrogens.
 587         
 588         The step4, canonicalizeTautomer, relies on Canonicalize() function availabe via
 589         rdMolStandardize.TautomerEnumerator() to select a canonical tautomer.
 590     --mp <yes or no>  [default: no]
 591         Use multiprocessing.
 592          
 593         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 594         function employing lazy RDKit data iterable. This allows processing of
 595         arbitrary large data sets without any additional requirements memory.
 596         
 597         All input data may be optionally loaded into memory by mp.Pool.map()
 598         before starting worker processes in a process pool by setting the value
 599         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 600         
 601         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 602         data mode may adversely impact the performance. The '--mpParams' section
 603         provides additional information to tune the value of 'chunkSize'.
 604     --mpParams <Name,Value,...>  [default: auto]
 605         A comma delimited list of parameter name and value pairs to configure
 606         multiprocessing.
 607         
 608         The supported parameter names along with their default and possible
 609         values are shown below:
 610         
 611             chunkSize, auto
 612             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 613             numProcesses, auto   [ Default: mp.cpu_count() ]
 614         
 615         These parameters are used by the following functions to configure and
 616         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 617         mp.Pool.imap().
 618         
 619         The chunkSize determines chunks of input data passed to each worker
 620         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 621         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 622         
 623         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 624         automatically converts RDKit data iterable into a list, loads all data into
 625         memory, and calculates the default chunkSize using the following method
 626         as shown in its code:
 627         
 628             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 629             if extra: chunkSize += 1
 630         
 631         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 632         and 100 data items.
 633         
 634         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 635         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 636         data into memory. Consequently, the size of input data is not known a priori.
 637         It's not possible to estimate an optimal value for the chunkSize. The default 
 638         chunkSize is set to 1.
 639         
 640         The default value for the chunkSize during 'Lazy' data mode may adversely
 641         impact the performance due to the overhead associated with exchanging
 642         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 643         a larger value during 'Lazy' input data mode, based on the size of your input
 644         data and number of processes in the process pool.
 645         
 646         The mp.Pool.map() function waits for all worker processes to process all
 647         the data and return the results. The mp.Pool.imap() function, however,
 648         returns the the results obtained from worker processes as soon as the
 649         results become available for specified chunks of data.
 650         
 651         The order of data in the results returned by both mp.Pool.map() and 
 652         mp.Pool.imap() functions always corresponds to the input data.
 653     -o, --outfile <outfile>
 654         Output file name.
 655     --outfileParams <Name,Value,...>  [default: auto]
 656         A comma delimited list of parameter name and value pairs for writing
 657         molecules to files. The supported parameter names for different file
 658         formats, along with their default values, are shown below:
 659             
 660             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 661             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 662                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 663             
 664         Default value for compute2DCoords: yes for SMILES input file; no for all other
 665         file types.
 666     --overwrite
 667         Overwrite existing files.
 668     -q, --quiet <yes or no>  [default: no]
 669         Use quiet mode. The warning and information messages will not be printed.
 670     -s, --standardizeParams <Name,Value,...>  [default: auto]
 671         A comma delimited list of parameter name and value pairs for standardizing
 672         molecules. The supported parameter names along with their default values
 673         are shown below:
 674             
 675             acidbaseFile,none,fragmentFile,none,normalizationsFile,none,
 676             tautomerTransformsFile,none,
 677             cleanupRemoveHydrogens,yes,cleanupDisconnectMetals,yes,
 678             cleanupNormalize,yes,cleanupNormalizeMaxRestarts,200,
 679             cleanupReionize,yes,cleanupAssignStereo,yes,
 680             cleanupAssignStereoCleanIt,yes,cleanupAssignStereoForce,yes
 681             largestFragmentChooserUseAtomCount,yes,
 682             largestFragmentChooserCountHeavyAtomsOnly,no,preferOrganic,no,
 683             doCanonical,yes,
 684             maxTautomers,1000,maxTransforms,1000,
 685             tautomerRemoveBondStereo,yes,tautomerRemoveIsotopicHs,yes
 686             tautomerRemoveSp3Stereo,yes,tautomerReassignStereo,yes
 687             
 688         A brief description of the standardization parameters, taken from RDKit
 689         documentation, is as follows:
 690             
 691             acidbaseFile - File containing acid and base definitions
 692             fragmentFile - File containing fragment definitions
 693             normalizationsFile - File conataining normalization transformations
 694             tautomerTransformsFile - File containing tautomer transformations
 695             
 696             cleanupRemoveHydrogens - Remove hydrogens druring cleanup
 697             cleanupDisconnectMetals - Disconnect metal atoms covalently bonded
 698                 to non-metals during cleanup
 699             cleanupNormalize - Normalize functional groups and recombine
 700                 charges during cleanup
 701             cleanupNormalizeMaxRestarts - Maximum number of restarts during
 702                 normalization step of cleanup
 703             cleanupReionize -Ionize strongest acid groups first in partially
 704                 ionized molecules during cleanup
 705             cleanupAssignStereo - Assign stererochemistry during cleanup
 706             cleanupAssignStereoCleanIt - Clean property _CIPCode during
 707                 assign stereochemistry 
 708             cleanupAssignStereoForce - Always perform stereochemistry
 709                 calculation during assign stereochemistry
 710             
 711             largestFragmentChooserUseAtomCount - Use atom count as main
 712                 criterion before molecular weight to determine largest fragment
 713                 in LargestFragmentChooser
 714             largestFragmentChooserCountHeavyAtomsOnly - Count only heavy
 715                 atoms to determine largest fragment in LargestFragmentChooser
 716             preferOrganic - Prefer organic fragments over  inorganic ones when
 717                 choosing fragments
 718             
 719             doCanonical - Apply atom-order dependent normalizations in a
 720                 canonical order during uncharging
 721             
 722             maxTautomers - Maximum number of tautomers to generate
 723             maxTransforms - Maximum number of transforms to apply during
 724                 tautomer enumeration
 725             tautomerRemoveBondStereo - Remove stereochemistry from double bonds
 726                 involved in tautomerism
 727             tautomerRemoveIsotopicHs: Remove isotopic Hs from centers involved in tautomerism
 728             tautomerRemoveSp3Stereo - Remove stereochemistry from sp3 centers
 729                 involved in tautomerism
 730             tautomerReassignStereo - AssignStereochemistry on all generated tautomers
 731             
 732         The default value is set to none for the following  file name parameters:
 733         acidbaseFile, fragmentFile, normalizationsFile, and tautomerTransformsFile.
 734         The script relies on RDKit to automatically load appropriate acid base and
 735         fragment definitions along with normalization and tautomer transformations
 736         from a set of internal catalogs.
 737         
 738         Note: The fragmentFile doesn't appear to be used by the RDKit method
 739         rdMolStandardize.FragmentParent() to find largest fragment.
 740             
 741         The contents  of various standardization definitions and transformations files
 742         are described below:
 743             
 744             acidbaseFile - File containing acid and base definitions
 745             
 746                 // Name     Acid                 Base
 747                 -OSO3H      OS(=O)(=O)[OH]       OS(=O)(=O)[O-]
 748                 -SO3H       [!O]S(=O)(=O)[OH]    [!O]S(=O)(=O)[O-]
 749                 -OSO2H      O[SD3](=O)[OH]       O[SD3](=O)[O-]
 750                 ... ... ...
 751         
 752             fragmentFile - File containing fragment definitions
 753             
 754                 // Name     SMARTS
 755                 hydrogen     [H]
 756                 fluorine     [F]
 757                 chlorine     [Cl]
 758                 ... ... ...
 759         
 760             normalizationsFile - File conataining normalization transformations
 761             
 762                 // Name     SMIRKS
 763                 Sulfone to S(=O)(=O)        [S+2:1]([O-:2])([O-:3])>>
 764                     [S+0:1](=[O-0:2])(=[O-0:3])
 765                 Pyridine oxide to n+O-     [n:1]=[O:2]>>[n+:1][O-:2]
 766                 ... ... ...
 767         
 768             tautomerTransformsFile - File containing tautomer transformations
 769             
 770                 // Name                SMARTS   Bonds  Charges
 771                 1,3 (thio)keto/enol f  [CX4!H0]-[C]=[O,S,Se,Te;X1]
 772                 1,3 (thio)keto/enol r  [O,S,Se,Te;X2!H0]-[C]=[C]
 773                 1,5 (thio)keto/enol f  [CX4,NX3;!H0]-[C]=[C][CH0]=[O,S,Se,Te;X1]
 774                 ... ... ...
 775             
 776     -w, --workingdir <dir>
 777         Location of working directory which defaults to the current directory.
 778 
 779 Examples:
 780     To standardize molecules in a SMILES file by executing all standardization
 781     steps and write out a SMILES file, type:
 782 
 783         % RDKitStandardizeMolecules.py -i Sample.smi -o SampleOut.smi
 784 
 785     To standardize molecules in a SD file by executing all standardization
 786     steps, performing standardization in multiprocessing mode on all available
 787     CPUs without loading all data into memory, and write out and write out a
 788     SD file, type:
 789 
 790         % RDKitStandardizeMolecules.py --mp yes -i Sample.sdf -o SampleOut.sdf
 791 
 792     To standardize molecules in a SMILES file by executing  all standardization
 793     steps, performing standardization in multiprocessing mode on all available
 794     CPUs by loading all data into memory, and write out and write out a
 795     SMILES file, type:
 796 
 797         % RDKitStandardizeMolecules.py --mp yes --mpParams "inputDataMode,
 798           InMemory" -i Sample.smi -o SampleOut.smi
 799     
 800     To standardize molecules in a SMILES file by executing  all standardization
 801     steps, performing standardization in multiprocessing mode on specific number
 802     of CPUs and chunk size without loading all data into memory, and write out a
 803     a SMILES file, type:
 804 
 805         % RDKitStandardizeMolecules.py --mp yes --mpParams "inputDataMode,Lazy,
 806           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 807 
 808     To count number of molecules to be standardized without generating any
 809     output file, type:
 810 
 811         % RDKitStandardizeMolecules.py -m count -i Sample.sdf
 812 
 813     To standardize molecules in a SD file by executing specific standardization
 814     steps along with explicit values for various parameters to control the
 815     standardization behavior, and write out a SD file, type:
 816 
 817         % RDKitStandardizeMolecules.py --methodologyParams "cleanup,yes,
 818           removeFragments,yes,neutralize,yes,canonicalizeTautomer,yes"
 819           --standardizeParams "cleanupRemoveHydrogens,yes,
 820           cleanupDisconnectMetals,yes,cleanupNormalize,yes,
 821           cleanupNormalizeMaxRestarts,200,cleanupReionize,yes,
 822           cleanupAssignStereo,yes,largestFragmentChooserUseAtomCount,yes,
 823           doCanonical,yes,maxTautomers,1000"
 824           -i Sample.sdf -o SampleOut.sdf
 825 
 826     To standardize molecules in a CSV SMILES file, SMILES strings in column 1,
 827     name in column 2, and generate output SD file, type:
 828 
 829         % RDKitStandardizeMolecules.py --infileParams 
 830           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 831           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 832           -i SampleSMILES.csv -o SampleOut.sdf
 833 
 834 Author:
 835     Manish Sud(msud@san.rr.com)
 836 
 837 See also:
 838     RDKitConvertFileFormat.py, RDKitEnumerateTautomers.py,
 839     RDKitRemoveDuplicateMolecules.py, RDKitRemoveInvalidMolecules.py,
 840     RDKitRemoveSalts.py, RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py
 841 
 842 Copyright:
 843     Copyright (C) 2025 Manish Sud. All rights reserved.
 844 
 845     The functionality available in this script is implemented using RDKit, an
 846     open source toolkit for cheminformatics developed by Greg Landrum.
 847 
 848     This file is part of MayaChemTools.
 849 
 850     MayaChemTools is free software; you can redistribute it and/or modify it under
 851     the terms of the GNU Lesser General Public License as published by the Free
 852     Software Foundation; either version 3 of the License, or (at your option) any
 853     later version.
 854 
 855 """
 856 
 857 if __name__ == "__main__":
 858     main()