MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveSalts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem.SaltRemover import SaltRemover
  43     from rdkit.Chem.SaltRemover import InputFormat
  44     from rdkit.Chem import AllChem
  45 except ImportError as ErrMsg:
  46     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  47     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  48     sys.exit(1)
  49 
  50 # MayaChemTools imports...
  51 try:
  52     from docopt import docopt
  53     import MiscUtil
  54     import RDKitUtil
  55 except ImportError as ErrMsg:
  56     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  57     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  58     sys.exit(1)
  59 
  60 ScriptName = os.path.basename(sys.argv[0])
  61 Options = {}
  62 OptionsInfo = {}
  63 
  64 def main():
  65     """Start execution of the script."""
  66     
  67     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  68     
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70     
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73     
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76     
  77     # Perform actions required by the script...
  78     RemoveSalts()
  79     
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 def RemoveSalts():
  84     """Identify and remove salts from molecules."""
  85     
  86     # Setup a molecule reader...
  87     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  88     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  89     
  90     # Set up a molecule writer...
  91     Writer = SetupMoleculeWriter()
  92 
  93     MolCount, ValidMolCount, SaltsMolCount = ProcessMolecules(Mols, Writer)
  94 
  95     if Writer is not None:
  96         Writer.close()
  97     
  98     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  99     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 100     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 101     
 102     MiscUtil.PrintInfo("\nNumber of molecules containing salts: %d" % (SaltsMolCount))
 103 
 104 def ProcessMolecules(Mols, Writer):
 105     """Process and remove salts from molecules."""
 106 
 107     if OptionsInfo["MPMode"]:
 108         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 109     else:
 110         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 111 
 112 def ProcessMoleculesUsingSingleProcess(Mols,  Writer):
 113     """Process and remove salts from molecules using a single process."""
 114     
 115     MiscUtil.PrintInfo("\nRemoving salts...")
 116     
 117     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 118     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 119     
 120     # Set up a salt remover...
 121     Remover = SetupSaltRemover()
 122     
 123     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 124     FirstMol = True
 125     for Mol in Mols:
 126         MolCount += 1
 127         
 128         if Mol is None:
 129             continue
 130         
 131         if RDKitUtil.IsMolEmpty(Mol):
 132             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 133             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 134             continue
 135         
 136         ValidMolCount += 1
 137         if FirstMol:
 138             FirstMol = False
 139             if SetSMILESMolProps:
 140                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 141         
 142         UnsaltedMol, SaltyStatus = RemoveMolSalts(Mol, Remover, MolCount)
 143         
 144         if SaltyStatus:
 145             SaltsMolCount += 1
 146 
 147         WriteMolecule(Writer, UnsaltedMol, Compute2DCoords)
 148     
 149     return (MolCount, ValidMolCount, SaltsMolCount)
 150     
 151 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 152     """Process and remove salts from molecules using  multiprocessing."""
 153     
 154     MiscUtil.PrintInfo("\nRemoving salts using multiprocessing...")
 155     
 156     MPParams = OptionsInfo["MPParams"]
 157     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 158     
 159     # Setup data for initializing a worker process...
 160     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 161 
 162     # Setup a encoded mols data iterable for a worker process by pickling only public
 163     # and private molecule properties...
 164     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 165 
 166     # Setup process pool along with data initialization for each process...
 167     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 168     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 169     
 170     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 171     
 172     # Start processing...
 173     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 174         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 175     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 176         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 177     else:
 178         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 179     
 180     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 181     
 182     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 183     FirstMol = True
 184     for Result in Results:
 185         MolCount += 1
 186         MolIndex, EncodedMol, SaltyStatus = Result
 187         
 188         if EncodedMol is None:
 189             continue
 190         ValidMolCount += 1
 191         
 192         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 193         
 194         if FirstMol:
 195             FirstMol = False
 196             if SetSMILESMolProps:
 197                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 198         
 199         if SaltyStatus:
 200             SaltsMolCount += 1
 201 
 202         WriteMolecule(Writer, Mol, Compute2DCoords)
 203     
 204     return (MolCount, ValidMolCount, SaltsMolCount)
 205 
 206 def InitializeWorkerProcess(*EncodedArgs):
 207     """Initialize data for a worker process."""
 208 
 209     global Options, OptionsInfo
 210     
 211     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 212 
 213     # Decode Options and OptionInfo...
 214     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 215     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 216 
 217     # Set up salt remover...
 218     OptionsInfo["SaltRemover"] = SetupSaltRemover()
 219 
 220 def WorkerProcess(EncodedMolInfo):
 221     """Process data for a worker process."""
 222     
 223     MolIndex, EncodedMol = EncodedMolInfo
 224     
 225     if EncodedMol is None:
 226         return [MolIndex, None, False]
 227         
 228     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 229     if RDKitUtil.IsMolEmpty(Mol):
 230         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 231         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 232         return [MolIndex, None, False]
 233         
 234     Mol, SaltyStatus = RemoveMolSalts(Mol, OptionsInfo["SaltRemover"], (MolIndex + 1))
 235     EncodedMol = RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps)
 236 
 237     return [MolIndex, EncodedMol, SaltyStatus]
 238     
 239 def RemoveMolSalts(Mol, Remover, MolNum):
 240     """Remove salts from mol and return unsalted mol along with mol salty status."""
 241 
 242     UnsaltedMol = Mol
 243     SaltyStatus = False
 244     
 245     if Remover is not None:
 246         KeptMol, DeletedMols = Remover.StripMolWithDeleted(Mol, dontRemoveEverything = False)
 247         if len(DeletedMols) >= 1:
 248             SaltyStatus = True
 249         if RDKitUtil.IsMolEmpty(KeptMol):
 250             if len(DeletedMols) >= 1:
 251                 # Take the larged fragment from DeletedMols
 252                 UnsaltedMol = GetLargestMol(DeletedMols)
 253     else:
 254         # Use largest fragment as unsalted molecule...
 255         MolFrags = Chem.GetMolFrags(Mol, asMols = True)
 256         if len(MolFrags) > 1:
 257             # Keep the largest fragment as unsalted molecule...
 258             SaltyStatus = True
 259             UnsaltedMol = GetLargestMol(MolFrags)
 260 
 261     if SaltyStatus:
 262         Chem.SanitizeMol(UnsaltedMol)
 263         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 264         if len(MolName):
 265             UnsaltedMol.SetProp("_Name", MolName)
 266 
 267         # Set mol properties...
 268         for DataLabel in Mol.GetPropNames(includePrivate = False, includeComputed = False):
 269             DataProp = Mol.GetProp(DataLabel)
 270             UnsaltedMol.SetProp(DataLabel, DataProp)
 271     
 272     return (UnsaltedMol, SaltyStatus)
 273 
 274 def GetLargestMol(Mols):
 275     """Get largest mol from list of mols."""
 276 
 277     LargestMol = None
 278     LargestMolSize = -1
 279     for Mol in Mols:
 280         Size = Mol.GetNumAtoms()
 281         if Size > LargestMolSize:
 282             LargestMol = Mol
 283             LargestMolSize = Size
 284 
 285     return LargestMol
 286 
 287 def SetupSaltRemover():
 288     """Setup a salt remover."""
 289     
 290     Remover = None
 291     if OptionsInfo["SaltsByComponentsMode"]:
 292         return Remover
 293 
 294     return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS)
 295 
 296 def WriteMolecule(Writer, Mol, Compute2DCoords):
 297     """Write out molecule."""
 298     
 299     if OptionsInfo["CountMode"]:
 300         return
 301     
 302     if Compute2DCoords:
 303         AllChem.Compute2DCoords(Mol)
 304     
 305     Writer.write(Mol)
 306 
 307 def SetupMoleculeWriter():
 308     """Setup a molecule writer."""
 309     
 310     Writer = None
 311     if OptionsInfo["CountMode"]:
 312         return Writer
 313 
 314     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 315     if Writer is None:
 316         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 317     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 318     
 319     return Writer
 320 
 321 def ProcessOptions():
 322     """Process and validate command line arguments and options."""
 323     
 324     MiscUtil.PrintInfo("Processing options...")
 325     
 326     # Validate options...
 327     ValidateOptions()
 328     
 329     OptionsInfo["Infile"] = Options["--infile"]
 330     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 331     
 332     OptionsInfo["Outfile"] = Options["--outfile"]
 333     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 334 
 335     OptionsInfo["Overwrite"] = Options["--overwrite"]
 336 
 337     OptionsInfo["CountMode"] = False
 338     if re.match("^count$", Options["--mode"], re.I):
 339         OptionsInfo["CountMode"] = True
 340         
 341     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 342     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 343 
 344     SaltsByComponentsMode = False
 345     SaltsBySMARTSFileMode = False
 346     SaltsBySMARTSMode = False
 347     if re.match("^ByComponent$", Options["--saltsMode"], re.I):
 348         SaltsByComponentsMode = True
 349     elif re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 350         SaltsBySMARTSFileMode = False
 351     elif re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 352         SaltsBySMARTSMode = True
 353     else:
 354         MiscUtil.PrintError("The salts mode specified, %s, using \"--saltsMode\" option is not valid." % Options["--saltsMode"])
 355     OptionsInfo["SaltsByComponentsMode"]  = SaltsByComponentsMode
 356     OptionsInfo["SaltsBySMARTSFileMode"]  = SaltsBySMARTSFileMode
 357     OptionsInfo["SaltsBySMARTSMode"]  = SaltsBySMARTSMode
 358 
 359     SaltsFile = None
 360     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 361         if not re.match("^auto$", Options["--saltsFile"], re.I):
 362             SaltsFile = Options["--saltsFile"]
 363     OptionsInfo["SaltsFile"] = SaltsFile
 364     
 365     SaltsSMARTS = None
 366     if re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 367         if not Options["--saltsSMARTS"]:
 368             MiscUtil.PrintError("No salts SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 369         SaltsSMARTS = Options["--saltsSMARTS"].strip(" ")
 370         if not len(SaltsSMARTS):
 371             MiscUtil.PrintError("Empty SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 372         if re.search(" ", SaltsSMARTS):
 373             SaltsSMARTS = re.sub('[ ]+', '\n', SaltsSMARTS)
 374         
 375     OptionsInfo["SaltsSMARTS"] = SaltsSMARTS
 376     
 377 def RetrieveOptions():
 378     """Retrieve command line arguments and options."""
 379     
 380     # Get options...
 381     global Options
 382     Options = docopt(_docoptUsage_)
 383     
 384     # Set current working directory to the specified directory...
 385     WorkingDir = Options["--workingdir"]
 386     if WorkingDir:
 387         os.chdir(WorkingDir)
 388     
 389     # Handle examples option...
 390     if "--examples" in Options and Options["--examples"]:
 391         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 392         sys.exit(0)
 393 
 394 def ValidateOptions():
 395     """Validate option values."""
 396     
 397     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 398     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 399     
 400     if Options["--outfile"]:
 401         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 402         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 403         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 404 
 405     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 406     if re.match("^remove$", Options["--mode"], re.I):
 407         if not Options["--outfile"]:
 408             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 409     
 410     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 411     
 412     MiscUtil.ValidateOptionTextValue("--saltsMode", Options["--saltsMode"], "ByComponent BySMARTSFile BySMARTS")
 413     
 414     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 415         if not re.match("^auto$", Options["--saltsFile"], re.I):
 416             MiscUtil.ValidateOptionFilePath("--saltsFile", Options["--saltsFile"])
 417 
 418 # Setup a usage string for docopt...
 419 _docoptUsage_ = """
 420 RDKitRemoveSalts.py - Remove salts
 421 
 422 Usage:
 423     RDKitRemoveSalts.py  [--infileParams <Name,Value,...>] [--mode <remove or count>]
 424                          [--mp <yes or no>] [--mpParams <Name,Value,...>] [--outfileParams <Name,Value,...> ]
 425                          [--overwrite] [--saltsMode <ByComponent, BySMARTSFile, BySMARTS>]
 426                          [--saltsFile <FileName or auto>] [--saltsSMARTS <SMARTS>]
 427                          [-w <dir>] [-o <outfile>]  -i <infile>
 428     RDKitRemoveSalts.py -h | --help | -e | --examples
 429 
 430 Description:
 431     Remove salts from molecules or simply count the number of molecules containing
 432     salts. Salts are identified and removed based on either SMARTS strings or by selecting
 433     the largest disconnected components in molecules as non-salt portion of molecules.
 434 
 435     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 436 
 437     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 438 
 439 Options:
 440     -e, --examples
 441         Print examples.
 442     -h, --help
 443         Print this help message.
 444     -i, --infile <infile>
 445         Input file name.
 446     --infileParams <Name,Value,...>  [default: auto]
 447         A comma delimited list of parameter name and value pairs for reading
 448         molecules from files. The supported parameter names for different file
 449         formats, along with their default values, are shown below:
 450             
 451             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 452             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 453                 smilesTitleLine,auto,sanitize,yes
 454             
 455         Possible values for smilesDelimiter: space, comma or tab.
 456     -m, --mode <remove or count>  [default: remove]
 457         Specify whether to remove salts from molecules and write out molecules
 458         or or simply count the number of molecules containing salts.
 459     --mp <yes or no>  [default: no]
 460         Use multiprocessing.
 461          
 462         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 463         function employing lazy RDKit data iterable. This allows processing of
 464         arbitrary large data sets without any additional requirements memory.
 465         
 466         All input data may be optionally loaded into memory by mp.Pool.map()
 467         before starting worker processes in a process pool by setting the value
 468         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 469         
 470         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 471         data mode may adversely impact the performance. The '--mpParams' section
 472         provides additional information to tune the value of 'chunkSize'.
 473     --mpParams <Name,Value,...>  [default: auto]
 474         A comma delimited list of parameter name and value pairs to configure
 475         multiprocessing.
 476         
 477         The supported parameter names along with their default and possible
 478         values are shown below:
 479         
 480             chunkSize, auto
 481             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 482             numProcesses, auto   [ Default: mp.cpu_count() ]
 483         
 484         These parameters are used by the following functions to configure and
 485         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 486         mp.Pool.imap().
 487         
 488         The chunkSize determines chunks of input data passed to each worker
 489         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 490         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 491         
 492         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 493         automatically converts RDKit data iterable into a list, loads all data into
 494         memory, and calculates the default chunkSize using the following method
 495         as shown in its code:
 496         
 497             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 498             if extra: chunkSize += 1
 499         
 500         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 501         and 100 data items.
 502         
 503         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 504         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 505         data into memory. Consequently, the size of input data is not known a priori.
 506         It's not possible to estimate an optimal value for the chunkSize. The default 
 507         chunkSize is set to 1.
 508         
 509         The default value for the chunkSize during 'Lazy' data mode may adversely
 510         impact the performance due to the overhead associated with exchanging
 511         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 512         a larger value during 'Lazy' input data mode, based on the size of your input
 513         data and number of processes in the process pool.
 514         
 515         The mp.Pool.map() function waits for all worker processes to process all
 516         the data and return the results. The mp.Pool.imap() function, however,
 517         returns the the results obtained from worker processes as soon as the
 518         results become available for specified chunks of data.
 519         
 520         The order of data in the results returned by both mp.Pool.map() and 
 521         mp.Pool.imap() functions always corresponds to the input data.
 522     -o, --outfile <outfile>
 523         Output file name.
 524     --outfileParams <Name,Value,...>  [default: auto]
 525         A comma delimited list of parameter name and value pairs for writing
 526         molecules to files. The supported parameter names for different file
 527         formats, along with their default values, are shown below:
 528             
 529             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 530             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 531                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 532             
 533         Default value for compute2DCoords: yes for SMILES input file; no for all other
 534         file types.
 535     --overwrite
 536         Overwrite existing files.
 537     -s, --saltsMode <ByComponent, BySMARTSFile, BySMARTS>  [default: ByComponent]
 538         Specify whether to identify and remove salts based on SMARTS strings or
 539         by selecting the largest disconnected component as non-salt portion of a
 540         molecule. Possible values: ByComponent, BySMARTSFile or BySMARTS.
 541     --saltsFile <FileName or auto>  [default: auto]
 542         Specify a file name containing specification for SMARTS corresponding to salts or
 543         use default salts file, Salts.txt, available in RDKit data directory. This option is only
 544         used during 'BySMARTSFile' value of '-s, --saltsMode' option.
 545         
 546         RDKit data format: Smarts<tab>Name(optional)
 547         
 548         For example:
 549             
 550             [Cl,Br,I]
 551             [N](=O)(O)O
 552             [CH3]C(=O)O	  Acetic acid
 553             
 554     --saltsSMARTS <SMARTS text>
 555         Space delimited SMARTS specifications to use for salts identification instead
 556         their specifications in '--saltsFile'. This option is only used during 'BySMARTS'
 557         value of '-s, --saltsMode' option.
 558     -w, --workingdir <dir>
 559         Location of working directory which defaults to the current directory.
 560 
 561 Examples:
 562     To remove salts from molecules in a SMILES file by keeping largest disconnected
 563     components as non-salt portion of molecules and write out a SMILES file, type:
 564 
 565         % RDKitRemoveSalts.py -i Sample.smi -o SampleOut.smi
 566 
 567     To remove salts from molecules in a SMILES file by keeping largest disconnected
 568     components as non-salt portion of molecules, perform salt removal in multiprocessing
 569     mode on all available CPUs without loading all data into memory, and write out a
 570     SMILES file, type:
 571 
 572         % RDKitRemoveSalts.py --mp yes -i Sample.smi -o SampleOut.smi
 573 
 574     To remove salts from molecules in a SMILES file by keeping largest disconnected
 575     components as non-salt portion of molecules, perform salt removal in multiprocessing
 576     mode on all available CPUs by loading all data into memory, and write out a
 577     SMILES file, type:
 578 
 579         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,InMemory"
 580           -i Sample.smi -o SampleOut.smi
 581 
 582     To remove salts from molecules in a SMILES file by keeping largest disconnected
 583     components as non-salt portion of molecules, perform salt removal in multiprocessing
 584     mode on specific number of CPUs and chunk size without loading all data into memory,
 585     and write out a SMILES file, type:
 586 
 587         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,Lazy,
 588           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 589 
 590     To count number of molecules containing salts from in a SD file, using largest
 591     components as non-salt portion of molecules, without generating any output
 592     file, type:
 593 
 594         % RDKitRemoveSalts.py -m count -i Sample.sdf
 595 
 596     To remove salts from molecules in a SMILES file using SMARTS strings in default
 597     Salts.txt distributed with RDKit to identify salts and write out a SMILES file, type:
 598 
 599         % RDKitRemoveSalts.py -m remove -s BySMARTSFile -i Sample.smi
 600           -o SampleOut.smi
 601 
 602     To remove salts from molecules in a SD file using SMARTS strings in a local
 603     CustomSalts.txt to identify salts and write out a SMILES file, type:
 604 
 605         % RDKitRemoveSalts.py -m remove -s BySMARTSFile --saltsFile
 606           CustomSalts.txt -i Sample.sdf -o SampleOut.smi
 607 
 608     To remove salts from molecules in a SD file using specified SMARTS to identify
 609     salts and write out a SD file, type:
 610 
 611         % RDKitRemoveSalts.py -m remove -s BySMARTS  --saltsSMARTS
 612           '[Cl,Br,I]  [N](=O)(O)O [N](=O)(O)O'
 613           -i Sample.sdf -o SampleOut.smi
 614 
 615     To remove salts form  molecules from a CSV SMILES file, SMILES strings in column 1,
 616     name in column 2, and generate output SD file, type:
 617 
 618         % RDKitRemoveSalts.py --infileParams 
 619           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 620           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 621           -i SampleSMILES.csv -o SampleOut.sdf
 622 
 623 Author:
 624     Manish Sud(msud@san.rr.com)
 625 
 626 See also:
 627     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 628     RDKitRemoveInvalidMolecules.py, RDKitSearchFunctionalGroups.py,
 629     RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py
 630 
 631 Copyright:
 632     Copyright (C) 2024 Manish Sud. All rights reserved.
 633 
 634     The functionality available in this script is implemented using RDKit, an
 635     open source toolkit for cheminformatics developed by Greg Landrum.
 636 
 637     This file is part of MayaChemTools.
 638 
 639     MayaChemTools is free software; you can redistribute it and/or modify it under
 640     the terms of the GNU Lesser General Public License as published by the Free
 641     Software Foundation; either version 3 of the License, or (at your option) any
 642     later version.
 643 
 644 """
 645 
 646 if __name__ == "__main__":
 647     main()