MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveSalts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2023 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem.SaltRemover import SaltRemover
  43     from rdkit.Chem.SaltRemover import InputFormat
  44     from rdkit.Chem import AllChem
  45 except ImportError as ErrMsg:
  46     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  47     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  48     sys.exit(1)
  49 
  50 # MayaChemTools imports...
  51 try:
  52     from docopt import docopt
  53     import MiscUtil
  54     import RDKitUtil
  55 except ImportError as ErrMsg:
  56     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  57     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  58     sys.exit(1)
  59 
  60 ScriptName = os.path.basename(sys.argv[0])
  61 Options = {}
  62 OptionsInfo = {}
  63 
  64 def main():
  65     """Start execution of the script."""
  66     
  67     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  68     
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70     
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73     
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76     
  77     # Perform actions required by the script...
  78     RemoveSalts()
  79     
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 def RemoveSalts():
  84     """Identify and remove salts from molecules."""
  85     
  86     # Setup a molecule reader...
  87     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  88     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  89     
  90     # Set up a molecule writer...
  91     Writer = SetupMoleculeWriter()
  92 
  93     MolCount, ValidMolCount, SaltsMolCount = ProcessMolecules(Mols, Writer)
  94 
  95     if Writer is not None:
  96         Writer.close()
  97     
  98     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  99     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 100     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 101     
 102     MiscUtil.PrintInfo("\nNumber of molecules coontaining salts: %d" % (SaltsMolCount))
 103 
 104 def ProcessMolecules(Mols, Writer):
 105     """Process and remove salts from molecules."""
 106 
 107     if OptionsInfo["MPMode"]:
 108         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 109     else:
 110         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 111 
 112 def ProcessMoleculesUsingSingleProcess(Mols,  Writer):
 113     """Process and remove salts from molecules using a single process."""
 114     
 115     MiscUtil.PrintInfo("\nRemoving salts...")
 116     
 117     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 118     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 119     
 120     # Set up a salt remover...
 121     Remover = SetupSaltRemover()
 122     
 123     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 124     FirstMol = True
 125     for Mol in Mols:
 126         MolCount += 1
 127         
 128         if Mol is None:
 129             continue
 130         
 131         if RDKitUtil.IsMolEmpty(Mol):
 132             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 133             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 134             continue
 135         
 136         ValidMolCount += 1
 137         if FirstMol:
 138             FirstMol = False
 139             if SetSMILESMolProps:
 140                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 141         
 142         UnsaltedMol, SaltyStatus = RemoveMolSalts(Mol, Remover, MolCount)
 143         
 144         if SaltyStatus:
 145             SaltsMolCount += 1
 146 
 147         WriteMolecule(Writer, UnsaltedMol, Compute2DCoords)
 148     
 149     return (MolCount, ValidMolCount, SaltsMolCount)
 150     
 151 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 152     """Process and remove salts from molecules using  multiprocessing."""
 153     
 154     MiscUtil.PrintInfo("\nRemoving salts using multiprocessing...")
 155     
 156     MPParams = OptionsInfo["MPParams"]
 157     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 158     
 159     # Setup data for initializing a worker process...
 160     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 161 
 162     # Setup a encoded mols data iterable for a worker process by pickling only public
 163     # and private molecule properties...
 164     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 165 
 166     # Setup process pool along with data initialization for each process...
 167     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 168     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 169     
 170     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 171     
 172     # Start processing...
 173     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 174         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 175     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 176         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 177     else:
 178         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 179     
 180     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 181     
 182     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 183     FirstMol = True
 184     for Result in Results:
 185         MolCount += 1
 186         MolIndex, EncodedMol, SaltyStatus = Result
 187         
 188         if EncodedMol is None:
 189             continue
 190         ValidMolCount += 1
 191         
 192         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 193         
 194         if FirstMol:
 195             FirstMol = False
 196             if SetSMILESMolProps:
 197                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 198         
 199         if SaltyStatus:
 200             SaltsMolCount += 1
 201 
 202         WriteMolecule(Writer, Mol, Compute2DCoords)
 203     
 204     return (MolCount, ValidMolCount, SaltsMolCount)
 205 
 206 def InitializeWorkerProcess(*EncodedArgs):
 207     """Initialize data for a worker process."""
 208 
 209     global Options, OptionsInfo
 210     
 211     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 212 
 213     # Decode Options and OptionInfo...
 214     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 215     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 216 
 217     # Set up salt remover...
 218     OptionsInfo["SaltRemover"] = SetupSaltRemover()
 219 
 220 def WorkerProcess(EncodedMolInfo):
 221     """Process data for a worker process."""
 222     
 223     MolIndex, EncodedMol = EncodedMolInfo
 224     
 225     if EncodedMol is None:
 226         return [MolIndex, None, False]
 227         
 228     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 229     if RDKitUtil.IsMolEmpty(Mol):
 230         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 231         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 232         return [MolIndex, None, False]
 233         
 234     Mol, SaltyStatus = RemoveMolSalts(Mol, OptionsInfo["SaltRemover"], (MolIndex + 1))
 235     EncodedMol = RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps)
 236 
 237     return [MolIndex, EncodedMol, SaltyStatus]
 238     
 239 def RemoveMolSalts(Mol, Remover, MolNum):
 240     """Remove salts from mol and return unsalted mol along with mol salty status."""
 241 
 242     UnsaltedMol = Mol
 243     SaltyStatus = False
 244     
 245     if Remover is not None:
 246         KeptMol, DeletedMols = Remover.StripMolWithDeleted(Mol, dontRemoveEverything = False)
 247         if len(DeletedMols) >= 1:
 248             SaltyStatus = True
 249         if RDKitUtil.IsMolEmpty(KeptMol):
 250             if len(DeletedMols) >= 1:
 251                 # Take the larged fragment from DeletedMols
 252                 UnsaltedMol = GetLargestMol(DeletedMols)
 253     else:
 254         # Use largest fragment as unsalted molecule...
 255         MolFrags = Chem.GetMolFrags(Mol, asMols = True)
 256         if len(MolFrags) > 1:
 257             # Keep the largest fragment as unsalted molecule...
 258             SaltyStatus = True
 259             UnsaltedMol = GetLargestMol(MolFrags)
 260 
 261     if SaltyStatus:
 262         Chem.SanitizeMol(UnsaltedMol)
 263         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 264         if len(MolName):
 265             UnsaltedMol.SetProp("_Name", MolName)
 266     
 267     return (UnsaltedMol, SaltyStatus)
 268 
 269 def GetLargestMol(Mols):
 270     """Get largest mol from list of mols."""
 271 
 272     LargestMol = None
 273     LargestMolSize = -1
 274     for Mol in Mols:
 275         Size = Mol.GetNumAtoms()
 276         if Size > LargestMolSize:
 277             LargestMol = Mol
 278             LargestMolSize = Size
 279 
 280     return LargestMol
 281 
 282 def SetupSaltRemover():
 283     """Setup a salt remover."""
 284     
 285     Remover = None
 286     if OptionsInfo["SaltsByComponentsMode"]:
 287         return Remover
 288 
 289     return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS)
 290 
 291 def WriteMolecule(Writer, Mol, Compute2DCoords):
 292     """Write out molecule."""
 293     
 294     if OptionsInfo["CountMode"]:
 295         return
 296     
 297     if Compute2DCoords:
 298         AllChem.Compute2DCoords(Mol)
 299     
 300     Writer.write(Mol)
 301 
 302 def SetupMoleculeWriter():
 303     """Setup a molecule writer."""
 304     
 305     Writer = None
 306     if OptionsInfo["CountMode"]:
 307         return Writer
 308 
 309     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 310     if Writer is None:
 311         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 312     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 313     
 314     return Writer
 315 
 316 def ProcessOptions():
 317     """Process and validate command line arguments and options."""
 318     
 319     MiscUtil.PrintInfo("Processing options...")
 320     
 321     # Validate options...
 322     ValidateOptions()
 323     
 324     OptionsInfo["Infile"] = Options["--infile"]
 325     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 326     
 327     OptionsInfo["Outfile"] = Options["--outfile"]
 328     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 329 
 330     OptionsInfo["Overwrite"] = Options["--overwrite"]
 331 
 332     OptionsInfo["CountMode"] = False
 333     if re.match("^count$", Options["--mode"], re.I):
 334         OptionsInfo["CountMode"] = True
 335         
 336     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 337     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 338 
 339     SaltsByComponentsMode = False
 340     SaltsBySMARTSFileMode = False
 341     SaltsBySMARTSMode = False
 342     if re.match("^ByComponent$", Options["--saltsMode"], re.I):
 343         SaltsByComponentsMode = True
 344     elif re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 345         SaltsBySMARTSFileMode = False
 346     elif re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 347         SaltsBySMARTSMode = True
 348     else:
 349         MiscUtil.PrintError("The salts mode specified, %s, using \"--saltsMode\" option is not valid." % Options["--saltsMode"])
 350     OptionsInfo["SaltsByComponentsMode"]  = SaltsByComponentsMode
 351     OptionsInfo["SaltsBySMARTSFileMode"]  = SaltsBySMARTSFileMode
 352     OptionsInfo["SaltsBySMARTSMode"]  = SaltsBySMARTSMode
 353 
 354     SaltsFile = None
 355     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 356         if not re.match("^auto$", Options["--saltsFile"], re.I):
 357             SaltsFile = Options["--saltsFile"]
 358     OptionsInfo["SaltsFile"] = SaltsFile
 359     
 360     SaltsSMARTS = None
 361     if re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 362         if not Options["--saltsSMARTS"]:
 363             MiscUtil.PrintError("No salts SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 364         SaltsSMARTS = Options["--saltsSMARTS"].strip(" ")
 365         if not len(SaltsSMARTS):
 366             MiscUtil.PrintError("Empty SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 367         if re.search(" ", SaltsSMARTS):
 368             SaltsSMARTS = re.sub('[ ]+', '\n', SaltsSMARTS)
 369         
 370     OptionsInfo["SaltsSMARTS"] = SaltsSMARTS
 371     
 372 def RetrieveOptions():
 373     """Retrieve command line arguments and options."""
 374     
 375     # Get options...
 376     global Options
 377     Options = docopt(_docoptUsage_)
 378     
 379     # Set current working directory to the specified directory...
 380     WorkingDir = Options["--workingdir"]
 381     if WorkingDir:
 382         os.chdir(WorkingDir)
 383     
 384     # Handle examples option...
 385     if "--examples" in Options and Options["--examples"]:
 386         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 387         sys.exit(0)
 388 
 389 def ValidateOptions():
 390     """Validate option values."""
 391     
 392     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 393     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 394     
 395     if Options["--outfile"]:
 396         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 397         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 398         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 399 
 400     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 401     if re.match("^remove$", Options["--mode"], re.I):
 402         if not Options["--outfile"]:
 403             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 404     
 405     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 406     
 407     MiscUtil.ValidateOptionTextValue("--saltsMode", Options["--saltsMode"], "ByComponent BySMARTSFile BySMARTS")
 408     
 409     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 410         if not re.match("^auto$", Options["--saltsFile"], re.I):
 411             MiscUtil.ValidateOptionFilePath("--saltsFile", Options["--saltsFile"])
 412 
 413 # Setup a usage string for docopt...
 414 _docoptUsage_ = """
 415 RDKitRemoveSalts.py - Remove salts
 416 
 417 Usage:
 418     RDKitRemoveSalts.py  [--infileParams <Name,Value,...>] [--mode <remove or count>]
 419                          [--mp <yes or no>] [--mpParams <Name,Value,...>] [--outfileParams <Name,Value,...> ]
 420                          [--overwrite] [--saltsMode <ByComponent, BySMARTSFile, BySMARTS>]
 421                          [--saltsFile <FileName or auto>] [--saltsSMARTS <SMARTS>]
 422                          [-w <dir>] [-o <outfile>]  -i <infile>
 423     RDKitRemoveSalts.py -h | --help | -e | --examples
 424 
 425 Description:
 426     Remove salts from molecules or simply count the number of molecules containing
 427     salts. Salts are identified and removed based on either SMARTS strings or by selecting
 428     the largest disconnected components in molecules as non-salt portion of molecules.
 429 
 430     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 431 
 432     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 433 
 434 Options:
 435     -e, --examples
 436         Print examples.
 437     -h, --help
 438         Print this help message.
 439     -i, --infile <infile>
 440         Input file name.
 441     --infileParams <Name,Value,...>  [default: auto]
 442         A comma delimited list of parameter name and value pairs for reading
 443         molecules from files. The supported parameter names for different file
 444         formats, along with their default values, are shown below:
 445             
 446             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 447             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 448                 smilesTitleLine,auto,sanitize,yes
 449             
 450         Possible values for smilesDelimiter: space, comma or tab.
 451     -m, --mode <remove or count>  [default: remove]
 452         Specify whether to remove salts from molecules and write out molecules
 453         or or simply count the number of molecules containing salts.
 454     --mp <yes or no>  [default: no]
 455         Use multiprocessing.
 456          
 457         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 458         function employing lazy RDKit data iterable. This allows processing of
 459         arbitrary large data sets without any additional requirements memory.
 460         
 461         All input data may be optionally loaded into memory by mp.Pool.map()
 462         before starting worker processes in a process pool by setting the value
 463         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 464         
 465         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 466         data mode may adversely impact the performance. The '--mpParams' section
 467         provides additional information to tune the value of 'chunkSize'.
 468     --mpParams <Name,Value,...>  [default: auto]
 469         A comma delimited list of parameter name and value pairs to configure
 470         multiprocessing.
 471         
 472         The supported parameter names along with their default and possible
 473         values are shown below:
 474         
 475             chunkSize, auto
 476             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 477             numProcesses, auto   [ Default: mp.cpu_count() ]
 478         
 479         These parameters are used by the following functions to configure and
 480         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 481         mp.Pool.imap().
 482         
 483         The chunkSize determines chunks of input data passed to each worker
 484         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 485         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 486         
 487         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 488         automatically converts RDKit data iterable into a list, loads all data into
 489         memory, and calculates the default chunkSize using the following method
 490         as shown in its code:
 491         
 492             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 493             if extra: chunkSize += 1
 494         
 495         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 496         and 100 data items.
 497         
 498         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 499         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 500         data into memory. Consequently, the size of input data is not known a priori.
 501         It's not possible to estimate an optimal value for the chunkSize. The default 
 502         chunkSize is set to 1.
 503         
 504         The default value for the chunkSize during 'Lazy' data mode may adversely
 505         impact the performance due to the overhead associated with exchanging
 506         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 507         a larger value during 'Lazy' input data mode, based on the size of your input
 508         data and number of processes in the process pool.
 509         
 510         The mp.Pool.map() function waits for all worker processes to process all
 511         the data and return the results. The mp.Pool.imap() function, however,
 512         returns the the results obtained from worker processes as soon as the
 513         results become available for specified chunks of data.
 514         
 515         The order of data in the results returned by both mp.Pool.map() and 
 516         mp.Pool.imap() functions always corresponds to the input data.
 517     -o, --outfile <outfile>
 518         Output file name.
 519     --outfileParams <Name,Value,...>  [default: auto]
 520         A comma delimited list of parameter name and value pairs for writing
 521         molecules to files. The supported parameter names for different file
 522         formats, along with their default values, are shown below:
 523             
 524             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 525             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 526                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 527             
 528         Default value for compute2DCoords: yes for SMILES input file; no for all other
 529         file types.
 530     --overwrite
 531         Overwrite existing files.
 532     -s, --saltsMode <ByComponent, BySMARTSFile, BySMARTS>  [default: ByComponent]
 533         Specify whether to identify and remove salts based on SMARTS strings or
 534         by selecting the largest disconnected component as non-salt portion of a
 535         molecule. Possible values: ByComponent, BySMARTSFile or BySMARTS.
 536     --saltsFile <FileName or auto>  [default: auto]
 537         Specify a file name containing specification for SMARTS corresponding to salts or
 538         use default salts file, Salts.txt, available in RDKit data directory. This option is only
 539         used during 'BySMARTSFile' value of '-s, --saltsMode' option.
 540         
 541         RDKit data format: Smarts<tab>Name(optional)
 542         
 543         For example:
 544             
 545             [Cl,Br,I]
 546             [N](=O)(O)O
 547             [CH3]C(=O)O	  Acetic acid
 548             
 549     --saltsSMARTS <SMARTS text>
 550         Space delimited SMARTS specifications to use for salts identification instead
 551         their specifications in '--saltsFile'. This option is only used during 'BySMARTS'
 552         value of '-s, --saltsMode' option.
 553     -w, --workingdir <dir>
 554         Location of working directory which defaults to the current directory.
 555 
 556 Examples:
 557     To remove salts from molecules in a SMILES file by keeping largest disconnected
 558     components as non-salt portion of molecules and write out a SMILES file, type:
 559 
 560         % RDKitRemoveSalts.py -i Sample.smi -o SampleOut.smi
 561 
 562     To remove salts from molecules in a SMILES file by keeping largest disconnected
 563     components as non-salt portion of molecules, perform salt removal in multiprocessing
 564     mode on all available CPUs without loading all data into memory, and write out a
 565     SMILES file, type:
 566 
 567         % RDKitRemoveSalts.py --mp yes -i Sample.smi -o SampleOut.smi
 568 
 569     To remove salts from molecules in a SMILES file by keeping largest disconnected
 570     components as non-salt portion of molecules, perform salt removal in multiprocessing
 571     mode on all available CPUs by loading all data into memory, and write out a
 572     SMILES file, type:
 573 
 574         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,InMemory"
 575           -i Sample.smi -o SampleOut.smi
 576 
 577     To remove salts from molecules in a SMILES file by keeping largest disconnected
 578     components as non-salt portion of molecules, perform salt removal in multiprocessing
 579     mode on specific number of CPUs and chunk size without loading all data into memory,
 580     and write out a SMILES file, type:
 581 
 582         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,Lazy,
 583           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 584 
 585     To count number of molecules containing salts from in a SD file, using largest
 586     components as non-salt portion of molecules, without generating any output
 587     file, type:
 588 
 589         % RDKitRemoveSalts.py -m count -i Sample.sdf
 590 
 591     To remove salts from molecules in a SMILES file using SMARTS strings in default
 592     Salts.txt distributed with RDKit to identify salts and write out a SMILES file, type:
 593 
 594         % RDKitRemoveSalts.py -m remove -s BySMARTSFile -i Sample.smi
 595           -o SampleOut.smi
 596 
 597     To remove salts from molecules in a SD file using SMARTS strings in a local
 598     CustomSalts.txt to identify salts and write out a SMILES file, type:
 599 
 600         % RDKitRemoveSalts.py -m remove -s BySMARTSFile --saltsFile
 601           CustomSalts.txt -i Sample.sdf -o SampleOut.smi
 602 
 603     To remove salts from molecules in a SD file using specified SMARTS to identify
 604     salts and write out a SD file, type:
 605 
 606         % RDKitRemoveSalts.py -m remove -s BySMARTS  --saltsSMARTS
 607           '[Cl,Br,I]  [N](=O)(O)O [N](=O)(O)O'
 608           -i Sample.sdf -o SampleOut.smi
 609 
 610     To remove salts form  molecules from a CSV SMILES file, SMILES strings in column 1,
 611     name in column 2, and generate output SD file, type:
 612 
 613         % RDKitRemoveSalts.py --infileParams 
 614           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 615           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 616           -i SampleSMILES.csv -o SampleOut.sdf
 617 
 618 Author:
 619     Manish Sud(msud@san.rr.com)
 620 
 621 See also:
 622     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 623     RDKitRemoveInvalidMolecules.py, RDKitSearchFunctionalGroups.py,
 624     RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py
 625 
 626 Copyright:
 627     Copyright (C) 2023 Manish Sud. All rights reserved.
 628 
 629     The functionality available in this script is implemented using RDKit, an
 630     open source toolkit for cheminformatics developed by Greg Landrum.
 631 
 632     This file is part of MayaChemTools.
 633 
 634     MayaChemTools is free software; you can redistribute it and/or modify it under
 635     the terms of the GNU Lesser General Public License as published by the Free
 636     Software Foundation; either version 3 of the License, or (at your option) any
 637     later version.
 638 
 639 """
 640 
 641 if __name__ == "__main__":
 642     main()