MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveSalts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem.SaltRemover import SaltRemover
  43     from rdkit.Chem.SaltRemover import InputFormat
  44     from rdkit.Chem import AllChem
  45 except ImportError as ErrMsg:
  46     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  47     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  48     sys.exit(1)
  49 
  50 # MayaChemTools imports...
  51 try:
  52     from docopt import docopt
  53     import MiscUtil
  54     import RDKitUtil
  55 except ImportError as ErrMsg:
  56     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  57     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  58     sys.exit(1)
  59 
  60 ScriptName = os.path.basename(sys.argv[0])
  61 Options = {}
  62 OptionsInfo = {}
  63 
  64 def main():
  65     """Start execution of the script"""
  66     
  67     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  68     
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70     
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73     
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76     
  77     # Perform actions required by the script...
  78     RemoveSalts()
  79     
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 def RemoveSalts():
  84     """Identify and remove salts from molecules"""
  85     
  86     # Setup a molecule reader...
  87     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  88     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  89     
  90     # Set up a molecule writer...
  91     Writer = SetupMoleculeWriter()
  92 
  93     MolCount, ValidMolCount, SaltsMolCount = ProcessMolecules(Mols, Writer)
  94 
  95     if Writer is not None:
  96         Writer.close()
  97     
  98     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  99     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 100     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 101     
 102     MiscUtil.PrintInfo("\nNumber of molecules coontaining salts: %d" % (SaltsMolCount))
 103 
 104 def ProcessMolecules(Mols, Writer):
 105     """Process and remove salts from molecules. """
 106 
 107     if OptionsInfo["MPMode"]:
 108         return ProcessMoleculesUsingMultipleProcesses(Mols, Writer)
 109     else:
 110         return ProcessMoleculesUsingSingleProcess(Mols, Writer)
 111 
 112 def ProcessMoleculesUsingSingleProcess(Mols,  Writer):
 113     """Process and remove salts from molecules using a single process. """
 114     
 115     MiscUtil.PrintInfo("\nRemoving salts...")
 116     
 117     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 118     
 119     # Set up a salt remover...
 120     Remover = SetupSaltRemover()
 121     
 122     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 123     for Mol in Mols:
 124         MolCount += 1
 125         
 126         if Mol is None:
 127             continue
 128         
 129         if RDKitUtil.IsMolEmpty(Mol):
 130             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 131             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 132             continue
 133         
 134         ValidMolCount += 1
 135         
 136         UnsaltedMol, SaltyStatus = RemoveMolSalts(Mol, Remover, MolCount)
 137         
 138         if SaltyStatus:
 139             SaltsMolCount += 1
 140 
 141         WriteMolecule(Writer, UnsaltedMol, Compute2DCoords)
 142     
 143     return (MolCount, ValidMolCount, SaltsMolCount)
 144     
 145 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer):
 146     """Process and remove salts from molecules using  multiprocessing."""
 147     
 148     MiscUtil.PrintInfo("\nRemoving salts using multiprocessing...")
 149     
 150     MPParams = OptionsInfo["MPParams"]
 151     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 152     
 153     # Setup data for initializing a worker process...
 154     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 155 
 156     # Setup a encoded mols data iterable for a worker process by pickling only public
 157     # and private molecule properties...
 158     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 159 
 160     # Setup process pool along with data initialization for each process...
 161     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 162     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 163     
 164     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 165     
 166     # Start processing...
 167     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 168         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 169     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 170         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 171     else:
 172         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 173     
 174     (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3
 175     for Result in Results:
 176         MolCount += 1
 177         MolIndex, EncodedMol, SaltyStatus = Result
 178         
 179         if EncodedMol is None:
 180             continue
 181         ValidMolCount += 1
 182         
 183         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 184         
 185         if SaltyStatus:
 186             SaltsMolCount += 1
 187 
 188         WriteMolecule(Writer, Mol, Compute2DCoords)
 189     
 190     return (MolCount, ValidMolCount, SaltsMolCount)
 191 
 192 def InitializeWorkerProcess(*EncodedArgs):
 193     """Initialize data for a worker process."""
 194 
 195     global Options, OptionsInfo
 196     
 197     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 198 
 199     # Decode Options and OptionInfo...
 200     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 201     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 202 
 203     # Set up salt remover...
 204     OptionsInfo["SaltRemover"] = SetupSaltRemover()
 205 
 206 def WorkerProcess(EncodedMolInfo):
 207     """Process data for a worker process."""
 208     
 209     MolIndex, EncodedMol = EncodedMolInfo
 210     
 211     if EncodedMol is None:
 212         return [MolIndex, None, False]
 213         
 214     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 215     if RDKitUtil.IsMolEmpty(Mol):
 216         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 217         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 218         return [MolIndex, None, False]
 219         
 220     Mol, SaltyStatus = RemoveMolSalts(Mol, OptionsInfo["SaltRemover"], (MolIndex + 1))
 221     EncodedMol = RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps)
 222 
 223     return [MolIndex, EncodedMol, SaltyStatus]
 224     
 225 def RemoveMolSalts(Mol, Remover, MolNum):
 226     """Remove salts from mol and return unsalted mol along with mol salty status."""
 227 
 228     UnsaltedMol = Mol
 229     SaltyStatus = False
 230     
 231     if Remover is not None:
 232         KeptMol, DeletedMols = Remover.StripMolWithDeleted(Mol, dontRemoveEverything = False)
 233         if len(DeletedMols) >= 1:
 234             SaltyStatus = True
 235         if RDKitUtil.IsMolEmpty(KeptMol):
 236             if len(DeletedMols) >= 1:
 237                 # Take the larged fragment from DeletedMols
 238                 UnsaltedMol = GetLargestMol(DeletedMols)
 239     else:
 240         # Use largest fragment as unsalted molecule...
 241         MolFrags = Chem.GetMolFrags(Mol, asMols = True)
 242         if len(MolFrags) > 1:
 243             # Keep the largest fragment as unsalted molecule...
 244             SaltyStatus = True
 245             UnsaltedMol = GetLargestMol(MolFrags)
 246 
 247     if SaltyStatus:
 248         Chem.SanitizeMol(UnsaltedMol)
 249         MolName = RDKitUtil.GetMolName(Mol, MolNum)
 250         if len(MolName):
 251             UnsaltedMol.SetProp("_Name", MolName)
 252     
 253     return (UnsaltedMol, SaltyStatus)
 254 
 255 def GetLargestMol(Mols):
 256     """Get largest mol from list of mols"""
 257 
 258     LargestMol = None
 259     LargestMolSize = -1
 260     for Mol in Mols:
 261         Size = Mol.GetNumAtoms()
 262         if Size > LargestMolSize:
 263             LargestMol = Mol
 264             LargestMolSize = Size
 265 
 266     return LargestMol
 267 
 268 def SetupSaltRemover():
 269     """Setup a salt removerr."""
 270     
 271     Remover = None
 272     if OptionsInfo["SaltsByComponentsMode"]:
 273         return Remover
 274 
 275     return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS)
 276 
 277 def WriteMolecule(Writer, Mol, Compute2DCoords):
 278     """Write out molecule."""
 279     
 280     if OptionsInfo["CountMode"]:
 281         return
 282     
 283     if Compute2DCoords:
 284         AllChem.Compute2DCoords(Mol)
 285     
 286     Writer.write(Mol)
 287 
 288 def SetupMoleculeWriter():
 289     """Setup a molecule writer."""
 290     
 291     Writer = None
 292     if OptionsInfo["CountMode"]:
 293         return Writer
 294 
 295     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 296     if Writer is None:
 297         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 298     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 299     
 300     return Writer
 301 
 302 def ProcessOptions():
 303     """Process and validate command line arguments and options"""
 304     
 305     MiscUtil.PrintInfo("Processing options...")
 306     
 307     # Validate options...
 308     ValidateOptions()
 309     
 310     OptionsInfo["Infile"] = Options["--infile"]
 311     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 312     
 313     OptionsInfo["Outfile"] = Options["--outfile"]
 314     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 315 
 316     OptionsInfo["Overwrite"] = Options["--overwrite"]
 317 
 318     OptionsInfo["CountMode"] = False
 319     if re.match("^count$", Options["--mode"], re.I):
 320         OptionsInfo["CountMode"] = True
 321         
 322     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 323     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 324 
 325     SaltsByComponentsMode = False
 326     SaltsBySMARTSFileMode = False
 327     SaltsBySMARTSMode = False
 328     if re.match("^ByComponent$", Options["--saltsMode"], re.I):
 329         SaltsByComponentsMode = True
 330     elif re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 331         SaltsBySMARTSFileMode = False
 332     elif re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 333         SaltsBySMARTSMode = True
 334     else:
 335         MiscUtil.PrintError("The salts mode specified, %s, using \"--saltsMode\" option is not valid." % Options["--saltsMode"])
 336     OptionsInfo["SaltsByComponentsMode"]  = SaltsByComponentsMode
 337     OptionsInfo["SaltsBySMARTSFileMode"]  = SaltsBySMARTSFileMode
 338     OptionsInfo["SaltsBySMARTSMode"]  = SaltsBySMARTSMode
 339 
 340     SaltsFile = None
 341     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 342         if not re.match("^auto$", Options["--saltsFile"], re.I):
 343             SaltsFile = Options["--saltsFile"]
 344     OptionsInfo["SaltsFile"] = SaltsFile
 345     
 346     SaltsSMARTS = None
 347     if re.match("^BySMARTS$", Options["--saltsMode"], re.I):
 348         if not Options["--saltsSMARTS"]:
 349             MiscUtil.PrintError("No salts SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 350         SaltsSMARTS = Options["--saltsSMARTS"].strip(" ")
 351         if not len(SaltsSMARTS):
 352             MiscUtil.PrintError("Empty SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option")
 353         if re.search(" ", SaltsSMARTS):
 354             SaltsSMARTS = re.sub('[ ]+', '\n', SaltsSMARTS)
 355         
 356     OptionsInfo["SaltsSMARTS"] = SaltsSMARTS
 357     
 358 def RetrieveOptions():
 359     """Retrieve command line arguments and options"""
 360     
 361     # Get options...
 362     global Options
 363     Options = docopt(_docoptUsage_)
 364     
 365     # Set current working directory to the specified directory...
 366     WorkingDir = Options["--workingdir"]
 367     if WorkingDir:
 368         os.chdir(WorkingDir)
 369     
 370     # Handle examples option...
 371     if "--examples" in Options and Options["--examples"]:
 372         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 373         sys.exit(0)
 374 
 375 def ValidateOptions():
 376     """Validate option values"""
 377     
 378     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 379     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 380     
 381     if Options["--outfile"]:
 382         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 383         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 384         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 385 
 386     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 387     if re.match("^remove$", Options["--mode"], re.I):
 388         if not Options["--outfile"]:
 389             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 390     
 391     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 392     
 393     MiscUtil.ValidateOptionTextValue("--saltsMode", Options["--saltsMode"], "ByComponent BySMARTSFile BySMARTS")
 394     
 395     if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I):
 396         if not re.match("^auto$", Options["--saltsFile"], re.I):
 397             MiscUtil.ValidateOptionFilePath("--saltsFile", Options["--saltsFile"])
 398 
 399 # Setup a usage string for docopt...
 400 _docoptUsage_ = """
 401 RDKitRemoveSalts.py - Remove salts
 402 
 403 Usage:
 404     RDKitRemoveSalts.py  [--infileParams <Name,Value,...>] [--mode <remove or count>]
 405                          [--mp <yes or no>] [--mpParams <Name.Value,...>] [--outfileParams <Name,Value,...> ]
 406                          [--overwrite] [--saltsMode <ByComponent, BySMARTSFile, BySMARTS>]
 407                          [--saltsFile <FileName or auto>] [--saltsSMARTS <SMARTS>]
 408                          [-w <dir>] [-o <outfile>]  -i <infile>
 409     RDKitRemoveSalts.py -h | --help | -e | --examples
 410 
 411 Description:
 412     Remove salts from molecules or simply count the number of molecules containing
 413     salts. Salts are identified and removed based on either SMARTS strings or by selecting
 414     the largest disconnected components in molecules as non-salt portion of molecules.
 415 
 416     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 417 
 418     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 419 
 420 Options:
 421     -e, --examples
 422         Print examples.
 423     -h, --help
 424         Print this help message.
 425     -i, --infile <infile>
 426         Input file name.
 427     --infileParams <Name,Value,...>  [default: auto]
 428         A comma delimited list of parameter name and value pairs for reading
 429         molecules from files. The supported parameter names for different file
 430         formats, along with their default values, are shown below:
 431             
 432             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 433             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 434                 smilesTitleLine,auto,sanitize,yes
 435             
 436         Possible values for smilesDelimiter: space, comma or tab.
 437     -m, --mode <remove or count>  [default: remove]
 438         Specify whether to remove salts from molecules and write out molecules
 439         or or simply count the number of molecules containing salts.
 440     --mp <yes or no>  [default: no]
 441         Use multiprocessing.
 442          
 443         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 444         function employing lazy RDKit data iterable. This allows processing of
 445         arbitrary large data sets without any additional requirements memory.
 446         
 447         All input data may be optionally loaded into memory by mp.Pool.map()
 448         before starting worker processes in a process pool by setting the value
 449         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 450         
 451         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 452         data mode may adversely impact the performance. The '--mpParams' section
 453         provides additional information to tune the value of 'chunkSize'.
 454     --mpParams <Name,Value,...>  [default: auto]
 455         A comma delimited list of parameter name and value pairs for to
 456         configure multiprocessing.
 457         
 458         The supported parameter names along with their default and possible
 459         values are shown below:
 460         
 461             chunkSize, auto
 462             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 463             numProcesses, auto   [ Default: mp.cpu_count() ]
 464         
 465         These parameters are used by the following functions to configure and
 466         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 467         mp.Pool.imap().
 468         
 469         The chunkSize determines chunks of input data passed to each worker
 470         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 471         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 472         
 473         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 474         automatically converts RDKit data iterable into a list, loads all data into
 475         memory, and calculates the default chunkSize using the following method
 476         as shown in its code:
 477         
 478             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 479             if extra: chunkSize += 1
 480         
 481         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 482         and 100 data items.
 483         
 484         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 485         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 486         data into memory. Consequently, the size of input data is not known a priori.
 487         It's not possible to estimate an optimal value for the chunkSize. The default 
 488         chunkSize is set to 1.
 489         
 490         The default value for the chunkSize during 'Lazy' data mode may adversely
 491         impact the performance due to the overhead associated with exchanging
 492         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 493         a larger value during 'Lazy' input data mode, based on the size of your input
 494         data and number of processes in the process pool.
 495         
 496         The mp.Pool.map() function waits for all worker processes to process all
 497         the data and return the results. The mp.Pool.imap() function, however,
 498         returns the the results obtained from worker processes as soon as the
 499         results become available for specified chunks of data.
 500         
 501         The order of data in the results returned by both mp.Pool.map() and 
 502         mp.Pool.imap() functions always corresponds to the input data.
 503     -o, --outfile <outfile>
 504         Output file name.
 505     --outfileParams <Name,Value,...>  [default: auto]
 506         A comma delimited list of parameter name and value pairs for writing
 507         molecules to files. The supported parameter names for different file
 508         formats, along with their default values, are shown below:
 509             
 510             SD: compute2DCoords,auto,kekulize,no
 511             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 512                 smilesTitleLine,yes
 513             
 514         Default value for compute2DCoords: yes for SMILES input file; no for all other
 515         file types.
 516     --overwrite
 517         Overwrite existing files.
 518     -s, --saltsMode <ByComponent, BySMARTSFile, BySMARTS>  [default: ByComponent]
 519         Specify whether to identify and remove salts based on SMARTS strings or
 520         by selecting the largest disconnected component as non-salt portion of a
 521         molecule. Possible values: ByComponent, BySMARTSFile or BySMARTS.
 522     --saltsFile <FileName or auto>  [default: auto]
 523         Specify a file name containing specification for SMARTS corresponding to salts or
 524         use default salts file, Salts.txt, available in RDKit data directory. This option is only
 525         used during 'BySMARTSFile' value of '-s, --saltsMode' option.
 526         
 527         RDKit data format: Smarts<tab>Name(optional)
 528         
 529         For example:
 530             
 531             [Cl,Br,I]
 532             [N](=O)(O)O
 533             [CH3]C(=O)O	  Acetic acid
 534             
 535     --saltsSMARTS <SMARTS text>
 536         Space delimited SMARTS specifications to use for salts identification instead
 537         their specifications in '--saltsFile'. This option is only used during 'BySMARTS'
 538         value of '-s, --saltsMode' option.
 539     -w, --workingdir <dir>
 540         Location of working directory which defaults to the current directory.
 541 
 542 Examples:
 543     To remove salts from molecules in a SMILES file by keeping largest disconnected
 544     components as non-salt portion of molecules and write out a SMILES file, type:
 545 
 546         % RDKitRemoveSalts.py -i Sample.smi -o SampleOut.smi
 547 
 548     To remove salts from molecules in a SMILES file by keeping largest disconnected
 549     components as non-salt portion of molecules, perform salt removal in multiprocessing
 550     mode on all available CPUs without loading all data into memory, and write out a
 551     SMILES file, type:
 552 
 553         % RDKitRemoveSalts.py --mp yes -i Sample.smi -o SampleOut.smi
 554 
 555     To remove salts from molecules in a SMILES file by keeping largest disconnected
 556     components as non-salt portion of molecules, perform salt removal in multiprocessing
 557     mode on all available CPUs by loading all data into memory, and write out a
 558     SMILES file, type:
 559 
 560         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,InMemory"
 561           -i Sample.smi -o SampleOut.smi
 562 
 563     To remove salts from molecules in a SMILES file by keeping largest disconnected
 564     components as non-salt portion of molecules, perform salt removal in multiprocessing
 565     mode on specific number of CPUs and chunk size without loading all data into memory,
 566     and write out a SMILES file, type:
 567 
 568         % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,Lazy,
 569           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 570 
 571     To count number of molecule containing salts from in a SD file, using largest
 572     components as non-salt portion of molecules, without generating any output
 573     file, type:
 574 
 575         % RDKitRemoveSalts.py -m count -i Sample.sdf
 576 
 577     To remove salts from molecules in a SMILES file using SMARTS strings in default
 578     Salts.txt distributed with RDKit to identify salts and write out a SMILES file, type:
 579 
 580         % RDKitRemoveSalts.py -m remove -s BySMARTSFile -i Sample.smi
 581           -o SampleOut.smi
 582 
 583     To remove salts from molecules in a SD file using SMARTS strings in a local
 584     CustomSalts.txt to identify salts and write out a SMILES file, type:
 585 
 586         % RDKitRemoveSalts.py -m remove -s BySMARTSFile --saltsFile
 587           CustomSalts.txt -i Sample.sdf -o SampleOut.smi
 588 
 589     To remove salts from molecules in a SD file using specified SMARTS to identify
 590     salts and write out a SD file, type:
 591 
 592         % RDKitRemoveSalts.py -m remove -s BySMARTS  --saltsSMARTS
 593           '[Cl,Br,I]  [N](=O)(O)O [N](=O)(O)O'
 594           -i Sample.sdf -o SampleOut.smi
 595 
 596     To remove salts form  molecules from a CSV SMILES file, SMILES strings in column 1,
 597     name in column 2, and generate output SD file, type:
 598 
 599         % RDKitRemoveSalts.py --infileParams 
 600           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 601           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 602           -i SampleSMILES.csv -o SampleOut.sdf
 603 
 604 Author:
 605     Manish Sud(msud@san.rr.com)
 606 
 607 See also:
 608     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 609     RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py
 610 
 611 Copyright:
 612     Copyright (C) 2019 Manish Sud. All rights reserved.
 613 
 614     The functionality available in this script is implemented using RDKit, an
 615     open source toolkit for cheminformatics developed by Greg Landrum.
 616 
 617     This file is part of MayaChemTools.
 618 
 619     MayaChemTools is free software; you can redistribute it and/or modify it under
 620     the terms of the GNU Lesser General Public License as published by the Free
 621     Software Foundation; either version 3 of the License, or (at your option) any
 622     later version.
 623 
 624 """
 625 
 626 if __name__ == "__main__":
 627     main()