1 #!/bin/env python 2 # 3 # File: RDKitRemoveSalts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem.SaltRemover import SaltRemover 43 from rdkit.Chem.SaltRemover import InputFormat 44 from rdkit.Chem import AllChem 45 except ImportError as ErrMsg: 46 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 47 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 48 sys.exit(1) 49 50 # MayaChemTools imports... 51 try: 52 from docopt import docopt 53 import MiscUtil 54 import RDKitUtil 55 except ImportError as ErrMsg: 56 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 57 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 58 sys.exit(1) 59 60 ScriptName = os.path.basename(sys.argv[0]) 61 Options = {} 62 OptionsInfo = {} 63 64 def main(): 65 """Start execution of the script.""" 66 67 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 RemoveSalts() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 def RemoveSalts(): 84 """Identify and remove salts from molecules.""" 85 86 # Setup a molecule reader... 87 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 88 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 89 90 # Set up a molecule writer... 91 Writer = SetupMoleculeWriter() 92 93 MolCount, ValidMolCount, SaltsMolCount = ProcessMolecules(Mols, Writer) 94 95 if Writer is not None: 96 Writer.close() 97 98 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 99 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 100 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 101 102 MiscUtil.PrintInfo("\nNumber of molecules containing salts: %d" % (SaltsMolCount)) 103 104 def ProcessMolecules(Mols, Writer): 105 """Process and remove salts from molecules.""" 106 107 if OptionsInfo["MPMode"]: 108 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 109 else: 110 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 111 112 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 113 """Process and remove salts from molecules using a single process.""" 114 115 MiscUtil.PrintInfo("\nRemoving salts...") 116 117 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 118 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 119 120 # Set up a salt remover... 121 Remover = SetupSaltRemover() 122 123 (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3 124 FirstMol = True 125 for Mol in Mols: 126 MolCount += 1 127 128 if Mol is None: 129 continue 130 131 if RDKitUtil.IsMolEmpty(Mol): 132 MolName = RDKitUtil.GetMolName(Mol, MolCount) 133 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 134 continue 135 136 ValidMolCount += 1 137 if FirstMol: 138 FirstMol = False 139 if SetSMILESMolProps: 140 RDKitUtil.SetWriterMolProps(Writer, Mol) 141 142 UnsaltedMol, SaltyStatus = RemoveMolSalts(Mol, Remover, MolCount) 143 144 if SaltyStatus: 145 SaltsMolCount += 1 146 147 WriteMolecule(Writer, UnsaltedMol, Compute2DCoords) 148 149 return (MolCount, ValidMolCount, SaltsMolCount) 150 151 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 152 """Process and remove salts from molecules using multiprocessing.""" 153 154 MiscUtil.PrintInfo("\nRemoving salts using multiprocessing...") 155 156 MPParams = OptionsInfo["MPParams"] 157 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 158 159 # Setup data for initializing a worker process... 160 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 161 162 # Setup a encoded mols data iterable for a worker process by pickling only public 163 # and private molecule properties... 164 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 165 166 # Setup process pool along with data initialization for each process... 167 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 168 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 169 170 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 171 172 # Start processing... 173 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 174 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 175 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 176 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 177 else: 178 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 179 180 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 181 182 (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3 183 FirstMol = True 184 for Result in Results: 185 MolCount += 1 186 MolIndex, EncodedMol, SaltyStatus = Result 187 188 if EncodedMol is None: 189 continue 190 ValidMolCount += 1 191 192 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 193 194 if FirstMol: 195 FirstMol = False 196 if SetSMILESMolProps: 197 RDKitUtil.SetWriterMolProps(Writer, Mol) 198 199 if SaltyStatus: 200 SaltsMolCount += 1 201 202 WriteMolecule(Writer, Mol, Compute2DCoords) 203 204 return (MolCount, ValidMolCount, SaltsMolCount) 205 206 def InitializeWorkerProcess(*EncodedArgs): 207 """Initialize data for a worker process.""" 208 209 global Options, OptionsInfo 210 211 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 212 213 # Decode Options and OptionInfo... 214 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 215 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 216 217 # Set up salt remover... 218 OptionsInfo["SaltRemover"] = SetupSaltRemover() 219 220 def WorkerProcess(EncodedMolInfo): 221 """Process data for a worker process.""" 222 223 MolIndex, EncodedMol = EncodedMolInfo 224 225 if EncodedMol is None: 226 return [MolIndex, None, False] 227 228 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 229 if RDKitUtil.IsMolEmpty(Mol): 230 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 231 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 232 return [MolIndex, None, False] 233 234 Mol, SaltyStatus = RemoveMolSalts(Mol, OptionsInfo["SaltRemover"], (MolIndex + 1)) 235 EncodedMol = RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps) 236 237 return [MolIndex, EncodedMol, SaltyStatus] 238 239 def RemoveMolSalts(Mol, Remover, MolNum): 240 """Remove salts from mol and return unsalted mol along with mol salty status.""" 241 242 UnsaltedMol = Mol 243 SaltyStatus = False 244 245 if Remover is not None: 246 KeptMol, DeletedMols = Remover.StripMolWithDeleted(Mol, dontRemoveEverything = False) 247 if len(DeletedMols) >= 1: 248 SaltyStatus = True 249 if RDKitUtil.IsMolEmpty(KeptMol): 250 if len(DeletedMols) >= 1: 251 # Take the larged fragment from DeletedMols 252 UnsaltedMol = GetLargestMol(DeletedMols) 253 else: 254 # Use largest fragment as unsalted molecule... 255 MolFrags = Chem.GetMolFrags(Mol, asMols = True) 256 if len(MolFrags) > 1: 257 # Keep the largest fragment as unsalted molecule... 258 SaltyStatus = True 259 UnsaltedMol = GetLargestMol(MolFrags) 260 261 if SaltyStatus: 262 Chem.SanitizeMol(UnsaltedMol) 263 MolName = RDKitUtil.GetMolName(Mol, MolNum) 264 if len(MolName): 265 UnsaltedMol.SetProp("_Name", MolName) 266 267 # Set mol properties... 268 for DataLabel in Mol.GetPropNames(includePrivate = False, includeComputed = False): 269 DataProp = Mol.GetProp(DataLabel) 270 UnsaltedMol.SetProp(DataLabel, DataProp) 271 272 return (UnsaltedMol, SaltyStatus) 273 274 def GetLargestMol(Mols): 275 """Get largest mol from list of mols.""" 276 277 LargestMol = None 278 LargestMolSize = -1 279 for Mol in Mols: 280 Size = Mol.GetNumAtoms() 281 if Size > LargestMolSize: 282 LargestMol = Mol 283 LargestMolSize = Size 284 285 return LargestMol 286 287 def SetupSaltRemover(): 288 """Setup a salt remover.""" 289 290 Remover = None 291 if OptionsInfo["SaltsByComponentsMode"]: 292 return Remover 293 294 return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS) 295 296 def WriteMolecule(Writer, Mol, Compute2DCoords): 297 """Write out molecule.""" 298 299 if OptionsInfo["CountMode"]: 300 return 301 302 if Compute2DCoords: 303 AllChem.Compute2DCoords(Mol) 304 305 Writer.write(Mol) 306 307 def SetupMoleculeWriter(): 308 """Setup a molecule writer.""" 309 310 Writer = None 311 if OptionsInfo["CountMode"]: 312 return Writer 313 314 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 315 if Writer is None: 316 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 317 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 318 319 return Writer 320 321 def ProcessOptions(): 322 """Process and validate command line arguments and options.""" 323 324 MiscUtil.PrintInfo("Processing options...") 325 326 # Validate options... 327 ValidateOptions() 328 329 OptionsInfo["Infile"] = Options["--infile"] 330 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 331 332 OptionsInfo["Outfile"] = Options["--outfile"] 333 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 334 335 OptionsInfo["Overwrite"] = Options["--overwrite"] 336 337 OptionsInfo["CountMode"] = False 338 if re.match("^count$", Options["--mode"], re.I): 339 OptionsInfo["CountMode"] = True 340 341 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 342 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 343 344 SaltsByComponentsMode = False 345 SaltsBySMARTSFileMode = False 346 SaltsBySMARTSMode = False 347 if re.match("^ByComponent$", Options["--saltsMode"], re.I): 348 SaltsByComponentsMode = True 349 elif re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 350 SaltsBySMARTSFileMode = False 351 elif re.match("^BySMARTS$", Options["--saltsMode"], re.I): 352 SaltsBySMARTSMode = True 353 else: 354 MiscUtil.PrintError("The salts mode specified, %s, using \"--saltsMode\" option is not valid." % Options["--saltsMode"]) 355 OptionsInfo["SaltsByComponentsMode"] = SaltsByComponentsMode 356 OptionsInfo["SaltsBySMARTSFileMode"] = SaltsBySMARTSFileMode 357 OptionsInfo["SaltsBySMARTSMode"] = SaltsBySMARTSMode 358 359 SaltsFile = None 360 if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 361 if not re.match("^auto$", Options["--saltsFile"], re.I): 362 SaltsFile = Options["--saltsFile"] 363 OptionsInfo["SaltsFile"] = SaltsFile 364 365 SaltsSMARTS = None 366 if re.match("^BySMARTS$", Options["--saltsMode"], re.I): 367 if not Options["--saltsSMARTS"]: 368 MiscUtil.PrintError("No salts SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option") 369 SaltsSMARTS = Options["--saltsSMARTS"].strip(" ") 370 if not len(SaltsSMARTS): 371 MiscUtil.PrintError("Empty SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option") 372 if re.search(" ", SaltsSMARTS): 373 SaltsSMARTS = re.sub('[ ]+', '\n', SaltsSMARTS) 374 375 OptionsInfo["SaltsSMARTS"] = SaltsSMARTS 376 377 def RetrieveOptions(): 378 """Retrieve command line arguments and options.""" 379 380 # Get options... 381 global Options 382 Options = docopt(_docoptUsage_) 383 384 # Set current working directory to the specified directory... 385 WorkingDir = Options["--workingdir"] 386 if WorkingDir: 387 os.chdir(WorkingDir) 388 389 # Handle examples option... 390 if "--examples" in Options and Options["--examples"]: 391 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 392 sys.exit(0) 393 394 def ValidateOptions(): 395 """Validate option values.""" 396 397 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 398 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 399 400 if Options["--outfile"]: 401 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 402 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 403 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 404 405 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 406 if re.match("^remove$", Options["--mode"], re.I): 407 if not Options["--outfile"]: 408 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option") 409 410 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 411 412 MiscUtil.ValidateOptionTextValue("--saltsMode", Options["--saltsMode"], "ByComponent BySMARTSFile BySMARTS") 413 414 if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 415 if not re.match("^auto$", Options["--saltsFile"], re.I): 416 MiscUtil.ValidateOptionFilePath("--saltsFile", Options["--saltsFile"]) 417 418 # Setup a usage string for docopt... 419 _docoptUsage_ = """ 420 RDKitRemoveSalts.py - Remove salts 421 422 Usage: 423 RDKitRemoveSalts.py [--infileParams <Name,Value,...>] [--mode <remove or count>] 424 [--mp <yes or no>] [--mpParams <Name,Value,...>] [--outfileParams <Name,Value,...> ] 425 [--overwrite] [--saltsMode <ByComponent, BySMARTSFile, BySMARTS>] 426 [--saltsFile <FileName or auto>] [--saltsSMARTS <SMARTS>] 427 [-w <dir>] [-o <outfile>] -i <infile> 428 RDKitRemoveSalts.py -h | --help | -e | --examples 429 430 Description: 431 Remove salts from molecules or simply count the number of molecules containing 432 salts. Salts are identified and removed based on either SMARTS strings or by selecting 433 the largest disconnected components in molecules as non-salt portion of molecules. 434 435 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 436 437 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 438 439 Options: 440 -e, --examples 441 Print examples. 442 -h, --help 443 Print this help message. 444 -i, --infile <infile> 445 Input file name. 446 --infileParams <Name,Value,...> [default: auto] 447 A comma delimited list of parameter name and value pairs for reading 448 molecules from files. The supported parameter names for different file 449 formats, along with their default values, are shown below: 450 451 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 452 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 453 smilesTitleLine,auto,sanitize,yes 454 455 Possible values for smilesDelimiter: space, comma or tab. 456 -m, --mode <remove or count> [default: remove] 457 Specify whether to remove salts from molecules and write out molecules 458 or or simply count the number of molecules containing salts. 459 --mp <yes or no> [default: no] 460 Use multiprocessing. 461 462 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 463 function employing lazy RDKit data iterable. This allows processing of 464 arbitrary large data sets without any additional requirements memory. 465 466 All input data may be optionally loaded into memory by mp.Pool.map() 467 before starting worker processes in a process pool by setting the value 468 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 469 470 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 471 data mode may adversely impact the performance. The '--mpParams' section 472 provides additional information to tune the value of 'chunkSize'. 473 --mpParams <Name,Value,...> [default: auto] 474 A comma delimited list of parameter name and value pairs to configure 475 multiprocessing. 476 477 The supported parameter names along with their default and possible 478 values are shown below: 479 480 chunkSize, auto 481 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 482 numProcesses, auto [ Default: mp.cpu_count() ] 483 484 These parameters are used by the following functions to configure and 485 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 486 mp.Pool.imap(). 487 488 The chunkSize determines chunks of input data passed to each worker 489 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 490 The default value of chunkSize is dependent on the value of 'inputDataMode'. 491 492 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 493 automatically converts RDKit data iterable into a list, loads all data into 494 memory, and calculates the default chunkSize using the following method 495 as shown in its code: 496 497 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 498 if extra: chunkSize += 1 499 500 For example, the default chunkSize will be 7 for a pool of 4 worker processes 501 and 100 data items. 502 503 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 504 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 505 data into memory. Consequently, the size of input data is not known a priori. 506 It's not possible to estimate an optimal value for the chunkSize. The default 507 chunkSize is set to 1. 508 509 The default value for the chunkSize during 'Lazy' data mode may adversely 510 impact the performance due to the overhead associated with exchanging 511 small chunks of data. It is generally a good idea to explicitly set chunkSize to 512 a larger value during 'Lazy' input data mode, based on the size of your input 513 data and number of processes in the process pool. 514 515 The mp.Pool.map() function waits for all worker processes to process all 516 the data and return the results. The mp.Pool.imap() function, however, 517 returns the the results obtained from worker processes as soon as the 518 results become available for specified chunks of data. 519 520 The order of data in the results returned by both mp.Pool.map() and 521 mp.Pool.imap() functions always corresponds to the input data. 522 -o, --outfile <outfile> 523 Output file name. 524 --outfileParams <Name,Value,...> [default: auto] 525 A comma delimited list of parameter name and value pairs for writing 526 molecules to files. The supported parameter names for different file 527 formats, along with their default values, are shown below: 528 529 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 530 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 531 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 532 533 Default value for compute2DCoords: yes for SMILES input file; no for all other 534 file types. 535 --overwrite 536 Overwrite existing files. 537 -s, --saltsMode <ByComponent, BySMARTSFile, BySMARTS> [default: ByComponent] 538 Specify whether to identify and remove salts based on SMARTS strings or 539 by selecting the largest disconnected component as non-salt portion of a 540 molecule. Possible values: ByComponent, BySMARTSFile or BySMARTS. 541 --saltsFile <FileName or auto> [default: auto] 542 Specify a file name containing specification for SMARTS corresponding to salts or 543 use default salts file, Salts.txt, available in RDKit data directory. This option is only 544 used during 'BySMARTSFile' value of '-s, --saltsMode' option. 545 546 RDKit data format: Smarts<tab>Name(optional) 547 548 For example: 549 550 [Cl,Br,I] 551 [N](=O)(O)O 552 [CH3]C(=O)O Acetic acid 553 554 --saltsSMARTS <SMARTS text> 555 Space delimited SMARTS specifications to use for salts identification instead 556 their specifications in '--saltsFile'. This option is only used during 'BySMARTS' 557 value of '-s, --saltsMode' option. 558 -w, --workingdir <dir> 559 Location of working directory which defaults to the current directory. 560 561 Examples: 562 To remove salts from molecules in a SMILES file by keeping largest disconnected 563 components as non-salt portion of molecules and write out a SMILES file, type: 564 565 % RDKitRemoveSalts.py -i Sample.smi -o SampleOut.smi 566 567 To remove salts from molecules in a SMILES file by keeping largest disconnected 568 components as non-salt portion of molecules, perform salt removal in multiprocessing 569 mode on all available CPUs without loading all data into memory, and write out a 570 SMILES file, type: 571 572 % RDKitRemoveSalts.py --mp yes -i Sample.smi -o SampleOut.smi 573 574 To remove salts from molecules in a SMILES file by keeping largest disconnected 575 components as non-salt portion of molecules, perform salt removal in multiprocessing 576 mode on all available CPUs by loading all data into memory, and write out a 577 SMILES file, type: 578 579 % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,InMemory" 580 -i Sample.smi -o SampleOut.smi 581 582 To remove salts from molecules in a SMILES file by keeping largest disconnected 583 components as non-salt portion of molecules, perform salt removal in multiprocessing 584 mode on specific number of CPUs and chunk size without loading all data into memory, 585 and write out a SMILES file, type: 586 587 % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,Lazy, 588 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi 589 590 To count number of molecules containing salts from in a SD file, using largest 591 components as non-salt portion of molecules, without generating any output 592 file, type: 593 594 % RDKitRemoveSalts.py -m count -i Sample.sdf 595 596 To remove salts from molecules in a SMILES file using SMARTS strings in default 597 Salts.txt distributed with RDKit to identify salts and write out a SMILES file, type: 598 599 % RDKitRemoveSalts.py -m remove -s BySMARTSFile -i Sample.smi 600 -o SampleOut.smi 601 602 To remove salts from molecules in a SD file using SMARTS strings in a local 603 CustomSalts.txt to identify salts and write out a SMILES file, type: 604 605 % RDKitRemoveSalts.py -m remove -s BySMARTSFile --saltsFile 606 CustomSalts.txt -i Sample.sdf -o SampleOut.smi 607 608 To remove salts from molecules in a SD file using specified SMARTS to identify 609 salts and write out a SD file, type: 610 611 % RDKitRemoveSalts.py -m remove -s BySMARTS --saltsSMARTS 612 '[Cl,Br,I] [N](=O)(O)O [N](=O)(O)O' 613 -i Sample.sdf -o SampleOut.smi 614 615 To remove salts form molecules from a CSV SMILES file, SMILES strings in column 1, 616 name in column 2, and generate output SD file, type: 617 618 % RDKitRemoveSalts.py --infileParams 619 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 620 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 621 -i SampleSMILES.csv -o SampleOut.sdf 622 623 Author: 624 Manish Sud(msud@san.rr.com) 625 626 See also: 627 RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py, 628 RDKitRemoveInvalidMolecules.py, RDKitSearchFunctionalGroups.py, 629 RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py 630 631 Copyright: 632 Copyright (C) 2025 Manish Sud. All rights reserved. 633 634 The functionality available in this script is implemented using RDKit, an 635 open source toolkit for cheminformatics developed by Greg Landrum. 636 637 This file is part of MayaChemTools. 638 639 MayaChemTools is free software; you can redistribute it and/or modify it under 640 the terms of the GNU Lesser General Public License as published by the Free 641 Software Foundation; either version 3 of the License, or (at your option) any 642 later version. 643 644 """ 645 646 if __name__ == "__main__": 647 main()