1 #!/bin/env python 2 # 3 # File: RDKitRemoveSalts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2023 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem.SaltRemover import SaltRemover 43 from rdkit.Chem.SaltRemover import InputFormat 44 from rdkit.Chem import AllChem 45 except ImportError as ErrMsg: 46 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 47 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 48 sys.exit(1) 49 50 # MayaChemTools imports... 51 try: 52 from docopt import docopt 53 import MiscUtil 54 import RDKitUtil 55 except ImportError as ErrMsg: 56 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 57 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 58 sys.exit(1) 59 60 ScriptName = os.path.basename(sys.argv[0]) 61 Options = {} 62 OptionsInfo = {} 63 64 def main(): 65 """Start execution of the script.""" 66 67 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 RemoveSalts() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 def RemoveSalts(): 84 """Identify and remove salts from molecules.""" 85 86 # Setup a molecule reader... 87 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 88 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 89 90 # Set up a molecule writer... 91 Writer = SetupMoleculeWriter() 92 93 MolCount, ValidMolCount, SaltsMolCount = ProcessMolecules(Mols, Writer) 94 95 if Writer is not None: 96 Writer.close() 97 98 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 99 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 100 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 101 102 MiscUtil.PrintInfo("\nNumber of molecules coontaining salts: %d" % (SaltsMolCount)) 103 104 def ProcessMolecules(Mols, Writer): 105 """Process and remove salts from molecules.""" 106 107 if OptionsInfo["MPMode"]: 108 return ProcessMoleculesUsingMultipleProcesses(Mols, Writer) 109 else: 110 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 111 112 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 113 """Process and remove salts from molecules using a single process.""" 114 115 MiscUtil.PrintInfo("\nRemoving salts...") 116 117 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 118 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 119 120 # Set up a salt remover... 121 Remover = SetupSaltRemover() 122 123 (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3 124 FirstMol = True 125 for Mol in Mols: 126 MolCount += 1 127 128 if Mol is None: 129 continue 130 131 if RDKitUtil.IsMolEmpty(Mol): 132 MolName = RDKitUtil.GetMolName(Mol, MolCount) 133 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 134 continue 135 136 ValidMolCount += 1 137 if FirstMol: 138 FirstMol = False 139 if SetSMILESMolProps: 140 RDKitUtil.SetWriterMolProps(Writer, Mol) 141 142 UnsaltedMol, SaltyStatus = RemoveMolSalts(Mol, Remover, MolCount) 143 144 if SaltyStatus: 145 SaltsMolCount += 1 146 147 WriteMolecule(Writer, UnsaltedMol, Compute2DCoords) 148 149 return (MolCount, ValidMolCount, SaltsMolCount) 150 151 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 152 """Process and remove salts from molecules using multiprocessing.""" 153 154 MiscUtil.PrintInfo("\nRemoving salts using multiprocessing...") 155 156 MPParams = OptionsInfo["MPParams"] 157 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 158 159 # Setup data for initializing a worker process... 160 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 161 162 # Setup a encoded mols data iterable for a worker process by pickling only public 163 # and private molecule properties... 164 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 165 166 # Setup process pool along with data initialization for each process... 167 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 168 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 169 170 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 171 172 # Start processing... 173 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 174 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 175 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 176 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 177 else: 178 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 179 180 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 181 182 (MolCount, ValidMolCount, SaltsMolCount) = [0] * 3 183 FirstMol = True 184 for Result in Results: 185 MolCount += 1 186 MolIndex, EncodedMol, SaltyStatus = Result 187 188 if EncodedMol is None: 189 continue 190 ValidMolCount += 1 191 192 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 193 194 if FirstMol: 195 FirstMol = False 196 if SetSMILESMolProps: 197 RDKitUtil.SetWriterMolProps(Writer, Mol) 198 199 if SaltyStatus: 200 SaltsMolCount += 1 201 202 WriteMolecule(Writer, Mol, Compute2DCoords) 203 204 return (MolCount, ValidMolCount, SaltsMolCount) 205 206 def InitializeWorkerProcess(*EncodedArgs): 207 """Initialize data for a worker process.""" 208 209 global Options, OptionsInfo 210 211 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 212 213 # Decode Options and OptionInfo... 214 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 215 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 216 217 # Set up salt remover... 218 OptionsInfo["SaltRemover"] = SetupSaltRemover() 219 220 def WorkerProcess(EncodedMolInfo): 221 """Process data for a worker process.""" 222 223 MolIndex, EncodedMol = EncodedMolInfo 224 225 if EncodedMol is None: 226 return [MolIndex, None, False] 227 228 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 229 if RDKitUtil.IsMolEmpty(Mol): 230 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 231 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 232 return [MolIndex, None, False] 233 234 Mol, SaltyStatus = RemoveMolSalts(Mol, OptionsInfo["SaltRemover"], (MolIndex + 1)) 235 EncodedMol = RDKitUtil.MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps) 236 237 return [MolIndex, EncodedMol, SaltyStatus] 238 239 def RemoveMolSalts(Mol, Remover, MolNum): 240 """Remove salts from mol and return unsalted mol along with mol salty status.""" 241 242 UnsaltedMol = Mol 243 SaltyStatus = False 244 245 if Remover is not None: 246 KeptMol, DeletedMols = Remover.StripMolWithDeleted(Mol, dontRemoveEverything = False) 247 if len(DeletedMols) >= 1: 248 SaltyStatus = True 249 if RDKitUtil.IsMolEmpty(KeptMol): 250 if len(DeletedMols) >= 1: 251 # Take the larged fragment from DeletedMols 252 UnsaltedMol = GetLargestMol(DeletedMols) 253 else: 254 # Use largest fragment as unsalted molecule... 255 MolFrags = Chem.GetMolFrags(Mol, asMols = True) 256 if len(MolFrags) > 1: 257 # Keep the largest fragment as unsalted molecule... 258 SaltyStatus = True 259 UnsaltedMol = GetLargestMol(MolFrags) 260 261 if SaltyStatus: 262 Chem.SanitizeMol(UnsaltedMol) 263 MolName = RDKitUtil.GetMolName(Mol, MolNum) 264 if len(MolName): 265 UnsaltedMol.SetProp("_Name", MolName) 266 267 return (UnsaltedMol, SaltyStatus) 268 269 def GetLargestMol(Mols): 270 """Get largest mol from list of mols.""" 271 272 LargestMol = None 273 LargestMolSize = -1 274 for Mol in Mols: 275 Size = Mol.GetNumAtoms() 276 if Size > LargestMolSize: 277 LargestMol = Mol 278 LargestMolSize = Size 279 280 return LargestMol 281 282 def SetupSaltRemover(): 283 """Setup a salt remover.""" 284 285 Remover = None 286 if OptionsInfo["SaltsByComponentsMode"]: 287 return Remover 288 289 return SaltRemover(defnFilename = OptionsInfo["SaltsFile"], defnData = OptionsInfo["SaltsSMARTS"], defnFormat = InputFormat.SMARTS) 290 291 def WriteMolecule(Writer, Mol, Compute2DCoords): 292 """Write out molecule.""" 293 294 if OptionsInfo["CountMode"]: 295 return 296 297 if Compute2DCoords: 298 AllChem.Compute2DCoords(Mol) 299 300 Writer.write(Mol) 301 302 def SetupMoleculeWriter(): 303 """Setup a molecule writer.""" 304 305 Writer = None 306 if OptionsInfo["CountMode"]: 307 return Writer 308 309 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 310 if Writer is None: 311 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 312 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 313 314 return Writer 315 316 def ProcessOptions(): 317 """Process and validate command line arguments and options.""" 318 319 MiscUtil.PrintInfo("Processing options...") 320 321 # Validate options... 322 ValidateOptions() 323 324 OptionsInfo["Infile"] = Options["--infile"] 325 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 326 327 OptionsInfo["Outfile"] = Options["--outfile"] 328 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 329 330 OptionsInfo["Overwrite"] = Options["--overwrite"] 331 332 OptionsInfo["CountMode"] = False 333 if re.match("^count$", Options["--mode"], re.I): 334 OptionsInfo["CountMode"] = True 335 336 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 337 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 338 339 SaltsByComponentsMode = False 340 SaltsBySMARTSFileMode = False 341 SaltsBySMARTSMode = False 342 if re.match("^ByComponent$", Options["--saltsMode"], re.I): 343 SaltsByComponentsMode = True 344 elif re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 345 SaltsBySMARTSFileMode = False 346 elif re.match("^BySMARTS$", Options["--saltsMode"], re.I): 347 SaltsBySMARTSMode = True 348 else: 349 MiscUtil.PrintError("The salts mode specified, %s, using \"--saltsMode\" option is not valid." % Options["--saltsMode"]) 350 OptionsInfo["SaltsByComponentsMode"] = SaltsByComponentsMode 351 OptionsInfo["SaltsBySMARTSFileMode"] = SaltsBySMARTSFileMode 352 OptionsInfo["SaltsBySMARTSMode"] = SaltsBySMARTSMode 353 354 SaltsFile = None 355 if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 356 if not re.match("^auto$", Options["--saltsFile"], re.I): 357 SaltsFile = Options["--saltsFile"] 358 OptionsInfo["SaltsFile"] = SaltsFile 359 360 SaltsSMARTS = None 361 if re.match("^BySMARTS$", Options["--saltsMode"], re.I): 362 if not Options["--saltsSMARTS"]: 363 MiscUtil.PrintError("No salts SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option") 364 SaltsSMARTS = Options["--saltsSMARTS"].strip(" ") 365 if not len(SaltsSMARTS): 366 MiscUtil.PrintError("Empty SMARTS pattern specified using \"--saltsSMARTS\" option during \"BySMARTS\" value of \"-s, --saltsMode\" option") 367 if re.search(" ", SaltsSMARTS): 368 SaltsSMARTS = re.sub('[ ]+', '\n', SaltsSMARTS) 369 370 OptionsInfo["SaltsSMARTS"] = SaltsSMARTS 371 372 def RetrieveOptions(): 373 """Retrieve command line arguments and options.""" 374 375 # Get options... 376 global Options 377 Options = docopt(_docoptUsage_) 378 379 # Set current working directory to the specified directory... 380 WorkingDir = Options["--workingdir"] 381 if WorkingDir: 382 os.chdir(WorkingDir) 383 384 # Handle examples option... 385 if "--examples" in Options and Options["--examples"]: 386 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 387 sys.exit(0) 388 389 def ValidateOptions(): 390 """Validate option values.""" 391 392 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 393 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 394 395 if Options["--outfile"]: 396 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 397 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 398 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 399 400 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 401 if re.match("^remove$", Options["--mode"], re.I): 402 if not Options["--outfile"]: 403 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option") 404 405 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 406 407 MiscUtil.ValidateOptionTextValue("--saltsMode", Options["--saltsMode"], "ByComponent BySMARTSFile BySMARTS") 408 409 if re.match("^BySMARTSFile$", Options["--saltsMode"], re.I): 410 if not re.match("^auto$", Options["--saltsFile"], re.I): 411 MiscUtil.ValidateOptionFilePath("--saltsFile", Options["--saltsFile"]) 412 413 # Setup a usage string for docopt... 414 _docoptUsage_ = """ 415 RDKitRemoveSalts.py - Remove salts 416 417 Usage: 418 RDKitRemoveSalts.py [--infileParams <Name,Value,...>] [--mode <remove or count>] 419 [--mp <yes or no>] [--mpParams <Name,Value,...>] [--outfileParams <Name,Value,...> ] 420 [--overwrite] [--saltsMode <ByComponent, BySMARTSFile, BySMARTS>] 421 [--saltsFile <FileName or auto>] [--saltsSMARTS <SMARTS>] 422 [-w <dir>] [-o <outfile>] -i <infile> 423 RDKitRemoveSalts.py -h | --help | -e | --examples 424 425 Description: 426 Remove salts from molecules or simply count the number of molecules containing 427 salts. Salts are identified and removed based on either SMARTS strings or by selecting 428 the largest disconnected components in molecules as non-salt portion of molecules. 429 430 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 431 432 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 433 434 Options: 435 -e, --examples 436 Print examples. 437 -h, --help 438 Print this help message. 439 -i, --infile <infile> 440 Input file name. 441 --infileParams <Name,Value,...> [default: auto] 442 A comma delimited list of parameter name and value pairs for reading 443 molecules from files. The supported parameter names for different file 444 formats, along with their default values, are shown below: 445 446 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 447 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 448 smilesTitleLine,auto,sanitize,yes 449 450 Possible values for smilesDelimiter: space, comma or tab. 451 -m, --mode <remove or count> [default: remove] 452 Specify whether to remove salts from molecules and write out molecules 453 or or simply count the number of molecules containing salts. 454 --mp <yes or no> [default: no] 455 Use multiprocessing. 456 457 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 458 function employing lazy RDKit data iterable. This allows processing of 459 arbitrary large data sets without any additional requirements memory. 460 461 All input data may be optionally loaded into memory by mp.Pool.map() 462 before starting worker processes in a process pool by setting the value 463 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 464 465 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 466 data mode may adversely impact the performance. The '--mpParams' section 467 provides additional information to tune the value of 'chunkSize'. 468 --mpParams <Name,Value,...> [default: auto] 469 A comma delimited list of parameter name and value pairs to configure 470 multiprocessing. 471 472 The supported parameter names along with their default and possible 473 values are shown below: 474 475 chunkSize, auto 476 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 477 numProcesses, auto [ Default: mp.cpu_count() ] 478 479 These parameters are used by the following functions to configure and 480 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 481 mp.Pool.imap(). 482 483 The chunkSize determines chunks of input data passed to each worker 484 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 485 The default value of chunkSize is dependent on the value of 'inputDataMode'. 486 487 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 488 automatically converts RDKit data iterable into a list, loads all data into 489 memory, and calculates the default chunkSize using the following method 490 as shown in its code: 491 492 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 493 if extra: chunkSize += 1 494 495 For example, the default chunkSize will be 7 for a pool of 4 worker processes 496 and 100 data items. 497 498 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 499 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 500 data into memory. Consequently, the size of input data is not known a priori. 501 It's not possible to estimate an optimal value for the chunkSize. The default 502 chunkSize is set to 1. 503 504 The default value for the chunkSize during 'Lazy' data mode may adversely 505 impact the performance due to the overhead associated with exchanging 506 small chunks of data. It is generally a good idea to explicitly set chunkSize to 507 a larger value during 'Lazy' input data mode, based on the size of your input 508 data and number of processes in the process pool. 509 510 The mp.Pool.map() function waits for all worker processes to process all 511 the data and return the results. The mp.Pool.imap() function, however, 512 returns the the results obtained from worker processes as soon as the 513 results become available for specified chunks of data. 514 515 The order of data in the results returned by both mp.Pool.map() and 516 mp.Pool.imap() functions always corresponds to the input data. 517 -o, --outfile <outfile> 518 Output file name. 519 --outfileParams <Name,Value,...> [default: auto] 520 A comma delimited list of parameter name and value pairs for writing 521 molecules to files. The supported parameter names for different file 522 formats, along with their default values, are shown below: 523 524 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 525 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 526 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 527 528 Default value for compute2DCoords: yes for SMILES input file; no for all other 529 file types. 530 --overwrite 531 Overwrite existing files. 532 -s, --saltsMode <ByComponent, BySMARTSFile, BySMARTS> [default: ByComponent] 533 Specify whether to identify and remove salts based on SMARTS strings or 534 by selecting the largest disconnected component as non-salt portion of a 535 molecule. Possible values: ByComponent, BySMARTSFile or BySMARTS. 536 --saltsFile <FileName or auto> [default: auto] 537 Specify a file name containing specification for SMARTS corresponding to salts or 538 use default salts file, Salts.txt, available in RDKit data directory. This option is only 539 used during 'BySMARTSFile' value of '-s, --saltsMode' option. 540 541 RDKit data format: Smarts<tab>Name(optional) 542 543 For example: 544 545 [Cl,Br,I] 546 [N](=O)(O)O 547 [CH3]C(=O)O Acetic acid 548 549 --saltsSMARTS <SMARTS text> 550 Space delimited SMARTS specifications to use for salts identification instead 551 their specifications in '--saltsFile'. This option is only used during 'BySMARTS' 552 value of '-s, --saltsMode' option. 553 -w, --workingdir <dir> 554 Location of working directory which defaults to the current directory. 555 556 Examples: 557 To remove salts from molecules in a SMILES file by keeping largest disconnected 558 components as non-salt portion of molecules and write out a SMILES file, type: 559 560 % RDKitRemoveSalts.py -i Sample.smi -o SampleOut.smi 561 562 To remove salts from molecules in a SMILES file by keeping largest disconnected 563 components as non-salt portion of molecules, perform salt removal in multiprocessing 564 mode on all available CPUs without loading all data into memory, and write out a 565 SMILES file, type: 566 567 % RDKitRemoveSalts.py --mp yes -i Sample.smi -o SampleOut.smi 568 569 To remove salts from molecules in a SMILES file by keeping largest disconnected 570 components as non-salt portion of molecules, perform salt removal in multiprocessing 571 mode on all available CPUs by loading all data into memory, and write out a 572 SMILES file, type: 573 574 % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,InMemory" 575 -i Sample.smi -o SampleOut.smi 576 577 To remove salts from molecules in a SMILES file by keeping largest disconnected 578 components as non-salt portion of molecules, perform salt removal in multiprocessing 579 mode on specific number of CPUs and chunk size without loading all data into memory, 580 and write out a SMILES file, type: 581 582 % RDKitRemoveSalts.py --mp yes --mpParams "inputDataMode,Lazy, 583 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi 584 585 To count number of molecules containing salts from in a SD file, using largest 586 components as non-salt portion of molecules, without generating any output 587 file, type: 588 589 % RDKitRemoveSalts.py -m count -i Sample.sdf 590 591 To remove salts from molecules in a SMILES file using SMARTS strings in default 592 Salts.txt distributed with RDKit to identify salts and write out a SMILES file, type: 593 594 % RDKitRemoveSalts.py -m remove -s BySMARTSFile -i Sample.smi 595 -o SampleOut.smi 596 597 To remove salts from molecules in a SD file using SMARTS strings in a local 598 CustomSalts.txt to identify salts and write out a SMILES file, type: 599 600 % RDKitRemoveSalts.py -m remove -s BySMARTSFile --saltsFile 601 CustomSalts.txt -i Sample.sdf -o SampleOut.smi 602 603 To remove salts from molecules in a SD file using specified SMARTS to identify 604 salts and write out a SD file, type: 605 606 % RDKitRemoveSalts.py -m remove -s BySMARTS --saltsSMARTS 607 '[Cl,Br,I] [N](=O)(O)O [N](=O)(O)O' 608 -i Sample.sdf -o SampleOut.smi 609 610 To remove salts form molecules from a CSV SMILES file, SMILES strings in column 1, 611 name in column 2, and generate output SD file, type: 612 613 % RDKitRemoveSalts.py --infileParams 614 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 615 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 616 -i SampleSMILES.csv -o SampleOut.sdf 617 618 Author: 619 Manish Sud(msud@san.rr.com) 620 621 See also: 622 RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py, 623 RDKitRemoveInvalidMolecules.py, RDKitSearchFunctionalGroups.py, 624 RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py 625 626 Copyright: 627 Copyright (C) 2023 Manish Sud. All rights reserved. 628 629 The functionality available in this script is implemented using RDKit, an 630 open source toolkit for cheminformatics developed by Greg Landrum. 631 632 This file is part of MayaChemTools. 633 634 MayaChemTools is free software; you can redistribute it and/or modify it under 635 the terms of the GNU Lesser General Public License as published by the Free 636 Software Foundation; either version 3 of the License, or (at your option) any 637 later version. 638 639 """ 640 641 if __name__ == "__main__": 642 main()