1 #!/bin/env python 2 # 3 # File: RDKitSearchSMARTS.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformSearch() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformSearch(): 82 """Perform search using specified SMARTS pattern.""" 83 84 # Set up a pattern molecule... 85 PatternMol = Chem.MolFromSmarts(OptionsInfo["Pattern"]) 86 87 # Setup a molecule reader... 88 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 89 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 90 91 # Set up molecule writers... 92 Writer, WriterFiltered = SetupMoleculeWriters() 93 94 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PatternMol, Writer, WriterFiltered) 95 96 if Writer is not None: 97 Writer.close() 98 if WriterFiltered is not None: 99 WriterFiltered.close() 100 101 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 102 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 103 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 104 105 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 106 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 107 108 def ProcessMolecules(Mols, PatternMol, Writer, WriterFiltered): 109 """Process and filter molecules.""" 110 111 if OptionsInfo["MPMode"]: 112 return ProcessMoleculesUsingMultipleProcesses(Mols, PatternMol, Writer, WriterFiltered) 113 else: 114 return ProcessMoleculesUsingSingleProcess(Mols, PatternMol, Writer, WriterFiltered) 115 116 def ProcessMoleculesUsingSingleProcess(Mols, PatternMol, Writer, WriterFiltered): 117 """Process and filter molecules using a single process.""" 118 119 NegateMatch = OptionsInfo["NegateMatch"] 120 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 121 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 122 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 123 124 MiscUtil.PrintInfo("\nFiltering molecules...") 125 126 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 127 FirstMol = True 128 for Mol in Mols: 129 MolCount += 1 130 131 if Mol is None: 132 continue 133 134 if RDKitUtil.IsMolEmpty(Mol): 135 MolName = RDKitUtil.GetMolName(Mol, MolCount) 136 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 137 continue 138 139 ValidMolCount += 1 140 if FirstMol: 141 FirstMol = False 142 if SetSMILESMolProps: 143 if Writer is not None: 144 RDKitUtil.SetWriterMolProps(Writer, Mol) 145 if WriterFiltered is not None: 146 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 147 148 MolMatched = DoesMoleculeContainsPattern(Mol, PatternMol) 149 if MolMatched != NegateMatch: 150 RemainingMolCount += 1 151 WriteMolecule(Writer, Mol, Compute2DCoords) 152 else: 153 if OutfileFilteredMode: 154 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 155 156 return (MolCount, ValidMolCount, RemainingMolCount) 157 158 def ProcessMoleculesUsingMultipleProcesses(Mols, PatternMol, Writer, WriterFiltered): 159 """Process and filter molecules using multiprocessing.""" 160 161 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 162 163 MPParams = OptionsInfo["MPParams"] 164 NegateMatch = OptionsInfo["NegateMatch"] 165 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 166 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 167 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 168 169 # Setup data for initializing a worker process... 170 MiscUtil.PrintInfo("Encoding options info and pattern molecule...") 171 OptionsInfo["EncodedPatternMol"] = RDKitUtil.MolToBase64EncodedMolString(PatternMol) 172 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 173 174 # Setup a encoded mols data iterable for a worker process... 175 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 176 177 # Setup process pool along with data initialization for each process... 178 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 179 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 180 181 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 182 183 # Start processing... 184 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 185 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 186 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 187 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 188 else: 189 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 190 191 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 192 FirstMol = True 193 for Result in Results: 194 MolCount += 1 195 MolIndex, EncodedMol, MolMatched = Result 196 197 if EncodedMol is None: 198 continue 199 ValidMolCount += 1 200 201 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 202 203 if FirstMol: 204 FirstMol = False 205 if SetSMILESMolProps: 206 if Writer is not None: 207 RDKitUtil.SetWriterMolProps(Writer, Mol) 208 if WriterFiltered is not None: 209 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 210 211 if MolMatched != NegateMatch: 212 RemainingMolCount += 1 213 WriteMolecule(Writer, Mol, Compute2DCoords) 214 else: 215 if OutfileFilteredMode: 216 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 217 218 return (MolCount, ValidMolCount, RemainingMolCount) 219 220 def InitializeWorkerProcess(*EncodedArgs): 221 """Initialize data for a worker process.""" 222 223 global Options, OptionsInfo 224 225 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 226 227 # Decode Options and OptionInfo... 228 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 229 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 230 231 # Decode PatternMol... 232 OptionsInfo["PatternMol"] = RDKitUtil.MolFromBase64EncodedMolString(OptionsInfo["EncodedPatternMol"]) 233 234 def WorkerProcess(EncodedMolInfo): 235 """Process data for a worker process.""" 236 237 MolIndex, EncodedMol = EncodedMolInfo 238 239 if EncodedMol is None: 240 return [MolIndex, None, False] 241 242 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 243 if RDKitUtil.IsMolEmpty(Mol): 244 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 245 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 246 return [MolIndex, None, False] 247 248 MolMatched = DoesMoleculeContainsPattern(Mol, OptionsInfo["PatternMol"]) 249 250 return [MolIndex, EncodedMol, MolMatched] 251 252 def WriteMolecule(Writer, Mol, Compute2DCoords): 253 """Write out molecule.""" 254 255 if OptionsInfo["CountMode"]: 256 return 257 258 if Compute2DCoords: 259 AllChem.Compute2DCoords(Mol) 260 261 Writer.write(Mol) 262 263 def SetupMoleculeWriters(): 264 """Setup molecule writers.""" 265 266 Writer = None 267 WriterFiltered = None 268 269 if OptionsInfo["CountMode"]: 270 return (Writer, WriterFiltered) 271 272 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 273 if Writer is None: 274 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 275 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 276 277 if OptionsInfo["OutfileFilteredMode"]: 278 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 279 if WriterFiltered is None: 280 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 281 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 282 283 return (Writer, WriterFiltered) 284 285 def DoesMoleculeContainsPattern(Mol, PatternMol): 286 """Check presence of pattern in the molecule.""" 287 288 return True if Mol.HasSubstructMatch(PatternMol, useChirality = OptionsInfo["UseChirality"]) else False 289 290 def ProcessOptions(): 291 """Process and validate command line arguments and options.""" 292 293 MiscUtil.PrintInfo("Processing options...") 294 295 # Validate options... 296 ValidateOptions() 297 298 OptionsInfo["Infile"] = Options["--infile"] 299 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 300 301 OptionsInfo["Outfile"] = Options["--outfile"] 302 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 303 304 OptionsInfo["OutfileFiltered"] = "" 305 if Options["--outfile"]: 306 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 307 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 308 OptionsInfo["OutfileFiltered"] = OutfileFiltered 309 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 310 311 OptionsInfo["Overwrite"] = Options["--overwrite"] 312 313 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 314 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 315 316 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 317 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 318 319 OptionsInfo["Pattern"] = Options["--pattern"] 320 OptionsInfo["UseChirality"] = True if re.match("^yes$", Options["--useChirality"], re.I) else False 321 322 def RetrieveOptions(): 323 """Retrieve command line arguments and options.""" 324 325 # Get options... 326 global Options 327 Options = docopt(_docoptUsage_) 328 329 # Set current working directory to the specified directory... 330 WorkingDir = Options["--workingdir"] 331 if WorkingDir: 332 os.chdir(WorkingDir) 333 334 # Handle examples option... 335 if "--examples" in Options and Options["--examples"]: 336 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 337 sys.exit(0) 338 339 def ValidateOptions(): 340 """Validate option values.""" 341 342 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 343 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 344 if Options["--outfile"]: 345 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 346 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 347 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 348 349 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 350 351 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "retrieve count") 352 if re.match("^retrieve$", Options["--mode"], re.I): 353 if not Options["--outfile"]: 354 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"retrieve\" value of \"-m, --mode\" option") 355 356 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 357 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 358 359 PatternMol = Chem.MolFromSmarts(Options["--pattern"]) 360 if PatternMol is None: 361 MiscUtil.PrintError("The value specified, %s, using option \"-p, --pattern\" is not a valid SMARTS: Failed to create pattern molecule" % Options["--pattern"]) 362 363 MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no") 364 365 # Setup a usage string for docopt... 366 _docoptUsage_ = """ 367 RDKitSearchSMARTS.py - Perform a substructure search using SMARTS pattern 368 369 Usage: 370 RDKitSearchSMARTS.py [--infileParams <Name,Value,...>] [--mode <retrieve or count>] 371 [--mp <yes or no>] [--mpParams <Name,Value,...>] [--negate <yes or no>] 372 [--outfileFiltered <yes or no>] [--outfileParams <Name,Value,...>] [--overwrite] 373 [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -p <SMARTS> -i <infile> 374 RDKitSearchSMARTS.py -h | --help | -e | --examples 375 376 Description: 377 Perform a substructure search in an input file using specified SMARTS pattern and 378 write out the matched molecules to an output file or simply count the number 379 of matches. 380 381 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 382 383 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 384 385 Options: 386 -e, --examples 387 Print examples. 388 -h, --help 389 Print this help message. 390 -i, --infile <infile> 391 Input file name. 392 --infileParams <Name,Value,...> [default: auto] 393 A comma delimited list of parameter name and value pairs for reading 394 molecules from files. The supported parameter names for different file 395 formats, along with their default values, are shown below: 396 397 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 398 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 399 smilesTitleLine,auto,sanitize,yes 400 401 Possible values for smilesDelimiter: space, comma or tab. 402 -m, --mode <retrieve or count> [default: retrieve] 403 Specify whether to retrieve and write out matched molecules to an output 404 file or simply count the number of matches. 405 --mp <yes or no> [default: no] 406 Use multiprocessing. 407 408 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 409 function employing lazy RDKit data iterable. This allows processing of 410 arbitrary large data sets without any additional requirements memory. 411 412 All input data may be optionally loaded into memory by mp.Pool.map() 413 before starting worker processes in a process pool by setting the value 414 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 415 416 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 417 data mode may adversely impact the performance. The '--mpParams' section 418 provides additional information to tune the value of 'chunkSize'. 419 --mpParams <Name,Value,...> [default: auto] 420 A comma delimited list of parameter name and value pairs to configure 421 multiprocessing. 422 423 The supported parameter names along with their default and possible 424 values are shown below: 425 426 chunkSize, auto 427 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 428 numProcesses, auto [ Default: mp.cpu_count() ] 429 430 These parameters are used by the following functions to configure and 431 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 432 mp.Pool.imap(). 433 434 The chunkSize determines chunks of input data passed to each worker 435 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 436 The default value of chunkSize is dependent on the value of 'inputDataMode'. 437 438 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 439 automatically converts RDKit data iterable into a list, loads all data into 440 memory, and calculates the default chunkSize using the following method 441 as shown in its code: 442 443 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 444 if extra: chunkSize += 1 445 446 For example, the default chunkSize will be 7 for a pool of 4 worker processes 447 and 100 data items. 448 449 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 450 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 451 data into memory. Consequently, the size of input data is not known a priori. 452 It's not possible to estimate an optimal value for the chunkSize. The default 453 chunkSize is set to 1. 454 455 The default value for the chunkSize during 'Lazy' data mode may adversely 456 impact the performance due to the overhead associated with exchanging 457 small chunks of data. It is generally a good idea to explicitly set chunkSize to 458 a larger value during 'Lazy' input data mode, based on the size of your input 459 data and number of processes in the process pool. 460 461 The mp.Pool.map() function waits for all worker processes to process all 462 the data and return the results. The mp.Pool.imap() function, however, 463 returns the the results obtained from worker processes as soon as the 464 results become available for specified chunks of data. 465 466 The order of data in the results returned by both mp.Pool.map() and 467 mp.Pool.imap() functions always corresponds to the input data. 468 -n, --negate <yes or no> [default: no] 469 Specify whether to find molecules not matching the specified SMARTS pattern. 470 -o, --outfile <outfile> 471 Output file name. 472 --outfileFiltered <yes or no> [default: no] 473 Write out a file containing filtered molecules. Its name is automatically 474 generated from the specified output file. Default: <OutfileRoot>_ 475 Filtered.<OutfileExt>. 476 --outfileParams <Name,Value,...> [default: auto] 477 A comma delimited list of parameter name and value pairs for writing 478 molecules to files. The supported parameter names for different file 479 formats, along with their default values, are shown below: 480 481 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 482 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 483 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 484 485 Default value for compute2DCoords: yes for SMILES input file; no for all other 486 file types. 487 --overwrite 488 Overwrite existing files. 489 -p, --pattern <SMARTS> [default: none] 490 SMARTS pattern for performing search. 491 -u, --useChirality <yes or no> [default: no] 492 Use stereochemistry information for SMARTS search. 493 -w, --workingdir <dir> 494 Location of working directory which defaults to the current directory. 495 496 Examples: 497 To retrieve molecules containing the substructure corresponding to a specified 498 SMARTS pattern and write out a SMILES file, type: 499 500 % RDKitSearchSMARTS.py -p 'c1ccccc1' -i Sample.smi -o SampleOut.smi 501 502 To retrieve molecules containing the substructure corresponding to a specified 503 SMARTS pattern, perform filtering in multiprocessing mode on all available 504 CPUs without loading all data into memory, and write out a SMILES file, type: 505 506 % RDKitSearchSMARTS.py --mp yes -p 'c1ccccc1' -i Sample.smi -o SampleOut.smi 507 508 To retrieve molecules containing the substructure corresponding to a specified 509 SMARTS pattern, perform filtering in multiprocessing mode on all available 510 CPUs by loading all data into memory, and write out a SMILES file, type: 511 512 % RDKitSearchSMARTS.py --mp yes --mpParams "inputDataMode,InMemory" 513 -p 'c1ccccc1' -i Sample.smi -o SampleOut.smi 514 515 To retrieve molecules containing the substructure corresponding to a specified 516 SMARTS pattern, perform filtering in multiprocessing mode on specific number 517 of CPUs and chunk size without loading all data into memory, and write out 518 a SMILES file, type: 519 520 % RDKitSearchSMARTS.py --mp yes --mpParams "inputDataMode,Lazy, 521 numProcesses,4,chunkSize,8" -p 'c1ccccc1' -i Sample.smi -o SampleOut.smi 522 523 To only count the number of molecules containing the substructure corresponding 524 to a specified SMARTS pattern without writing out any file, type: 525 526 % RDKitSearchSMARTS.py -m count -p 'c1ccccc1' -i Sample.smi 527 528 To count the number of molecules in a SD file not containing the substructure 529 corresponding to a specified SMARTS pattern and write out a SD file, type: 530 531 % RDKitSearchSMARTS.py -n yes -p 'c1ccccc1' -i Sample.sdf -o SampleOut.sdf 532 533 To retrieve molecules containing the substructure corresponding to a specified 534 SMARTS pattern from a CSV SMILES file, SMILES strings in column 1, name in 535 and write out a SD file, type: 536 537 % RDKitSearchSMARTS.py -p 'c1ccccc1' --infileParams 538 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 539 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 540 -i SampleSMILES.csv -o SampleOut.sdf 541 542 Author: 543 Manish Sud(msud@san.rr.com) 544 545 See also: 546 RDKitConvertFileFormat.py, RDKitFilterPAINS.py, RDKitSearchFunctionalGroups.py 547 548 Copyright: 549 Copyright (C) 2024 Manish Sud. All rights reserved. 550 551 The functionality available in this script is implemented using RDKit, an 552 open source toolkit for cheminformatics developed by Greg Landrum. 553 554 This file is part of MayaChemTools. 555 556 MayaChemTools is free software; you can redistribute it and/or modify it under 557 the terms of the GNU Lesser General Public License as published by the Free 558 Software Foundation; either version 3 of the License, or (at your option) any 559 later version. 560 561 """ 562 563 if __name__ == "__main__": 564 main()