1 #!/bin/env python 2 # 3 # File: RDKitFilterPAINS.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2022 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformFiltering() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformFiltering(): 82 """Filter molecules using SMARTS specified in PAINS filter file.""" 83 84 # Setup PAINS patterns and pattern mols... 85 MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...") 86 PAINSPatterns = RetrievePAINSPatterns() 87 PAINSPatternMols = SetupPAINSPatternMols(PAINSPatterns) 88 89 # Setup a molecule reader... 90 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 91 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 92 93 # Set up molecule writers... 94 Writer, WriterFiltered = SetupMoleculeWriters() 95 96 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered) 97 98 if Writer is not None: 99 Writer.close() 100 if WriterFiltered is not None: 101 WriterFiltered.close() 102 103 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 104 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 105 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 106 107 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 108 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 109 110 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered): 111 """Process and filter molecules. """ 112 113 if OptionsInfo["MPMode"]: 114 return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered) 115 else: 116 return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered) 117 118 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered): 119 """Process and filter molecules using a single process.""" 120 121 NegateMatch = OptionsInfo["NegateMatch"] 122 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 123 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 124 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 125 126 MiscUtil.PrintInfo("\nFiltering molecules...") 127 128 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 129 FirstMol = True 130 for Mol in Mols: 131 MolCount += 1 132 133 if Mol is None: 134 continue 135 136 if RDKitUtil.IsMolEmpty(Mol): 137 MolName = RDKitUtil.GetMolName(Mol, MolCount) 138 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 139 continue 140 141 ValidMolCount += 1 142 if FirstMol: 143 FirstMol = False 144 if SetSMILESMolProps: 145 if Writer is not None: 146 RDKitUtil.SetWriterMolProps(Writer, Mol) 147 if WriterFiltered is not None: 148 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 149 150 MolMatched = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols) 151 if MolMatched == NegateMatch: 152 RemainingMolCount += 1 153 WriteMolecule(Writer, Mol, Compute2DCoords) 154 else: 155 if OutfileFilteredMode: 156 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 157 158 return (MolCount, ValidMolCount, RemainingMolCount) 159 160 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered): 161 """Process and filter molecules using multiprocessing.""" 162 163 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 164 165 MPParams = OptionsInfo["MPParams"] 166 NegateMatch = OptionsInfo["NegateMatch"] 167 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 168 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 169 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 170 171 # Setup data for initializing a worker process... 172 MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...") 173 OptionsInfo["EncodedPAINSPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols] 174 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 175 176 # Setup a encoded mols data iterable for a worker process... 177 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 178 179 # Setup process pool along with data initialization for each process... 180 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 181 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 182 183 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 184 185 # Start processing... 186 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 187 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 188 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 189 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 190 else: 191 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 192 193 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 194 FirstMol = True 195 for Result in Results: 196 MolCount += 1 197 MolIndex, EncodedMol, MolMatched = Result 198 199 if EncodedMol is None: 200 continue 201 ValidMolCount += 1 202 203 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 204 205 if FirstMol: 206 FirstMol = False 207 if SetSMILESMolProps: 208 if Writer is not None: 209 RDKitUtil.SetWriterMolProps(Writer, Mol) 210 if WriterFiltered is not None: 211 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 212 213 if MolMatched == NegateMatch: 214 RemainingMolCount += 1 215 WriteMolecule(Writer, Mol, Compute2DCoords) 216 else: 217 if OutfileFilteredMode: 218 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 219 220 return (MolCount, ValidMolCount, RemainingMolCount) 221 222 def InitializeWorkerProcess(*EncodedArgs): 223 """Initialize data for a worker process.""" 224 225 global Options, OptionsInfo 226 227 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 228 229 # Decode Options and OptionInfo... 230 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 231 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 232 233 # Decode PAINSPatternMols... 234 OptionsInfo["PAINSPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]] 235 236 def WorkerProcess(EncodedMolInfo): 237 """Process data for a worker process.""" 238 239 MolIndex, EncodedMol = EncodedMolInfo 240 241 if EncodedMol is None: 242 return [MolIndex, None, False] 243 244 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 245 if RDKitUtil.IsMolEmpty(Mol): 246 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 247 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 248 return [MolIndex, None, False] 249 250 MolMatched = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"]) 251 252 return [MolIndex, EncodedMol, MolMatched] 253 254 def WriteMolecule(Writer, Mol, Compute2DCoords): 255 """Write out molecule.""" 256 257 if OptionsInfo["CountMode"]: 258 return 259 260 if Compute2DCoords: 261 AllChem.Compute2DCoords(Mol) 262 263 Writer.write(Mol) 264 265 def SetupMoleculeWriters(): 266 """Setup molecule writers.""" 267 268 Writer = None 269 WriterFiltered = None 270 271 if OptionsInfo["CountMode"]: 272 return (Writer, WriterFiltered) 273 274 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 275 if Writer is None: 276 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 277 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 278 279 if OptionsInfo["OutfileFilteredMode"]: 280 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 281 if WriterFiltered is None: 282 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 283 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 284 285 return (Writer, WriterFiltered) 286 287 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols): 288 """Check presence of PAINS pattern in the molecule""" 289 290 MolMatched = False 291 292 for PatternMol in PAINSPatternMols: 293 if Mol.HasSubstructMatch(PatternMol, useChirality = True): 294 MolMatched = True 295 break 296 297 return MolMatched 298 299 def RetrievePAINSPatterns(): 300 """Retrieve PAINS patterns for specified PAINS mode""" 301 302 SMARTSPatterns = [] 303 for FilterType in OptionsInfo["SpecifiedFilterTypes"]: 304 SMARTSPatterns.extend(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType]) 305 306 return SMARTSPatterns 307 308 def SetupPAINSPatternMols(PAINSPatterns): 309 """Set up PAINS pattern mols for substructure search""" 310 311 PatternMols = [] 312 for Pattern in PAINSPatterns: 313 PatternMol = Chem.MolFromSmarts(Pattern) 314 if PatternMol is None: 315 MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern) 316 continue 317 PatternMols.append(PatternMol) 318 319 return PatternMols 320 321 def ProcessPAINSMode(): 322 """Process specified PAINS mode. """ 323 324 # Retrieve filetrs information... 325 RetrievePAINSFiltersInfo() 326 327 # Process PAINS mode... 328 OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"] 329 if re.match("^All$", OptionsInfo["PAINSMode"], re.I): 330 return 331 332 PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"]) 333 if not len(PAINSMode): 334 MiscUtil.PrintError("The PAINSMode mode specified using \"-p, --painsMode\" option are empty.") 335 336 CanonicalFilterTypesMap = {} 337 for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]: 338 CanonicalFilterTypesMap[FilterType.lower()] = FilterType 339 340 SpecifiedFilterTypes = [] 341 for FilterType in PAINSMode.split(","): 342 CanonicalFilterType = FilterType.lower() 343 if not CanonicalFilterType in CanonicalFilterTypesMap: 344 MiscUtil.PrintError("The PAINS mode, %s, specified using \"-p, --PAINSMode\" is not valid. Supported PAINS modes: %s" % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"]))) 345 346 SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType]) 347 348 OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes 349 350 def RetrievePAINSFiltersInfo(): 351 """Retrieve information for PAINS filters.""" 352 353 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 354 PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv") 355 356 MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath)) 357 358 Delimiter = ',' 359 QuoteChar = '"' 360 IgnoreHeaderLine = True 361 FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine) 362 363 PAINSFiltersMap = {} 364 PAINSFiltersMap["FilterTypes"] = [] 365 PAINSFiltersMap["ID"] = {} 366 PAINSFiltersMap["SMARTS"] = {} 367 368 for LineWords in FilterLinesWords: 369 FilterType = LineWords[0] 370 ID = LineWords[1] 371 SMARTS = LineWords[2] 372 373 if not FilterType in PAINSFiltersMap["FilterTypes"]: 374 PAINSFiltersMap["FilterTypes"].append(FilterType) 375 PAINSFiltersMap["ID"][FilterType] = [] 376 PAINSFiltersMap["SMARTS"][FilterType] = [] 377 378 PAINSFiltersMap["ID"][FilterType].append(ID) 379 PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS) 380 381 OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap 382 383 MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords)) 384 MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"]))) 385 386 for FilterType in PAINSFiltersMap["FilterTypes"]: 387 MiscUtil.PrintInfo("Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["ID"][FilterType]))) 388 389 def ProcessOptions(): 390 """Process and validate command line arguments and options""" 391 392 MiscUtil.PrintInfo("Processing options...") 393 394 # Validate options... 395 ValidateOptions() 396 397 OptionsInfo["Infile"] = Options["--infile"] 398 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 399 400 OptionsInfo["Outfile"] = Options["--outfile"] 401 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 402 403 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 404 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 405 OptionsInfo["OutfileFiltered"] = OutfileFiltered 406 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 407 408 OptionsInfo["Overwrite"] = Options["--overwrite"] 409 410 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 411 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 412 413 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 414 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 415 416 OptionsInfo["PAINSMode"] = Options["--painsMode"] 417 ProcessPAINSMode() 418 419 def RetrieveOptions(): 420 """Retrieve command line arguments and options""" 421 422 # Get options... 423 global Options 424 Options = docopt(_docoptUsage_) 425 426 # Set current working directory to the specified directory... 427 WorkingDir = Options["--workingdir"] 428 if WorkingDir: 429 os.chdir(WorkingDir) 430 431 # Handle examples option... 432 if "--examples" in Options and Options["--examples"]: 433 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 434 sys.exit(0) 435 436 def ValidateOptions(): 437 """Validate option values""" 438 439 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 440 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 441 442 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 443 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 444 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 445 446 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 447 448 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 449 if re.match("^filter$", Options["--mode"], re.I): 450 if not Options["--outfile"]: 451 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 452 453 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 454 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 455 456 # Setup a usage string for docopt... 457 _docoptUsage_ = """ 458 RDKitFilterPAINS.py - Filter PAINS molecules 459 460 Usage: 461 RDKitFilterPAINS.py [--infileParams <Name,Value,...>] [--mode <filter or count>] 462 [--mp <yes or no>] [--mpParams <Name,Value,...>] 463 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ] 464 [--painsMode <All or A, B, C>] [--negate <yes or no>] 465 [--overwrite] [-w <dir>] -i <infile> -o <outfile> 466 RDKitFilterPAINS.py -h | --help | -e | --examples 467 468 Description: 469 Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input 470 file by performing a substructure search using SMARTS pattern specified in 471 MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate 472 molecules to an output file or simply count the number of filtered molecules. 473 474 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 475 .tsv, .txt) 476 477 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 478 479 Options: 480 -e, --examples 481 Print examples. 482 -h, --help 483 Print this help message. 484 -i, --infile <infile> 485 Input file name. 486 --infileParams <Name,Value,...> [default: auto] 487 A comma delimited list of parameter name and value pairs for reading 488 molecules from files. The supported parameter names for different file 489 formats, along with their default values, are shown below: 490 491 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 492 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 493 smilesTitleLine,auto,sanitize,yes 494 495 Possible values for smilesDelimiter: space, comma or tab. 496 -m, --mode <filter or count> [default: filter] 497 Specify whether to filter the matched molecules and write out the rest of the 498 molecules to an outfile or simply count the number of matched molecules 499 marked for filtering. 500 --mp <yes or no> [default: no] 501 Use multiprocessing. 502 503 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 504 function employing lazy RDKit data iterable. This allows processing of 505 arbitrary large data sets without any additional requirements memory. 506 507 All input data may be optionally loaded into memory by mp.Pool.map() 508 before starting worker processes in a process pool by setting the value 509 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 510 511 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 512 data mode may adversely impact the performance. The '--mpParams' section 513 provides additional information to tune the value of 'chunkSize'. 514 --mpParams <Name,Value,...> [default: auto] 515 A comma delimited list of parameter name and value pairs to configure 516 multiprocessing. 517 518 The supported parameter names along with their default and possible 519 values are shown below: 520 521 chunkSize, auto 522 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 523 numProcesses, auto [ Default: mp.cpu_count() ] 524 525 These parameters are used by the following functions to configure and 526 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 527 mp.Pool.imap(). 528 529 The chunkSize determines chunks of input data passed to each worker 530 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 531 The default value of chunkSize is dependent on the value of 'inputDataMode'. 532 533 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 534 automatically converts RDKit data iterable into a list, loads all data into 535 memory, and calculates the default chunkSize using the following method 536 as shown in its code: 537 538 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 539 if extra: chunkSize += 1 540 541 For example, the default chunkSize will be 7 for a pool of 4 worker processes 542 and 100 data items. 543 544 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 545 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 546 data into memory. Consequently, the size of input data is not known a priori. 547 It's not possible to estimate an optimal value for the chunkSize. The default 548 chunkSize is set to 1. 549 550 The default value for the chunkSize during 'Lazy' data mode may adversely 551 impact the performance due to the overhead associated with exchanging 552 small chunks of data. It is generally a good idea to explicitly set chunkSize to 553 a larger value during 'Lazy' input data mode, based on the size of your input 554 data and number of processes in the process pool. 555 556 The mp.Pool.map() function waits for all worker processes to process all 557 the data and return the results. The mp.Pool.imap() function, however, 558 returns the the results obtained from worker processes as soon as the 559 results become available for specified chunks of data. 560 561 The order of data in the results returned by both mp.Pool.map() and 562 mp.Pool.imap() functions always corresponds to the input data. 563 -n, --negate <yes or no> [default: no] 564 Specify whether to filter molecules not matching the PAINS filters specified by 565 SMARTS patterns. 566 -o, --outfile <outfile> 567 Output file name. 568 --outfileFiltered <yes or no> [default: no] 569 Write out a file containing filtered molecules. Its name is automatically 570 generated from the specified output file. Default: <OutfileRoot>_ 571 Filtered.<OutfileExt>. 572 --outfileParams <Name,Value,...> [default: auto] 573 A comma delimited list of parameter name and value pairs for writing 574 molecules to files. The supported parameter names for different file 575 formats, along with their default values, are shown below: 576 577 SD: compute2DCoords,auto,kekulize,yes 578 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 579 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 580 581 Default value for compute2DCoords: yes for SMILES input file; no for all other 582 file types. 583 --overwrite 584 Overwrite existing files. 585 -p, --painsMode <All or A, B, or C> [default: All] 586 All or a comma delimited list of PAINS filter family type to used for 587 filtering molecules. 588 -w, --workingdir <dir> 589 Location of working directory which defaults to the current directory. 590 591 Examples: 592 To count the number of molecules not containing any substructure corresponding to 593 PAINS SMARTS patterns and write out a SMILES file, type: 594 595 % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi 596 597 To count the number of molecules not containing any substructure corresponding to 598 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 599 CPUs without loading all data into memory, and write out a SMILES file, type: 600 601 % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi 602 603 To count the number of molecules not containing any substructure corresponding to 604 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 605 CPUs by loading all data into memory, and write out a SMILES file, type: 606 607 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory" 608 -i Sample.smi -o SampleOut.smi 609 610 To count the number of molecules not containing any substructure corresponding to 611 PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific 612 number of CPUs and chunk size without loading all data into memory, and 613 write out a SMILES file, type: 614 615 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy, 616 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi 617 618 To count the number of molecules not containing any substructure corresponding to 619 PAINS SMARTS patterns and write out a SMILES file containing these and filtered 620 molecules, type: 621 622 % RDKitFilterPAINS.py --outfileFiltered yes -i Sample.smi 623 -o SampleOut.smi 624 625 To only count the number of molecules not containing any substructure corresponding 626 to PAINS SMARTS patterns without writing out any file, type: 627 628 % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi 629 630 To count the number of molecules containing any substructure corresponding to 631 PAINS SMARTS patterns and write out a SD file with computed 2D coordinates, 632 type: 633 634 % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf 635 636 To count the number of molecules not containing any substructure corresponding to 637 PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 638 639 % RDKitFilterPAINS.py --painsMode A --infileParams 640 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 641 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 642 -i SampleSMILES.csv -o SampleOut.sdf 643 644 Author: 645 Manish Sud(msud@san.rr.com) 646 647 See also: 648 RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 649 650 Copyright: 651 Copyright (C) 2022 Manish Sud. All rights reserved. 652 653 The functionality available in this script is implemented using RDKit, an 654 open source toolkit for cheminformatics developed by Greg Landrum. 655 656 This file is part of MayaChemTools. 657 658 MayaChemTools is free software; you can redistribute it and/or modify it under 659 the terms of the GNU Lesser General Public License as published by the Free 660 Software Foundation; either version 3 of the License, or (at your option) any 661 later version. 662 663 """ 664 665 if __name__ == "__main__": 666 main()