1 #!/bin/env python 2 # 3 # File: RDKitFilterChEMBLAlerts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2023 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformFiltering() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformFiltering(): 82 """Filter molecules using SMARTS specified in ChEMBL filters file.""" 83 84 # Setup ChEMBL patterns and pattern mols... 85 MiscUtil.PrintInfo("\nSetting up ChEMBL pattern molecules for performing substructure search...") 86 ChEMBLPatterns = RetrieveChEMBLPatterns() 87 ChEMBLPatternMols = SetupChEMBLPatternMols(ChEMBLPatterns) 88 89 # Setup a molecule reader... 90 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 91 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 92 93 # Set up molecule writers... 94 Writer, WriterFiltered = SetupMoleculeWriters() 95 96 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 97 98 if Writer is not None: 99 Writer.close() 100 if WriterFiltered is not None: 101 WriterFiltered.close() 102 103 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 104 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 105 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 106 107 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 108 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 109 110 def ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 111 """Process and filter molecules. """ 112 113 if OptionsInfo["MPMode"]: 114 return ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 115 else: 116 return ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 117 118 def ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 119 """Process and filter molecules using a single process.""" 120 121 NegateMatch = OptionsInfo["NegateMatch"] 122 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 123 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 124 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 125 126 MiscUtil.PrintInfo("\nFiltering molecules...") 127 128 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 129 FirstMol = True 130 for Mol in Mols: 131 MolCount += 1 132 133 if Mol is None: 134 continue 135 136 if RDKitUtil.IsMolEmpty(Mol): 137 MolName = RDKitUtil.GetMolName(Mol, MolCount) 138 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 139 continue 140 141 ValidMolCount += 1 142 if FirstMol: 143 FirstMol = False 144 if SetSMILESMolProps: 145 if Writer is not None: 146 RDKitUtil.SetWriterMolProps(Writer, Mol) 147 if WriterFiltered is not None: 148 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 149 150 MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols) 151 if MolMatched == NegateMatch: 152 RemainingMolCount += 1 153 WriteMolecule(Writer, Mol, Compute2DCoords) 154 else: 155 if OutfileFilteredMode: 156 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 157 158 return (MolCount, ValidMolCount, RemainingMolCount) 159 160 def ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 161 """Process and filter molecules using multiprocessing.""" 162 163 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 164 165 MPParams = OptionsInfo["MPParams"] 166 NegateMatch = OptionsInfo["NegateMatch"] 167 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 168 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 169 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 170 171 # Setup data for initializing a worker process... 172 MiscUtil.PrintInfo("Encoding options info and ChEMBL alert pattern molecules...") 173 OptionsInfo["EncodedChEMBLPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in ChEMBLPatternMols] 174 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 175 176 # Setup a encoded mols data iterable for a worker process... 177 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 178 179 # Setup process pool along with data initialization for each process... 180 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 181 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 182 183 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 184 185 # Start processing... 186 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 187 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 188 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 189 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 190 else: 191 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 192 193 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 194 FirstMol = True 195 for Result in Results: 196 MolCount += 1 197 MolIndex, EncodedMol, MolMatched = Result 198 199 if EncodedMol is None: 200 continue 201 ValidMolCount += 1 202 203 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 204 205 if FirstMol: 206 FirstMol = False 207 if SetSMILESMolProps: 208 if Writer is not None: 209 RDKitUtil.SetWriterMolProps(Writer, Mol) 210 if WriterFiltered is not None: 211 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 212 213 if MolMatched == NegateMatch: 214 RemainingMolCount += 1 215 WriteMolecule(Writer, Mol, Compute2DCoords) 216 else: 217 if OutfileFilteredMode: 218 WriteMolecule(WriterFiltered, Mol, Compute2DCoords) 219 220 return (MolCount, ValidMolCount, RemainingMolCount) 221 222 def InitializeWorkerProcess(*EncodedArgs): 223 """Initialize data for a worker process.""" 224 225 global Options, OptionsInfo 226 227 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 228 229 # Decode Options and OptionInfo... 230 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 231 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 232 233 # Decode ChEMBLPatternMols... 234 OptionsInfo["ChEMBLPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedChEMBLPatternMols"]] 235 236 def WorkerProcess(EncodedMolInfo): 237 """Process data for a worker process.""" 238 239 MolIndex, EncodedMol = EncodedMolInfo 240 241 if EncodedMol is None: 242 return [MolIndex, None, False] 243 244 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 245 if RDKitUtil.IsMolEmpty(Mol): 246 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 247 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 248 return [MolIndex, None, False] 249 250 MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, OptionsInfo["ChEMBLPatternMols"]) 251 252 return [MolIndex, EncodedMol, MolMatched] 253 254 def WriteMolecule(Writer, Mol, Compute2DCoords): 255 """Write out molecule.""" 256 257 if OptionsInfo["CountMode"]: 258 return 259 260 if Compute2DCoords: 261 AllChem.Compute2DCoords(Mol) 262 263 Writer.write(Mol) 264 265 def SetupMoleculeWriters(): 266 """Setup molecule writers.""" 267 268 Writer = None 269 WriterFiltered = None 270 271 if OptionsInfo["CountMode"]: 272 return (Writer, WriterFiltered) 273 274 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 275 if Writer is None: 276 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 277 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 278 279 if OptionsInfo["OutfileFilteredMode"]: 280 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 281 if WriterFiltered is None: 282 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 283 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 284 285 return (Writer, WriterFiltered) 286 287 def DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols): 288 """Check presence of ChEMBL alerts pattern in the molecule.""" 289 290 MolMatched = False 291 292 for PatternMol in ChEMBLPatternMols: 293 if Mol.HasSubstructMatch(PatternMol, useChirality = True): 294 MolMatched = True 295 break 296 297 return MolMatched 298 299 def RetrieveChEMBLPatterns(): 300 """Retrieve ChEMBL patterns for specified ChEMBL altert mode.""" 301 302 SMARTSPatterns = [] 303 for FilterType in OptionsInfo["SpecifiedFilterTypes"]: 304 SMARTSPatterns.extend(OptionsInfo["ChEMBLFiltersMap"]["SMARTS"][FilterType]) 305 306 return SMARTSPatterns 307 308 def SetupChEMBLPatternMols(ChEMBLPatterns): 309 """Set up ChEMBL pattern mols for substructure search.""" 310 311 PatternMols = [] 312 for Pattern in ChEMBLPatterns: 313 PatternMol = Chem.MolFromSmarts(Pattern) 314 PatternMols.append(PatternMol) 315 316 return PatternMols 317 318 def ProcessChEMBLAlertsMode(): 319 """Process specified alerts mode.""" 320 321 # Retrieve filetrs information... 322 RetrieveChEMBLFiltersInfo() 323 324 # Process alerts mode... 325 OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"] 326 if re.match("^All$", OptionsInfo["AlertsMode"], re.I): 327 return 328 329 AlertsMode = re.sub(" ", "", OptionsInfo["AlertsMode"]) 330 if not len(AlertsMode): 331 MiscUtil.PrintError("The alerts mode specified using \"-a, --alertsMode\" option are empty.") 332 333 CanonicalFilterTypesMap = {} 334 for FilterType in OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]: 335 CanonicalFilterTypesMap[FilterType.lower()] = FilterType 336 337 SpecifiedFilterTypes = [] 338 for FilterType in AlertsMode.split(","): 339 CanonicalFilterType = FilterType.lower() 340 if not CanonicalFilterType in CanonicalFilterTypesMap: 341 MiscUtil.PrintError("The altert mode, %s, specified using \"-a, --alertsMode\" is not valid. Supported alert modes: %s" % (FilterType, ", ".join(OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]))) 342 343 SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType]) 344 345 OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes 346 347 def RetrieveChEMBLFiltersInfo(): 348 """Retrieve information for ChEMBL filters.""" 349 350 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 351 ChEMBLFiltersFilePath = os.path.join(MayaChemToolsDataDir, "ChEMBLFilters.csv") 352 353 MiscUtil.PrintInfo("\nRetrieving ChEMBL alerts SMARTS patterns from file %s" % (ChEMBLFiltersFilePath)) 354 355 Delimiter = ',' 356 QuoteChar = '"' 357 IgnoreHeaderLine = True 358 FilterLinesWords = MiscUtil.GetTextLinesWords(ChEMBLFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine) 359 360 ChEMBLFiltersMap = {} 361 ChEMBLFiltersMap["FilterTypes"] = [] 362 ChEMBLFiltersMap["ID"] = {} 363 ChEMBLFiltersMap["SMARTS"] = {} 364 365 for LineWords in FilterLinesWords: 366 FilterType = LineWords[0] 367 ID = LineWords[1] 368 SMARTS = LineWords[2] 369 370 if not FilterType in ChEMBLFiltersMap["FilterTypes"]: 371 ChEMBLFiltersMap["FilterTypes"].append(FilterType) 372 ChEMBLFiltersMap["ID"][FilterType] = [] 373 ChEMBLFiltersMap["SMARTS"][FilterType] = [] 374 375 ChEMBLFiltersMap["ID"][FilterType].append(ID) 376 ChEMBLFiltersMap["SMARTS"][FilterType].append(SMARTS) 377 378 OptionsInfo["ChEMBLFiltersMap"] = ChEMBLFiltersMap 379 380 MiscUtil.PrintInfo("\nTotal number alerts: %d" % len(FilterLinesWords)) 381 MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(ChEMBLFiltersMap["FilterTypes"]), ", ".join(ChEMBLFiltersMap["FilterTypes"]))) 382 383 for FilterType in ChEMBLFiltersMap["FilterTypes"]: 384 MiscUtil.PrintInfo("Filter family type: %s; Number of alerts: %d" % (FilterType, len(ChEMBLFiltersMap["ID"][FilterType]))) 385 MiscUtil.PrintInfo("") 386 387 def ProcessOptions(): 388 """Process and validate command line arguments and options.""" 389 390 MiscUtil.PrintInfo("Processing options...") 391 392 # Validate options... 393 ValidateOptions() 394 395 OptionsInfo["Infile"] = Options["--infile"] 396 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 397 398 OptionsInfo["Outfile"] = Options["--outfile"] 399 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 400 401 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 402 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 403 OptionsInfo["OutfileFiltered"] = OutfileFiltered 404 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 405 406 OptionsInfo["Overwrite"] = Options["--overwrite"] 407 408 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 409 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 410 411 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 412 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 413 414 OptionsInfo["AlertsMode"] = Options["--alertsMode"] 415 ProcessChEMBLAlertsMode() 416 417 def RetrieveOptions(): 418 """Retrieve command line arguments and options.""" 419 420 # Get options... 421 global Options 422 Options = docopt(_docoptUsage_) 423 424 # Set current working directory to the specified directory... 425 WorkingDir = Options["--workingdir"] 426 if WorkingDir: 427 os.chdir(WorkingDir) 428 429 # Handle examples option... 430 if "--examples" in Options and Options["--examples"]: 431 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 432 sys.exit(0) 433 434 def ValidateOptions(): 435 """Validate option values.""" 436 437 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 438 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 439 440 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 441 if re.match("^filter$", Options["--mode"], re.I): 442 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 443 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 444 445 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 446 447 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 448 if re.match("^filter$", Options["--mode"], re.I): 449 if not Options["--outfile"]: 450 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 451 452 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 453 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 454 455 # Setup a usage string for docopt... 456 _docoptUsage_ = """ 457 RDKitFilterChEMBLAlterts.py - Filter ChEMBL alerts 458 459 Usage: 460 RDKitFilterChEMBLAlerts.py [--alertsMode <All or Type,Type,...>] 461 [--infileParams <Name,Value,...>] [--mode <filter or count>] 462 [--mp <yes or no>] [--mpParams <Name,Value,...>] 463 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...>] 464 [--negate <yes or no>] [--overwrite] [-w <dir>] -i <infile> -o <outfile> 465 RDKitFilterChEMBLAlerts.py -h | --help | -e | --examples 466 467 Description: 468 Filter molecules from an input file for ChEMBL structural alerts by performing 469 a substructure search using SMARTS patterns specified in MAYACHEMTOOLS/ 470 lib/data/ChEMBLFilters.csv file and write out appropriate molecules to an 471 output file or simply count the number of filtered molecules. 472 473 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 474 .tsv, .txt) 475 476 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 477 478 Options: 479 -a, --alertsMode <All or Type, Type,...> [default: All] 480 All or a comma delimited list of ChEMBL filter types to use for filtering 481 molecules. 482 483 The supported filter family types, along with a description, are show below: 484 485 BMS: Bristol-Myers Squibb HTS Deck Filters 486 Dundee: University of Dundee NTD Screening Library Filters 487 Glaxo: Bristol-Myers Squibb HTS Deck Filters 488 Inpharmatica 489 MLSMR: NIH MLSMR Excluded Functionality Filters 490 PfizerLINT: Pfizer LINT filters 491 SureChEMBL 492 493 -e, --examples 494 Print examples. 495 -h, --help 496 Print this help message. 497 -i, --infile <infile> 498 Input file name. 499 --infileParams <Name,Value,...> [default: auto] 500 A comma delimited list of parameter name and value pairs for reading 501 molecules from files. The supported parameter names for different file 502 formats, along with their default values, are shown below: 503 504 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 505 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 506 smilesTitleLine,auto,sanitize,yes 507 508 Possible values for smilesDelimiter: space, comma or tab. 509 -m, --mode <filter or count> [default: filter] 510 Specify whether to filter the matched molecules and write out the rest of the 511 molecules to an outfile or simply count the number of matched molecules 512 marked for filtering. 513 --mp <yes or no> [default: no] 514 Use multiprocessing. 515 516 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 517 function employing lazy RDKit data iterable. This allows processing of 518 arbitrary large data sets without any additional requirements memory. 519 520 All input data may be optionally loaded into memory by mp.Pool.map() 521 before starting worker processes in a process pool by setting the value 522 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 523 524 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 525 data mode may adversely impact the performance. The '--mpParams' section 526 provides additional information to tune the value of 'chunkSize'. 527 --mpParams <Name,Value,...> [default: auto] 528 A comma delimited list of parameter name and value pairs to configure 529 multiprocessing. 530 531 The supported parameter names along with their default and possible 532 values are shown below: 533 534 chunkSize, auto 535 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 536 numProcesses, auto [ Default: mp.cpu_count() ] 537 538 These parameters are used by the following functions to configure and 539 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 540 mp.Pool.imap(). 541 542 The chunkSize determines chunks of input data passed to each worker 543 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 544 The default value of chunkSize is dependent on the value of 'inputDataMode'. 545 546 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 547 automatically converts RDKit data iterable into a list, loads all data into 548 memory, and calculates the default chunkSize using the following method 549 as shown in its code: 550 551 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 552 if extra: chunkSize += 1 553 554 For example, the default chunkSize will be 7 for a pool of 4 worker processes 555 and 100 data items. 556 557 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 558 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 559 data into memory. Consequently, the size of input data is not known a priori. 560 It's not possible to estimate an optimal value for the chunkSize. The default 561 chunkSize is set to 1. 562 563 The default value for the chunkSize during 'Lazy' data mode may adversely 564 impact the performance due to the overhead associated with exchanging 565 small chunks of data. It is generally a good idea to explicitly set chunkSize to 566 a larger value during 'Lazy' input data mode, based on the size of your input 567 data and number of processes in the process pool. 568 569 The mp.Pool.map() function waits for all worker processes to process all 570 the data and return the results. The mp.Pool.imap() function, however, 571 returns the the results obtained from worker processes as soon as the 572 results become available for specified chunks of data. 573 574 The order of data in the results returned by both mp.Pool.map() and 575 mp.Pool.imap() functions always corresponds to the input data. 576 -n, --negate <yes or no> [default: no] 577 Specify whether to filter molecules not matching the ChEMBL filters specified by 578 SMARTS patterns. 579 -o, --outfile <outfile> 580 Output file name. 581 --outfileFiltered <yes or no> [default: no] 582 Write out a file containing filtered molecules. Its name is automatically 583 generated from the specified output file. Default: <OutfileRoot>_ 584 Filtered.<OutfileExt>. 585 --outfileParams <Name,Value,...> [default: auto] 586 A comma delimited list of parameter name and value pairs for writing 587 molecules to files. The supported parameter names for different file 588 formats, along with their default values, are shown below: 589 590 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 591 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 592 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 593 594 Default value for compute2DCoords: yes for SMILES input file; no for all other 595 file types. 596 --overwrite 597 Overwrite existing files. 598 -w, --workingdir <dir> 599 Location of working directory which defaults to the current directory. 600 601 Examples: 602 To count the number of molecules not containing any substructure corresponding 603 to any ChEMBL SMARTS patterns and write out SMILES files containing these molecules, 604 type: 605 606 % RDKitFilterChEMBLAlerts.py -i Sample.smi -o SampleOut.smi 607 608 To count the number of molecules not containing any substructure corresponding to 609 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all 610 available CPUs without loading all data into memory, and write out a SMILES file, type: 611 612 % RDKitFilterChEMBLAlerts.py --mp yes -i Sample.smi -o SampleOut.smi 613 614 To count the number of molecules not containing any substructure corresponding to 615 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all 616 available CPUs by loading all data into memory, and write out a SMILES file, type: 617 618 % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode, 619 InMemory" -i Sample.smi -o SampleOut.smi 620 621 To count the number of molecules not containing any substructure corresponding to 622 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on specific 623 number of CPUs and chunk size without loading all data into memory, and 624 write out a SMILES file, type: 625 626 % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,Lazy, 627 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi 628 629 To count the number of molecules not containing any substructure corresponding 630 to any ChEMBL SMARTS patterns and write out SMILES files containing these and filtered 631 molecules, type: 632 633 % RDKitFilterChEMBLAlerts.py --outfileFiltered yes -i Sample.smi 634 -o SampleOut.smi 635 636 To only count the number of molecules not containing any substructure corresponding 637 to BMS ChEMBL SMARTS patterns without writing out any files, type: 638 639 % RDKitFilterChEMBLAlerts.py -m count -a BMS -i Sample.sdf 640 -o SampleOut.smi 641 642 To count the number of molecules not containing any substructure corresponding 643 to Pfizer LINT ChEMBL SMARTS patterns in a CSV SMILES file and write out a SD file, 644 type: 645 646 % RDKitFilterChEMBLAlerts.py --altertsMode PfizerLINT --infileParams 647 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 648 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 649 -i SampleSMILES.csv -o SampleOut.sdf 650 651 Author: 652 Manish Sud(msud@san.rr.com) 653 654 See also: 655 RDKitFilterPAINS.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 656 657 Copyright: 658 Copyright (C) 2023 Manish Sud. All rights reserved. 659 660 The functionality available in this script is implemented using RDKit, an 661 open source toolkit for cheminformatics developed by Greg Landrum. 662 663 This file is part of MayaChemTools. 664 665 MayaChemTools is free software; you can redistribute it and/or modify it under 666 the terms of the GNU Lesser General Public License as published by the Free 667 Software Foundation; either version 3 of the License, or (at your option) any 668 later version. 669 670 """ 671 672 if __name__ == "__main__": 673 main()