1 #!/bin/env python 2 # 3 # File: RDKitFilterPAINS.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformFiltering() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformFiltering(): 82 """Filter molecules using SMARTS specified in PAINS filter file.""" 83 84 # Setup PAINS patterns and pattern mols... 85 MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...") 86 PAINSPatternMols = SetupPAINSPatternMols() 87 88 # Setup a molecule reader... 89 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 90 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 91 92 # Set up molecule writers... 93 Writer, WriterFiltered = SetupMoleculeWriters() 94 95 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered) 96 97 if Writer is not None: 98 Writer.close() 99 if WriterFiltered is not None: 100 WriterFiltered.close() 101 102 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 103 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 104 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 105 106 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 107 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 108 109 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered): 110 """Process and filter molecules.""" 111 112 if OptionsInfo["MPMode"]: 113 return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered) 114 else: 115 return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered) 116 117 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered): 118 """Process and filter molecules using a single process.""" 119 120 NegateMatch = OptionsInfo["NegateMatch"] 121 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 122 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 123 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 124 125 MiscUtil.PrintInfo("\nFiltering molecules...") 126 127 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 128 FirstMol = True 129 for Mol in Mols: 130 MolCount += 1 131 132 if Mol is None: 133 continue 134 135 if RDKitUtil.IsMolEmpty(Mol): 136 MolName = RDKitUtil.GetMolName(Mol, MolCount) 137 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 138 continue 139 140 ValidMolCount += 1 141 if FirstMol: 142 FirstMol = False 143 if SetSMILESMolProps: 144 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 145 146 MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols) 147 if MolMatched == NegateMatch: 148 RemainingMolCount += 1 149 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 150 else: 151 if OutfileFilteredMode: 152 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 153 154 return (MolCount, ValidMolCount, RemainingMolCount) 155 156 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered): 157 """Process and filter molecules using multiprocessing.""" 158 159 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 160 161 MPParams = OptionsInfo["MPParams"] 162 NegateMatch = OptionsInfo["NegateMatch"] 163 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 164 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 165 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 166 167 # Setup data for initializing a worker process... 168 MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...") 169 OptionsInfo["EncodedPAINSPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols] 170 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 171 172 # Setup a encoded mols data iterable for a worker process... 173 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 174 175 # Setup process pool along with data initialization for each process... 176 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 177 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 178 179 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 180 181 # Start processing... 182 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 183 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 184 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 185 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 186 else: 187 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 188 189 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 190 FirstMol = True 191 for Result in Results: 192 MolCount += 1 193 MolIndex, EncodedMol, MolMatched, AlertsInfo = Result 194 195 if EncodedMol is None: 196 continue 197 ValidMolCount += 1 198 199 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 200 201 if FirstMol: 202 FirstMol = False 203 if SetSMILESMolProps: 204 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 205 206 if MolMatched == NegateMatch: 207 RemainingMolCount += 1 208 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 209 else: 210 if OutfileFilteredMode: 211 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 212 213 return (MolCount, ValidMolCount, RemainingMolCount) 214 215 def InitializeWorkerProcess(*EncodedArgs): 216 """Initialize data for a worker process.""" 217 218 global Options, OptionsInfo 219 220 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 221 222 # Decode Options and OptionInfo... 223 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 224 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 225 226 # Decode PAINSPatternMols... 227 OptionsInfo["PAINSPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]] 228 229 def WorkerProcess(EncodedMolInfo): 230 """Process data for a worker process.""" 231 232 MolIndex, EncodedMol = EncodedMolInfo 233 234 if EncodedMol is None: 235 return [MolIndex, None, False, None] 236 237 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 238 if RDKitUtil.IsMolEmpty(Mol): 239 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 240 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 241 return [MolIndex, None, False, None] 242 243 MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"]) 244 245 return [MolIndex, EncodedMol, MolMatched, AlertsInfo] 246 247 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords): 248 """Write out molecule.""" 249 250 if OptionsInfo["CountMode"]: 251 return 252 253 if Compute2DCoords: 254 AllChem.Compute2DCoords(Mol) 255 256 if AlertsInfo is not None and len(AlertsInfo): 257 AlertsCount = "%s" % len(AlertsInfo) 258 Alerts = "; ".join(AlertsInfo) 259 if OptionsInfo["WriteAlertsCount"]: 260 Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount) 261 Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts) 262 263 Writer.write(Mol) 264 265 def SetupMoleculeWriters(): 266 """Setup molecule writers.""" 267 268 Writer = None 269 WriterFiltered = None 270 271 if OptionsInfo["CountMode"]: 272 return (Writer, WriterFiltered) 273 274 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 275 if Writer is None: 276 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 277 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 278 279 if OptionsInfo["OutfileFilteredMode"]: 280 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 281 if WriterFiltered is None: 282 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 283 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 284 285 return (Writer, WriterFiltered) 286 287 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol): 288 """Setup properties to write for SMILES molecule writers.""" 289 290 if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]: 291 return 292 293 NegateMatch = OptionsInfo["NegateMatch"] 294 SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"] 295 SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"] 296 297 if Writer is not None: 298 RDKitUtil.SetWriterMolProps(Writer, Mol) 299 if SetSMILESMolAlertsProp: 300 if NegateMatch: 301 Writer.SetProps(SMILESMolAlertsPropList) 302 303 if WriterFiltered is not None: 304 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 305 if SetSMILESMolAlertsProp: 306 if not NegateMatch: 307 WriterFiltered.SetProps(SMILESMolAlertsPropList) 308 309 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols): 310 """Check presence of PAINS pattern in the molecule.""" 311 312 MatchAllAlerts = OptionsInfo["MatchAllAlerts"] 313 AlertsInfo = [] 314 for PatternMol in PAINSPatternMols: 315 if Mol.HasSubstructMatch(PatternMol, useChirality = True): 316 AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID"))) 317 if not MatchAllAlerts: 318 break 319 320 if len(AlertsInfo) == 0: 321 MolMatched = False 322 AlertsInfo = None 323 else: 324 MolMatched = True 325 326 return (MolMatched, AlertsInfo) 327 328 def SetupPAINSPatternMols(): 329 """Set up PAINS pattern mols for substructure search corresponding to PAINS mode.""" 330 331 PatternMols = [] 332 for FilterType in OptionsInfo["SpecifiedFilterTypes"]: 333 for Index, Pattern in enumerate(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType]): 334 ID = OptionsInfo["PAINSFiltersMap"]["IDs"][FilterType][Index] 335 336 PatternMol = Chem.MolFromSmarts(Pattern) 337 if PatternMol is None: 338 MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern) 339 continue 340 341 # Setup FilterType and PattenMol as property of PatternMol 342 PatternMol.SetProp("FilterType", FilterType) 343 PatternMol.SetProp("FilterID", ID) 344 345 PatternMols.append(PatternMol) 346 347 return PatternMols 348 349 def ProcessPAINSMode(): 350 """Process specified PAINS mode.""" 351 352 OptionsInfo["PAINSMode"] = Options["--painsMode"] 353 354 # Retrieve filetrs information... 355 RetrievePAINSFiltersInfo() 356 357 # Process PAINS mode... 358 OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"] 359 if re.match("^All$", OptionsInfo["PAINSMode"], re.I): 360 return 361 362 PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"]) 363 if not len(PAINSMode): 364 MiscUtil.PrintError("The PAINSMode mode specified using \"-p, --painsMode\" option are empty.") 365 366 CanonicalFilterTypesMap = {} 367 for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]: 368 CanonicalFilterTypesMap[FilterType.lower()] = FilterType 369 370 SpecifiedFilterTypes = [] 371 for FilterType in PAINSMode.split(","): 372 CanonicalFilterType = FilterType.lower() 373 if not CanonicalFilterType in CanonicalFilterTypesMap: 374 MiscUtil.PrintError("The PAINS mode, %s, specified using \"-p, --PAINSMode\" is not valid. Supported PAINS modes: %s" % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"]))) 375 376 SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType]) 377 378 OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes 379 380 def ProcessPAINSMatch(): 381 """Process specified PAINS match.""" 382 383 PAINSMatch = Options["--painsMatch"] 384 385 MatchFirstAlert, MatchAllAlerts = [False] * 2 386 if re.match("^First$", PAINSMatch, re.I): 387 MatchFirstAlert = True 388 elif re.match("^All$", PAINSMatch, re.I): 389 MatchAllAlerts = True 390 else: 391 MiscUtil.PrintError("The value %s, specified using \"--painsMatch\" option is not valid. Supported values: First or All" % (PAINSMatch)) 392 393 OptionsInfo["PAINSMatch"] = PAINSMatch 394 OptionsInfo["MatchFirstAlert"] = MatchFirstAlert 395 OptionsInfo["MatchAllAlerts"] = MatchAllAlerts 396 397 # Setup labels for writing out alerts match information... 398 OptionsInfo["AlertsCountLabel"] = "PAINSAlertsCount" 399 OptionsInfo["AlertsLabel"] = "FirstPAINSAlert" if MatchFirstAlert else "PAINSAlerts" 400 401 # Write out alerts count only for match all alerts... 402 OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False 403 404 # Write out alerts match information to comma or tab delimited SMILES files... 405 SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"] 406 OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False 407 408 SMILESMolAlertsPropList = [] 409 if OptionsInfo["WriteAlertsCount"]: 410 SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"]) 411 SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"]) 412 OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList 413 414 def RetrievePAINSFiltersInfo(): 415 """Retrieve information for PAINS filters.""" 416 417 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 418 PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv") 419 420 MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath)) 421 422 Delimiter = ',' 423 QuoteChar = '"' 424 IgnoreHeaderLine = True 425 FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine) 426 427 PAINSFiltersMap = {} 428 PAINSFiltersMap["FilterTypes"] = [] 429 PAINSFiltersMap["IDs"] = {} 430 PAINSFiltersMap["SMARTS"] = {} 431 432 for LineWords in FilterLinesWords: 433 FilterType = LineWords[0] 434 ID = LineWords[1] 435 SMARTS = LineWords[2] 436 437 if not FilterType in PAINSFiltersMap["FilterTypes"]: 438 PAINSFiltersMap["FilterTypes"].append(FilterType) 439 PAINSFiltersMap["IDs"][FilterType] = [] 440 PAINSFiltersMap["SMARTS"][FilterType] = [] 441 442 PAINSFiltersMap["IDs"][FilterType].append(ID) 443 PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS) 444 445 OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap 446 447 MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords)) 448 MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"]))) 449 450 for FilterType in PAINSFiltersMap["FilterTypes"]: 451 MiscUtil.PrintInfo("Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["IDs"][FilterType]))) 452 453 def ProcessOptions(): 454 """Process and validate command line arguments and options.""" 455 456 MiscUtil.PrintInfo("Processing options...") 457 458 # Validate options... 459 ValidateOptions() 460 461 OptionsInfo["Infile"] = Options["--infile"] 462 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 463 464 OptionsInfo["Outfile"] = Options["--outfile"] 465 ParamsDefaultInfoOverride = {"SMILESMolProps": True} 466 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 467 468 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 469 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 470 OptionsInfo["OutfileFiltered"] = OutfileFiltered 471 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 472 473 OptionsInfo["Overwrite"] = Options["--overwrite"] 474 475 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 476 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 477 478 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 479 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 480 481 ProcessPAINSMode() 482 ProcessPAINSMatch() 483 484 def RetrieveOptions(): 485 """Retrieve command line arguments and options.""" 486 487 # Get options... 488 global Options 489 Options = docopt(_docoptUsage_) 490 491 # Set current working directory to the specified directory... 492 WorkingDir = Options["--workingdir"] 493 if WorkingDir: 494 os.chdir(WorkingDir) 495 496 # Handle examples option... 497 if "--examples" in Options and Options["--examples"]: 498 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 499 sys.exit(0) 500 501 def ValidateOptions(): 502 """Validate option values.""" 503 504 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 505 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 506 507 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 508 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 509 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 510 511 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 512 513 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 514 if re.match("^filter$", Options["--mode"], re.I): 515 if not Options["--outfile"]: 516 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 517 518 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 519 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 520 521 MiscUtil.ValidateOptionTextValue("--painsMatch", Options["--painsMatch"], "First All") 522 523 # Setup a usage string for docopt... 524 _docoptUsage_ = """ 525 RDKitFilterPAINS.py - Filter PAINS molecules 526 527 Usage: 528 RDKitFilterPAINS.py [--infileParams <Name,Value,...>] [--mode <filter or count>] 529 [--mp <yes or no>] [--mpParams <Name,Value,...>] 530 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ] 531 [--painsMode <All or A, B, C>] [--painsMatch <First or All>] [--negate <yes or no>] 532 [--overwrite] [-w <dir>] -i <infile> -o <outfile> 533 RDKitFilterPAINS.py -h | --help | -e | --examples 534 535 Description: 536 Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input 537 file by performing a substructure search using SMARTS pattern specified in 538 MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate 539 molecules to an output file or simply count the number of filtered molecules. 540 541 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 542 .tsv, .txt) 543 544 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 545 546 Options: 547 -e, --examples 548 Print examples. 549 -h, --help 550 Print this help message. 551 -i, --infile <infile> 552 Input file name. 553 --infileParams <Name,Value,...> [default: auto] 554 A comma delimited list of parameter name and value pairs for reading 555 molecules from files. The supported parameter names for different file 556 formats, along with their default values, are shown below: 557 558 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 559 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 560 smilesTitleLine,auto,sanitize,yes 561 562 Possible values for smilesDelimiter: space, comma or tab. 563 -m, --mode <filter or count> [default: filter] 564 Specify whether to filter the matched molecules and write out the rest of the 565 molecules to an outfile or simply count the number of matched molecules 566 marked for filtering. 567 --mp <yes or no> [default: no] 568 Use multiprocessing. 569 570 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 571 function employing lazy RDKit data iterable. This allows processing of 572 arbitrary large data sets without any additional requirements memory. 573 574 All input data may be optionally loaded into memory by mp.Pool.map() 575 before starting worker processes in a process pool by setting the value 576 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 577 578 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 579 data mode may adversely impact the performance. The '--mpParams' section 580 provides additional information to tune the value of 'chunkSize'. 581 --mpParams <Name,Value,...> [default: auto] 582 A comma delimited list of parameter name and value pairs to configure 583 multiprocessing. 584 585 The supported parameter names along with their default and possible 586 values are shown below: 587 588 chunkSize, auto 589 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 590 numProcesses, auto [ Default: mp.cpu_count() ] 591 592 These parameters are used by the following functions to configure and 593 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 594 mp.Pool.imap(). 595 596 The chunkSize determines chunks of input data passed to each worker 597 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 598 The default value of chunkSize is dependent on the value of 'inputDataMode'. 599 600 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 601 automatically converts RDKit data iterable into a list, loads all data into 602 memory, and calculates the default chunkSize using the following method 603 as shown in its code: 604 605 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 606 if extra: chunkSize += 1 607 608 For example, the default chunkSize will be 7 for a pool of 4 worker processes 609 and 100 data items. 610 611 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 612 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 613 data into memory. Consequently, the size of input data is not known a priori. 614 It's not possible to estimate an optimal value for the chunkSize. The default 615 chunkSize is set to 1. 616 617 The default value for the chunkSize during 'Lazy' data mode may adversely 618 impact the performance due to the overhead associated with exchanging 619 small chunks of data. It is generally a good idea to explicitly set chunkSize to 620 a larger value during 'Lazy' input data mode, based on the size of your input 621 data and number of processes in the process pool. 622 623 The mp.Pool.map() function waits for all worker processes to process all 624 the data and return the results. The mp.Pool.imap() function, however, 625 returns the the results obtained from worker processes as soon as the 626 results become available for specified chunks of data. 627 628 The order of data in the results returned by both mp.Pool.map() and 629 mp.Pool.imap() functions always corresponds to the input data. 630 -n, --negate <yes or no> [default: no] 631 Specify whether to filter molecules not matching the PAINS filters specified by 632 SMARTS patterns. 633 -o, --outfile <outfile> 634 Output file name. 635 --outfileFiltered <yes or no> [default: no] 636 Write out a file containing filtered molecules. Its name is automatically 637 generated from the specified output file. Default: <OutfileRoot>_ 638 Filtered.<OutfileExt>. 639 --outfileParams <Name,Value,...> [default: auto] 640 A comma delimited list of parameter name and value pairs for writing 641 molecules to files. The supported parameter names for different file 642 formats, along with their default values, are shown below: 643 644 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 645 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 646 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes 647 648 Default value for compute2DCoords: yes for SMILES input file; no for all other 649 file types. 650 --overwrite 651 Overwrite existing files. 652 -p, --painsMode <All or A, B, or C> [default: All] 653 All or a comma delimited list of PAINS filter family type to used for 654 filtering molecules. 655 --painsMatch <First or All> [default: First] 656 Stop after matching only first PAINS pattern or match all patterns for 657 filtering molecules. 658 659 The 'PAINSAlertCount' and 'PAINSAlerts' data fields are added to 660 SD file containing filtered molecules for 'All' value of '-painsMatch'. In 661 addition, these data fields are only written to tab or comma delimited 662 SMILES file. 663 664 Format: 665 666 > <PAINSAlertsCount> 667 Number 668 669 > <PAINSAlerts> 670 FilterType: ID; FilterType: ID... ... ...`` 671 672 -w, --workingdir <dir> 673 Location of working directory which defaults to the current directory. 674 675 Examples: 676 To count the number of molecules not containing any substructure corresponding to 677 PAINS SMARTS patterns and write out a SMILES file, type: 678 679 % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi 680 681 To count the number of molecules not containing any substructure corresponding to 682 PAINS SMARTS patterns and write out a SMILES file containing these and filtered 683 molecules along with the alerts information for filtered molecules matching 684 first pattern, type: 685 686 % RDKitFilterPAINS.py --outfileFiltered yes --outfileParams 687 "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi 688 689 To count the number of molecules not containing any substructure corresponding 690 to PAINS SMARTS patterns and write out comma delmited SMILES files containing 691 these and filtered molecules along with the alerts information for filtered 692 molecules matching all patterns, type: 693 694 % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes 695 --outfileParams "SMILESDelimiter,comma" -i Sample.sdf 696 -o SampleOut.smi 697 698 To count the number of molecules not containing any substructure corresponding 699 to PAINS SMARTS patterns and write out comma delmited SD files containing 700 these and filtered molecules along with the alerts information for filtered 701 molecules matching all patterns, type: 702 703 % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes 704 -i Sample.smi -o SampleOut.sdf 705 706 To count the number of molecules not containing any substructure corresponding to 707 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 708 CPUs without loading all data into memory, and write out a SMILES file, type: 709 710 % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi 711 712 To count the number of molecules not containing any substructure corresponding to 713 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 714 CPUs by loading all data into memory, and write out a SD file, type: 715 716 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory" 717 -i Sample.smi -o SampleOut.sdf 718 719 To count the number of molecules not containing any substructure corresponding to 720 PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific 721 number of CPUs and chunk size without loading all data into memory, and 722 write out a SD file, type: 723 724 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy, 725 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf 726 727 To only count the number of molecules not containing any substructure corresponding 728 to PAINS SMARTS patterns without writing out any file, type: 729 730 % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi 731 732 To count the number of molecules containing any substructure corresponding to 733 PAINS SMARTS patterns and write out a SD file with computed 2D coordinates, 734 type: 735 736 % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf 737 738 To count the number of molecules not containing any substructure corresponding to 739 PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 740 741 % RDKitFilterPAINS.py --painsMode A --infileParams 742 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 743 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 744 -i SampleSMILES.csv -o SampleOut.sdf 745 746 Author: 747 Manish Sud(msud@san.rr.com) 748 749 See also: 750 RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 751 752 Copyright: 753 Copyright (C) 2024 Manish Sud. All rights reserved. 754 755 The functionality available in this script is implemented using RDKit, an 756 open source toolkit for cheminformatics developed by Greg Landrum. 757 758 This file is part of MayaChemTools. 759 760 MayaChemTools is free software; you can redistribute it and/or modify it under 761 the terms of the GNU Lesser General Public License as published by the Free 762 Software Foundation; either version 3 of the License, or (at your option) any 763 later version. 764 765 """ 766 767 if __name__ == "__main__": 768 main()