1 #!/bin/env python 2 # 3 # File: RDKitFilterPAINS.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2026 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 import os 32 import sys 33 import time 34 import re 35 import multiprocessing as mp 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo( 67 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" 68 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()) 69 ) 70 71 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 72 73 # Retrieve command line arguments and options... 74 RetrieveOptions() 75 76 # Process and validate command line arguments and options... 77 ProcessOptions() 78 79 # Perform actions required by the script... 80 PerformFiltering() 81 82 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 83 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 84 85 86 def PerformFiltering(): 87 """Filter molecules using SMARTS specified in PAINS filter file.""" 88 89 # Setup PAINS patterns and pattern mols... 90 MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...") 91 PAINSPatternMols = SetupPAINSPatternMols() 92 93 # Setup a molecule reader... 94 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 95 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 96 97 # Set up molecule writers... 98 Writer, WriterFiltered = SetupMoleculeWriters() 99 100 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered) 101 102 if Writer is not None: 103 Writer.close() 104 if WriterFiltered is not None: 105 WriterFiltered.close() 106 107 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 108 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 109 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 110 111 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 112 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 113 114 115 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered): 116 """Process and filter molecules.""" 117 118 if OptionsInfo["MPMode"]: 119 return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered) 120 else: 121 return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered) 122 123 124 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered): 125 """Process and filter molecules using a single process.""" 126 127 NegateMatch = OptionsInfo["NegateMatch"] 128 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 129 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 130 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 131 132 MiscUtil.PrintInfo("\nFiltering molecules...") 133 134 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 135 FirstMol = True 136 for Mol in Mols: 137 MolCount += 1 138 139 if Mol is None: 140 continue 141 142 if RDKitUtil.IsMolEmpty(Mol): 143 MolName = RDKitUtil.GetMolName(Mol, MolCount) 144 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 145 continue 146 147 ValidMolCount += 1 148 if FirstMol: 149 FirstMol = False 150 if SetSMILESMolProps: 151 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 152 153 MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols) 154 if MolMatched == NegateMatch: 155 RemainingMolCount += 1 156 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 157 else: 158 if OutfileFilteredMode: 159 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 160 161 return (MolCount, ValidMolCount, RemainingMolCount) 162 163 164 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered): 165 """Process and filter molecules using multiprocessing.""" 166 167 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 168 169 MPParams = OptionsInfo["MPParams"] 170 NegateMatch = OptionsInfo["NegateMatch"] 171 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 172 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 173 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 174 175 # Setup data for initializing a worker process... 176 MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...") 177 OptionsInfo["EncodedPAINSPatternMols"] = [ 178 RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols 179 ] 180 InitializeWorkerProcessArgs = ( 181 MiscUtil.ObjectToBase64EncodedString(Options), 182 MiscUtil.ObjectToBase64EncodedString(OptionsInfo), 183 ) 184 185 # Setup a encoded mols data iterable for a worker process... 186 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 187 188 # Setup process pool along with data initialization for each process... 189 MiscUtil.PrintInfo( 190 "\nConfiguring multiprocessing using %s method..." 191 % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()") 192 ) 193 MiscUtil.PrintInfo( 194 "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" 195 % ( 196 MPParams["NumProcesses"], 197 MPParams["InputDataMode"], 198 ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]), 199 ) 200 ) 201 202 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 203 204 # Start processing... 205 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 206 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 207 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 208 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 209 else: 210 MiscUtil.PrintError( 211 'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"]) 212 ) 213 214 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 215 FirstMol = True 216 for Result in Results: 217 MolCount += 1 218 MolIndex, EncodedMol, MolMatched, AlertsInfo = Result 219 220 if EncodedMol is None: 221 continue 222 ValidMolCount += 1 223 224 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 225 226 if FirstMol: 227 FirstMol = False 228 if SetSMILESMolProps: 229 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 230 231 if MolMatched == NegateMatch: 232 RemainingMolCount += 1 233 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 234 else: 235 if OutfileFilteredMode: 236 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 237 238 return (MolCount, ValidMolCount, RemainingMolCount) 239 240 241 def InitializeWorkerProcess(*EncodedArgs): 242 """Initialize data for a worker process.""" 243 244 global Options, OptionsInfo 245 246 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 247 248 # Decode Options and OptionInfo... 249 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 250 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 251 252 # Decode PAINSPatternMols... 253 OptionsInfo["PAINSPatternMols"] = [ 254 RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"] 255 ] 256 257 258 def WorkerProcess(EncodedMolInfo): 259 """Process data for a worker process.""" 260 261 MolIndex, EncodedMol = EncodedMolInfo 262 263 if EncodedMol is None: 264 return [MolIndex, None, False, None] 265 266 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 267 if RDKitUtil.IsMolEmpty(Mol): 268 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 269 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 270 return [MolIndex, None, False, None] 271 272 MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"]) 273 274 return [MolIndex, EncodedMol, MolMatched, AlertsInfo] 275 276 277 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords): 278 """Write out molecule.""" 279 280 if OptionsInfo["CountMode"]: 281 return 282 283 if Compute2DCoords: 284 AllChem.Compute2DCoords(Mol) 285 286 if AlertsInfo is not None and len(AlertsInfo): 287 AlertsCount = "%s" % len(AlertsInfo) 288 Alerts = "; ".join(AlertsInfo) 289 if OptionsInfo["WriteAlertsCount"]: 290 Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount) 291 Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts) 292 293 Writer.write(Mol) 294 295 296 def SetupMoleculeWriters(): 297 """Setup molecule writers.""" 298 299 Writer = None 300 WriterFiltered = None 301 302 if OptionsInfo["CountMode"]: 303 return (Writer, WriterFiltered) 304 305 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 306 if Writer is None: 307 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 308 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 309 310 if OptionsInfo["OutfileFilteredMode"]: 311 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 312 if WriterFiltered is None: 313 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 314 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 315 316 return (Writer, WriterFiltered) 317 318 319 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol): 320 """Setup properties to write for SMILES molecule writers.""" 321 322 if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]: 323 return 324 325 NegateMatch = OptionsInfo["NegateMatch"] 326 SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"] 327 SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"] 328 329 if Writer is not None: 330 RDKitUtil.SetWriterMolProps(Writer, Mol) 331 if SetSMILESMolAlertsProp: 332 if NegateMatch: 333 Writer.SetProps(SMILESMolAlertsPropList) 334 335 if WriterFiltered is not None: 336 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 337 if SetSMILESMolAlertsProp: 338 if not NegateMatch: 339 WriterFiltered.SetProps(SMILESMolAlertsPropList) 340 341 342 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols): 343 """Check presence of PAINS pattern in the molecule.""" 344 345 MatchAllAlerts = OptionsInfo["MatchAllAlerts"] 346 AlertsInfo = [] 347 for PatternMol in PAINSPatternMols: 348 if Mol.HasSubstructMatch(PatternMol, useChirality=True): 349 AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID"))) 350 if not MatchAllAlerts: 351 break 352 353 if len(AlertsInfo) == 0: 354 MolMatched = False 355 AlertsInfo = None 356 else: 357 MolMatched = True 358 359 return (MolMatched, AlertsInfo) 360 361 362 def SetupPAINSPatternMols(): 363 """Set up PAINS pattern mols for substructure search corresponding to PAINS mode.""" 364 365 PatternMols = [] 366 for FilterType in OptionsInfo["SpecifiedFilterTypes"]: 367 for Index, Pattern in enumerate(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType]): 368 ID = OptionsInfo["PAINSFiltersMap"]["IDs"][FilterType][Index] 369 370 PatternMol = Chem.MolFromSmarts(Pattern) 371 if PatternMol is None: 372 MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern) 373 continue 374 375 # Setup FilterType and PattenMol as property of PatternMol 376 PatternMol.SetProp("FilterType", FilterType) 377 PatternMol.SetProp("FilterID", ID) 378 379 PatternMols.append(PatternMol) 380 381 return PatternMols 382 383 384 def ProcessPAINSMode(): 385 """Process specified PAINS mode.""" 386 387 OptionsInfo["PAINSMode"] = Options["--painsMode"] 388 389 # Retrieve filetrs information... 390 RetrievePAINSFiltersInfo() 391 392 # Process PAINS mode... 393 OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"] 394 if re.match("^All$", OptionsInfo["PAINSMode"], re.I): 395 return 396 397 PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"]) 398 if not len(PAINSMode): 399 MiscUtil.PrintError('The PAINSMode mode specified using "-p, --painsMode" option are empty.') 400 401 CanonicalFilterTypesMap = {} 402 for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]: 403 CanonicalFilterTypesMap[FilterType.lower()] = FilterType 404 405 SpecifiedFilterTypes = [] 406 for FilterType in PAINSMode.split(","): 407 CanonicalFilterType = FilterType.lower() 408 if CanonicalFilterType not in CanonicalFilterTypesMap: 409 MiscUtil.PrintError( 410 'The PAINS mode, %s, specified using "-p, --PAINSMode" is not valid. Supported PAINS modes: %s' 411 % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"])) 412 ) 413 414 SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType]) 415 416 OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes 417 418 419 def ProcessPAINSMatch(): 420 """Process specified PAINS match.""" 421 422 PAINSMatch = Options["--painsMatch"] 423 424 MatchFirstAlert, MatchAllAlerts = [False] * 2 425 if re.match("^First$", PAINSMatch, re.I): 426 MatchFirstAlert = True 427 elif re.match("^All$", PAINSMatch, re.I): 428 MatchAllAlerts = True 429 else: 430 MiscUtil.PrintError( 431 'The value %s, specified using "--painsMatch" option is not valid. Supported values: First or All' 432 % (PAINSMatch) 433 ) 434 435 OptionsInfo["PAINSMatch"] = PAINSMatch 436 OptionsInfo["MatchFirstAlert"] = MatchFirstAlert 437 OptionsInfo["MatchAllAlerts"] = MatchAllAlerts 438 439 # Setup labels for writing out alerts match information... 440 OptionsInfo["AlertsCountLabel"] = "PAINSAlertsCount" 441 OptionsInfo["AlertsLabel"] = "FirstPAINSAlert" if MatchFirstAlert else "PAINSAlerts" 442 443 # Write out alerts count only for match all alerts... 444 OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False 445 446 # Write out alerts match information to comma or tab delimited SMILES files... 447 SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"] 448 OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False 449 450 SMILESMolAlertsPropList = [] 451 if OptionsInfo["WriteAlertsCount"]: 452 SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"]) 453 SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"]) 454 OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList 455 456 457 def RetrievePAINSFiltersInfo(): 458 """Retrieve information for PAINS filters.""" 459 460 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 461 PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv") 462 463 MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath)) 464 465 Delimiter = "," 466 QuoteChar = '"' 467 IgnoreHeaderLine = True 468 FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine) 469 470 PAINSFiltersMap = {} 471 PAINSFiltersMap["FilterTypes"] = [] 472 PAINSFiltersMap["IDs"] = {} 473 PAINSFiltersMap["SMARTS"] = {} 474 475 for LineWords in FilterLinesWords: 476 FilterType = LineWords[0] 477 ID = LineWords[1] 478 SMARTS = LineWords[2] 479 480 if FilterType not in PAINSFiltersMap["FilterTypes"]: 481 PAINSFiltersMap["FilterTypes"].append(FilterType) 482 PAINSFiltersMap["IDs"][FilterType] = [] 483 PAINSFiltersMap["SMARTS"][FilterType] = [] 484 485 PAINSFiltersMap["IDs"][FilterType].append(ID) 486 PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS) 487 488 OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap 489 490 MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords)) 491 MiscUtil.PrintInfo( 492 "Number of filter family types: %d\nFilter familty types: %s\n" 493 % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"])) 494 ) 495 496 for FilterType in PAINSFiltersMap["FilterTypes"]: 497 MiscUtil.PrintInfo( 498 "Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["IDs"][FilterType])) 499 ) 500 501 502 def ProcessOptions(): 503 """Process and validate command line arguments and options.""" 504 505 MiscUtil.PrintInfo("Processing options...") 506 507 # Validate options... 508 ValidateOptions() 509 510 OptionsInfo["Infile"] = Options["--infile"] 511 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters( 512 "--infileParams", Options["--infileParams"], Options["--infile"] 513 ) 514 515 OptionsInfo["Outfile"] = Options["--outfile"] 516 ParamsDefaultInfoOverride = {"SMILESMolProps": True} 517 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters( 518 "--outfileParams", 519 Options["--outfileParams"], 520 Options["--infile"], 521 Options["--outfile"], 522 ParamsDefaultInfo=ParamsDefaultInfoOverride, 523 ) 524 525 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 526 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 527 OptionsInfo["OutfileFiltered"] = OutfileFiltered 528 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 529 530 OptionsInfo["Overwrite"] = Options["--overwrite"] 531 532 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 533 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 534 535 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 536 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 537 538 ProcessPAINSMode() 539 ProcessPAINSMatch() 540 541 542 def RetrieveOptions(): 543 """Retrieve command line arguments and options.""" 544 545 # Get options... 546 global Options 547 Options = docopt(_docoptUsage_) 548 549 # Set current working directory to the specified directory... 550 WorkingDir = Options["--workingdir"] 551 if WorkingDir: 552 os.chdir(WorkingDir) 553 554 # Handle examples option... 555 if "--examples" in Options and Options["--examples"]: 556 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 557 sys.exit(0) 558 559 560 def ValidateOptions(): 561 """Validate option values.""" 562 563 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 564 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 565 566 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 567 MiscUtil.ValidateOptionsOutputFileOverwrite( 568 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"] 569 ) 570 MiscUtil.ValidateOptionsDistinctFileNames( 571 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"] 572 ) 573 574 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 575 576 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 577 if re.match("^filter$", Options["--mode"], re.I): 578 if not Options["--outfile"]: 579 MiscUtil.PrintError( 580 'The outfile must be specified using "-o, --outfile" during "filter" value of "-m, --mode" option' 581 ) 582 583 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 584 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 585 586 MiscUtil.ValidateOptionTextValue("--painsMatch", Options["--painsMatch"], "First All") 587 588 589 # Setup a usage string for docopt... 590 _docoptUsage_ = """ 591 RDKitFilterPAINS.py - Filter PAINS molecules 592 593 Usage: 594 RDKitFilterPAINS.py [--infileParams <Name,Value,...>] [--mode <filter or count>] 595 [--mp <yes or no>] [--mpParams <Name,Value,...>] 596 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ] 597 [--painsMode <All or A, B, C>] [--painsMatch <First or All>] [--negate <yes or no>] 598 [--overwrite] [-w <dir>] -i <infile> -o <outfile> 599 RDKitFilterPAINS.py -h | --help | -e | --examples 600 601 Description: 602 Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input 603 file by performing a substructure search using SMARTS pattern specified in 604 MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate 605 molecules to an output file or simply count the number of filtered molecules. 606 607 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 608 .tsv, .txt) 609 610 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 611 612 Options: 613 -e, --examples 614 Print examples. 615 -h, --help 616 Print this help message. 617 -i, --infile <infile> 618 Input file name. 619 --infileParams <Name,Value,...> [default: auto] 620 A comma delimited list of parameter name and value pairs for reading 621 molecules from files. The supported parameter names for different file 622 formats, along with their default values, are shown below: 623 624 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 625 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 626 smilesTitleLine,auto,sanitize,yes 627 628 Possible values for smilesDelimiter: space, comma or tab. 629 -m, --mode <filter or count> [default: filter] 630 Specify whether to filter the matched molecules and write out the rest of the 631 molecules to an outfile or simply count the number of matched molecules 632 marked for filtering. 633 --mp <yes or no> [default: no] 634 Use multiprocessing. 635 636 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 637 function employing lazy RDKit data iterable. This allows processing of 638 arbitrary large data sets without any additional requirements memory. 639 640 All input data may be optionally loaded into memory by mp.Pool.map() 641 before starting worker processes in a process pool by setting the value 642 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 643 644 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 645 data mode may adversely impact the performance. The '--mpParams' section 646 provides additional information to tune the value of 'chunkSize'. 647 --mpParams <Name,Value,...> [default: auto] 648 A comma delimited list of parameter name and value pairs to configure 649 multiprocessing. 650 651 The supported parameter names along with their default and possible 652 values are shown below: 653 654 chunkSize, auto 655 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 656 numProcesses, auto [ Default: mp.cpu_count() ] 657 658 These parameters are used by the following functions to configure and 659 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 660 mp.Pool.imap(). 661 662 The chunkSize determines chunks of input data passed to each worker 663 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 664 The default value of chunkSize is dependent on the value of 'inputDataMode'. 665 666 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 667 automatically converts RDKit data iterable into a list, loads all data into 668 memory, and calculates the default chunkSize using the following method 669 as shown in its code: 670 671 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 672 if extra: chunkSize += 1 673 674 For example, the default chunkSize will be 7 for a pool of 4 worker processes 675 and 100 data items. 676 677 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 678 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 679 data into memory. Consequently, the size of input data is not known a priori. 680 It's not possible to estimate an optimal value for the chunkSize. The default 681 chunkSize is set to 1. 682 683 The default value for the chunkSize during 'Lazy' data mode may adversely 684 impact the performance due to the overhead associated with exchanging 685 small chunks of data. It is generally a good idea to explicitly set chunkSize to 686 a larger value during 'Lazy' input data mode, based on the size of your input 687 data and number of processes in the process pool. 688 689 The mp.Pool.map() function waits for all worker processes to process all 690 the data and return the results. The mp.Pool.imap() function, however, 691 returns the the results obtained from worker processes as soon as the 692 results become available for specified chunks of data. 693 694 The order of data in the results returned by both mp.Pool.map() and 695 mp.Pool.imap() functions always corresponds to the input data. 696 -n, --negate <yes or no> [default: no] 697 Specify whether to filter molecules not matching the PAINS filters specified by 698 SMARTS patterns. 699 -o, --outfile <outfile> 700 Output file name. 701 --outfileFiltered <yes or no> [default: no] 702 Write out a file containing filtered molecules. Its name is automatically 703 generated from the specified output file. Default: <OutfileRoot>_ 704 Filtered.<OutfileExt>. 705 --outfileParams <Name,Value,...> [default: auto] 706 A comma delimited list of parameter name and value pairs for writing 707 molecules to files. The supported parameter names for different file 708 formats, along with their default values, are shown below: 709 710 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 711 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 712 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes 713 714 Default value for compute2DCoords: yes for SMILES input file; no for all other 715 file types. 716 --overwrite 717 Overwrite existing files. 718 -p, --painsMode <All or A, B, or C> [default: All] 719 All or a comma delimited list of PAINS filter family type to used for 720 filtering molecules. 721 --painsMatch <First or All> [default: First] 722 Stop after matching only first PAINS pattern or match all patterns for 723 filtering molecules. 724 725 The 'PAINSAlertCount' and 'PAINSAlerts' data fields are added to 726 SD file containing filtered molecules for 'All' value of '-painsMatch'. In 727 addition, these data fields are only written to tab or comma delimited 728 SMILES file. 729 730 Format: 731 732 > <PAINSAlertsCount> 733 Number 734 735 > <PAINSAlerts> 736 FilterType: ID; FilterType: ID... ... ...`` 737 738 -w, --workingdir <dir> 739 Location of working directory which defaults to the current directory. 740 741 Examples: 742 To count the number of molecules not containing any substructure corresponding to 743 PAINS SMARTS patterns and write out a SMILES file, type: 744 745 % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi 746 747 To count the number of molecules not containing any substructure corresponding to 748 PAINS SMARTS patterns and write out a SMILES file containing these and filtered 749 molecules along with the alerts information for filtered molecules matching 750 first pattern, type: 751 752 % RDKitFilterPAINS.py --outfileFiltered yes --outfileParams 753 "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi 754 755 To count the number of molecules not containing any substructure corresponding 756 to PAINS SMARTS patterns and write out comma delmited SMILES files containing 757 these and filtered molecules along with the alerts information for filtered 758 molecules matching all patterns, type: 759 760 % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes 761 --outfileParams "SMILESDelimiter,comma" -i Sample.sdf 762 -o SampleOut.smi 763 764 To count the number of molecules not containing any substructure corresponding 765 to PAINS SMARTS patterns and write out comma delmited SD files containing 766 these and filtered molecules along with the alerts information for filtered 767 molecules matching all patterns, type: 768 769 % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes 770 -i Sample.smi -o SampleOut.sdf 771 772 To count the number of molecules not containing any substructure corresponding to 773 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 774 CPUs without loading all data into memory, and write out a SMILES file, type: 775 776 % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi 777 778 To count the number of molecules not containing any substructure corresponding to 779 PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available 780 CPUs by loading all data into memory, and write out a SD file, type: 781 782 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory" 783 -i Sample.smi -o SampleOut.sdf 784 785 To count the number of molecules not containing any substructure corresponding to 786 PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific 787 number of CPUs and chunk size without loading all data into memory, and 788 write out a SD file, type: 789 790 % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy, 791 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf 792 793 To only count the number of molecules not containing any substructure corresponding 794 to PAINS SMARTS patterns without writing out any file, type: 795 796 % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi 797 798 To count the number of molecules containing any substructure corresponding to 799 PAINS SMARTS patterns and write out a SD file with computed 2D coordinates, 800 type: 801 802 % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf 803 804 To count the number of molecules not containing any substructure corresponding to 805 PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 806 807 % RDKitFilterPAINS.py --painsMode A --infileParams 808 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 809 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 810 -i SampleSMILES.csv -o SampleOut.sdf 811 812 Author: 813 Manish Sud(msud@san.rr.com) 814 815 See also: 816 RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 817 818 Copyright: 819 Copyright (C) 2026 Manish Sud. All rights reserved. 820 821 The functionality available in this script is implemented using RDKit, an 822 open source toolkit for cheminformatics developed by Greg Landrum. 823 824 This file is part of MayaChemTools. 825 826 MayaChemTools is free software; you can redistribute it and/or modify it under 827 the terms of the GNU Lesser General Public License as published by the Free 828 Software Foundation; either version 3 of the License, or (at your option) any 829 later version. 830 831 """ 832 833 if __name__ == "__main__": 834 main()