MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterPAINS.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2026 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 import re
  35 import multiprocessing as mp
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 
  63 def main():
  64     """Start execution of the script."""
  65 
  66     MiscUtil.PrintInfo(
  67         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  68         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  69     )
  70 
  71     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  72 
  73     # Retrieve command line arguments and options...
  74     RetrieveOptions()
  75 
  76     # Process and validate command line arguments and options...
  77     ProcessOptions()
  78 
  79     # Perform actions required by the script...
  80     PerformFiltering()
  81 
  82     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  83     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  84 
  85 
  86 def PerformFiltering():
  87     """Filter molecules using SMARTS specified in PAINS filter file."""
  88 
  89     # Setup PAINS patterns and pattern mols...
  90     MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...")
  91     PAINSPatternMols = SetupPAINSPatternMols()
  92 
  93     # Setup a molecule reader...
  94     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  95     Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  96 
  97     # Set up molecule writers...
  98     Writer, WriterFiltered = SetupMoleculeWriters()
  99 
 100     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered)
 101 
 102     if Writer is not None:
 103         Writer.close()
 104     if WriterFiltered is not None:
 105         WriterFiltered.close()
 106 
 107     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 108     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 109     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 110 
 111     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 112     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 113 
 114 
 115 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered):
 116     """Process and filter molecules."""
 117 
 118     if OptionsInfo["MPMode"]:
 119         return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered)
 120     else:
 121         return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered)
 122 
 123 
 124 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered):
 125     """Process and filter molecules using a single process."""
 126 
 127     NegateMatch = OptionsInfo["NegateMatch"]
 128     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 129     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 130     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 131 
 132     MiscUtil.PrintInfo("\nFiltering molecules...")
 133 
 134     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 135     FirstMol = True
 136     for Mol in Mols:
 137         MolCount += 1
 138 
 139         if Mol is None:
 140             continue
 141 
 142         if RDKitUtil.IsMolEmpty(Mol):
 143             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 144             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 145             continue
 146 
 147         ValidMolCount += 1
 148         if FirstMol:
 149             FirstMol = False
 150             if SetSMILESMolProps:
 151                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 152 
 153         MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols)
 154         if MolMatched == NegateMatch:
 155             RemainingMolCount += 1
 156             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 157         else:
 158             if OutfileFilteredMode:
 159                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 160 
 161     return (MolCount, ValidMolCount, RemainingMolCount)
 162 
 163 
 164 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered):
 165     """Process and filter molecules using multiprocessing."""
 166 
 167     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 168 
 169     MPParams = OptionsInfo["MPParams"]
 170     NegateMatch = OptionsInfo["NegateMatch"]
 171     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 172     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 173     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 174 
 175     # Setup data for initializing a worker process...
 176     MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...")
 177     OptionsInfo["EncodedPAINSPatternMols"] = [
 178         RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols
 179     ]
 180     InitializeWorkerProcessArgs = (
 181         MiscUtil.ObjectToBase64EncodedString(Options),
 182         MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
 183     )
 184 
 185     # Setup a encoded mols data iterable for a worker process...
 186     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 187 
 188     # Setup process pool along with data initialization for each process...
 189     MiscUtil.PrintInfo(
 190         "\nConfiguring multiprocessing using %s method..."
 191         % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
 192     )
 193     MiscUtil.PrintInfo(
 194         "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
 195         % (
 196             MPParams["NumProcesses"],
 197             MPParams["InputDataMode"],
 198             ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
 199         )
 200     )
 201 
 202     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 203 
 204     # Start processing...
 205     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 206         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 207     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 208         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 209     else:
 210         MiscUtil.PrintError(
 211             'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
 212         )
 213 
 214     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 215     FirstMol = True
 216     for Result in Results:
 217         MolCount += 1
 218         MolIndex, EncodedMol, MolMatched, AlertsInfo = Result
 219 
 220         if EncodedMol is None:
 221             continue
 222         ValidMolCount += 1
 223 
 224         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 225 
 226         if FirstMol:
 227             FirstMol = False
 228             if SetSMILESMolProps:
 229                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 230 
 231         if MolMatched == NegateMatch:
 232             RemainingMolCount += 1
 233             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 234         else:
 235             if OutfileFilteredMode:
 236                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 237 
 238     return (MolCount, ValidMolCount, RemainingMolCount)
 239 
 240 
 241 def InitializeWorkerProcess(*EncodedArgs):
 242     """Initialize data for a worker process."""
 243 
 244     global Options, OptionsInfo
 245 
 246     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 247 
 248     # Decode Options and OptionInfo...
 249     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 250     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 251 
 252     # Decode PAINSPatternMols...
 253     OptionsInfo["PAINSPatternMols"] = [
 254         RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]
 255     ]
 256 
 257 
 258 def WorkerProcess(EncodedMolInfo):
 259     """Process data for a worker process."""
 260 
 261     MolIndex, EncodedMol = EncodedMolInfo
 262 
 263     if EncodedMol is None:
 264         return [MolIndex, None, False, None]
 265 
 266     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 267     if RDKitUtil.IsMolEmpty(Mol):
 268         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 269         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 270         return [MolIndex, None, False, None]
 271 
 272     MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"])
 273 
 274     return [MolIndex, EncodedMol, MolMatched, AlertsInfo]
 275 
 276 
 277 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords):
 278     """Write out molecule."""
 279 
 280     if OptionsInfo["CountMode"]:
 281         return
 282 
 283     if Compute2DCoords:
 284         AllChem.Compute2DCoords(Mol)
 285 
 286     if AlertsInfo is not None and len(AlertsInfo):
 287         AlertsCount = "%s" % len(AlertsInfo)
 288         Alerts = "; ".join(AlertsInfo)
 289         if OptionsInfo["WriteAlertsCount"]:
 290             Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount)
 291         Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts)
 292 
 293     Writer.write(Mol)
 294 
 295 
 296 def SetupMoleculeWriters():
 297     """Setup molecule writers."""
 298 
 299     Writer = None
 300     WriterFiltered = None
 301 
 302     if OptionsInfo["CountMode"]:
 303         return (Writer, WriterFiltered)
 304 
 305     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 306     if Writer is None:
 307         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 308     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 309 
 310     if OptionsInfo["OutfileFilteredMode"]:
 311         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 312         if WriterFiltered is None:
 313             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 314         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 315 
 316     return (Writer, WriterFiltered)
 317 
 318 
 319 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol):
 320     """Setup properties to write for SMILES molecule writers."""
 321 
 322     if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]:
 323         return
 324 
 325     NegateMatch = OptionsInfo["NegateMatch"]
 326     SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"]
 327     SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"]
 328 
 329     if Writer is not None:
 330         RDKitUtil.SetWriterMolProps(Writer, Mol)
 331         if SetSMILESMolAlertsProp:
 332             if NegateMatch:
 333                 Writer.SetProps(SMILESMolAlertsPropList)
 334 
 335     if WriterFiltered is not None:
 336         RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 337         if SetSMILESMolAlertsProp:
 338             if not NegateMatch:
 339                 WriterFiltered.SetProps(SMILESMolAlertsPropList)
 340 
 341 
 342 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols):
 343     """Check presence of PAINS pattern in the molecule."""
 344 
 345     MatchAllAlerts = OptionsInfo["MatchAllAlerts"]
 346     AlertsInfo = []
 347     for PatternMol in PAINSPatternMols:
 348         if Mol.HasSubstructMatch(PatternMol, useChirality=True):
 349             AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID")))
 350             if not MatchAllAlerts:
 351                 break
 352 
 353     if len(AlertsInfo) == 0:
 354         MolMatched = False
 355         AlertsInfo = None
 356     else:
 357         MolMatched = True
 358 
 359     return (MolMatched, AlertsInfo)
 360 
 361 
 362 def SetupPAINSPatternMols():
 363     """Set up PAINS pattern mols for substructure search corresponding to PAINS mode."""
 364 
 365     PatternMols = []
 366     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 367         for Index, Pattern in enumerate(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType]):
 368             ID = OptionsInfo["PAINSFiltersMap"]["IDs"][FilterType][Index]
 369 
 370             PatternMol = Chem.MolFromSmarts(Pattern)
 371             if PatternMol is None:
 372                 MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern)
 373                 continue
 374 
 375             # Setup FilterType and PattenMol as property of PatternMol
 376             PatternMol.SetProp("FilterType", FilterType)
 377             PatternMol.SetProp("FilterID", ID)
 378 
 379             PatternMols.append(PatternMol)
 380 
 381     return PatternMols
 382 
 383 
 384 def ProcessPAINSMode():
 385     """Process specified PAINS mode."""
 386 
 387     OptionsInfo["PAINSMode"] = Options["--painsMode"]
 388 
 389     # Retrieve filetrs information...
 390     RetrievePAINSFiltersInfo()
 391 
 392     # Process PAINS mode...
 393     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"]
 394     if re.match("^All$", OptionsInfo["PAINSMode"], re.I):
 395         return
 396 
 397     PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"])
 398     if not len(PAINSMode):
 399         MiscUtil.PrintError('The PAINSMode mode specified using "-p, --painsMode" option are empty.')
 400 
 401     CanonicalFilterTypesMap = {}
 402     for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]:
 403         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 404 
 405     SpecifiedFilterTypes = []
 406     for FilterType in PAINSMode.split(","):
 407         CanonicalFilterType = FilterType.lower()
 408         if CanonicalFilterType not in CanonicalFilterTypesMap:
 409             MiscUtil.PrintError(
 410                 'The PAINS mode, %s, specified using "-p, --PAINSMode" is not valid. Supported PAINS modes: %s'
 411                 % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"]))
 412             )
 413 
 414         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 415 
 416     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 417 
 418 
 419 def ProcessPAINSMatch():
 420     """Process specified PAINS match."""
 421 
 422     PAINSMatch = Options["--painsMatch"]
 423 
 424     MatchFirstAlert, MatchAllAlerts = [False] * 2
 425     if re.match("^First$", PAINSMatch, re.I):
 426         MatchFirstAlert = True
 427     elif re.match("^All$", PAINSMatch, re.I):
 428         MatchAllAlerts = True
 429     else:
 430         MiscUtil.PrintError(
 431             'The value %s, specified using "--painsMatch" option is not valid. Supported values: First or All'
 432             % (PAINSMatch)
 433         )
 434 
 435     OptionsInfo["PAINSMatch"] = PAINSMatch
 436     OptionsInfo["MatchFirstAlert"] = MatchFirstAlert
 437     OptionsInfo["MatchAllAlerts"] = MatchAllAlerts
 438 
 439     # Setup labels for writing out alerts match information...
 440     OptionsInfo["AlertsCountLabel"] = "PAINSAlertsCount"
 441     OptionsInfo["AlertsLabel"] = "FirstPAINSAlert" if MatchFirstAlert else "PAINSAlerts"
 442 
 443     # Write out alerts count only for match all alerts...
 444     OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False
 445 
 446     # Write out alerts match information to comma or tab delimited SMILES files...
 447     SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
 448     OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False
 449 
 450     SMILESMolAlertsPropList = []
 451     if OptionsInfo["WriteAlertsCount"]:
 452         SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"])
 453     SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"])
 454     OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList
 455 
 456 
 457 def RetrievePAINSFiltersInfo():
 458     """Retrieve information for PAINS filters."""
 459 
 460     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 461     PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv")
 462 
 463     MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath))
 464 
 465     Delimiter = ","
 466     QuoteChar = '"'
 467     IgnoreHeaderLine = True
 468     FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 469 
 470     PAINSFiltersMap = {}
 471     PAINSFiltersMap["FilterTypes"] = []
 472     PAINSFiltersMap["IDs"] = {}
 473     PAINSFiltersMap["SMARTS"] = {}
 474 
 475     for LineWords in FilterLinesWords:
 476         FilterType = LineWords[0]
 477         ID = LineWords[1]
 478         SMARTS = LineWords[2]
 479 
 480         if FilterType not in PAINSFiltersMap["FilterTypes"]:
 481             PAINSFiltersMap["FilterTypes"].append(FilterType)
 482             PAINSFiltersMap["IDs"][FilterType] = []
 483             PAINSFiltersMap["SMARTS"][FilterType] = []
 484 
 485         PAINSFiltersMap["IDs"][FilterType].append(ID)
 486         PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS)
 487 
 488     OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap
 489 
 490     MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords))
 491     MiscUtil.PrintInfo(
 492         "Number of filter family types: %d\nFilter familty types: %s\n"
 493         % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"]))
 494     )
 495 
 496     for FilterType in PAINSFiltersMap["FilterTypes"]:
 497         MiscUtil.PrintInfo(
 498             "Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["IDs"][FilterType]))
 499         )
 500 
 501 
 502 def ProcessOptions():
 503     """Process and validate command line arguments and options."""
 504 
 505     MiscUtil.PrintInfo("Processing options...")
 506 
 507     # Validate options...
 508     ValidateOptions()
 509 
 510     OptionsInfo["Infile"] = Options["--infile"]
 511     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
 512         "--infileParams", Options["--infileParams"], Options["--infile"]
 513     )
 514 
 515     OptionsInfo["Outfile"] = Options["--outfile"]
 516     ParamsDefaultInfoOverride = {"SMILESMolProps": True}
 517     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
 518         "--outfileParams",
 519         Options["--outfileParams"],
 520         Options["--infile"],
 521         Options["--outfile"],
 522         ParamsDefaultInfo=ParamsDefaultInfoOverride,
 523     )
 524 
 525     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 526     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 527     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 528     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 529 
 530     OptionsInfo["Overwrite"] = Options["--overwrite"]
 531 
 532     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 533     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 534 
 535     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 536     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 537 
 538     ProcessPAINSMode()
 539     ProcessPAINSMatch()
 540 
 541 
 542 def RetrieveOptions():
 543     """Retrieve command line arguments and options."""
 544 
 545     # Get options...
 546     global Options
 547     Options = docopt(_docoptUsage_)
 548 
 549     # Set current working directory to the specified directory...
 550     WorkingDir = Options["--workingdir"]
 551     if WorkingDir:
 552         os.chdir(WorkingDir)
 553 
 554     # Handle examples option...
 555     if "--examples" in Options and Options["--examples"]:
 556         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 557         sys.exit(0)
 558 
 559 
 560 def ValidateOptions():
 561     """Validate option values."""
 562 
 563     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 564     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 565 
 566     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 567     MiscUtil.ValidateOptionsOutputFileOverwrite(
 568         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 569     )
 570     MiscUtil.ValidateOptionsDistinctFileNames(
 571         "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 572     )
 573 
 574     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 575 
 576     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 577     if re.match("^filter$", Options["--mode"], re.I):
 578         if not Options["--outfile"]:
 579             MiscUtil.PrintError(
 580                 'The outfile must be specified using "-o, --outfile" during "filter" value of "-m, --mode" option'
 581             )
 582 
 583     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 584     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 585 
 586     MiscUtil.ValidateOptionTextValue("--painsMatch", Options["--painsMatch"], "First All")
 587 
 588 
 589 # Setup a usage string for docopt...
 590 _docoptUsage_ = """
 591 RDKitFilterPAINS.py - Filter PAINS molecules
 592 
 593 Usage:
 594     RDKitFilterPAINS.py  [--infileParams <Name,Value,...>] [--mode <filter or count>]
 595                          [--mp <yes or no>] [--mpParams <Name,Value,...>]
 596                          [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ]
 597                          [--painsMode <All or A, B, C>] [--painsMatch <First or All>] [--negate <yes or no>]
 598                          [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 599     RDKitFilterPAINS.py -h | --help | -e | --examples
 600 
 601 Description:
 602     Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input
 603     file by performing a substructure search using SMARTS pattern specified in
 604     MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate
 605     molecules to an output file or simply count the number of filtered molecules.
 606 
 607     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 608     .tsv, .txt)
 609 
 610     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 611 
 612 Options:
 613     -e, --examples
 614         Print examples.
 615     -h, --help
 616         Print this help message.
 617     -i, --infile <infile>
 618         Input file name.
 619     --infileParams <Name,Value,...>  [default: auto]
 620         A comma delimited list of parameter name and value pairs for reading
 621         molecules from files. The supported parameter names for different file
 622         formats, along with their default values, are shown below:
 623             
 624             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 625             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 626                 smilesTitleLine,auto,sanitize,yes
 627             
 628         Possible values for smilesDelimiter: space, comma or tab.
 629     -m, --mode <filter or count>  [default: filter]
 630         Specify whether to filter the matched molecules and write out the rest of the 
 631         molecules to an outfile or simply count the number of matched molecules
 632         marked for filtering.
 633     --mp <yes or no>  [default: no]
 634         Use multiprocessing.
 635          
 636         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 637         function employing lazy RDKit data iterable. This allows processing of
 638         arbitrary large data sets without any additional requirements memory.
 639         
 640         All input data may be optionally loaded into memory by mp.Pool.map()
 641         before starting worker processes in a process pool by setting the value
 642         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 643         
 644         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 645         data mode may adversely impact the performance. The '--mpParams' section
 646         provides additional information to tune the value of 'chunkSize'.
 647     --mpParams <Name,Value,...>  [default: auto]
 648         A comma delimited list of parameter name and value pairs to configure
 649         multiprocessing.
 650         
 651         The supported parameter names along with their default and possible
 652         values are shown below:
 653         
 654             chunkSize, auto
 655             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 656             numProcesses, auto   [ Default: mp.cpu_count() ]
 657         
 658         These parameters are used by the following functions to configure and
 659         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 660         mp.Pool.imap().
 661         
 662         The chunkSize determines chunks of input data passed to each worker
 663         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 664         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 665         
 666         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 667         automatically converts RDKit data iterable into a list, loads all data into
 668         memory, and calculates the default chunkSize using the following method
 669         as shown in its code:
 670         
 671             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 672             if extra: chunkSize += 1
 673         
 674         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 675         and 100 data items.
 676         
 677         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 678         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 679         data into memory. Consequently, the size of input data is not known a priori.
 680         It's not possible to estimate an optimal value for the chunkSize. The default 
 681         chunkSize is set to 1.
 682         
 683         The default value for the chunkSize during 'Lazy' data mode may adversely
 684         impact the performance due to the overhead associated with exchanging
 685         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 686         a larger value during 'Lazy' input data mode, based on the size of your input
 687         data and number of processes in the process pool.
 688         
 689         The mp.Pool.map() function waits for all worker processes to process all
 690         the data and return the results. The mp.Pool.imap() function, however,
 691         returns the the results obtained from worker processes as soon as the
 692         results become available for specified chunks of data.
 693         
 694         The order of data in the results returned by both mp.Pool.map() and 
 695         mp.Pool.imap() functions always corresponds to the input data.
 696     -n, --negate <yes or no>  [default: no]
 697         Specify whether to filter molecules not matching the PAINS filters specified by
 698         SMARTS patterns.
 699     -o, --outfile <outfile>
 700         Output file name.
 701     --outfileFiltered <yes or no>  [default: no]
 702         Write out a file containing filtered molecules. Its name is automatically
 703         generated from the specified output file. Default: <OutfileRoot>_
 704         Filtered.<OutfileExt>.
 705     --outfileParams <Name,Value,...>  [default: auto]
 706         A comma delimited list of parameter name and value pairs for writing
 707         molecules to files. The supported parameter names for different file
 708         formats, along with their default values, are shown below:
 709             
 710             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 711             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 712                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes
 713             
 714         Default value for compute2DCoords: yes for SMILES input file; no for all other
 715         file types.
 716     --overwrite
 717         Overwrite existing files.
 718     -p, --painsMode <All or A, B, or C>  [default: All]
 719         All or a comma delimited list of PAINS filter family type to used for
 720         filtering molecules. 
 721     --painsMatch <First or All>  [default: First]
 722         Stop after matching  only first PAINS pattern or match all patterns for
 723         filtering molecules.
 724         
 725         The 'PAINSAlertCount' and 'PAINSAlerts' data fields are added to
 726         SD file containing filtered molecules for 'All' value of '-painsMatch'. In
 727         addition, these data fields are only written to tab or comma delimited
 728         SMILES file.
 729         
 730         Format:
 731             
 732             > <PAINSAlertsCount>
 733             Number
 734             
 735             > <PAINSAlerts>
 736             FilterType: ID; FilterType: ID... ... ...``
 737             
 738     -w, --workingdir <dir>
 739         Location of working directory which defaults to the current directory.
 740 
 741 Examples:
 742     To count the number of molecules not containing any substructure corresponding to
 743     PAINS SMARTS patterns and write out a SMILES file, type: 
 744 
 745         % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi
 746 
 747     To count the number of molecules not containing any substructure corresponding to
 748     PAINS SMARTS patterns and write out a SMILES file containing these and filtered
 749     molecules along with the alerts information for filtered molecules matching
 750     first pattern, type: 
 751 
 752         % RDKitFilterPAINS.py  --outfileFiltered yes --outfileParams
 753           "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi
 754 
 755     To count the number of molecules not containing any substructure corresponding
 756     to PAINS SMARTS patterns and write out comma delmited SMILES files containing
 757     these and filtered molecules along with the alerts information for filtered
 758     molecules matching all patterns, type: 
 759 
 760         % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes
 761           --outfileParams "SMILESDelimiter,comma" -i Sample.sdf
 762           -o SampleOut.smi
 763 
 764     To count the number of molecules not containing any substructure corresponding
 765     to PAINS SMARTS patterns and write out comma delmited SD files containing
 766     these and filtered molecules along with the alerts information for filtered
 767     molecules matching all patterns, type: 
 768 
 769         % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes
 770           -i Sample.smi -o SampleOut.sdf
 771 
 772     To count the number of molecules not containing any substructure corresponding to
 773     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 774     CPUs without loading all data into memory, and write out a SMILES file, type: 
 775 
 776         % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi
 777 
 778     To count the number of molecules not containing any substructure corresponding to
 779     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 780     CPUs by loading all data into memory, and write out a SD file, type: 
 781 
 782         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory"
 783           -i Sample.smi -o SampleOut.sdf
 784 
 785     To count the number of molecules not containing any substructure corresponding to
 786     PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific
 787     number of CPUs and chunk size without loading all data into memory, and
 788     write out a SD file, type: 
 789 
 790         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy,
 791           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf
 792 
 793     To only count the number of molecules not containing any substructure corresponding
 794     to PAINS SMARTS patterns without writing out any file, type: 
 795 
 796         % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi
 797 
 798     To count the number of molecules containing any substructure corresponding to
 799     PAINS SMARTS patterns and write out a SD file with computed 2D coordinates,
 800     type: 
 801 
 802         % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf
 803 
 804     To count the number of molecules not containing any substructure corresponding to
 805     PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 
 806 
 807         % RDKitFilterPAINS.py --painsMode A --infileParams
 808           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 809           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 810           -i SampleSMILES.csv -o SampleOut.sdf
 811 
 812 Author:
 813     Manish Sud(msud@san.rr.com)
 814 
 815 See also:
 816     RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 817 
 818 Copyright:
 819     Copyright (C) 2026 Manish Sud. All rights reserved.
 820 
 821     The functionality available in this script is implemented using RDKit, an
 822     open source toolkit for cheminformatics developed by Greg Landrum.
 823 
 824     This file is part of MayaChemTools.
 825 
 826     MayaChemTools is free software; you can redistribute it and/or modify it under
 827     the terms of the GNU Lesser General Public License as published by the Free
 828     Software Foundation; either version 3 of the License, or (at your option) any
 829     later version.
 830 
 831 """
 832 
 833 if __name__ == "__main__":
 834     main()