MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterPAINS.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script."""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in PAINS filter file."""
  83 
  84     # Setup PAINS patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...")
  86     PAINSPatternMols = SetupPAINSPatternMols()
  87     
  88     # Setup a molecule reader...
  89     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  90     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  91     
  92     # Set up molecule writers...
  93     Writer, WriterFiltered = SetupMoleculeWriters()
  94     
  95     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered)
  96     
  97     if Writer is not None:
  98         Writer.close()
  99     if WriterFiltered is not None:
 100         WriterFiltered.close()
 101     
 102     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 103     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 104     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 105 
 106     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 107     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 108 
 109 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered):
 110     """Process and filter molecules."""
 111     
 112     if OptionsInfo["MPMode"]:
 113         return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered)
 114     else:
 115         return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered)
 116 
 117 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered):
 118     """Process and filter molecules using a single process."""
 119     
 120     NegateMatch = OptionsInfo["NegateMatch"]
 121     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 122     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 123     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 124     
 125     MiscUtil.PrintInfo("\nFiltering molecules...")
 126 
 127     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 128     FirstMol = True
 129     for Mol in Mols:
 130         MolCount += 1
 131 
 132         if Mol is None:
 133             continue
 134         
 135         if RDKitUtil.IsMolEmpty(Mol):
 136             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 137             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 138             continue
 139         
 140         ValidMolCount += 1
 141         if FirstMol:
 142             FirstMol = False
 143             if SetSMILESMolProps:
 144                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 145         
 146         MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols)
 147         if MolMatched == NegateMatch:
 148             RemainingMolCount += 1
 149             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 150         else:
 151             if OutfileFilteredMode:
 152                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 153     
 154     return (MolCount, ValidMolCount, RemainingMolCount)
 155     
 156 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered):
 157     """Process and filter molecules using multiprocessing."""
 158     
 159     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 160     
 161     MPParams = OptionsInfo["MPParams"]
 162     NegateMatch = OptionsInfo["NegateMatch"]
 163     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 164     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 165     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 166     
 167     # Setup data for initializing a worker process...
 168     MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...")
 169     OptionsInfo["EncodedPAINSPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols]
 170     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 171 
 172     # Setup a encoded mols data iterable for a worker process...
 173     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 174 
 175     # Setup process pool along with data initialization for each process...
 176     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 177     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 178     
 179     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 180     
 181     # Start processing...
 182     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 183         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 184     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 185         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 186     else:
 187         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 188     
 189     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 190     FirstMol = True
 191     for Result in Results:
 192         MolCount += 1
 193         MolIndex, EncodedMol, MolMatched, AlertsInfo = Result
 194         
 195         if EncodedMol is None:
 196             continue
 197         ValidMolCount += 1
 198         
 199         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 200         
 201         if FirstMol:
 202             FirstMol = False
 203             if SetSMILESMolProps:
 204                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 205         
 206         if MolMatched == NegateMatch:
 207             RemainingMolCount += 1
 208             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 209         else:
 210             if OutfileFilteredMode:
 211                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 212     
 213     return (MolCount, ValidMolCount, RemainingMolCount)
 214 
 215 def InitializeWorkerProcess(*EncodedArgs):
 216     """Initialize data for a worker process."""
 217 
 218     global Options, OptionsInfo
 219     
 220     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 221 
 222     # Decode Options and OptionInfo...
 223     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 224     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 225 
 226     # Decode PAINSPatternMols...
 227     OptionsInfo["PAINSPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]]
 228     
 229 def WorkerProcess(EncodedMolInfo):
 230     """Process data for a worker process."""
 231     
 232     MolIndex, EncodedMol = EncodedMolInfo
 233     
 234     if EncodedMol is None:
 235         return [MolIndex, None, False, None]
 236         
 237     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 238     if RDKitUtil.IsMolEmpty(Mol):
 239         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 240         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 241         return [MolIndex, None, False, None]
 242         
 243     MolMatched, AlertsInfo = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"])
 244 
 245     return [MolIndex, EncodedMol, MolMatched, AlertsInfo]
 246     
 247 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords):
 248     """Write out molecule."""
 249     
 250     if OptionsInfo["CountMode"]:
 251         return
 252     
 253     if Compute2DCoords:
 254         AllChem.Compute2DCoords(Mol)
 255 
 256     if AlertsInfo is not None and len(AlertsInfo):
 257         AlertsCount = "%s" % len(AlertsInfo)
 258         Alerts = "; ".join(AlertsInfo)
 259         if OptionsInfo["WriteAlertsCount"]:
 260             Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount)
 261         Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts)
 262     
 263     Writer.write(Mol)
 264 
 265 def SetupMoleculeWriters():
 266     """Setup molecule writers."""
 267     
 268     Writer = None
 269     WriterFiltered = None
 270 
 271     if OptionsInfo["CountMode"]:
 272         return (Writer, WriterFiltered)
 273 
 274     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 275     if Writer is None:
 276         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 277     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 278     
 279     if OptionsInfo["OutfileFilteredMode"]:
 280         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 281         if WriterFiltered is None:
 282             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 283         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 284     
 285     return (Writer, WriterFiltered)
 286 
 287 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol):
 288     """Setup properties to write for SMILES molecule writers."""
 289 
 290     if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]:
 291         return
 292     
 293     NegateMatch = OptionsInfo["NegateMatch"]
 294     SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"]
 295     SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"]
 296     
 297     if Writer is not None:
 298         RDKitUtil.SetWriterMolProps(Writer, Mol)
 299         if SetSMILESMolAlertsProp:
 300             if NegateMatch:
 301                 Writer.SetProps(SMILESMolAlertsPropList)
 302     
 303     if WriterFiltered is not None:
 304         RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 305         if SetSMILESMolAlertsProp:
 306             if not NegateMatch:
 307                 WriterFiltered.SetProps(SMILESMolAlertsPropList)
 308 
 309 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols):
 310     """Check presence of PAINS pattern in the molecule."""
 311 
 312     MatchAllAlerts = OptionsInfo["MatchAllAlerts"]
 313     AlertsInfo = []
 314     for PatternMol in PAINSPatternMols:
 315         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 316             AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID")))
 317             if not MatchAllAlerts:
 318                 break
 319     
 320     if len(AlertsInfo) == 0:
 321         MolMatched = False
 322         AlertsInfo = None
 323     else:
 324         MolMatched = True
 325         
 326     return (MolMatched, AlertsInfo)
 327     
 328 def SetupPAINSPatternMols():
 329     """Set up PAINS pattern mols for substructure search corresponding to PAINS mode."""
 330 
 331     PatternMols = []
 332     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 333         for Index, Pattern in enumerate(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType]):
 334             ID = OptionsInfo["PAINSFiltersMap"]["IDs"][FilterType][Index]
 335             
 336             PatternMol = Chem.MolFromSmarts(Pattern)
 337             if PatternMol is None:
 338                 MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern)
 339                 continue
 340             
 341             # Setup FilterType and PattenMol as property of PatternMol
 342             PatternMol.SetProp("FilterType", FilterType)
 343             PatternMol.SetProp("FilterID", ID)
 344             
 345             PatternMols.append(PatternMol)
 346 
 347     return PatternMols
 348 
 349 def ProcessPAINSMode():
 350     """Process specified PAINS mode."""
 351     
 352     OptionsInfo["PAINSMode"] = Options["--painsMode"]
 353     
 354     # Retrieve filetrs information...
 355     RetrievePAINSFiltersInfo()
 356     
 357     # Process PAINS mode...
 358     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"]
 359     if re.match("^All$", OptionsInfo["PAINSMode"], re.I):
 360         return
 361     
 362     PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"])
 363     if not len(PAINSMode):
 364         MiscUtil.PrintError("The PAINSMode mode specified using \"-p, --painsMode\" option are empty.")
 365 
 366     CanonicalFilterTypesMap = {}
 367     for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]:
 368         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 369 
 370     SpecifiedFilterTypes = []
 371     for FilterType in PAINSMode.split(","):
 372         CanonicalFilterType = FilterType.lower()
 373         if not CanonicalFilterType in CanonicalFilterTypesMap:
 374             MiscUtil.PrintError("The PAINS mode, %s, specified using \"-p, --PAINSMode\" is not valid. Supported PAINS modes: %s" % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"])))
 375 
 376         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 377 
 378     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 379 
 380 def ProcessPAINSMatch():
 381     """Process specified PAINS match."""
 382 
 383     PAINSMatch = Options["--painsMatch"]
 384     
 385     MatchFirstAlert, MatchAllAlerts = [False] * 2
 386     if re.match("^First$", PAINSMatch, re.I):
 387         MatchFirstAlert = True
 388     elif re.match("^All$", PAINSMatch, re.I):
 389         MatchAllAlerts = True
 390     else:
 391         MiscUtil.PrintError("The value %s, specified using \"--painsMatch\" option is not valid. Supported values: First or All" % (PAINSMatch))        
 392     
 393     OptionsInfo["PAINSMatch"] = PAINSMatch
 394     OptionsInfo["MatchFirstAlert"] = MatchFirstAlert
 395     OptionsInfo["MatchAllAlerts"] = MatchAllAlerts
 396     
 397     # Setup labels for writing out alerts match information...
 398     OptionsInfo["AlertsCountLabel"] = "PAINSAlertsCount"
 399     OptionsInfo["AlertsLabel"] = "FirstPAINSAlert" if MatchFirstAlert else "PAINSAlerts"
 400 
 401     # Write out alerts count only for match all alerts...
 402     OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False
 403 
 404     # Write out alerts match information to comma or tab delimited SMILES files...
 405     SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
 406     OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False
 407 
 408     SMILESMolAlertsPropList = []
 409     if OptionsInfo["WriteAlertsCount"]:
 410         SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"])
 411     SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"])
 412     OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList
 413 
 414 def RetrievePAINSFiltersInfo():
 415     """Retrieve information for PAINS filters."""
 416     
 417     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 418     PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv")
 419     
 420     MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath))
 421 
 422     Delimiter = ','
 423     QuoteChar = '"'
 424     IgnoreHeaderLine = True
 425     FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 426 
 427     PAINSFiltersMap = {}
 428     PAINSFiltersMap["FilterTypes"] = []
 429     PAINSFiltersMap["IDs"] = {}
 430     PAINSFiltersMap["SMARTS"] = {}
 431 
 432     for LineWords in FilterLinesWords:
 433         FilterType = LineWords[0]
 434         ID = LineWords[1]
 435         SMARTS = LineWords[2]
 436 
 437         if not FilterType in PAINSFiltersMap["FilterTypes"]:
 438             PAINSFiltersMap["FilterTypes"].append(FilterType)
 439             PAINSFiltersMap["IDs"][FilterType] = []
 440             PAINSFiltersMap["SMARTS"][FilterType] = []
 441 
 442         PAINSFiltersMap["IDs"][FilterType].append(ID)
 443         PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS)
 444 
 445     OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap
 446     
 447     MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords))
 448     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"])))
 449 
 450     for FilterType in PAINSFiltersMap["FilterTypes"]:
 451         MiscUtil.PrintInfo("Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["IDs"][FilterType])))
 452 
 453 def ProcessOptions():
 454     """Process and validate command line arguments and options."""
 455     
 456     MiscUtil.PrintInfo("Processing options...")
 457     
 458     # Validate options...
 459     ValidateOptions()
 460     
 461     OptionsInfo["Infile"] = Options["--infile"]
 462     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 463     
 464     OptionsInfo["Outfile"] = Options["--outfile"]
 465     ParamsDefaultInfoOverride = {"SMILESMolProps": True}
 466     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"], ParamsDefaultInfo = ParamsDefaultInfoOverride)
 467     
 468     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 469     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 470     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 471     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 472     
 473     OptionsInfo["Overwrite"] = Options["--overwrite"]
 474 
 475     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 476     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 477 
 478     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 479     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 480 
 481     ProcessPAINSMode()
 482     ProcessPAINSMatch()
 483 
 484 def RetrieveOptions():
 485     """Retrieve command line arguments and options."""
 486     
 487     # Get options...
 488     global Options
 489     Options = docopt(_docoptUsage_)
 490     
 491     # Set current working directory to the specified directory...
 492     WorkingDir = Options["--workingdir"]
 493     if WorkingDir:
 494         os.chdir(WorkingDir)
 495     
 496     # Handle examples option...
 497     if "--examples" in Options and Options["--examples"]:
 498         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 499         sys.exit(0)
 500 
 501 def ValidateOptions():
 502     """Validate option values."""
 503     
 504     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 505     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 506     
 507     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 508     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 509     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 510 
 511     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 512     
 513     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 514     if re.match("^filter$", Options["--mode"], re.I):
 515         if not Options["--outfile"]:
 516             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 517         
 518     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 519     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 520     
 521     MiscUtil.ValidateOptionTextValue("--painsMatch", Options["--painsMatch"], "First All")
 522     
 523 # Setup a usage string for docopt...
 524 _docoptUsage_ = """
 525 RDKitFilterPAINS.py - Filter PAINS molecules
 526 
 527 Usage:
 528     RDKitFilterPAINS.py  [--infileParams <Name,Value,...>] [--mode <filter or count>]
 529                          [--mp <yes or no>] [--mpParams <Name,Value,...>]
 530                          [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ]
 531                          [--painsMode <All or A, B, C>] [--painsMatch <First or All>] [--negate <yes or no>]
 532                          [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 533     RDKitFilterPAINS.py -h | --help | -e | --examples
 534 
 535 Description:
 536     Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input
 537     file by performing a substructure search using SMARTS pattern specified in
 538     MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate
 539     molecules to an output file or simply count the number of filtered molecules.
 540 
 541     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 542     .tsv, .txt)
 543 
 544     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 545 
 546 Options:
 547     -e, --examples
 548         Print examples.
 549     -h, --help
 550         Print this help message.
 551     -i, --infile <infile>
 552         Input file name.
 553     --infileParams <Name,Value,...>  [default: auto]
 554         A comma delimited list of parameter name and value pairs for reading
 555         molecules from files. The supported parameter names for different file
 556         formats, along with their default values, are shown below:
 557             
 558             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 559             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 560                 smilesTitleLine,auto,sanitize,yes
 561             
 562         Possible values for smilesDelimiter: space, comma or tab.
 563     -m, --mode <filter or count>  [default: filter]
 564         Specify whether to filter the matched molecules and write out the rest of the 
 565         molecules to an outfile or simply count the number of matched molecules
 566         marked for filtering.
 567     --mp <yes or no>  [default: no]
 568         Use multiprocessing.
 569          
 570         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 571         function employing lazy RDKit data iterable. This allows processing of
 572         arbitrary large data sets without any additional requirements memory.
 573         
 574         All input data may be optionally loaded into memory by mp.Pool.map()
 575         before starting worker processes in a process pool by setting the value
 576         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 577         
 578         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 579         data mode may adversely impact the performance. The '--mpParams' section
 580         provides additional information to tune the value of 'chunkSize'.
 581     --mpParams <Name,Value,...>  [default: auto]
 582         A comma delimited list of parameter name and value pairs to configure
 583         multiprocessing.
 584         
 585         The supported parameter names along with their default and possible
 586         values are shown below:
 587         
 588             chunkSize, auto
 589             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 590             numProcesses, auto   [ Default: mp.cpu_count() ]
 591         
 592         These parameters are used by the following functions to configure and
 593         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 594         mp.Pool.imap().
 595         
 596         The chunkSize determines chunks of input data passed to each worker
 597         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 598         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 599         
 600         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 601         automatically converts RDKit data iterable into a list, loads all data into
 602         memory, and calculates the default chunkSize using the following method
 603         as shown in its code:
 604         
 605             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 606             if extra: chunkSize += 1
 607         
 608         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 609         and 100 data items.
 610         
 611         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 612         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 613         data into memory. Consequently, the size of input data is not known a priori.
 614         It's not possible to estimate an optimal value for the chunkSize. The default 
 615         chunkSize is set to 1.
 616         
 617         The default value for the chunkSize during 'Lazy' data mode may adversely
 618         impact the performance due to the overhead associated with exchanging
 619         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 620         a larger value during 'Lazy' input data mode, based on the size of your input
 621         data and number of processes in the process pool.
 622         
 623         The mp.Pool.map() function waits for all worker processes to process all
 624         the data and return the results. The mp.Pool.imap() function, however,
 625         returns the the results obtained from worker processes as soon as the
 626         results become available for specified chunks of data.
 627         
 628         The order of data in the results returned by both mp.Pool.map() and 
 629         mp.Pool.imap() functions always corresponds to the input data.
 630     -n, --negate <yes or no>  [default: no]
 631         Specify whether to filter molecules not matching the PAINS filters specified by
 632         SMARTS patterns.
 633     -o, --outfile <outfile>
 634         Output file name.
 635     --outfileFiltered <yes or no>  [default: no]
 636         Write out a file containing filtered molecules. Its name is automatically
 637         generated from the specified output file. Default: <OutfileRoot>_
 638         Filtered.<OutfileExt>.
 639     --outfileParams <Name,Value,...>  [default: auto]
 640         A comma delimited list of parameter name and value pairs for writing
 641         molecules to files. The supported parameter names for different file
 642         formats, along with their default values, are shown below:
 643             
 644             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 645             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 646                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes
 647             
 648         Default value for compute2DCoords: yes for SMILES input file; no for all other
 649         file types.
 650     --overwrite
 651         Overwrite existing files.
 652     -p, --painsMode <All or A, B, or C>  [default: All]
 653         All or a comma delimited list of PAINS filter family type to used for
 654         filtering molecules. 
 655     --painsMatch <First or All>  [default: First]
 656         Stop after matching  only first PAINS pattern or match all patterns for
 657         filtering molecules.
 658         
 659         The 'PAINSAlertCount' and 'PAINSAlerts' data fields are added to
 660         SD file containing filtered molecules for 'All' value of '-painsMatch'. In
 661         addition, these data fields are only written to tab or comma delimited
 662         SMILES file.
 663         
 664         Format:
 665             
 666             > <PAINSAlertsCount>
 667             Number
 668             
 669             > <PAINSAlerts>
 670             FilterType: ID; FilterType: ID... ... ...``
 671             
 672     -w, --workingdir <dir>
 673         Location of working directory which defaults to the current directory.
 674 
 675 Examples:
 676     To count the number of molecules not containing any substructure corresponding to
 677     PAINS SMARTS patterns and write out a SMILES file, type: 
 678 
 679         % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi
 680 
 681     To count the number of molecules not containing any substructure corresponding to
 682     PAINS SMARTS patterns and write out a SMILES file containing these and filtered
 683     molecules along with the alerts information for filtered molecules matching
 684     first pattern, type: 
 685 
 686         % RDKitFilterPAINS.py  --outfileFiltered yes --outfileParams
 687           "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi
 688 
 689     To count the number of molecules not containing any substructure corresponding
 690     to PAINS SMARTS patterns and write out comma delmited SMILES files containing
 691     these and filtered molecules along with the alerts information for filtered
 692     molecules matching all patterns, type: 
 693 
 694         % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes
 695           --outfileParams "SMILESDelimiter,comma" -i Sample.sdf
 696           -o SampleOut.smi
 697 
 698     To count the number of molecules not containing any substructure corresponding
 699     to PAINS SMARTS patterns and write out comma delmited SD files containing
 700     these and filtered molecules along with the alerts information for filtered
 701     molecules matching all patterns, type: 
 702 
 703         % RDKitFilterPAINS.py --painsMatch All --outfileFiltered yes
 704           -i Sample.smi -o SampleOut.sdf
 705 
 706     To count the number of molecules not containing any substructure corresponding to
 707     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 708     CPUs without loading all data into memory, and write out a SMILES file, type: 
 709 
 710         % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi
 711 
 712     To count the number of molecules not containing any substructure corresponding to
 713     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 714     CPUs by loading all data into memory, and write out a SD file, type: 
 715 
 716         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory"
 717           -i Sample.smi -o SampleOut.sdf
 718 
 719     To count the number of molecules not containing any substructure corresponding to
 720     PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific
 721     number of CPUs and chunk size without loading all data into memory, and
 722     write out a SD file, type: 
 723 
 724         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy,
 725           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf
 726 
 727     To only count the number of molecules not containing any substructure corresponding
 728     to PAINS SMARTS patterns without writing out any file, type: 
 729 
 730         % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi
 731 
 732     To count the number of molecules containing any substructure corresponding to
 733     PAINS SMARTS patterns and write out a SD file with computed 2D coordinates,
 734     type: 
 735 
 736         % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf
 737 
 738     To count the number of molecules not containing any substructure corresponding to
 739     PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 
 740 
 741         % RDKitFilterPAINS.py --painsMode A --infileParams
 742           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 743           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 744           -i SampleSMILES.csv -o SampleOut.sdf
 745 
 746 Author:
 747     Manish Sud(msud@san.rr.com)
 748 
 749 See also:
 750     RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 751 
 752 Copyright:
 753     Copyright (C) 2024 Manish Sud. All rights reserved.
 754 
 755     The functionality available in this script is implemented using RDKit, an
 756     open source toolkit for cheminformatics developed by Greg Landrum.
 757 
 758     This file is part of MayaChemTools.
 759 
 760     MayaChemTools is free software; you can redistribute it and/or modify it under
 761     the terms of the GNU Lesser General Public License as published by the Free
 762     Software Foundation; either version 3 of the License, or (at your option) any
 763     later version.
 764 
 765 """
 766 
 767 if __name__ == "__main__":
 768     main()