MayaChemTools:Code:RDKitFilterChEMBLAlerts.py

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterChEMBLAlerts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script."""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in ChEMBL filters file."""
  83     
  84     # Setup ChEMBL patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up ChEMBL pattern molecules for performing substructure search...")
  86     ChEMBLPatternMols = SetupChEMBLPatternMols()
  87     
  88     # Setup a molecule reader...
  89     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  90     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  91     
  92     # Set up molecule writers...
  93     Writer, WriterFiltered = SetupMoleculeWriters()
  94     
  95     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
  96 
  97     if Writer is not None:
  98         Writer.close()
  99     if WriterFiltered is not None:
 100         WriterFiltered.close()
 101     
 102     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 103     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 104     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 105 
 106     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 107     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 108 
 109 def ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 110     """Process and filter molecules. """
 111     
 112     if OptionsInfo["MPMode"]:
 113         return ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 114     else:
 115         return ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 116 
 117 def ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 118     """Process and filter molecules using a single process."""
 119     
 120     NegateMatch = OptionsInfo["NegateMatch"]
 121     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 122     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 123     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 124     
 125     MiscUtil.PrintInfo("\nFiltering molecules...")
 126     
 127     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 128     FirstMol = True
 129     for Mol in Mols:
 130         MolCount += 1
 131         
 132         if Mol is None:
 133             continue
 134         
 135         if RDKitUtil.IsMolEmpty(Mol):
 136             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 137             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 138             continue
 139         
 140         ValidMolCount += 1
 141         if FirstMol:
 142             FirstMol = False
 143             if SetSMILESMolProps:
 144                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 145         
 146         MolMatched, AlertsInfo = DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols)
 147         if MolMatched == NegateMatch:
 148             RemainingMolCount += 1
 149             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 150         else:
 151             if OutfileFilteredMode:
 152                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 153     
 154     return (MolCount, ValidMolCount, RemainingMolCount)
 155 
 156 def ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 157     """Process and filter molecules using multiprocessing."""
 158     
 159     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 160     
 161     MPParams = OptionsInfo["MPParams"]
 162     NegateMatch = OptionsInfo["NegateMatch"]
 163     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 164     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 165     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 166     
 167     # Setup data for initializing a worker process...
 168     MiscUtil.PrintInfo("Encoding options info and ChEMBL alert pattern molecules...")
 169     OptionsInfo["EncodedChEMBLPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in ChEMBLPatternMols]
 170     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 171 
 172     # Setup a encoded mols data iterable for a worker process...
 173     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 174 
 175     # Setup process pool along with data initialization for each process...
 176     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 177     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 178     
 179     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 180     
 181     # Start processing...
 182     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 183         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 184     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 185         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 186     else:
 187         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 188     
 189     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 190     FirstMol = True
 191     for Result in Results:
 192         MolCount += 1
 193         MolIndex, EncodedMol, MolMatched, AlertsInfo = Result
 194         
 195         if EncodedMol is None:
 196             continue
 197         ValidMolCount += 1
 198         
 199         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 200         
 201         if FirstMol:
 202             FirstMol = False
 203             if SetSMILESMolProps:
 204                 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol)
 205         
 206         if MolMatched == NegateMatch:
 207             RemainingMolCount += 1
 208             WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords)
 209         else:
 210             if OutfileFilteredMode:
 211                 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords)
 212     
 213     return (MolCount, ValidMolCount, RemainingMolCount)
 214 
 215 def InitializeWorkerProcess(*EncodedArgs):
 216     """Initialize data for a worker process."""
 217     
 218     global Options, OptionsInfo
 219 
 220     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 221     
 222     # Decode Options and OptionInfo...
 223     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 224     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 225 
 226     # Decode ChEMBLPatternMols...
 227     OptionsInfo["ChEMBLPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedChEMBLPatternMols"]]
 228     
 229 def WorkerProcess(EncodedMolInfo):
 230     """Process data for a worker process."""
 231 
 232     MolIndex, EncodedMol = EncodedMolInfo
 233     
 234     if EncodedMol is None:
 235         return [MolIndex, None, False, None]
 236     
 237     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 238     if RDKitUtil.IsMolEmpty(Mol):
 239         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 240         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 241         return [MolIndex, None, False, None]
 242         
 243     MolMatched, AlertsInfo = DoesMoleculeContainsChEMBLPattern(Mol, OptionsInfo["ChEMBLPatternMols"])
 244 
 245     return [MolIndex, EncodedMol, MolMatched, AlertsInfo]
 246     
 247 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords):
 248     """Write out molecule."""
 249     
 250     if OptionsInfo["CountMode"]:
 251         return
 252     
 253     if Compute2DCoords:
 254         AllChem.Compute2DCoords(Mol)
 255     
 256     if AlertsInfo is not None and len(AlertsInfo):
 257         AlertsCount = "%s" % len(AlertsInfo)
 258         Alerts = "; ".join(AlertsInfo)
 259         if OptionsInfo["WriteAlertsCount"]:
 260             Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount)
 261         Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts)
 262     
 263     Writer.write(Mol)
 264     
 265 def SetupMoleculeWriters():
 266     """Setup molecule writers."""
 267     
 268     Writer = None
 269     WriterFiltered = None
 270 
 271     if OptionsInfo["CountMode"]:
 272         return (Writer, WriterFiltered)
 273 
 274     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 275     if Writer is None:
 276         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 277     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 278     
 279     if OptionsInfo["OutfileFilteredMode"]:
 280         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 281         if WriterFiltered is None:
 282             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 283         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 284     
 285     return (Writer, WriterFiltered)
 286 
 287 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol):
 288     """Setup properties to write for SMILES molecule writers."""
 289 
 290     if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]:
 291         return
 292     
 293     NegateMatch = OptionsInfo["NegateMatch"]
 294     SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"]
 295     SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"]
 296     
 297     if Writer is not None:
 298         RDKitUtil.SetWriterMolProps(Writer, Mol)
 299         if SetSMILESMolAlertsProp:
 300             if NegateMatch:
 301                 Writer.SetProps(SMILESMolAlertsPropList)
 302     
 303     if WriterFiltered is not None:
 304         RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 305         if SetSMILESMolAlertsProp:
 306             if not NegateMatch:
 307                 WriterFiltered.SetProps(SMILESMolAlertsPropList)
 308 
 309 def DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols):
 310     """Check presence of ChEMBL alerts pattern in the molecule."""
 311 
 312     MatchAllAlerts = OptionsInfo["MatchAllAlerts"]
 313     AlertsInfo = []
 314     for PatternMol in ChEMBLPatternMols:
 315         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 316             AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID")))
 317             if not MatchAllAlerts:
 318                 break
 319 
 320     if len(AlertsInfo) == 0:
 321         MolMatched = False
 322         AlertsInfo = None
 323     else:
 324         MolMatched = True
 325     
 326     return (MolMatched, AlertsInfo)
 327     
 328 def SetupChEMBLPatternMols():
 329     """Set up ChEMBL pattern mols for substructure search corresponding to alert mode"""
 330 
 331     PatternMols = []
 332     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 333         for Index, Pattern in enumerate(OptionsInfo["ChEMBLFiltersMap"]["SMARTS"][FilterType]):
 334             ID = OptionsInfo["ChEMBLFiltersMap"]["IDs"][FilterType][Index]
 335             
 336             PatternMol = Chem.MolFromSmarts(Pattern)
 337             if PatternMol is None:
 338                 MiscUtil.PrintWarning("Failed to convert ChEMBL pattern, %s, into a molecule..." % Pattern)
 339                 continue
 340             
 341             # Setup FilterType and PattenMol as property of PatternMol
 342             PatternMol.SetProp("FilterType", FilterType)
 343             PatternMol.SetProp("FilterID", ID)
 344             
 345             PatternMols.append(PatternMol)
 346 
 347     return PatternMols
 348 
 349 def ProcessChEMBLAlertsMode():
 350     """Process specified alerts mode."""
 351     
 352     OptionsInfo["AlertsMode"] = Options["--alertsMode"]
 353     
 354     # Retrieve filetrs information...
 355     RetrieveChEMBLFiltersInfo()
 356     
 357     # Process alerts mode...
 358     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]
 359     if re.match("^All$", OptionsInfo["AlertsMode"], re.I):
 360         return
 361     
 362     AlertsMode = re.sub(" ", "", OptionsInfo["AlertsMode"])
 363     if not len(AlertsMode):
 364         MiscUtil.PrintError("The alerts mode specified using \"-a, --alertsMode\" option are empty.")
 365 
 366     CanonicalFilterTypesMap = {}
 367     for FilterType in OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]:
 368         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 369 
 370     SpecifiedFilterTypes = []
 371     for FilterType in AlertsMode.split(","):
 372         CanonicalFilterType = FilterType.lower()
 373         if not CanonicalFilterType in CanonicalFilterTypesMap:
 374             MiscUtil.PrintError("The altert mode, %s, specified using \"-a, --alertsMode\" is not valid. Supported alert modes: %s" % (FilterType, ", ".join(OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"])))
 375 
 376         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 377 
 378     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 379 
 380 def ProcessChEMBLAlertsMatch():
 381     """Process specified alerts match."""
 382 
 383     AlertsMatch = Options["--alertsMatch"]
 384     
 385     MatchFirstAlert, MatchAllAlerts = [False] * 2
 386     if re.match("^First$", AlertsMatch, re.I):
 387         MatchFirstAlert = True
 388     elif re.match("^All$", AlertsMatch, re.I):
 389         MatchAllAlerts = True
 390     else:
 391         MiscUtil.PrintError("The value %s, specified using \"--alertsMatch\" option is not valid. Supported values: First or All" % (AlertsMatch))        
 392     
 393     OptionsInfo["AlertsMatch"] = AlertsMatch
 394     OptionsInfo["MatchFirstAlert"] = MatchFirstAlert
 395     OptionsInfo["MatchAllAlerts"] = MatchAllAlerts
 396     
 397     # Setup labels for writing out alerts match information...
 398     OptionsInfo["AlertsCountLabel"] = "ChEMBLAlertsCount"
 399     OptionsInfo["AlertsLabel"] = "FirstChEMBLAlert" if MatchFirstAlert else "ChEMBLAlerts"
 400 
 401     # Write out alerts count only for match all alerts...
 402     OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False
 403 
 404     # Write out alerts match information to comma or tab delimited SMILES files...
 405     SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
 406     OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False
 407 
 408     SMILESMolAlertsPropList = []
 409     if OptionsInfo["WriteAlertsCount"]:
 410         SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"])
 411     SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"])
 412     OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList
 413 
 414 def RetrieveChEMBLFiltersInfo():
 415     """Retrieve information for ChEMBL filters."""
 416     
 417     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 418     ChEMBLFiltersFilePath = os.path.join(MayaChemToolsDataDir, "ChEMBLFilters.csv")
 419     
 420     MiscUtil.PrintInfo("\nRetrieving ChEMBL alerts SMARTS patterns from file %s" % (ChEMBLFiltersFilePath))
 421 
 422     Delimiter = ','
 423     QuoteChar = '"'
 424     IgnoreHeaderLine = True
 425     FilterLinesWords = MiscUtil.GetTextLinesWords(ChEMBLFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 426 
 427     ChEMBLFiltersMap = {}
 428     ChEMBLFiltersMap["FilterTypes"] = []
 429     ChEMBLFiltersMap["IDs"] = {}
 430     ChEMBLFiltersMap["SMARTS"] = {}
 431 
 432     for LineWords in FilterLinesWords:
 433         FilterType = LineWords[0]
 434         ID = LineWords[1]
 435         SMARTS = LineWords[2]
 436 
 437         if not FilterType in ChEMBLFiltersMap["FilterTypes"]:
 438             ChEMBLFiltersMap["FilterTypes"].append(FilterType)
 439             ChEMBLFiltersMap["IDs"][FilterType] = []
 440             ChEMBLFiltersMap["SMARTS"][FilterType] = []
 441 
 442         ChEMBLFiltersMap["IDs"][FilterType].append(ID)
 443         ChEMBLFiltersMap["SMARTS"][FilterType].append(SMARTS)
 444 
 445     OptionsInfo["ChEMBLFiltersMap"] = ChEMBLFiltersMap
 446     
 447     MiscUtil.PrintInfo("\nTotal number alerts: %d" % len(FilterLinesWords))
 448     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(ChEMBLFiltersMap["FilterTypes"]), ", ".join(ChEMBLFiltersMap["FilterTypes"])))
 449 
 450     for FilterType in ChEMBLFiltersMap["FilterTypes"]:
 451         MiscUtil.PrintInfo("Filter family type: %s; Number of alerts: %d" % (FilterType, len(ChEMBLFiltersMap["IDs"][FilterType])))
 452     MiscUtil.PrintInfo("")
 453     
 454 def ProcessOptions():
 455     """Process and validate command line arguments and options."""
 456     
 457     MiscUtil.PrintInfo("Processing options...")
 458     
 459     # Validate options...
 460     ValidateOptions()
 461     
 462     OptionsInfo["Infile"] = Options["--infile"]
 463     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 464     
 465     OptionsInfo["Outfile"] = Options["--outfile"]
 466     ParamsDefaultInfoOverride = {"SMILESMolProps": True}
 467     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"], ParamsDefaultInfo = ParamsDefaultInfoOverride)
 468     
 469     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 470     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 471     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 472     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 473     
 474     OptionsInfo["Overwrite"] = Options["--overwrite"]
 475 
 476     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 477     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 478 
 479     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 480     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 481     
 482     ProcessChEMBLAlertsMode()
 483     ProcessChEMBLAlertsMatch()
 484 
 485 def RetrieveOptions():
 486     """Retrieve command line arguments and options."""
 487     
 488     # Get options...
 489     global Options
 490     Options = docopt(_docoptUsage_)
 491     
 492     # Set current working directory to the specified directory...
 493     WorkingDir = Options["--workingdir"]
 494     if WorkingDir:
 495         os.chdir(WorkingDir)
 496     
 497     # Handle examples option...
 498     if "--examples" in Options and Options["--examples"]:
 499         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 500         sys.exit(0)
 501 
 502 def ValidateOptions():
 503     """Validate option values."""
 504     
 505     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 506     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 507     
 508     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 509     if re.match("^filter$", Options["--mode"], re.I):
 510         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 511         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 512 
 513     MiscUtil.ValidateOptionTextValue("--alertsMatch", Options["--alertsMatch"], "First All")
 514     
 515     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 516     
 517     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 518     if re.match("^filter$", Options["--mode"], re.I):
 519         if not Options["--outfile"]:
 520             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 521         
 522     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 523     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 524     
 525 # Setup a usage string for docopt...
 526 _docoptUsage_ = """
 527 RDKitFilterChEMBLAlterts.py - Filter ChEMBL alerts
 528 
 529 Usage:
 530     RDKitFilterChEMBLAlerts.py  [--alertsMode <All or Type,Type,...>] [--alertsMatch <First or All>]
 531                                 [--infileParams <Name,Value,...>] [--mode <filter or count>]
 532                                 [--mp <yes or no>] [--mpParams <Name,Value,...>]
 533                                 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...>]
 534                                 [--negate <yes or no>] [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 535     RDKitFilterChEMBLAlerts.py -h | --help | -e | --examples
 536 
 537 Description:
 538     Filter molecules from an input file for ChEMBL structural alerts by performing
 539     a substructure search using SMARTS patterns specified in MAYACHEMTOOLS/
 540     lib/data/ChEMBLFilters.csv file and write out appropriate molecules to an
 541     output file or simply count the number of filtered molecules.
 542 
 543     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 544     .tsv, .txt)
 545 
 546     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 547 
 548 Options:
 549     -a, --alertsMode <All or Type, Type,...>  [default: All]
 550         All or a comma delimited list of ChEMBL filter types to use for filtering
 551         molecules. 
 552         
 553         The supported filter family types, along with a description, are show below:
 554         
 555             BMS: Bristol-Myers Squibb HTS Deck Filters
 556             Dundee: University of Dundee NTD Screening Library Filters
 557             Glaxo: Bristol-Myers Squibb HTS Deck Filters
 558             Inpharmatica
 559             MLSMR: NIH MLSMR Excluded Functionality Filters
 560             PfizerLINT: Pfizer LINT filters
 561             SureChEMBL
 562         
 563     --alertsMatch <First or All>  [default: First]
 564         Stop after matching only first alert or match all ChEMBL alerts for
 565         filtering molecules.
 566         
 567         The 'ChEMBLAlertsCount' and 'ChEMBLAlerts' data fields are added to
 568         SD file containing filtered molecules for 'All' value of '-altersMatch'. In
 569         addition, these data fields are only written to tab or comma delimited
 570         SMILES file.
 571         
 572         Format:
 573             
 574             > <ChEMBLAlertsCount>
 575             Number
 576             
 577             > <ChEMBLAlerts>
 578             FilterType: ID; FilterType: ID... ... ...``
 579             
 580     -e, --examples
 581         Print examples.
 582     -h, --help
 583         Print this help message.
 584     -i, --infile <infile>
 585         Input file name.
 586     --infileParams <Name,Value,...>  [default: auto]
 587         A comma delimited list of parameter name and value pairs for reading
 588         molecules from files. The supported parameter names for different file
 589         formats, along with their default values, are shown below:
 590             
 591             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 592             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 593                 smilesTitleLine,auto,sanitize,yes
 594             
 595         Possible values for smilesDelimiter: space, comma or tab.
 596     -m, --mode <filter or count>  [default: filter]
 597         Specify whether to filter the matched molecules and write out the rest of the 
 598         molecules to an outfile or simply count the number of matched molecules
 599         marked for filtering.
 600     --mp <yes or no>  [default: no]
 601         Use multiprocessing.
 602          
 603         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 604         function employing lazy RDKit data iterable. This allows processing of
 605         arbitrary large data sets without any additional requirements memory.
 606         
 607         All input data may be optionally loaded into memory by mp.Pool.map()
 608         before starting worker processes in a process pool by setting the value
 609         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 610         
 611         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 612         data mode may adversely impact the performance. The '--mpParams' section
 613         provides additional information to tune the value of 'chunkSize'.
 614     --mpParams <Name,Value,...>  [default: auto]
 615         A comma delimited list of parameter name and value pairs to configure
 616         multiprocessing.
 617         
 618         The supported parameter names along with their default and possible
 619         values are shown below:
 620         
 621             chunkSize, auto
 622             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 623             numProcesses, auto   [ Default: mp.cpu_count() ]
 624         
 625         These parameters are used by the following functions to configure and
 626         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 627         mp.Pool.imap().
 628         
 629         The chunkSize determines chunks of input data passed to each worker
 630         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 631         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 632         
 633         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 634         automatically converts RDKit data iterable into a list, loads all data into
 635         memory, and calculates the default chunkSize using the following method
 636         as shown in its code:
 637         
 638             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 639             if extra: chunkSize += 1
 640         
 641         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 642         and 100 data items.
 643         
 644         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 645         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 646         data into memory. Consequently, the size of input data is not known a priori.
 647         It's not possible to estimate an optimal value for the chunkSize. The default 
 648         chunkSize is set to 1.
 649         
 650         The default value for the chunkSize during 'Lazy' data mode may adversely
 651         impact the performance due to the overhead associated with exchanging
 652         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 653         a larger value during 'Lazy' input data mode, based on the size of your input
 654         data and number of processes in the process pool.
 655         
 656         The mp.Pool.map() function waits for all worker processes to process all
 657         the data and return the results. The mp.Pool.imap() function, however,
 658         returns the the results obtained from worker processes as soon as the
 659         results become available for specified chunks of data.
 660         
 661         The order of data in the results returned by both mp.Pool.map() and 
 662         mp.Pool.imap() functions always corresponds to the input data.
 663     -n, --negate <yes or no>  [default: no]
 664         Specify whether to filter molecules not matching the ChEMBL filters specified by
 665         SMARTS patterns.
 666     -o, --outfile <outfile>
 667         Output file name.
 668     --outfileFiltered <yes or no>  [default: no]
 669         Write out a file containing filtered molecules. Its name is automatically
 670         generated from the specified output file. Default: <OutfileRoot>_
 671         Filtered.<OutfileExt>.
 672     --outfileParams <Name,Value,...>  [default: auto]
 673         A comma delimited list of parameter name and value pairs for writing
 674         molecules to files. The supported parameter names for different file
 675         formats, along with their default values, are shown below:
 676             
 677             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 678             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 679                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes
 680             
 681         Default value for compute2DCoords: yes for SMILES input file; no for all other
 682         file types.
 683     --overwrite
 684         Overwrite existing files.
 685     -w, --workingdir <dir>
 686         Location of working directory which defaults to the current directory.
 687 
 688 Examples:
 689     To count the number of molecules not containing any substructure corresponding
 690     to any ChEMBL SMARTS patterns and write out SMILES files containing these molecules,
 691     type: 
 692 
 693         % RDKitFilterChEMBLAlerts.py -i Sample.smi -o SampleOut.smi
 694 
 695     To count the number of molecules not containing any substructure corresponding
 696     to any ChEMBL SMARTS patterns and write out comma delmited SMILES files
 697     containing these and filtered molecules along with the alerts information for
 698     filtered molecules matching first pattern, type: 
 699 
 700         % RDKitFilterChEMBLAlerts.py --outfileFiltered yes --outfileParams
 701           "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi
 702 
 703     To count the number of molecules not containing any substructure corresponding
 704     to any ChEMBL SMARTS patterns and write out comma delmited SMILES files
 705     containing these and filtered molecules along with the alerts information for
 706     filtered molecules matching all patterns, type: 
 707 
 708         % RDKitFilterChEMBLAlerts.py --alertsMatch All --outfileFiltered yes
 709           --outfileParams "SMILESDelimiter,comma" -i Sample.smi
 710           -o SampleOut.smi
 711 
 712     To count the number of molecules not containing any substructure corresponding
 713     to any ChEMBL SMARTS patterns and write out SD files containing these and filtered
 714     molecules along with the alerts information for filtered molecules matching all
 715     patterns, type: 
 716 
 717         % RDKitFilterChEMBLAlerts.py --alertsMatch All --outfileFiltered yes
 718           -i Sample.smi -o SampleOut.sdf
 719 
 720     To count the number of molecules not containing any substructure corresponding to
 721     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 722     available CPUs without loading all data into memory, and write out a SMILES file, type: 
 723 
 724         % RDKitFilterChEMBLAlerts.py --mp yes -i Sample.smi -o SampleOut.smi
 725 
 726     To count the number of molecules not containing any substructure corresponding to
 727     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 728     available CPUs by loading all data into memory, and write out a SD file, type: 
 729 
 730         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,
 731           InMemory" -i Sample.smi -o SampleOut.sdf
 732 
 733     To count the number of molecules not containing any substructure corresponding to
 734     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on specific
 735     number of CPUs and chunk size without loading all data into memory, and
 736     write out a SD file, type: 
 737 
 738         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,Lazy,
 739           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf
 740 
 741     To only count the number of molecules not containing any substructure corresponding
 742     to BMS ChEMBL SMARTS patterns without writing out any files, type: 
 743 
 744         % RDKitFilterChEMBLAlerts.py -m count -a BMS -i Sample.sdf
 745           -o SampleOut.smi
 746 
 747     To count the number of molecules not containing any substructure corresponding
 748     to Pfizer LINT ChEMBL SMARTS patterns in a  CSV SMILES file and write out a SD file,
 749     type:  
 750 
 751         % RDKitFilterChEMBLAlerts.py --altertsMode PfizerLINT --infileParams
 752           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 753           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 754           -i SampleSMILES.csv -o SampleOut.sdf
 755 
 756 Author:
 757     Manish Sud(msud@san.rr.com)
 758 
 759 See also:
 760     RDKitFilterPAINS.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 761 
 762 Copyright:
 763     Copyright (C) 2025 Manish Sud. All rights reserved.
 764 
 765     The functionality available in this script is implemented using RDKit, an
 766     open source toolkit for cheminformatics developed by Greg Landrum.
 767 
 768     This file is part of MayaChemTools.
 769 
 770     MayaChemTools is free software; you can redistribute it and/or modify it under
 771     the terms of the GNU Lesser General Public License as published by the Free
 772     Software Foundation; either version 3 of the License, or (at your option) any
 773     later version.
 774 
 775 """
 776 
 777 if __name__ == "__main__":
 778     main()