MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterChEMBLAlerts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2023 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script."""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in ChEMBL filters file."""
  83     
  84     # Setup ChEMBL patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up ChEMBL pattern molecules for performing substructure search...")
  86     ChEMBLPatterns = RetrieveChEMBLPatterns()
  87     ChEMBLPatternMols = SetupChEMBLPatternMols(ChEMBLPatterns)
  88     
  89     # Setup a molecule reader...
  90     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  91     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  92     
  93     # Set up molecule writers...
  94     Writer, WriterFiltered = SetupMoleculeWriters()
  95     
  96     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
  97 
  98     if Writer is not None:
  99         Writer.close()
 100     if WriterFiltered is not None:
 101         WriterFiltered.close()
 102     
 103     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 104     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 105     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 106 
 107     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 108     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 109 
 110 def ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 111     """Process and filter molecules. """
 112     
 113     if OptionsInfo["MPMode"]:
 114         return ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 115     else:
 116         return ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 117 
 118 def ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 119     """Process and filter molecules using a single process."""
 120     
 121     NegateMatch = OptionsInfo["NegateMatch"]
 122     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 123     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 124     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 125     
 126     MiscUtil.PrintInfo("\nFiltering molecules...")
 127     
 128     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 129     FirstMol = True
 130     for Mol in Mols:
 131         MolCount += 1
 132         
 133         if Mol is None:
 134             continue
 135         
 136         if RDKitUtil.IsMolEmpty(Mol):
 137             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 138             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 139             continue
 140         
 141         ValidMolCount += 1
 142         if FirstMol:
 143             FirstMol = False
 144             if SetSMILESMolProps:
 145                 if Writer is not None:
 146                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 147                 if WriterFiltered is not None:
 148                     RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 149         
 150         MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols)
 151         if MolMatched == NegateMatch:
 152             RemainingMolCount += 1
 153             WriteMolecule(Writer, Mol, Compute2DCoords)
 154         else:
 155             if OutfileFilteredMode:
 156                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 157     
 158     return (MolCount, ValidMolCount, RemainingMolCount)
 159 
 160 def ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 161     """Process and filter molecules using multiprocessing."""
 162     
 163     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 164     
 165     MPParams = OptionsInfo["MPParams"]
 166     NegateMatch = OptionsInfo["NegateMatch"]
 167     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 168     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 169     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 170     
 171     # Setup data for initializing a worker process...
 172     MiscUtil.PrintInfo("Encoding options info and ChEMBL alert pattern molecules...")
 173     OptionsInfo["EncodedChEMBLPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in ChEMBLPatternMols]
 174     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 175 
 176     # Setup a encoded mols data iterable for a worker process...
 177     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 178 
 179     # Setup process pool along with data initialization for each process...
 180     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 181     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 182     
 183     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 184     
 185     # Start processing...
 186     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 187         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 188     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 189         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 190     else:
 191         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 192     
 193     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 194     FirstMol = True
 195     for Result in Results:
 196         MolCount += 1
 197         MolIndex, EncodedMol, MolMatched = Result
 198         
 199         if EncodedMol is None:
 200             continue
 201         ValidMolCount += 1
 202         
 203         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 204         
 205         if FirstMol:
 206             FirstMol = False
 207             if SetSMILESMolProps:
 208                 if Writer is not None:
 209                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 210                 if WriterFiltered is not None:
 211                     RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 212         
 213         if MolMatched == NegateMatch:
 214             RemainingMolCount += 1
 215             WriteMolecule(Writer, Mol, Compute2DCoords)
 216         else:
 217             if OutfileFilteredMode:
 218                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 219     
 220     return (MolCount, ValidMolCount, RemainingMolCount)
 221 
 222 def InitializeWorkerProcess(*EncodedArgs):
 223     """Initialize data for a worker process."""
 224     
 225     global Options, OptionsInfo
 226 
 227     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 228     
 229     # Decode Options and OptionInfo...
 230     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 231     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 232 
 233     # Decode ChEMBLPatternMols...
 234     OptionsInfo["ChEMBLPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedChEMBLPatternMols"]]
 235     
 236 def WorkerProcess(EncodedMolInfo):
 237     """Process data for a worker process."""
 238 
 239     MolIndex, EncodedMol = EncodedMolInfo
 240     
 241     if EncodedMol is None:
 242         return [MolIndex, None, False]
 243     
 244     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 245     if RDKitUtil.IsMolEmpty(Mol):
 246         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 247         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 248         return [MolIndex, None, False]
 249         
 250     MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, OptionsInfo["ChEMBLPatternMols"])
 251 
 252     return [MolIndex, EncodedMol, MolMatched]
 253     
 254 def WriteMolecule(Writer, Mol, Compute2DCoords):
 255     """Write out molecule."""
 256     
 257     if OptionsInfo["CountMode"]:
 258         return
 259     
 260     if Compute2DCoords:
 261         AllChem.Compute2DCoords(Mol)
 262     
 263     Writer.write(Mol)
 264     
 265 def SetupMoleculeWriters():
 266     """Setup molecule writers."""
 267     
 268     Writer = None
 269     WriterFiltered = None
 270 
 271     if OptionsInfo["CountMode"]:
 272         return (Writer, WriterFiltered)
 273 
 274     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 275     if Writer is None:
 276         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 277     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 278     
 279     if OptionsInfo["OutfileFilteredMode"]:
 280         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 281         if WriterFiltered is None:
 282             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 283         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 284     
 285     return (Writer, WriterFiltered)
 286 
 287 def DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols):
 288     """Check presence of ChEMBL alerts pattern in the molecule."""
 289 
 290     MolMatched = False
 291     
 292     for PatternMol in ChEMBLPatternMols:
 293         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 294             MolMatched = True
 295             break
 296         
 297     return MolMatched
 298     
 299 def RetrieveChEMBLPatterns():
 300     """Retrieve ChEMBL patterns for specified ChEMBL altert mode."""
 301 
 302     SMARTSPatterns = []
 303     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 304         SMARTSPatterns.extend(OptionsInfo["ChEMBLFiltersMap"]["SMARTS"][FilterType])
 305 
 306     return SMARTSPatterns
 307 
 308 def SetupChEMBLPatternMols(ChEMBLPatterns):
 309     """Set up ChEMBL pattern mols for substructure search."""
 310 
 311     PatternMols = []
 312     for Pattern in ChEMBLPatterns:
 313         PatternMol = Chem.MolFromSmarts(Pattern)
 314         PatternMols.append(PatternMol)
 315         
 316     return PatternMols    
 317 
 318 def ProcessChEMBLAlertsMode():
 319     """Process specified alerts mode."""
 320     
 321     # Retrieve filetrs information...
 322     RetrieveChEMBLFiltersInfo()
 323     
 324     # Process alerts mode...
 325     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]
 326     if re.match("^All$", OptionsInfo["AlertsMode"], re.I):
 327         return
 328     
 329     AlertsMode = re.sub(" ", "", OptionsInfo["AlertsMode"])
 330     if not len(AlertsMode):
 331         MiscUtil.PrintError("The alerts mode specified using \"-a, --alertsMode\" option are empty.")
 332 
 333     CanonicalFilterTypesMap = {}
 334     for FilterType in OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]:
 335         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 336 
 337     SpecifiedFilterTypes = []
 338     for FilterType in AlertsMode.split(","):
 339         CanonicalFilterType = FilterType.lower()
 340         if not CanonicalFilterType in CanonicalFilterTypesMap:
 341             MiscUtil.PrintError("The altert mode, %s, specified using \"-a, --alertsMode\" is not valid. Supported alert modes: %s" % (FilterType, ", ".join(OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"])))
 342 
 343         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 344 
 345     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 346     
 347 def RetrieveChEMBLFiltersInfo():
 348     """Retrieve information for ChEMBL filters."""
 349     
 350     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 351     ChEMBLFiltersFilePath = os.path.join(MayaChemToolsDataDir, "ChEMBLFilters.csv")
 352     
 353     MiscUtil.PrintInfo("\nRetrieving ChEMBL alerts SMARTS patterns from file %s" % (ChEMBLFiltersFilePath))
 354 
 355     Delimiter = ','
 356     QuoteChar = '"'
 357     IgnoreHeaderLine = True
 358     FilterLinesWords = MiscUtil.GetTextLinesWords(ChEMBLFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 359 
 360     ChEMBLFiltersMap = {}
 361     ChEMBLFiltersMap["FilterTypes"] = []
 362     ChEMBLFiltersMap["ID"] = {}
 363     ChEMBLFiltersMap["SMARTS"] = {}
 364 
 365     for LineWords in FilterLinesWords:
 366         FilterType = LineWords[0]
 367         ID = LineWords[1]
 368         SMARTS = LineWords[2]
 369 
 370         if not FilterType in ChEMBLFiltersMap["FilterTypes"]:
 371             ChEMBLFiltersMap["FilterTypes"].append(FilterType)
 372             ChEMBLFiltersMap["ID"][FilterType] = []
 373             ChEMBLFiltersMap["SMARTS"][FilterType] = []
 374 
 375         ChEMBLFiltersMap["ID"][FilterType].append(ID)
 376         ChEMBLFiltersMap["SMARTS"][FilterType].append(SMARTS)
 377 
 378     OptionsInfo["ChEMBLFiltersMap"] = ChEMBLFiltersMap
 379     
 380     MiscUtil.PrintInfo("\nTotal number alerts: %d" % len(FilterLinesWords))
 381     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(ChEMBLFiltersMap["FilterTypes"]), ", ".join(ChEMBLFiltersMap["FilterTypes"])))
 382 
 383     for FilterType in ChEMBLFiltersMap["FilterTypes"]:
 384         MiscUtil.PrintInfo("Filter family type: %s; Number of alerts: %d" % (FilterType, len(ChEMBLFiltersMap["ID"][FilterType])))
 385     MiscUtil.PrintInfo("")
 386     
 387 def ProcessOptions():
 388     """Process and validate command line arguments and options."""
 389     
 390     MiscUtil.PrintInfo("Processing options...")
 391     
 392     # Validate options...
 393     ValidateOptions()
 394     
 395     OptionsInfo["Infile"] = Options["--infile"]
 396     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 397     
 398     OptionsInfo["Outfile"] = Options["--outfile"]
 399     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 400     
 401     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 402     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 403     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 404     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 405     
 406     OptionsInfo["Overwrite"] = Options["--overwrite"]
 407 
 408     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 409     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 410 
 411     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 412     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 413     
 414     OptionsInfo["AlertsMode"] = Options["--alertsMode"]
 415     ProcessChEMBLAlertsMode()
 416     
 417 def RetrieveOptions():
 418     """Retrieve command line arguments and options."""
 419     
 420     # Get options...
 421     global Options
 422     Options = docopt(_docoptUsage_)
 423     
 424     # Set current working directory to the specified directory...
 425     WorkingDir = Options["--workingdir"]
 426     if WorkingDir:
 427         os.chdir(WorkingDir)
 428     
 429     # Handle examples option...
 430     if "--examples" in Options and Options["--examples"]:
 431         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 432         sys.exit(0)
 433 
 434 def ValidateOptions():
 435     """Validate option values."""
 436     
 437     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 438     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 439     
 440     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 441     if re.match("^filter$", Options["--mode"], re.I):
 442         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 443         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 444 
 445     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 446     
 447     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 448     if re.match("^filter$", Options["--mode"], re.I):
 449         if not Options["--outfile"]:
 450             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 451         
 452     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 453     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 454     
 455 # Setup a usage string for docopt...
 456 _docoptUsage_ = """
 457 RDKitFilterChEMBLAlterts.py - Filter ChEMBL alerts
 458 
 459 Usage:
 460     RDKitFilterChEMBLAlerts.py  [--alertsMode <All or Type,Type,...>]
 461                                 [--infileParams <Name,Value,...>] [--mode <filter or count>]
 462                                 [--mp <yes or no>] [--mpParams <Name,Value,...>]
 463                                 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...>]
 464                                 [--negate <yes or no>] [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 465     RDKitFilterChEMBLAlerts.py -h | --help | -e | --examples
 466 
 467 Description:
 468     Filter molecules from an input file for ChEMBL structural alerts by performing
 469     a substructure search using SMARTS patterns specified in MAYACHEMTOOLS/
 470     lib/data/ChEMBLFilters.csv file and write out appropriate molecules to an
 471     output file or simply count the number of filtered molecules.
 472 
 473     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 474     .tsv, .txt)
 475 
 476     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 477 
 478 Options:
 479     -a, --alertsMode <All or Type, Type,...>  [default: All]
 480         All or a comma delimited list of ChEMBL filter types to use for filtering
 481         molecules. 
 482         
 483         The supported filter family types, along with a description, are show below:
 484         
 485             BMS: Bristol-Myers Squibb HTS Deck Filters
 486             Dundee: University of Dundee NTD Screening Library Filters
 487             Glaxo: Bristol-Myers Squibb HTS Deck Filters
 488             Inpharmatica
 489             MLSMR: NIH MLSMR Excluded Functionality Filters
 490             PfizerLINT: Pfizer LINT filters
 491             SureChEMBL
 492         
 493     -e, --examples
 494         Print examples.
 495     -h, --help
 496         Print this help message.
 497     -i, --infile <infile>
 498         Input file name.
 499     --infileParams <Name,Value,...>  [default: auto]
 500         A comma delimited list of parameter name and value pairs for reading
 501         molecules from files. The supported parameter names for different file
 502         formats, along with their default values, are shown below:
 503             
 504             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 505             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 506                 smilesTitleLine,auto,sanitize,yes
 507             
 508         Possible values for smilesDelimiter: space, comma or tab.
 509     -m, --mode <filter or count>  [default: filter]
 510         Specify whether to filter the matched molecules and write out the rest of the 
 511         molecules to an outfile or simply count the number of matched molecules
 512         marked for filtering.
 513     --mp <yes or no>  [default: no]
 514         Use multiprocessing.
 515          
 516         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 517         function employing lazy RDKit data iterable. This allows processing of
 518         arbitrary large data sets without any additional requirements memory.
 519         
 520         All input data may be optionally loaded into memory by mp.Pool.map()
 521         before starting worker processes in a process pool by setting the value
 522         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 523         
 524         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 525         data mode may adversely impact the performance. The '--mpParams' section
 526         provides additional information to tune the value of 'chunkSize'.
 527     --mpParams <Name,Value,...>  [default: auto]
 528         A comma delimited list of parameter name and value pairs to configure
 529         multiprocessing.
 530         
 531         The supported parameter names along with their default and possible
 532         values are shown below:
 533         
 534             chunkSize, auto
 535             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 536             numProcesses, auto   [ Default: mp.cpu_count() ]
 537         
 538         These parameters are used by the following functions to configure and
 539         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 540         mp.Pool.imap().
 541         
 542         The chunkSize determines chunks of input data passed to each worker
 543         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 544         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 545         
 546         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 547         automatically converts RDKit data iterable into a list, loads all data into
 548         memory, and calculates the default chunkSize using the following method
 549         as shown in its code:
 550         
 551             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 552             if extra: chunkSize += 1
 553         
 554         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 555         and 100 data items.
 556         
 557         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 558         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 559         data into memory. Consequently, the size of input data is not known a priori.
 560         It's not possible to estimate an optimal value for the chunkSize. The default 
 561         chunkSize is set to 1.
 562         
 563         The default value for the chunkSize during 'Lazy' data mode may adversely
 564         impact the performance due to the overhead associated with exchanging
 565         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 566         a larger value during 'Lazy' input data mode, based on the size of your input
 567         data and number of processes in the process pool.
 568         
 569         The mp.Pool.map() function waits for all worker processes to process all
 570         the data and return the results. The mp.Pool.imap() function, however,
 571         returns the the results obtained from worker processes as soon as the
 572         results become available for specified chunks of data.
 573         
 574         The order of data in the results returned by both mp.Pool.map() and 
 575         mp.Pool.imap() functions always corresponds to the input data.
 576     -n, --negate <yes or no>  [default: no]
 577         Specify whether to filter molecules not matching the ChEMBL filters specified by
 578         SMARTS patterns.
 579     -o, --outfile <outfile>
 580         Output file name.
 581     --outfileFiltered <yes or no>  [default: no]
 582         Write out a file containing filtered molecules. Its name is automatically
 583         generated from the specified output file. Default: <OutfileRoot>_
 584         Filtered.<OutfileExt>.
 585     --outfileParams <Name,Value,...>  [default: auto]
 586         A comma delimited list of parameter name and value pairs for writing
 587         molecules to files. The supported parameter names for different file
 588         formats, along with their default values, are shown below:
 589             
 590             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 591             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 592                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 593             
 594         Default value for compute2DCoords: yes for SMILES input file; no for all other
 595         file types.
 596     --overwrite
 597         Overwrite existing files.
 598     -w, --workingdir <dir>
 599         Location of working directory which defaults to the current directory.
 600 
 601 Examples:
 602     To count the number of molecules not containing any substructure corresponding
 603     to any ChEMBL SMARTS patterns and write out SMILES files containing these molecules,
 604     type: 
 605 
 606         % RDKitFilterChEMBLAlerts.py -i Sample.smi -o SampleOut.smi
 607 
 608     To count the number of molecules not containing any substructure corresponding to
 609     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 610     available CPUs without loading all data into memory, and write out a SMILES file, type: 
 611 
 612         % RDKitFilterChEMBLAlerts.py --mp yes -i Sample.smi -o SampleOut.smi
 613 
 614     To count the number of molecules not containing any substructure corresponding to
 615     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 616     available CPUs by loading all data into memory, and write out a SMILES file, type: 
 617 
 618         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,
 619           InMemory" -i Sample.smi -o SampleOut.smi
 620 
 621     To count the number of molecules not containing any substructure corresponding to
 622     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on specific
 623     number of CPUs and chunk size without loading all data into memory, and
 624     write out a SMILES file, type: 
 625 
 626         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,Lazy,
 627           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 628 
 629     To count the number of molecules not containing any substructure corresponding
 630     to any ChEMBL SMARTS patterns and write out SMILES files containing these and filtered
 631     molecules, type: 
 632 
 633         % RDKitFilterChEMBLAlerts.py --outfileFiltered yes -i Sample.smi
 634           -o SampleOut.smi
 635 
 636     To only count the number of molecules not containing any substructure corresponding
 637     to BMS ChEMBL SMARTS patterns without writing out any files, type: 
 638 
 639         % RDKitFilterChEMBLAlerts.py -m count -a BMS -i Sample.sdf
 640           -o SampleOut.smi
 641 
 642     To count the number of molecules not containing any substructure corresponding
 643     to Pfizer LINT ChEMBL SMARTS patterns in a  CSV SMILES file and write out a SD file,
 644     type:  
 645 
 646         % RDKitFilterChEMBLAlerts.py --altertsMode PfizerLINT --infileParams
 647           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 648           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 649           -i SampleSMILES.csv -o SampleOut.sdf
 650 
 651 Author:
 652     Manish Sud(msud@san.rr.com)
 653 
 654 See also:
 655     RDKitFilterPAINS.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 656 
 657 Copyright:
 658     Copyright (C) 2023 Manish Sud. All rights reserved.
 659 
 660     The functionality available in this script is implemented using RDKit, an
 661     open source toolkit for cheminformatics developed by Greg Landrum.
 662 
 663     This file is part of MayaChemTools.
 664 
 665     MayaChemTools is free software; you can redistribute it and/or modify it under
 666     the terms of the GNU Lesser General Public License as published by the Free
 667     Software Foundation; either version 3 of the License, or (at your option) any
 668     later version.
 669 
 670 """
 671 
 672 if __name__ == "__main__":
 673     main()