MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterChEMBLAlerts.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script"""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in ChEMBL filters file."""
  83     
  84     # Setup ChEMBL patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up ChEMBL pattern molecules for performing substructure search...")
  86     ChEMBLPatterns = RetrieveChEMBLPatterns()
  87     ChEMBLPatternMols = SetupChEMBLPatternMols(ChEMBLPatterns)
  88     
  89     # Setup a molecule reader...
  90     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  91     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  92     
  93     # Set up molecule writers...
  94     Writer, WriterFiltered = SetupMoleculeWriters()
  95     
  96     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
  97 
  98     if Writer is not None:
  99         Writer.close()
 100     if WriterFiltered is not None:
 101         WriterFiltered.close()
 102     
 103     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 104     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 105     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 106 
 107     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 108     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 109 
 110 def ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 111     """Process and filter molecules. """
 112     
 113     if OptionsInfo["MPMode"]:
 114         return ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 115     else:
 116         return ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered)
 117 
 118 def ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 119     """Process and filter molecules using a single process."""
 120     
 121     NegateMatch = OptionsInfo["NegateMatch"]
 122     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 123     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 124     
 125     MiscUtil.PrintInfo("\nFiltering molecules...")
 126     
 127     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 128     for Mol in Mols:
 129         MolCount += 1
 130         
 131         if Mol is None:
 132             continue
 133         
 134         if RDKitUtil.IsMolEmpty(Mol):
 135             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 136             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 137             continue
 138         
 139         ValidMolCount += 1
 140         
 141         MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols)
 142         if MolMatched == NegateMatch:
 143             RemainingMolCount += 1
 144             WriteMolecule(Writer, Mol, Compute2DCoords)
 145         else:
 146             if OutfileFilteredMode:
 147                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 148     
 149     return (MolCount, ValidMolCount, RemainingMolCount)
 150 
 151 def ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered):
 152     """Process and filter molecules using multiprocessing."""
 153     
 154     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 155     
 156     MPParams = OptionsInfo["MPParams"]
 157     NegateMatch = OptionsInfo["NegateMatch"]
 158     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 159     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 160     
 161     # Setup data for initializing a worker process...
 162     MiscUtil.PrintInfo("Encoding options info and ChEMBL alert pattern molecules...")
 163     OptionsInfo["EncodedChEMBLPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in ChEMBLPatternMols]
 164     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 165 
 166     # Setup a encoded mols data iterable for a worker process...
 167     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 168 
 169     # Setup process pool along with data initialization for each process...
 170     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 171     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 172     
 173     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 174     
 175     # Start processing...
 176     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 177         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 178     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 179         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 180     else:
 181         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 182     
 183     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 184     for Result in Results:
 185         MolCount += 1
 186         MolIndex, EncodedMol, MolMatched = Result
 187         
 188         if EncodedMol is None:
 189             continue
 190         ValidMolCount += 1
 191         
 192         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 193         if MolMatched == NegateMatch:
 194             RemainingMolCount += 1
 195             WriteMolecule(Writer, Mol, Compute2DCoords)
 196         else:
 197             if OutfileFilteredMode:
 198                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 199     
 200     return (MolCount, ValidMolCount, RemainingMolCount)
 201 
 202 def InitializeWorkerProcess(*EncodedArgs):
 203     """Initialize data for a worker process."""
 204     
 205     global Options, OptionsInfo
 206 
 207     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 208     
 209     # Decode Options and OptionInfo...
 210     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 211     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 212 
 213     # Decode ChEMBLPatternMols...
 214     OptionsInfo["ChEMBLPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedChEMBLPatternMols"]]
 215     
 216 def WorkerProcess(EncodedMolInfo):
 217     """Process data for a worker process."""
 218 
 219     MolIndex, EncodedMol = EncodedMolInfo
 220     
 221     if EncodedMol is None:
 222         return [MolIndex, None, False]
 223     
 224     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 225     if RDKitUtil.IsMolEmpty(Mol):
 226         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 227         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 228         return [MolIndex, None, False]
 229         
 230     MolMatched = DoesMoleculeContainsChEMBLPattern(Mol, OptionsInfo["ChEMBLPatternMols"])
 231 
 232     return [MolIndex, EncodedMol, MolMatched]
 233     
 234 def WriteMolecule(Writer, Mol, Compute2DCoords):
 235     """Write out molecule."""
 236     
 237     if OptionsInfo["CountMode"]:
 238         return
 239     
 240     if Compute2DCoords:
 241         AllChem.Compute2DCoords(Mol)
 242     
 243     Writer.write(Mol)
 244     
 245 def SetupMoleculeWriters():
 246     """Setup molecule writers."""
 247     
 248     Writer = None
 249     WriterFiltered = None
 250 
 251     if OptionsInfo["CountMode"]:
 252         return (Writer, WriterFiltered)
 253 
 254     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 255     if Writer is None:
 256         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 257     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 258     
 259     if OptionsInfo["OutfileFilteredMode"]:
 260         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 261         if WriterFiltered is None:
 262             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 263         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 264     
 265     return (Writer, WriterFiltered)
 266 
 267 def DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols):
 268     """Check presence of ChEMBL alerts pattern in the molecule"""
 269 
 270     MolMatched = False
 271     
 272     for PatternMol in ChEMBLPatternMols:
 273         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 274             MolMatched = True
 275             break
 276         
 277     return MolMatched
 278     
 279 def RetrieveChEMBLPatterns():
 280     """Retrieve ChEMBL patterns for specified ChEMBL altert mode."""
 281 
 282     SMARTSPatterns = []
 283     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 284         SMARTSPatterns.extend(OptionsInfo["ChEMBLFiltersMap"]["SMARTS"][FilterType])
 285 
 286     return SMARTSPatterns
 287 
 288 def SetupChEMBLPatternMols(ChEMBLPatterns):
 289     """Set up ChEMBL pattern mols for substructure search"""
 290 
 291     PatternMols = []
 292     for Pattern in ChEMBLPatterns:
 293         PatternMol = Chem.MolFromSmarts(Pattern)
 294         PatternMols.append(PatternMol)
 295         
 296     return PatternMols    
 297 
 298 def ProcessChEMBLAlertsMode():
 299     """Process specified alerts mode. """
 300     
 301     # Retrieve filetrs information...
 302     RetrieveChEMBLFiltersInfo()
 303     
 304     # Process alerts mode...
 305     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]
 306     if re.match("^All$", OptionsInfo["AlertsMode"], re.I):
 307         return
 308     
 309     AlertsMode = re.sub(" ", "", OptionsInfo["AlertsMode"])
 310     if not len(AlertsMode):
 311         MiscUtil.PrintError("The alerts mode specified using \"-a, --alertsMode\" option are empty.")
 312 
 313     CanonicalFilterTypesMap = {}
 314     for FilterType in OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]:
 315         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 316 
 317     SpecifiedFilterTypes = []
 318     for FilterType in AlertsMode.split(","):
 319         CanonicalFilterType = FilterType.lower()
 320         if not CanonicalFilterType in CanonicalFilterTypesMap:
 321             MiscUtil.PrintError("The altert mode, %s, specified using \"-a, --alertsMode\" is not valid. Supported alert modes: %s" % (FilterType, ", ".join(OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"])))
 322 
 323         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 324 
 325     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 326     
 327 def RetrieveChEMBLFiltersInfo():
 328     """Retrieve information for ChEMBL filters."""
 329     
 330     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 331     ChEMBLFiltersFilePath = os.path.join(MayaChemToolsDataDir, "ChEMBLFilters.csv")
 332     
 333     MiscUtil.PrintInfo("\nRetrieving ChEMBL alerts SMARTS patterns from file %s" % (ChEMBLFiltersFilePath))
 334 
 335     Delimiter = ','
 336     QuoteChar = '"'
 337     IgnoreHeaderLine = True
 338     FilterLinesWords = MiscUtil.GetTextLinesWords(ChEMBLFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 339 
 340     ChEMBLFiltersMap = {}
 341     ChEMBLFiltersMap["FilterTypes"] = []
 342     ChEMBLFiltersMap["ID"] = {}
 343     ChEMBLFiltersMap["SMARTS"] = {}
 344 
 345     for LineWords in FilterLinesWords:
 346         FilterType = LineWords[0]
 347         ID = LineWords[1]
 348         SMARTS = LineWords[2]
 349 
 350         if not FilterType in ChEMBLFiltersMap["FilterTypes"]:
 351             ChEMBLFiltersMap["FilterTypes"].append(FilterType)
 352             ChEMBLFiltersMap["ID"][FilterType] = []
 353             ChEMBLFiltersMap["SMARTS"][FilterType] = []
 354 
 355         ChEMBLFiltersMap["ID"][FilterType].append(ID)
 356         ChEMBLFiltersMap["SMARTS"][FilterType].append(SMARTS)
 357 
 358     OptionsInfo["ChEMBLFiltersMap"] = ChEMBLFiltersMap
 359     
 360     MiscUtil.PrintInfo("\nTotal number alerts: %d" % len(FilterLinesWords))
 361     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(ChEMBLFiltersMap["FilterTypes"]), ", ".join(ChEMBLFiltersMap["FilterTypes"])))
 362 
 363     for FilterType in ChEMBLFiltersMap["FilterTypes"]:
 364         MiscUtil.PrintInfo("Filter family type: %s; Number of alerts: %d" % (FilterType, len(ChEMBLFiltersMap["ID"][FilterType])))
 365     MiscUtil.PrintInfo("")
 366     
 367 def ProcessOptions():
 368     """Process and validate command line arguments and options"""
 369     
 370     MiscUtil.PrintInfo("Processing options...")
 371     
 372     # Validate options...
 373     ValidateOptions()
 374     
 375     OptionsInfo["Infile"] = Options["--infile"]
 376     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 377     
 378     OptionsInfo["Outfile"] = Options["--outfile"]
 379     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 380     
 381     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 382     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 383     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 384     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 385     
 386     OptionsInfo["Overwrite"] = Options["--overwrite"]
 387 
 388     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 389     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 390 
 391     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 392     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 393     
 394     OptionsInfo["AlertsMode"] = Options["--alertsMode"]
 395     ProcessChEMBLAlertsMode()
 396     
 397 def RetrieveOptions():
 398     """Retrieve command line arguments and options"""
 399     
 400     # Get options...
 401     global Options
 402     Options = docopt(_docoptUsage_)
 403     
 404     # Set current working directory to the specified directory...
 405     WorkingDir = Options["--workingdir"]
 406     if WorkingDir:
 407         os.chdir(WorkingDir)
 408     
 409     # Handle examples option...
 410     if "--examples" in Options and Options["--examples"]:
 411         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 412         sys.exit(0)
 413 
 414 def ValidateOptions():
 415     """Validate option values"""
 416     
 417     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 418     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 419     
 420     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 421     if re.match("^filter$", Options["--mode"], re.I):
 422         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 423         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 424 
 425     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 426     
 427     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 428     if re.match("^filter$", Options["--mode"], re.I):
 429         if not Options["--outfile"]:
 430             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 431         
 432     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 433     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 434     
 435 # Setup a usage string for docopt...
 436 _docoptUsage_ = """
 437 RDKitFilterChEMBLAlterts.py - Filter ChEMBL alerts
 438 
 439 Usage:
 440     RDKitFilterChEMBLAlerts.py  [--alertsMode <All or Type,Type,...>]
 441                                 [--infileParams <Name,Value,...>] [--mode <filter or count>]
 442                                 [--mp <yes or no>] [--mpParams <Name.Value,...>]
 443                                 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...>]
 444                                 [--negate <yes or no>] [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 445     RDKitFilterChEMBLAlerts.py -h | --help | -e | --examples
 446 
 447 Description:
 448     Filter molecules from an input file for ChEMBL structural alerts by performing
 449     a substructure search using SMARTS patterns specified in MAYACHEMTOOLS/
 450     lib/data/ChEMBLFilters.csv file and write out appropriate molecules to an
 451     output file or simply count the number of filtered molecules.
 452 
 453     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 454     .tsv, .txt)
 455 
 456     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 457 
 458 Options:
 459     -a, --alertsMode <All or Type, Type,...>  [default: All]
 460         All or a comma delimited list of ChEMBL filter types to use for filtering
 461         molecules. 
 462         
 463         The supported filter family types, along with a description, are show below:
 464         
 465             BMS: Bristol-Myers Squibb HTS Deck Filters
 466             Dundee: University of Dundee NTD Screening Library Filters
 467             Glaxo: Bristol-Myers Squibb HTS Deck Filters
 468             Inpharmatica
 469             MLSMR: NIH MLSMR Excluded Functionality Filters
 470             PfizerLINT: Pfizer LINT filters
 471             SureChEMBL
 472         
 473     -e, --examples
 474         Print examples.
 475     -h, --help
 476         Print this help message.
 477     -i, --infile <infile>
 478         Input file name.
 479     --infileParams <Name,Value,...>  [default: auto]
 480         A comma delimited list of parameter name and value pairs for reading
 481         molecules from files. The supported parameter names for different file
 482         formats, along with their default values, are shown below:
 483             
 484             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 485             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 486                 smilesTitleLine,auto,sanitize,yes
 487             
 488         Possible values for smilesDelimiter: space, comma or tab.
 489     -m, --mode <filter or count>  [default: filter]
 490         Specify whether to filter the matched molecules and write out the rest of the 
 491         molecules to an outfile or simply count the number of matched molecules
 492         marked for filtering.
 493     --mp <yes or no>  [default: no]
 494         Use multiprocessing.
 495          
 496         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 497         function employing lazy RDKit data iterable. This allows processing of
 498         arbitrary large data sets without any additional requirements memory.
 499         
 500         All input data may be optionally loaded into memory by mp.Pool.map()
 501         before starting worker processes in a process pool by setting the value
 502         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 503         
 504         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 505         data mode may adversely impact the performance. The '--mpParams' section
 506         provides additional information to tune the value of 'chunkSize'.
 507     --mpParams <Name,Value,...>  [default: auto]
 508         A comma delimited list of parameter name and value pairs for to
 509         configure multiprocessing.
 510         
 511         The supported parameter names along with their default and possible
 512         values are shown below:
 513         
 514             chunkSize, auto
 515             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 516             numProcesses, auto   [ Default: mp.cpu_count() ]
 517         
 518         These parameters are used by the following functions to configure and
 519         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 520         mp.Pool.imap().
 521         
 522         The chunkSize determines chunks of input data passed to each worker
 523         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 524         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 525         
 526         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 527         automatically converts RDKit data iterable into a list, loads all data into
 528         memory, and calculates the default chunkSize using the following method
 529         as shown in its code:
 530         
 531             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 532             if extra: chunkSize += 1
 533         
 534         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 535         and 100 data items.
 536         
 537         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 538         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 539         data into memory. Consequently, the size of input data is not known a priori.
 540         It's not possible to estimate an optimal value for the chunkSize. The default 
 541         chunkSize is set to 1.
 542         
 543         The default value for the chunkSize during 'Lazy' data mode may adversely
 544         impact the performance due to the overhead associated with exchanging
 545         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 546         a larger value during 'Lazy' input data mode, based on the size of your input
 547         data and number of processes in the process pool.
 548         
 549         The mp.Pool.map() function waits for all worker processes to process all
 550         the data and return the results. The mp.Pool.imap() function, however,
 551         returns the the results obtained from worker processes as soon as the
 552         results become available for specified chunks of data.
 553         
 554         The order of data in the results returned by both mp.Pool.map() and 
 555         mp.Pool.imap() functions always corresponds to the input data.
 556     -n, --negate <yes or no>  [default: no]
 557         Specify whether to filter molecules not matching the ChEMBL filters specified by
 558         SMARTS patterns.
 559     -o, --outfile <outfile>
 560         Output file name.
 561     --outfileFiltered <yes or no>  [default: no]
 562         Write out a file containing filtered molecules. Its name is automatically
 563         generated from the specified output file. Default: <OutfileRoot>_
 564         Filtered.<OutfileExt>.
 565     --outfileParams <Name,Value,...>  [default: auto]
 566         A comma delimited list of parameter name and value pairs for writing
 567         molecules to files. The supported parameter names for different file
 568         formats, along with their default values, are shown below:
 569             
 570             SD: compute2DCoords,auto,kekulize,no
 571             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 572                 smilesTitleLine,yes
 573             
 574         Default value for compute2DCoords: yes for SMILES input file; no for all other
 575         file types.
 576     --overwrite
 577         Overwrite existing files.
 578     -w, --workingdir <dir>
 579         Location of working directory which defaults to the current directory.
 580 
 581 Examples:
 582     To count the number of molecules not containing any substructure corresponding
 583     to any ChEMBL SMARTS patterns and write out SMILES files containing these molecules,
 584     type: 
 585 
 586         % RDKitFilterChEMBLAlerts.py -i Sample.smi -o SampleOut.smi
 587 
 588     To count the number of molecules not containing any substructure corresponding to
 589     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 590     available CPUs without loading all data into memory, and write out a SMILES file, type: 
 591 
 592         % RDKitFilterChEMBLAlerts.py --mp yes -i Sample.smi -o SampleOut.smi
 593 
 594     To count the number of molecules not containing any substructure corresponding to
 595     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all
 596     available CPUs by loading all data into memory, and write out a SMILES file, type: 
 597 
 598         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,
 599           InMemory" -i Sample.smi -o SampleOut.smi
 600 
 601     To count the number of molecules not containing any substructure corresponding to
 602     ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on specific
 603     number of CPUs and chunk size without loading all data into memory, and
 604     write out a SMILES file, type: 
 605 
 606         % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,Lazy,
 607           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 608 
 609     To count the number of molecules not containing any substructure corresponding
 610     to any ChEMBL SMARTS patterns and write out SMILES files containing these and filtered
 611     molecules, type: 
 612 
 613         % RDKitFilterChEMBLAlerts.py --outfileFiltered yes -i Sample.smi
 614           -o SampleOut.smi
 615 
 616     To only count the number of molecules not containing any substructure corresponding
 617     to BMS ChEMBL SMARTS patterns without writing out any files, type: 
 618 
 619         % RDKitFilterChEMBLAlerts.py -m count -a BMS -i Sample.sdf
 620           -o SampleOut.smi
 621 
 622     To count the number of molecules not containing any substructure corresponding
 623     to Pfizer LINT ChEMBL SMARTS patterns in a  CSV SMILES file and write out a SD file,
 624     type:  
 625 
 626         % RDKitFilterChEMBLAlerts.py --altertsMode PfizerLINT --infileParams
 627           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 628           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 629           -i SampleSMILES.csv -o SampleOut.sdf
 630 
 631 Author:
 632     Manish Sud(msud@san.rr.com)
 633 
 634 See also:
 635     RDKitFilterPAINS.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 636 
 637 Copyright:
 638     Copyright (C) 2019 Manish Sud. All rights reserved.
 639 
 640     The functionality available in this script is implemented using RDKit, an
 641     open source toolkit for cheminformatics developed by Greg Landrum.
 642 
 643     This file is part of MayaChemTools.
 644 
 645     MayaChemTools is free software; you can redistribute it and/or modify it under
 646     the terms of the GNU Lesser General Public License as published by the Free
 647     Software Foundation; either version 3 of the License, or (at your option) any
 648     later version.
 649 
 650 """
 651 
 652 if __name__ == "__main__":
 653     main()