MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterPAINS.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script"""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in PAINS filter file."""
  83 
  84     # Setup PAINS patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...")
  86     PAINSPatterns = RetrievePAINSPatterns()
  87     PAINSPatternMols = SetupPAINSPatternMols(PAINSPatterns)
  88     
  89     # Setup a molecule reader...
  90     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  91     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  92     
  93     # Set up molecule writers...
  94     Writer, WriterFiltered = SetupMoleculeWriters()
  95     
  96     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered)
  97     
  98     if Writer is not None:
  99         Writer.close()
 100     if WriterFiltered is not None:
 101         WriterFiltered.close()
 102     
 103     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 104     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 105     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 106 
 107     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 108     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 109 
 110 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered):
 111     """Process and filter molecules. """
 112     
 113     if OptionsInfo["MPMode"]:
 114         return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered)
 115     else:
 116         return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered)
 117 
 118 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered):
 119     """Process and filter molecules using a single process."""
 120     
 121     NegateMatch = OptionsInfo["NegateMatch"]
 122     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 123     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 124     
 125     MiscUtil.PrintInfo("\nFiltering molecules...")
 126     
 127     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 128     for Mol in Mols:
 129         MolCount += 1
 130 
 131         if Mol is None:
 132             continue
 133         
 134         if RDKitUtil.IsMolEmpty(Mol):
 135             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 136             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 137             continue
 138         
 139         ValidMolCount += 1
 140         
 141         MolMatched = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols)
 142         if MolMatched == NegateMatch:
 143             RemainingMolCount += 1
 144             WriteMolecule(Writer, Mol, Compute2DCoords)
 145         else:
 146             if OutfileFilteredMode:
 147                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 148     
 149     return (MolCount, ValidMolCount, RemainingMolCount)
 150     
 151 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered):
 152     """Process and filter molecules using multiprocessing."""
 153     
 154     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 155     
 156     MPParams = OptionsInfo["MPParams"]
 157     NegateMatch = OptionsInfo["NegateMatch"]
 158     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 159     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 160     
 161     # Setup data for initializing a worker process...
 162     MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...")
 163     OptionsInfo["EncodedPAINSPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols]
 164     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 165 
 166     # Setup a encoded mols data iterable for a worker process...
 167     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 168 
 169     # Setup process pool along with data initialization for each process...
 170     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 171     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 172     
 173     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 174     
 175     # Start processing...
 176     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 177         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 178     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 179         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 180     else:
 181         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 182     
 183     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 184     for Result in Results:
 185         MolCount += 1
 186         MolIndex, EncodedMol, MolMatched = Result
 187         
 188         if EncodedMol is None:
 189             continue
 190         ValidMolCount += 1
 191         
 192         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 193         
 194         if MolMatched == NegateMatch:
 195             RemainingMolCount += 1
 196             WriteMolecule(Writer, Mol, Compute2DCoords)
 197         else:
 198             if OutfileFilteredMode:
 199                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 200     
 201     return (MolCount, ValidMolCount, RemainingMolCount)
 202 
 203 def InitializeWorkerProcess(*EncodedArgs):
 204     """Initialize data for a worker process."""
 205 
 206     global Options, OptionsInfo
 207     
 208     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 209 
 210     # Decode Options and OptionInfo...
 211     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 212     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 213 
 214     # Decode PAINSPatternMols...
 215     OptionsInfo["PAINSPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]]
 216     
 217 def WorkerProcess(EncodedMolInfo):
 218     """Process data for a worker process."""
 219     
 220     MolIndex, EncodedMol = EncodedMolInfo
 221     
 222     if EncodedMol is None:
 223         return [MolIndex, None, False]
 224         
 225     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 226     if RDKitUtil.IsMolEmpty(Mol):
 227         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 228         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 229         return [MolIndex, None, False]
 230         
 231     MolMatched = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"])
 232 
 233     return [MolIndex, EncodedMol, MolMatched]
 234     
 235 def WriteMolecule(Writer, Mol, Compute2DCoords):
 236     """Write out molecule."""
 237     
 238     if OptionsInfo["CountMode"]:
 239         return
 240     
 241     if Compute2DCoords:
 242         AllChem.Compute2DCoords(Mol)
 243     
 244     Writer.write(Mol)
 245 
 246 def SetupMoleculeWriters():
 247     """Setup molecule writers."""
 248     
 249     Writer = None
 250     WriterFiltered = None
 251 
 252     if OptionsInfo["CountMode"]:
 253         return (Writer, WriterFiltered)
 254 
 255     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 256     if Writer is None:
 257         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 258     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 259     
 260     if OptionsInfo["OutfileFilteredMode"]:
 261         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 262         if WriterFiltered is None:
 263             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 264         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 265     
 266     return (Writer, WriterFiltered)
 267 
 268 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols):
 269     """Check presence of PAINS pattern in the molecule"""
 270 
 271     MolMatched = False
 272     
 273     for PatternMol in PAINSPatternMols:
 274         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 275             MolMatched = True
 276             break
 277         
 278     return MolMatched
 279     
 280 def RetrievePAINSPatterns():
 281     """Retrieve PAINS patterns for specified PAINS mode"""
 282 
 283     SMARTSPatterns = []
 284     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 285         SMARTSPatterns.extend(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType])
 286 
 287     return SMARTSPatterns
 288 
 289 def SetupPAINSPatternMols(PAINSPatterns):
 290     """Set up PAINS pattern mols for substructure search"""
 291 
 292     PatternMols = []
 293     for Pattern in PAINSPatterns:
 294         PatternMol = Chem.MolFromSmarts(Pattern)
 295         if PatternMol is None:
 296             MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern)
 297             continue
 298         PatternMols.append(PatternMol)
 299         
 300     return PatternMols    
 301 
 302 def ProcessPAINSMode():
 303     """Process specified PAINS mode. """
 304     
 305     # Retrieve filetrs information...
 306     RetrievePAINSFiltersInfo()
 307     
 308     # Process PAINS mode...
 309     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"]
 310     if re.match("^All$", OptionsInfo["PAINSMode"], re.I):
 311         return
 312     
 313     PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"])
 314     if not len(PAINSMode):
 315         MiscUtil.PrintError("The PAINSMode mode specified using \"-p, --painsMode\" option are empty.")
 316 
 317     CanonicalFilterTypesMap = {}
 318     for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]:
 319         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 320 
 321     SpecifiedFilterTypes = []
 322     for FilterType in PAINSMode.split(","):
 323         CanonicalFilterType = FilterType.lower()
 324         if not CanonicalFilterType in CanonicalFilterTypesMap:
 325             MiscUtil.PrintError("The PAINS mode, %s, specified using \"-p, --PAINSMode\" is not valid. Supported PAINS modes: %s" % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"])))
 326 
 327         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 328 
 329     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 330 
 331 def RetrievePAINSFiltersInfo():
 332     """Retrieve information for PAINS filters."""
 333     
 334     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 335     PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv")
 336     
 337     MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath))
 338 
 339     Delimiter = ','
 340     QuoteChar = '"'
 341     IgnoreHeaderLine = True
 342     FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 343 
 344     PAINSFiltersMap = {}
 345     PAINSFiltersMap["FilterTypes"] = []
 346     PAINSFiltersMap["ID"] = {}
 347     PAINSFiltersMap["SMARTS"] = {}
 348 
 349     for LineWords in FilterLinesWords:
 350         FilterType = LineWords[0]
 351         ID = LineWords[1]
 352         SMARTS = LineWords[2]
 353 
 354         if not FilterType in PAINSFiltersMap["FilterTypes"]:
 355             PAINSFiltersMap["FilterTypes"].append(FilterType)
 356             PAINSFiltersMap["ID"][FilterType] = []
 357             PAINSFiltersMap["SMARTS"][FilterType] = []
 358 
 359         PAINSFiltersMap["ID"][FilterType].append(ID)
 360         PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS)
 361 
 362     OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap
 363     
 364     MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords))
 365     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"])))
 366 
 367     for FilterType in PAINSFiltersMap["FilterTypes"]:
 368         MiscUtil.PrintInfo("Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["ID"][FilterType])))
 369 
 370 def ProcessOptions():
 371     """Process and validate command line arguments and options"""
 372     
 373     MiscUtil.PrintInfo("Processing options...")
 374     
 375     # Validate options...
 376     ValidateOptions()
 377     
 378     OptionsInfo["Infile"] = Options["--infile"]
 379     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 380     
 381     OptionsInfo["Outfile"] = Options["--outfile"]
 382     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 383     
 384     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 385     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 386     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 387     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 388     
 389     OptionsInfo["Overwrite"] = Options["--overwrite"]
 390 
 391     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 392     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 393 
 394     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 395     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 396 
 397     OptionsInfo["PAINSMode"] = Options["--painsMode"]
 398     ProcessPAINSMode()
 399     
 400 def RetrieveOptions():
 401     """Retrieve command line arguments and options"""
 402     
 403     # Get options...
 404     global Options
 405     Options = docopt(_docoptUsage_)
 406     
 407     # Set current working directory to the specified directory...
 408     WorkingDir = Options["--workingdir"]
 409     if WorkingDir:
 410         os.chdir(WorkingDir)
 411     
 412     # Handle examples option...
 413     if "--examples" in Options and Options["--examples"]:
 414         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 415         sys.exit(0)
 416 
 417 def ValidateOptions():
 418     """Validate option values"""
 419     
 420     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 421     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 422     
 423     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 424     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 425     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 426 
 427     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 428     
 429     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 430     if re.match("^filter$", Options["--mode"], re.I):
 431         if not Options["--outfile"]:
 432             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 433         
 434     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 435     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 436     
 437 # Setup a usage string for docopt...
 438 _docoptUsage_ = """
 439 RDKitFilterPAINS.py - Filter PAINS molecules
 440 
 441 Usage:
 442     RDKitFilterPAINS.py  [--infileParams <Name,Value,...>] [--mode <filter or count>]
 443                          [--mp <yes or no>] [--mpParams <Name.Value,...>]
 444                          [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ]
 445                          [--painsMode <All or A, B, C>] [--negate <yes or no>]
 446                          [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 447     RDKitFilterPAINS.py -h | --help | -e | --examples
 448 
 449 Description:
 450     Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input
 451     file by performing a substructure search using SMARTS pattern specified in
 452     MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate
 453     molecules to an output file or simply count the number of filtered molecules.
 454 
 455     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 456     .tsv, .txt)
 457 
 458     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 459 
 460 Options:
 461     -e, --examples
 462         Print examples.
 463     -h, --help
 464         Print this help message.
 465     -i, --infile <infile>
 466         Input file name.
 467     --infileParams <Name,Value,...>  [default: auto]
 468         A comma delimited list of parameter name and value pairs for reading
 469         molecules from files. The supported parameter names for different file
 470         formats, along with their default values, are shown below:
 471             
 472             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 473             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 474                 smilesTitleLine,auto,sanitize,yes
 475             
 476         Possible values for smilesDelimiter: space, comma or tab.
 477     -m, --mode <filter or count>  [default: filter]
 478         Specify whether to filter the matched molecules and write out the rest of the 
 479         molecules to an outfile or simply count the number of matched molecules
 480         marked for filtering.
 481     --mp <yes or no>  [default: no]
 482         Use multiprocessing.
 483          
 484         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 485         function employing lazy RDKit data iterable. This allows processing of
 486         arbitrary large data sets without any additional requirements memory.
 487         
 488         All input data may be optionally loaded into memory by mp.Pool.map()
 489         before starting worker processes in a process pool by setting the value
 490         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 491         
 492         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 493         data mode may adversely impact the performance. The '--mpParams' section
 494         provides additional information to tune the value of 'chunkSize'.
 495     --mpParams <Name,Value,...>  [default: auto]
 496         A comma delimited list of parameter name and value pairs for to
 497         configure multiprocessing.
 498         
 499         The supported parameter names along with their default and possible
 500         values are shown below:
 501         
 502             chunkSize, auto
 503             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 504             numProcesses, auto   [ Default: mp.cpu_count() ]
 505         
 506         These parameters are used by the following functions to configure and
 507         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 508         mp.Pool.imap().
 509         
 510         The chunkSize determines chunks of input data passed to each worker
 511         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 512         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 513         
 514         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 515         automatically converts RDKit data iterable into a list, loads all data into
 516         memory, and calculates the default chunkSize using the following method
 517         as shown in its code:
 518         
 519             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 520             if extra: chunkSize += 1
 521         
 522         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 523         and 100 data items.
 524         
 525         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 526         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 527         data into memory. Consequently, the size of input data is not known a priori.
 528         It's not possible to estimate an optimal value for the chunkSize. The default 
 529         chunkSize is set to 1.
 530         
 531         The default value for the chunkSize during 'Lazy' data mode may adversely
 532         impact the performance due to the overhead associated with exchanging
 533         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 534         a larger value during 'Lazy' input data mode, based on the size of your input
 535         data and number of processes in the process pool.
 536         
 537         The mp.Pool.map() function waits for all worker processes to process all
 538         the data and return the results. The mp.Pool.imap() function, however,
 539         returns the the results obtained from worker processes as soon as the
 540         results become available for specified chunks of data.
 541         
 542         The order of data in the results returned by both mp.Pool.map() and 
 543         mp.Pool.imap() functions always corresponds to the input data.
 544     -n, --negate <yes or no>  [default: no]
 545         Specify whether to filter molecules not matching the PAINS filters specified by
 546         SMARTS patterns.
 547     -o, --outfile <outfile>
 548         Output file name.
 549     --outfileFiltered <yes or no>  [default: no]
 550         Write out a file containing filtered molecules. Its name is automatically
 551         generated from the specified output file. Default: <OutfileRoot>_
 552         Filtered.<OutfileExt>.
 553     --outfileParams <Name,Value,...>  [default: auto]
 554         A comma delimited list of parameter name and value pairs for writing
 555         molecules to files. The supported parameter names for different file
 556         formats, along with their default values, are shown below:
 557             
 558             SD: compute2DCoords,auto,kekulize,no
 559             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 560                 smilesTitleLine,yes
 561             
 562         Default value for compute2DCoords: yes for SMILES input file; no for all other
 563         file types.
 564     --overwrite
 565         Overwrite existing files.
 566     -p, --painsMode <All or A, B, or C>  [default: All]
 567         All or a comma delimited list of PAINS filter family type to used for
 568         filtering molecules. 
 569     -w, --workingdir <dir>
 570         Location of working directory which defaults to the current directory.
 571 
 572 Examples:
 573     To count the number of molecules not containing any substructure corresponding to
 574     PAINS SMARTS patterns and write out a SMILES file, type: 
 575 
 576         % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi
 577 
 578     To count the number of molecules not containing any substructure corresponding to
 579     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 580     CPUs without loading all data into memory, and write out a SMILES file, type: 
 581 
 582         % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi
 583 
 584     To count the number of molecules not containing any substructure corresponding to
 585     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 586     CPUs by loading all data into memory, and write out a SMILES file, type: 
 587 
 588         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory"
 589           -i Sample.smi -o SampleOut.smi
 590 
 591     To count the number of molecules not containing any substructure corresponding to
 592     PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific
 593     number of CPUs and chunk size without loading all data into memory, and
 594     write out a SMILES file, type: 
 595 
 596         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy,
 597           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 598 
 599     To count the number of molecules not containing any substructure corresponding to
 600     PAINS SMARTS patterns and write out a SMILES file containing these and filtered
 601     molecules, type: 
 602 
 603         % RDKitFilterPAINS.py --outfileFiltered yes -i Sample.smi
 604           -o SampleOut.smi
 605 
 606     To only count the number of molecules not containing any substructure corresponding
 607     to PAINS SMARTS patterns without writing out any file, type: 
 608 
 609         % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi
 610 
 611     To count the number of molecules containing any substructure corresponding to
 612     PAINS SMARTS patterns and write out a SD file with computed 2D coordinates,
 613     type: 
 614 
 615         % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf
 616 
 617     To count the number of molecules not containing any substructure corresponding to
 618     PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 
 619 
 620         % RDKitFilterPAINS.py --painsMode A --infileParams
 621           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 622           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 623           -i SampleSMILES.csv -o SampleOut.sdf
 624 
 625 Author:
 626     Manish Sud(msud@san.rr.com)
 627 
 628 See also:
 629     RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 630 
 631 Copyright:
 632     Copyright (C) 2019 Manish Sud. All rights reserved.
 633 
 634     The functionality available in this script is implemented using RDKit, an
 635     open source toolkit for cheminformatics developed by Greg Landrum.
 636 
 637     This file is part of MayaChemTools.
 638 
 639     MayaChemTools is free software; you can redistribute it and/or modify it under
 640     the terms of the GNU Lesser General Public License as published by the Free
 641     Software Foundation; either version 3 of the License, or (at your option) any
 642     later version.
 643 
 644 """
 645 
 646 if __name__ == "__main__":
 647     main()