MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitFilterPAINS.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2022 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43 except ImportError as ErrMsg:
  44     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  45     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  46     sys.exit(1)
  47 
  48 # MayaChemTools imports...
  49 try:
  50     from docopt import docopt
  51     import MiscUtil
  52     import RDKitUtil
  53 except ImportError as ErrMsg:
  54     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  55     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  56     sys.exit(1)
  57 
  58 ScriptName = os.path.basename(sys.argv[0])
  59 Options = {}
  60 OptionsInfo = {}
  61 
  62 def main():
  63     """Start execution of the script"""
  64     
  65     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  66     
  67     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  68     
  69     # Retrieve command line arguments and options...
  70     RetrieveOptions()
  71     
  72     # Process and validate command line arguments and options...
  73     ProcessOptions()
  74     
  75     # Perform actions required by the script...
  76     PerformFiltering()
  77     
  78     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  79     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  80 
  81 def PerformFiltering():
  82     """Filter molecules using SMARTS specified in PAINS filter file."""
  83 
  84     # Setup PAINS patterns and pattern mols...
  85     MiscUtil.PrintInfo("\nSetting up PAINS pattern molecules for performing substructure search...")
  86     PAINSPatterns = RetrievePAINSPatterns()
  87     PAINSPatternMols = SetupPAINSPatternMols(PAINSPatterns)
  88     
  89     # Setup a molecule reader...
  90     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  91     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  92     
  93     # Set up molecule writers...
  94     Writer, WriterFiltered = SetupMoleculeWriters()
  95     
  96     MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered)
  97     
  98     if Writer is not None:
  99         Writer.close()
 100     if WriterFiltered is not None:
 101         WriterFiltered.close()
 102     
 103     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 104     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 105     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 106 
 107     MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
 108     MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
 109 
 110 def ProcessMolecules(Mols, PAINSPatternMols, Writer, WriterFiltered):
 111     """Process and filter molecules. """
 112     
 113     if OptionsInfo["MPMode"]:
 114         return ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered)
 115     else:
 116         return ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered)
 117 
 118 def ProcessMoleculesUsingSingleProcess(Mols, PAINSPatternMols, Writer, WriterFiltered):
 119     """Process and filter molecules using a single process."""
 120     
 121     NegateMatch = OptionsInfo["NegateMatch"]
 122     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 123     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 124     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 125     
 126     MiscUtil.PrintInfo("\nFiltering molecules...")
 127     
 128     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 129     FirstMol = True
 130     for Mol in Mols:
 131         MolCount += 1
 132 
 133         if Mol is None:
 134             continue
 135         
 136         if RDKitUtil.IsMolEmpty(Mol):
 137             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 138             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 139             continue
 140         
 141         ValidMolCount += 1
 142         if FirstMol:
 143             FirstMol = False
 144             if SetSMILESMolProps:
 145                 if Writer is not None:
 146                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 147                 if WriterFiltered is not None:
 148                     RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 149         
 150         MolMatched = DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols)
 151         if MolMatched == NegateMatch:
 152             RemainingMolCount += 1
 153             WriteMolecule(Writer, Mol, Compute2DCoords)
 154         else:
 155             if OutfileFilteredMode:
 156                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 157     
 158     return (MolCount, ValidMolCount, RemainingMolCount)
 159     
 160 def ProcessMoleculesUsingMultipleProcesses(Mols, PAINSPatternMols, Writer, WriterFiltered):
 161     """Process and filter molecules using multiprocessing."""
 162     
 163     MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
 164     
 165     MPParams = OptionsInfo["MPParams"]
 166     NegateMatch = OptionsInfo["NegateMatch"]
 167     OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
 168     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 169     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 170     
 171     # Setup data for initializing a worker process...
 172     MiscUtil.PrintInfo("Encoding options info and PAINS pattern molecules...")
 173     OptionsInfo["EncodedPAINSPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in PAINSPatternMols]
 174     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
 175 
 176     # Setup a encoded mols data iterable for a worker process...
 177     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 178 
 179     # Setup process pool along with data initialization for each process...
 180     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 181     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 182     
 183     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 184     
 185     # Start processing...
 186     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 187         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 188     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 189         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 190     else:
 191         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 192     
 193     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 194     FirstMol = True
 195     for Result in Results:
 196         MolCount += 1
 197         MolIndex, EncodedMol, MolMatched = Result
 198         
 199         if EncodedMol is None:
 200             continue
 201         ValidMolCount += 1
 202         
 203         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 204         
 205         if FirstMol:
 206             FirstMol = False
 207             if SetSMILESMolProps:
 208                 if Writer is not None:
 209                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 210                 if WriterFiltered is not None:
 211                     RDKitUtil.SetWriterMolProps(WriterFiltered, Mol)
 212         
 213         if MolMatched == NegateMatch:
 214             RemainingMolCount += 1
 215             WriteMolecule(Writer, Mol, Compute2DCoords)
 216         else:
 217             if OutfileFilteredMode:
 218                 WriteMolecule(WriterFiltered, Mol, Compute2DCoords)
 219     
 220     return (MolCount, ValidMolCount, RemainingMolCount)
 221 
 222 def InitializeWorkerProcess(*EncodedArgs):
 223     """Initialize data for a worker process."""
 224 
 225     global Options, OptionsInfo
 226     
 227     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 228 
 229     # Decode Options and OptionInfo...
 230     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 231     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 232 
 233     # Decode PAINSPatternMols...
 234     OptionsInfo["PAINSPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedPAINSPatternMols"]]
 235     
 236 def WorkerProcess(EncodedMolInfo):
 237     """Process data for a worker process."""
 238     
 239     MolIndex, EncodedMol = EncodedMolInfo
 240     
 241     if EncodedMol is None:
 242         return [MolIndex, None, False]
 243         
 244     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 245     if RDKitUtil.IsMolEmpty(Mol):
 246         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 247         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 248         return [MolIndex, None, False]
 249         
 250     MolMatched = DoesMoleculeContainsPAINSPattern(Mol, OptionsInfo["PAINSPatternMols"])
 251 
 252     return [MolIndex, EncodedMol, MolMatched]
 253     
 254 def WriteMolecule(Writer, Mol, Compute2DCoords):
 255     """Write out molecule."""
 256     
 257     if OptionsInfo["CountMode"]:
 258         return
 259     
 260     if Compute2DCoords:
 261         AllChem.Compute2DCoords(Mol)
 262     
 263     Writer.write(Mol)
 264 
 265 def SetupMoleculeWriters():
 266     """Setup molecule writers."""
 267     
 268     Writer = None
 269     WriterFiltered = None
 270 
 271     if OptionsInfo["CountMode"]:
 272         return (Writer, WriterFiltered)
 273 
 274     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 275     if Writer is None:
 276         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 277     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 278     
 279     if OptionsInfo["OutfileFilteredMode"]:
 280         WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
 281         if WriterFiltered is None:
 282             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
 283         MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
 284     
 285     return (Writer, WriterFiltered)
 286 
 287 def DoesMoleculeContainsPAINSPattern(Mol, PAINSPatternMols):
 288     """Check presence of PAINS pattern in the molecule"""
 289 
 290     MolMatched = False
 291     
 292     for PatternMol in PAINSPatternMols:
 293         if Mol.HasSubstructMatch(PatternMol, useChirality = True):
 294             MolMatched = True
 295             break
 296         
 297     return MolMatched
 298     
 299 def RetrievePAINSPatterns():
 300     """Retrieve PAINS patterns for specified PAINS mode"""
 301 
 302     SMARTSPatterns = []
 303     for FilterType in OptionsInfo["SpecifiedFilterTypes"]:
 304         SMARTSPatterns.extend(OptionsInfo["PAINSFiltersMap"]["SMARTS"][FilterType])
 305 
 306     return SMARTSPatterns
 307 
 308 def SetupPAINSPatternMols(PAINSPatterns):
 309     """Set up PAINS pattern mols for substructure search"""
 310 
 311     PatternMols = []
 312     for Pattern in PAINSPatterns:
 313         PatternMol = Chem.MolFromSmarts(Pattern)
 314         if PatternMol is None:
 315             MiscUtil.PrintWarning("Failed to convert PAINS pattern, %s, into a molecule..." % Pattern)
 316             continue
 317         PatternMols.append(PatternMol)
 318         
 319     return PatternMols    
 320 
 321 def ProcessPAINSMode():
 322     """Process specified PAINS mode. """
 323     
 324     # Retrieve filetrs information...
 325     RetrievePAINSFiltersInfo()
 326     
 327     # Process PAINS mode...
 328     OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["PAINSFiltersMap"]["FilterTypes"]
 329     if re.match("^All$", OptionsInfo["PAINSMode"], re.I):
 330         return
 331     
 332     PAINSMode = re.sub(" ", "", OptionsInfo["PAINSMode"])
 333     if not len(PAINSMode):
 334         MiscUtil.PrintError("The PAINSMode mode specified using \"-p, --painsMode\" option are empty.")
 335 
 336     CanonicalFilterTypesMap = {}
 337     for FilterType in OptionsInfo["PAINSFiltersMap"]["FilterTypes"]:
 338         CanonicalFilterTypesMap[FilterType.lower()] = FilterType
 339 
 340     SpecifiedFilterTypes = []
 341     for FilterType in PAINSMode.split(","):
 342         CanonicalFilterType = FilterType.lower()
 343         if not CanonicalFilterType in CanonicalFilterTypesMap:
 344             MiscUtil.PrintError("The PAINS mode, %s, specified using \"-p, --PAINSMode\" is not valid. Supported PAINS modes: %s" % (FilterType, ", ".join(OptionsInfo["PAINSFiltersMap"]["FilterTypes"])))
 345 
 346         SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType])
 347 
 348     OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes
 349 
 350 def RetrievePAINSFiltersInfo():
 351     """Retrieve information for PAINS filters."""
 352     
 353     MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath()
 354     PAINSFiltersFilePath = os.path.join(MayaChemToolsDataDir, "PAINSFilters.csv")
 355     
 356     MiscUtil.PrintInfo("\nRetrieving PAINS SMARTS patterns from file %s" % (PAINSFiltersFilePath))
 357 
 358     Delimiter = ','
 359     QuoteChar = '"'
 360     IgnoreHeaderLine = True
 361     FilterLinesWords = MiscUtil.GetTextLinesWords(PAINSFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine)
 362 
 363     PAINSFiltersMap = {}
 364     PAINSFiltersMap["FilterTypes"] = []
 365     PAINSFiltersMap["ID"] = {}
 366     PAINSFiltersMap["SMARTS"] = {}
 367 
 368     for LineWords in FilterLinesWords:
 369         FilterType = LineWords[0]
 370         ID = LineWords[1]
 371         SMARTS = LineWords[2]
 372 
 373         if not FilterType in PAINSFiltersMap["FilterTypes"]:
 374             PAINSFiltersMap["FilterTypes"].append(FilterType)
 375             PAINSFiltersMap["ID"][FilterType] = []
 376             PAINSFiltersMap["SMARTS"][FilterType] = []
 377 
 378         PAINSFiltersMap["ID"][FilterType].append(ID)
 379         PAINSFiltersMap["SMARTS"][FilterType].append(SMARTS)
 380 
 381     OptionsInfo["PAINSFiltersMap"] = PAINSFiltersMap
 382     
 383     MiscUtil.PrintInfo("\nTotal number filters: %d" % len(FilterLinesWords))
 384     MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(PAINSFiltersMap["FilterTypes"]), ", ".join(PAINSFiltersMap["FilterTypes"])))
 385 
 386     for FilterType in PAINSFiltersMap["FilterTypes"]:
 387         MiscUtil.PrintInfo("Filter family type: %s; Number of filters: %d" % (FilterType, len(PAINSFiltersMap["ID"][FilterType])))
 388 
 389 def ProcessOptions():
 390     """Process and validate command line arguments and options"""
 391     
 392     MiscUtil.PrintInfo("Processing options...")
 393     
 394     # Validate options...
 395     ValidateOptions()
 396     
 397     OptionsInfo["Infile"] = Options["--infile"]
 398     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 399     
 400     OptionsInfo["Outfile"] = Options["--outfile"]
 401     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 402     
 403     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 404     OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
 405     OptionsInfo["OutfileFiltered"] = OutfileFiltered
 406     OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
 407     
 408     OptionsInfo["Overwrite"] = Options["--overwrite"]
 409 
 410     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 411     OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False
 412 
 413     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 414     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 415 
 416     OptionsInfo["PAINSMode"] = Options["--painsMode"]
 417     ProcessPAINSMode()
 418     
 419 def RetrieveOptions():
 420     """Retrieve command line arguments and options"""
 421     
 422     # Get options...
 423     global Options
 424     Options = docopt(_docoptUsage_)
 425     
 426     # Set current working directory to the specified directory...
 427     WorkingDir = Options["--workingdir"]
 428     if WorkingDir:
 429         os.chdir(WorkingDir)
 430     
 431     # Handle examples option...
 432     if "--examples" in Options and Options["--examples"]:
 433         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 434         sys.exit(0)
 435 
 436 def ValidateOptions():
 437     """Validate option values"""
 438     
 439     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 440     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 441     
 442     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 443     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 444     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 445 
 446     MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
 447     
 448     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
 449     if re.match("^filter$", Options["--mode"], re.I):
 450         if not Options["--outfile"]:
 451             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option")
 452         
 453     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 454     MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no")
 455     
 456 # Setup a usage string for docopt...
 457 _docoptUsage_ = """
 458 RDKitFilterPAINS.py - Filter PAINS molecules
 459 
 460 Usage:
 461     RDKitFilterPAINS.py  [--infileParams <Name,Value,...>] [--mode <filter or count>]
 462                          [--mp <yes or no>] [--mpParams <Name,Value,...>]
 463                          [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...> ]
 464                          [--painsMode <All or A, B, C>] [--negate <yes or no>]
 465                          [--overwrite] [-w <dir>] -i <infile> -o <outfile>
 466     RDKitFilterPAINS.py -h | --help | -e | --examples
 467 
 468 Description:
 469     Filter Pan-assay Interference molecules (PAINS) [ Ref 130 - 131 ] from an input
 470     file by performing a substructure search using SMARTS pattern specified in
 471     MAYACHEMTOOLS/lib/data/PAINSFilters.csv file and write out appropriate
 472     molecules to an output file or simply count the number of filtered molecules.
 473 
 474     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv,
 475     .tsv, .txt)
 476 
 477     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 478 
 479 Options:
 480     -e, --examples
 481         Print examples.
 482     -h, --help
 483         Print this help message.
 484     -i, --infile <infile>
 485         Input file name.
 486     --infileParams <Name,Value,...>  [default: auto]
 487         A comma delimited list of parameter name and value pairs for reading
 488         molecules from files. The supported parameter names for different file
 489         formats, along with their default values, are shown below:
 490             
 491             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 492             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 493                 smilesTitleLine,auto,sanitize,yes
 494             
 495         Possible values for smilesDelimiter: space, comma or tab.
 496     -m, --mode <filter or count>  [default: filter]
 497         Specify whether to filter the matched molecules and write out the rest of the 
 498         molecules to an outfile or simply count the number of matched molecules
 499         marked for filtering.
 500     --mp <yes or no>  [default: no]
 501         Use multiprocessing.
 502          
 503         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 504         function employing lazy RDKit data iterable. This allows processing of
 505         arbitrary large data sets without any additional requirements memory.
 506         
 507         All input data may be optionally loaded into memory by mp.Pool.map()
 508         before starting worker processes in a process pool by setting the value
 509         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 510         
 511         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 512         data mode may adversely impact the performance. The '--mpParams' section
 513         provides additional information to tune the value of 'chunkSize'.
 514     --mpParams <Name,Value,...>  [default: auto]
 515         A comma delimited list of parameter name and value pairs to configure
 516         multiprocessing.
 517         
 518         The supported parameter names along with their default and possible
 519         values are shown below:
 520         
 521             chunkSize, auto
 522             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 523             numProcesses, auto   [ Default: mp.cpu_count() ]
 524         
 525         These parameters are used by the following functions to configure and
 526         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 527         mp.Pool.imap().
 528         
 529         The chunkSize determines chunks of input data passed to each worker
 530         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 531         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 532         
 533         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 534         automatically converts RDKit data iterable into a list, loads all data into
 535         memory, and calculates the default chunkSize using the following method
 536         as shown in its code:
 537         
 538             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 539             if extra: chunkSize += 1
 540         
 541         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 542         and 100 data items.
 543         
 544         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 545         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 546         data into memory. Consequently, the size of input data is not known a priori.
 547         It's not possible to estimate an optimal value for the chunkSize. The default 
 548         chunkSize is set to 1.
 549         
 550         The default value for the chunkSize during 'Lazy' data mode may adversely
 551         impact the performance due to the overhead associated with exchanging
 552         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 553         a larger value during 'Lazy' input data mode, based on the size of your input
 554         data and number of processes in the process pool.
 555         
 556         The mp.Pool.map() function waits for all worker processes to process all
 557         the data and return the results. The mp.Pool.imap() function, however,
 558         returns the the results obtained from worker processes as soon as the
 559         results become available for specified chunks of data.
 560         
 561         The order of data in the results returned by both mp.Pool.map() and 
 562         mp.Pool.imap() functions always corresponds to the input data.
 563     -n, --negate <yes or no>  [default: no]
 564         Specify whether to filter molecules not matching the PAINS filters specified by
 565         SMARTS patterns.
 566     -o, --outfile <outfile>
 567         Output file name.
 568     --outfileFiltered <yes or no>  [default: no]
 569         Write out a file containing filtered molecules. Its name is automatically
 570         generated from the specified output file. Default: <OutfileRoot>_
 571         Filtered.<OutfileExt>.
 572     --outfileParams <Name,Value,...>  [default: auto]
 573         A comma delimited list of parameter name and value pairs for writing
 574         molecules to files. The supported parameter names for different file
 575         formats, along with their default values, are shown below:
 576             
 577             SD: compute2DCoords,auto,kekulize,yes
 578             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 579                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 580             
 581         Default value for compute2DCoords: yes for SMILES input file; no for all other
 582         file types.
 583     --overwrite
 584         Overwrite existing files.
 585     -p, --painsMode <All or A, B, or C>  [default: All]
 586         All or a comma delimited list of PAINS filter family type to used for
 587         filtering molecules. 
 588     -w, --workingdir <dir>
 589         Location of working directory which defaults to the current directory.
 590 
 591 Examples:
 592     To count the number of molecules not containing any substructure corresponding to
 593     PAINS SMARTS patterns and write out a SMILES file, type: 
 594 
 595         % RDKitFilterPAINS.py -i Sample.smi -o SampleOut.smi
 596 
 597     To count the number of molecules not containing any substructure corresponding to
 598     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 599     CPUs without loading all data into memory, and write out a SMILES file, type: 
 600 
 601         % RDKitFilterPAINS.py --mp yes -i Sample.smi -o SampleOut.smi
 602 
 603     To count the number of molecules not containing any substructure corresponding to
 604     PAINS SMARTS patterns, perform filtering in multiprocessing mode on all available
 605     CPUs by loading all data into memory, and write out a SMILES file, type: 
 606 
 607         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,InMemory"
 608           -i Sample.smi -o SampleOut.smi
 609 
 610     To count the number of molecules not containing any substructure corresponding to
 611     PAINS SMARTS patterns, perform filtering in multiprocessing mode on specific
 612     number of CPUs and chunk size without loading all data into memory, and
 613     write out a SMILES file, type: 
 614 
 615         % RDKitFilterPAINS.py --mp yes --mpParams "inputDataMode,Lazy,
 616           numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.smi
 617 
 618     To count the number of molecules not containing any substructure corresponding to
 619     PAINS SMARTS patterns and write out a SMILES file containing these and filtered
 620     molecules, type: 
 621 
 622         % RDKitFilterPAINS.py --outfileFiltered yes -i Sample.smi
 623           -o SampleOut.smi
 624 
 625     To only count the number of molecules not containing any substructure corresponding
 626     to PAINS SMARTS patterns without writing out any file, type: 
 627 
 628         % RDKitFilterPAINS.py -m count -i Sample.sdf -o SampleOut.smi
 629 
 630     To count the number of molecules containing any substructure corresponding to
 631     PAINS SMARTS patterns and write out a SD file with computed 2D coordinates,
 632     type: 
 633 
 634         % RDKitFilterPAINS.py -n yes -i Sample.smi -o SampleOut.sdf
 635 
 636     To count the number of molecules not containing any substructure corresponding to
 637     PAINS SMARTS patterns family of Type A in a CSV SMILES file and write out a SD file, type: 
 638 
 639         % RDKitFilterPAINS.py --painsMode A --infileParams
 640           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 641           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 642           -i SampleSMILES.csv -o SampleOut.sdf
 643 
 644 Author:
 645     Manish Sud(msud@san.rr.com)
 646 
 647 See also:
 648     RDKitFilterChEMBLAlerts.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
 649 
 650 Copyright:
 651     Copyright (C) 2022 Manish Sud. All rights reserved.
 652 
 653     The functionality available in this script is implemented using RDKit, an
 654     open source toolkit for cheminformatics developed by Greg Landrum.
 655 
 656     This file is part of MayaChemTools.
 657 
 658     MayaChemTools is free software; you can redistribute it and/or modify it under
 659     the terms of the GNU Lesser General Public License as published by the Free
 660     Software Foundation; either version 3 of the License, or (at your option) any
 661     later version.
 662 
 663 """
 664 
 665 if __name__ == "__main__":
 666     main()