MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitSearchFunctionalGroups.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43     from rdkit.Chem import FunctionalGroups
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 FunctionalGroupsMap = {}
  64 
  65 def main():
  66     """Start execution of the script"""
  67     
  68     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  69     
  70     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  71     
  72     # Retrieve command line arguments and options...
  73     RetrieveOptions()
  74     
  75     # Process and validate command line arguments and options...
  76     ProcessOptions()
  77     
  78     # Perform actions required by the script...
  79     PerformFunctionalGroupsSearch()
  80     
  81     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  82     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  83 
  84 def PerformFunctionalGroupsSearch():
  85     """Retrieve functional groups information and perform search."""
  86 
  87     # Process functional groups info...
  88     ProcessFunctionalGroupsInfo()
  89     
  90     # Setup pattern mols for functional group SMARTS...
  91     GroupsPatternMols = SetupFunctionalGroupsSMARTSPatterns()
  92     
  93     # Setup a molecule reader...
  94     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  95     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  96     
  97     # Set up  molecule writers...
  98     Writer,  GroupOutfilesWriters = SetupMoleculeWriters()
  99 
 100     MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternMatchCountList = ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 101     
 102     if Writer is not None:
 103         Writer.close()
 104     for GroupOutfileWriter in GroupOutfilesWriters:
 105         GroupOutfileWriter.close()
 106         
 107     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 108     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 109     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 110 
 111     MiscUtil.PrintInfo("\nTotal number of molecules matched against specified match criteria: %d" % RemainingMolCount)
 112     
 113     MiscUtil.PrintInfo("\nNumber of molecuels matched against individual functional groups:")
 114     MiscUtil.PrintInfo("FunctionalGroupName,MatchCount")
 115     
 116     for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])):
 117         GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex]
 118         if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]:
 119             GroupName = '!' + GroupName
 120         GroupMatchCount = GroupsPatternMatchCountList[GroupIndex]
 121         MiscUtil.PrintInfo("%s,%d" % (GroupName, GroupMatchCount))
 122 
 123 def ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 124     """Process and search molecules. """
 125 
 126     if OptionsInfo["MPMode"]:
 127         return ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 128     else:
 129         return ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 130     
 131 def ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 132     """Process and search molecules using a single process."""
 133 
 134     MiscUtil.PrintInfo("\nSearching functional groups...")
 135     
 136     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 137     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 138     
 139     GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 140     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 141     
 142     for Mol in Mols:
 143         MolCount += 1
 144         
 145         if Mol is None:
 146             continue
 147         
 148         if RDKitUtil.IsMolEmpty(Mol):
 149             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 150             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 151             continue
 152         
 153         ValidMolCount += 1
 154 
 155         # Match molecule against functional group patterns...
 156         MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, GroupsPatternMols)
 157 
 158         # Update functional group match count...
 159         for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList):
 160             if MatchStatus:
 161                 GroupsPatternsMatchCountList[GroupIndex] += 1
 162         
 163         if not MolMatched:
 164             continue
 165         
 166         RemainingMolCount += 1
 167         WriteMolecule(Writer, GroupOutfilesWriters, Mol,  Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList)
 168     
 169     return (MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternsMatchCountList)
 170 
 171 def ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 172     """Process and search molecules using multiprocessing."""
 173 
 174     MiscUtil.PrintInfo("\nSearching functional groups  using multiprocessing...")
 175     
 176     MPParams = OptionsInfo["MPParams"]
 177     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 178     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 179     
 180     # Setup data for initializing a worker process...
 181     MiscUtil.PrintInfo("Encoding options info and functional groups pattern molecules...")
 182     OptionsInfo["EncodedGroupPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in GroupsPatternMols]
 183     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo), MiscUtil.ObjectToBase64EncodedString(FunctionalGroupsMap))
 184 
 185     # Setup a encoded mols data iterable for a worker process...
 186     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 187 
 188     # Setup process pool along with data initialization for each process...
 189     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 190     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 191     
 192     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 193     
 194     # Start processing...
 195     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 196         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 197     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 198         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 199     else:
 200         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 201     
 202     GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 203     
 204     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 205     
 206     for Result in Results:
 207         MolCount += 1
 208         MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList = Result
 209         
 210         if EncodedMol is None:
 211             continue
 212         ValidMolCount += 1
 213         
 214         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 215         
 216         # Update functional group match count...
 217         for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList):
 218             if MatchStatus:
 219                 GroupsPatternsMatchCountList[GroupIndex] += 1
 220         
 221         if not MolMatched:
 222             continue
 223         
 224         RemainingMolCount += 1
 225         WriteMolecule(Writer, GroupOutfilesWriters, Mol,  Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList)
 226     
 227     return (MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternsMatchCountList)
 228 
 229 def InitializeWorkerProcess(*EncodedArgs):
 230     """Initialize data for a worker process."""
 231     
 232     global Options, OptionsInfo, FunctionalGroupsMap
 233 
 234     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 235     
 236     # Decode Options and OptionInfo...
 237     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 238     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 239     FunctionalGroupsMap = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[2])
 240 
 241     # Decode ChEMBLPatternMols...
 242     OptionsInfo["GroupPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedGroupPatternMols"]]
 243     
 244 def WorkerProcess(EncodedMolInfo):
 245     """Process data for a worker process."""
 246 
 247     MolIndex, EncodedMol = EncodedMolInfo
 248 
 249     if EncodedMol is None:
 250         return [MolIndex, None, False, None]
 251     
 252     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 253     if RDKitUtil.IsMolEmpty(Mol):
 254         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 255         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 256         return [MolIndex, None, False, None]
 257         
 258     # Match molecule against functional group patterns...
 259     MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, OptionsInfo["GroupPatternMols"])
 260     
 261     return [MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList]
 262 
 263 def MatchMolecule(Mol, GroupsPatternMols):
 264     """Search for functional groups in a molecule."""
 265 
 266     GroupsPatternMatchStatusList = []
 267     
 268     # Match pattern mols...
 269     for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])):
 270         Status = DoesPatternMolMatch(GroupsPatternMols[GroupIndex], Mol, OptionsInfo["UseChirality"], OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex])
 271         GroupsPatternMatchStatusList.append(Status)
 272         
 273     # Match mol against all specified criteria...
 274     MolMatched = DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMatchStatusList, OptionsInfo["CombineMatchResults"], OptionsInfo["AndCombineOperatorMode"])
 275 
 276     return (MolMatched, GroupsPatternMatchStatusList)
 277 
 278 def DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMolsMatchStatus,  CombineMatchResults, AndCombineOperatorMode):
 279     """Match molecule using specified match criteia."""
 280 
 281     if CombineMatchResults and AndCombineOperatorMode:
 282         # Must match all specified SMARTS
 283         Status = True
 284         for MatchStatus in GroupsPatternMolsMatchStatus:
 285             if not MatchStatus:
 286                 Status = False
 287                 break
 288     else:
 289         # One match is enough...
 290         Status = False
 291         for MatchStatus in GroupsPatternMolsMatchStatus:
 292             if MatchStatus:
 293                 Status = True
 294                 break
 295     
 296     return Status
 297     
 298 def WriteMolecule(Writer, GroupOutfilesWriters, Mol, Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList):
 299     """Write out molecule."""
 300     
 301     if OptionsInfo["CountMode"]:
 302         return
 303     
 304     if Compute2DCoords:
 305         AllChem.Compute2DCoords(Mol)
 306     
 307     if CombineMatchResults:
 308         Writer.write(Mol)
 309     else:
 310         for GroupIndex in range(0, len(GroupsPatternMatchStatusList)):
 311             if GroupsPatternMatchStatusList[GroupIndex]:
 312                 GroupOutfilesWriters[GroupIndex].write(Mol)
 313     
 314 def SetupMoleculeWriters():
 315     """Set up molecule writers for output files."""
 316 
 317     Writer = None
 318     GroupOutfilesWriters = []
 319     
 320     if OptionsInfo["CountMode"]:
 321         return (Writer, GroupOutfilesWriters)
 322     
 323     Outfile = OptionsInfo["Outfile"]
 324     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 325     GroupsOutfiles = OptionsInfo["SpecifiedFunctionalGroupsOutfiles"]
 326     
 327     if CombineMatchResults:
 328         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 329         if Writer is None:
 330             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 331         MiscUtil.PrintInfo("Generating file %s..." % Outfile)
 332     else:
 333         for GroupOutfile in GroupsOutfiles:
 334             GroupOutfileWriter = RDKitUtil.MoleculesWriter(GroupOutfile, **OptionsInfo["OutfileParams"])
 335             if GroupOutfileWriter is None:
 336                 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Writer)
 337             GroupOutfilesWriters.append(GroupOutfileWriter)
 338         
 339         GroupsCount = len(GroupsOutfiles)
 340         if GroupsCount > 4:
 341             MiscUtil.PrintInfo("Generating %d output files with the following file name format: %s<GroupName>.%s" % (GroupsCount, OptionsInfo["OutfileBasename"], OptionsInfo["OutfileExt"]))
 342         else:
 343             Delmiter = ', '
 344             OutfileNames = Delmiter.join(GroupsOutfiles)
 345             MiscUtil.PrintInfo("Generating %d output files: %s..." % (GroupsCount, OutfileNames))
 346 
 347     return (Writer, GroupOutfilesWriters)
 348     
 349 def DoesPatternMolMatch(PatternMol, Mol, UseChirality, NegateMatch):
 350     """Perform a substructure match for the presence of pattern molecule in a molecule."""
 351 
 352     MolMatched = Mol.HasSubstructMatch(PatternMol, useChirality = UseChirality)
 353     if NegateMatch:
 354         if MolMatched:
 355             MolMatched = False
 356         else:
 357             MolMatched = True
 358     
 359     return MolMatched
 360     
 361 def ProcessFunctionalGroupsInfo():
 362     """Process functional groups information."""
 363     
 364     RetrieveFunctionalGroupsInfo()
 365     ProcessSpecifiedFunctionalGroups()
 366     
 367     SetupFunctionalGroupsOutputFileNames()
 368 
 369 def ProcessSpecifiedFunctionalGroups():
 370     """Process and validate specified functional groups"""
 371     
 372     OptionsInfo["SpecifiedFunctionalGroups"] = []
 373     OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = []
 374     
 375     if re.match("^All$", OptionsInfo["FunctionalGroups"], re.I):
 376         OptionsInfo["SpecifiedFunctionalGroups"] = FunctionalGroupsMap['Names']
 377         OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = [False] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 378         return
 379     
 380     # Set up a map of valid group names for checking specified group names...
 381     CanonicalGroupNameMap = {}
 382     for GroupName in FunctionalGroupsMap['Names']:
 383         CanonicalGroupNameMap[GroupName.lower()] = GroupName
 384         
 385     # Parse and validate specified names...
 386     GroupNames = re.sub(" ", "", OptionsInfo["FunctionalGroups"])
 387     if not GroupNames:
 388         MiscUtil.PrintError("No functional group name specified for \"-f, --functionalGroups\" option")
 389     
 390     SpecifiedFunctionalGroups = []
 391     SpecifiedNegateMatchStatus = []
 392     
 393     for GroupName in GroupNames.split(","):
 394         CanonicalGroupName = GroupName.lower()
 395         NegateMatchStatus = False
 396         if re.match("^!", CanonicalGroupName, re.I):
 397             NegateMatchStatus = True
 398             CanonicalGroupName = re.sub("^!", "", CanonicalGroupName)
 399         if CanonicalGroupName in CanonicalGroupNameMap:
 400             SpecifiedFunctionalGroups.append(CanonicalGroupNameMap[CanonicalGroupName])
 401             SpecifiedNegateMatchStatus.append(NegateMatchStatus)
 402         else:
 403             MiscUtil.PrintWarning("The functional group name, %s, specified using \"-f, --functionalGroups\" option is not a valid name." % (GroupName))
 404 
 405     if not len(SpecifiedFunctionalGroups):
 406         MiscUtil.PrintError("No valid functional group names specified for \"-f, --functionalGroups\" option")
 407         
 408     OptionsInfo["SpecifiedFunctionalGroups"] = SpecifiedFunctionalGroups
 409     OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = SpecifiedNegateMatchStatus
 410 
 411 def SetupFunctionalGroupsSMARTSPatterns():
 412     """Setup SMARTS patterns for specified functional groups."""
 413 
 414     OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"] = []
 415     FunctionalGroupsPatternMols = []
 416     
 417     for Name in OptionsInfo["SpecifiedFunctionalGroups"]:
 418         SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name]
 419         PatternMol = Chem.MolFromSmarts(SMARTSPattern)
 420         if PatternMol is None:
 421             MiscUtil.PrintError("Failed to parse SMARTS pattern, %s, for function group, %s" % (SMARTSPattern, Name))
 422         
 423         OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"].append(SMARTSPattern)
 424         FunctionalGroupsPatternMols.append(PatternMol)
 425     
 426     return FunctionalGroupsPatternMols
 427 
 428 def SetupFunctionalGroupsOutputFileNames():
 429     """Setup output file names for specified functional group names."""
 430 
 431     OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = []
 432     
 433     if OptionsInfo["CountMode"]:
 434         # No need of any output file...
 435         return
 436     
 437     if OptionsInfo["CombineMatchResults"]:
 438         # No need of output files for specified functional groups...
 439         return
 440     
 441     OutfileBasename = OptionsInfo["OutfileBasename"]
 442     OutfileExt = OptionsInfo["OutfileExt"]
 443     SpecifiedFunctionalGroupsOutfiles = []
 444 
 445     GroupsCount = len(OptionsInfo["SpecifiedFunctionalGroups"])
 446     for GroupIndex in range(0, GroupsCount):
 447         GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex]
 448         if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]:
 449             GroupName = "Not" + GroupName
 450         GroupName = re.sub("\.", "", GroupName)
 451         
 452         GroupOutfile = "%s%s.%s" % (OutfileBasename, GroupName, OutfileExt)
 453         SpecifiedFunctionalGroupsOutfiles.append(GroupOutfile)
 454 
 455     OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = SpecifiedFunctionalGroupsOutfiles
 456     
 457 def RetrieveFunctionalGroupsInfo():
 458     """Retrieve functional groups information"""
 459 
 460     MiscUtil.PrintInfo("\nRetrieving data from default RDKit functional groups hierarchy file Functional_Group_Hierarchy.txt...")
 461     
 462     FunctionalGroupNamesFile = OptionsInfo["GroupNamesFile"]
 463     FunctionalGroupsNodes = FunctionalGroups.BuildFuncGroupHierarchy(FunctionalGroupNamesFile)
 464 
 465     FunctionalGroupsMap['Names'] = []
 466     FunctionalGroupsMap['SMARTSPattern'] = {}
 467     
 468     RetrieveDataFromFunctionalGroupsHierarchy(FunctionalGroupsNodes)
 469 
 470     if not len(FunctionalGroupsMap['Names']):
 471         MiscUtil.PrintError("Failed to retrieve any functional group names and SMARTS patterns...")
 472         
 473     MiscUtil.PrintInfo("Total number of functional groups present functional group hierarchy: %d" % (len(FunctionalGroupsMap['Names'])))
 474     
 475 def RetrieveDataFromFunctionalGroupsHierarchy(FGNodes):
 476     """Retrieve functional groups data by recursively visiting functional group nodes."""
 477     
 478     for FGNode in FGNodes:
 479         Name = FGNode.label
 480         SMARTSPattern = FGNode.smarts
 481 
 482         if Name in FunctionalGroupsMap['SMARTSPattern']:
 483             MiscUtil.PrintWarning("Ignoring duplicate functional group name: %s..." % Name)
 484         else:
 485             FunctionalGroupsMap['Names'].append(Name)
 486             FunctionalGroupsMap['SMARTSPattern'][Name] = SMARTSPattern
 487 
 488         RetrieveDataFromFunctionalGroupsHierarchy(FGNode.children)
 489 
 490 def ListFunctionalGroupsInfo():
 491     """List functional groups information"""
 492 
 493     MiscUtil.PrintInfo("\nListing available functional groups names and SMARTS patterns...")
 494     MiscUtil.PrintInfo("\nFunctionalGroupName\tSMARTSPattern")
 495     
 496     for Name in sorted(FunctionalGroupsMap['Names']):
 497         SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name]
 498         MiscUtil.PrintInfo("%s\t%s" % (Name, SMARTSPattern))
 499     
 500     MiscUtil.PrintInfo("")
 501 
 502 def ProcessOptions():
 503     """Process and validate command line arguments and options"""
 504     
 505     MiscUtil.PrintInfo("Processing options...")
 506     
 507     # Validate options...
 508     ValidateOptions()
 509     
 510     OptionsInfo["CombineMatches"] = Options["--combineMatches"]
 511     
 512     OptionsInfo["CombineMatchResults"] = True
 513     if re.match("^No$", Options["--combineMatches"], re.I):
 514         OptionsInfo["CombineMatchResults"] = False
 515         if Options["--outfile"]:
 516             FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 517             OptionsInfo["OutfileBasename"] = FileName
 518             OptionsInfo["OutfileExt"] = FileExt
 519     
 520     OptionsInfo["CombineOperator"] = Options["--combineOperator"]
 521     OptionsInfo["AndCombineOperatorMode"] = True
 522     if re.match("^or$", Options["--combineOperator"], re.I):
 523         OptionsInfo["AndCombineOperatorMode"] = False
 524     
 525     OptionsInfo["GroupNamesFile"] = None
 526     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 527         OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"]
 528         
 529     OptionsInfo["FunctionalGroups"] = Options["--functionalGroups"]
 530     
 531     OptionsInfo["Infile"] = Options["--infile"]
 532     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 533     
 534     OptionsInfo["Outfile"] = Options["--outfile"]
 535     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 536     
 537     OptionsInfo["Overwrite"] = Options["--overwrite"]
 538     
 539     OptionsInfo["CountMode"] = False
 540     if re.match("^count$", Options["--mode"], re.I):
 541         OptionsInfo["CountMode"] = True
 542     
 543     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 544     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 545     
 546     OptionsInfo["UseChirality"] = False
 547     if re.match("^yes$", Options["--useChirality"], re.I):
 548         OptionsInfo["UseChirality"] = True
 549 
 550 def RetrieveOptions():
 551     """Retrieve command line arguments and options"""
 552     
 553     # Get options...
 554     global Options
 555     Options = docopt(_docoptUsage_)
 556 
 557     # Set current working directory to the specified directory...
 558     WorkingDir = Options["--workingdir"]
 559     if WorkingDir:
 560         os.chdir(WorkingDir)
 561     
 562     # Handle examples option...
 563     if "--examples" in Options and Options["--examples"]:
 564         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 565         sys.exit(0)
 566     
 567     # Handle listing of functional group information...
 568     if  Options and Options["--list"]:
 569         ProcessListFunctionalGroupsOption()
 570         sys.exit(0)
 571 
 572 def ProcessListFunctionalGroupsOption():
 573     """Process list functional groups information."""
 574 
 575     # Validate and process dataFile option for listing functional groups information...
 576     OptionsInfo["GroupNamesFile"] = None
 577     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 578         MiscUtil.ValidateOptionFilePath("-g, --groupNamesFile", Options["--groupNamesFile"])
 579         OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"]
 580     
 581     RetrieveFunctionalGroupsInfo()
 582     ListFunctionalGroupsInfo()
 583     
 584 def ValidateOptions():
 585     """Validate option values"""
 586     
 587     MiscUtil.ValidateOptionTextValue("-c, --combineMatches", Options["--combineMatches"], "yes no")
 588     MiscUtil.ValidateOptionTextValue("--combineOperator", Options["--combineOperator"], "and or")
 589     
 590     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 591         MiscUtil.ValidateOptionFilePath("-g, groupNamesFile", Options["--groupNamesFile"])
 592         
 593     if re.match("^none$", Options["--functionalGroups"], re.I):
 594         MiscUtil.PrintError("The name(s) of functional groups must be specified using \"-f, --functionalGroups\" option")
 595         
 596     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 597     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 598     if Options["--outfile"]:
 599         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 600         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 601         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 602     
 603     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "retrieve count")
 604     if re.match("^retrieve$", Options["--mode"], re.I):
 605         if not Options["--outfile"]:
 606             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"retrieve\" value of \"-m, --mode\" option")
 607         
 608     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 609     
 610     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 611 
 612 # Setup a usage string for docopt...
 613 _docoptUsage_ = """
 614 RDKitSearchFunctionalGroups.py - Search for functional groups using SMARTS patterns
 615 
 616 Usage:
 617     RDKitSearchFunctionalGroups.py  [--combineMatches <yes or no>] [--combineOperator <and or or>]
 618                                            [--groupNamesFile <FileName or auto>] [--infileParams <Name,Value,...>]
 619                                            [--mode <retrieve or count>] [--mp <yes or no>] [--mpParams <Name.Value,...>]
 620                                            [--negate <yes or no>] [--outfileParams <Name,Value,...>] [--overwrite]
 621                                            [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> -f <Name1,Name2,Name3... or All>
 622     RDKitSearchFunctionalGroups.py [--groupNamesFile <FileName or auto>] -l | --list
 623     RDKitSearchFunctionalGroups.py -h | --help | -e | --examples
 624 
 625 Description:
 626     Perform a substructure search in an input file using SMARTS patterns for functional
 627     groups and write out the matched molecules to an output file or simply count the
 628     number of matches.
 629 
 630     The SMARTS patterns for specified functional group(s) are retrieved from file,
 631     Functional_Group_Hierarchy.txt, available in RDKit data directory.
 632 
 633     The names of valid functional groups and hierarchies  are dynamically retrieved from the
 634     functional groups hierarchy file and are shown below:
 635 
 636         AcidChloride, AcidChloride.Aromatic, AcidChloride.Aliphatic
 637         Alcohol, Alcohol.Aromatic, Alcohol.Aliphatic
 638         Aldehyde, Aldehyde.Aromatic, Aldehyde.Aliphatic
 639         Amine, Amine.Primary, Amine.Primary.Aromatic, Amine.Primary.Aliphatic,
 640         Amine.Secondary, Amine.Secondary.Aromatic, Amine.Secondary.Aliphatic
 641         Amine.Tertiary, Amine.Tertiary.Aromatic, Amine.Tertiary.Aliphatic
 642         Amine.Aromatic, Amine.Aliphatic, Amine.Cyclic
 643         Azide, Azide.Aromatic, Azide.Aliphatic
 644         BoronicAcid, BoronicAcid.Aromatic, BoronicAcid.Aliphatic
 645         CarboxylicAcid, CarboxylicAcid.Aromatic, CarboxylicAcid.Aliphatic,
 646         CarboxylicAcid.AlphaAmino
 647         Halogen, Halogen.Aromatic, Halogen.Aliphatic
 648         Halogen.NotFluorine, Halogen.NotFluorine.Aliphatic,
 649         Halogen.NotFluorine.Aromatic
 650         Halogen.Bromine, Halogen.Bromine.Aliphatic, Halogen.Bromine.Aromatic,
 651         Halogen.Bromine.BromoKetone
 652         Isocyanate, Isocyanate.Aromatic, Isocyanate.Aliphatic
 653         Nitro, Nitro.Aromatic, Nitro.Aliphatic,
 654         SulfonylChloride, SulfonylChloride.Aromatic, SulfonylChloride.Aliphatic
 655         TerminalAlkyne
 656 
 657     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt)
 658 
 659     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 660 
 661 Options:
 662     -c, --combineMatches <yes or no>  [default: yes]
 663         Combine search results for matching SMARTS patterns of specified functional groups
 664         against a molecule. Possible values: yes or no.
 665         
 666         The matched molecules are written to a single output file for "yes" value. Otherwise,
 667         multiple output files are generated, one for each functional group. The names of  
 668         these files correspond to a combination of the basename of the specified output file
 669         and the name of the functional group.
 670         
 671         No output files are generated during "count" value of "-m, --mode" option.
 672     --combineOperator <and or or>  [default: and]
 673         Logical operator to use for combining match results corresponding to specified
 674         functional group names before writing out a single file. This option is ignored
 675         during "No" value of  "-c, --combineMatches" option.
 676     -e, --examples
 677         Print examples.
 678     -g, --groupNamesFile <FileName or auto>  [default: auto]
 679         Specify a file name containing data for functional groups hierarchy or use functional
 680         group hierarchy file, Functional_Group_Hierarchy.txt, available in RDKit data directory.
 681         
 682         RDKit data format: Name<tab>Smarts<tab>Label<tab>RemovalReaction (optional)
 683         
 684         The format of data in local functional group hierarchy must match format of the
 685         data in functional group file available in RDKit data directory.
 686     -f, --functionalGroups <Name1,Name2,Name3... or All>  [default: none]
 687         Functional group names for performing substructure SMARTS search. Possible values:
 688         Comma delimited list of valid functional group names or All. The current set of valid
 689         functional group names are listed in the description section.
 690         
 691         The match results for multiple functional group names are combined using 'and'
 692         operator before writing them out to single file. No merging of match results takes
 693         place during generation of individual result files corresponding to fictional group
 694         names.
 695         
 696         The functional group name may be started with an exclamation mark to negate
 697         the match result for that fictional group.
 698     -h, --help
 699         Print this help message.
 700     -i, --infile <infile>
 701         Input file name.
 702     --infileParams <Name,Value,...>  [default: auto]
 703         A comma delimited list of parameter name and value pairs for reading
 704         molecules from files. The supported parameter names for different file
 705         formats, along with their default values, are shown below:
 706             
 707             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 708             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 709                 smilesTitleLine,auto,sanitize,yes
 710             
 711         Possible values for smilesDelimiter: space, comma or tab.
 712     -l, --list
 713         List functional groups information without performing any search.
 714     -m, --mode <retrieve or count>  [default: retrieve]
 715         Specify whether to retrieve and write out matched molecules to an output
 716         file or simply count the number of matches.
 717     --mp <yes or no>  [default: no]
 718         Use multiprocessing.
 719          
 720         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 721         function employing lazy RDKit data iterable. This allows processing of
 722         arbitrary large data sets without any additional requirements memory.
 723         
 724         All input data may be optionally loaded into memory by mp.Pool.map()
 725         before starting worker processes in a process pool by setting the value
 726         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 727         
 728         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 729         data mode may adversely impact the performance. The '--mpParams' section
 730         provides additional information to tune the value of 'chunkSize'.
 731     --mpParams <Name,Value,...>  [default: auto]
 732         A comma delimited list of parameter name and value pairs for to
 733         configure multiprocessing.
 734         
 735         The supported parameter names along with their default and possible
 736         values are shown below:
 737         
 738             chunkSize, auto
 739             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 740             numProcesses, auto   [ Default: mp.cpu_count() ]
 741         
 742         These parameters are used by the following functions to configure and
 743         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 744         mp.Pool.imap().
 745         
 746         The chunkSize determines chunks of input data passed to each worker
 747         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 748         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 749         
 750         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 751         automatically converts RDKit data iterable into a list, loads all data into
 752         memory, and calculates the default chunkSize using the following method
 753         as shown in its code:
 754         
 755             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 756             if extra: chunkSize += 1
 757         
 758         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 759         and 100 data items.
 760         
 761         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 762         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 763         data into memory. Consequently, the size of input data is not known a priori.
 764         It's not possible to estimate an optimal value for the chunkSize. The default 
 765         chunkSize is set to 1.
 766         
 767         The default value for the chunkSize during 'Lazy' data mode may adversely
 768         impact the performance due to the overhead associated with exchanging
 769         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 770         a larger value during 'Lazy' input data mode, based on the size of your input
 771         data and number of processes in the process pool.
 772         
 773         The mp.Pool.map() function waits for all worker processes to process all
 774         the data and return the results. The mp.Pool.imap() function, however,
 775         returns the the results obtained from worker processes as soon as the
 776         results become available for specified chunks of data.
 777         
 778         The order of data in the results returned by both mp.Pool.map() and 
 779         mp.Pool.imap() functions always corresponds to the input data.
 780     -o, --outfile <outfile>
 781         Output file name.
 782     --outfileParams <Name,Value,...>  [default: auto]
 783         A comma delimited list of parameter name and value pairs for writing
 784         molecules to files. The supported parameter names for different file
 785         formats, along with their default values, are shown below:
 786             
 787             SD: compute2DCoords,auto,kekulize,no
 788             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 789                 smilesTitleLine,yes
 790             
 791         Default value for compute2DCoords: yes for SMILES input file; no for all other
 792         file types.
 793     --overwrite
 794         Overwrite existing files.
 795     -u, --useChirality <yes or no>  [default: no]
 796         Use stereochemistry information for SMARTS search.
 797     -w, --workingdir <dir>
 798         Location of working directory which defaults to the current directory.
 799 
 800 Examples:
 801     To list names of all available functional groups along with their SMARTS
 802     patterns, type:
 803 
 804         % RDKitSearchFunctionalGroups.py -l
 805 
 806     To retrieve molecules containing amine functional group and write out a
 807     SMILES file, type: 
 808 
 809         % RDKitSearchFunctionalGroups.py -f Amine -i Sample.smi -o SampleOut.smi
 810 
 811     To retrieve molecules containing amine functional group, perform search in
 812     multiprocessing mode on all  available CPUs without loading all data into
 813     memory, and write out a SMILES file, type: 
 814 
 815         % RDKitSearchFunctionalGroups.py --mp yes -f Amine -i Sample.smi
 816           -o SampleOut.smi
 817 
 818     To retrieve molecules containing amine functional group, perform search in
 819     multiprocessing mode on all  available CPUs by loading all data into memory,
 820     and write out a SMILES file, type: 
 821 
 822         % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode,
 823           InMemory" -f Amine -i Sample.smi -o SampleOut.smi
 824 
 825     To retrieve molecules containing amine functional group, perform search in
 826     multiprocessing mode on specific number of CPUs and chunksize without loading
 827     all data into memory, and write out a SMILES file, type: 
 828 
 829         % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode,
 830           lazy,numProcesses,4,chunkSize,8" -f Amine -i Sample.smi -o
 831           SampleOut.smi
 832 
 833     To retrieve molecules containing amine functional group but not halogens and carboxylic
 834     acid functional groups and write out a SMILES file, type: 
 835 
 836         % RDKitSearchFunctionalGroups.py -f 'Amine,!Halogen,!CarboxylicAcid'
 837           -i Sample.smi -o SampleOut.smi
 838 
 839     To retrieve molecules containing amine, halogens or carboxylic  acid functional groups
 840     and write out a SMILES file, type: 
 841 
 842         % RDKitSearchFunctionalGroups.py -f 'Amine,Halogen,CarboxylicAcid'
 843           --combineOperator or -i Sample.smi -o SampleOut.smi
 844 
 845     To retrieve molecules containing amine and carboxylic acid functional groups defined in
 846     a local functional groups hierarchy file and write out individual SD files for each
 847     funcitonal group, type: 
 848 
 849         % RDKitSearchFunctionalGroups.py -f 'Amine,CarboxylicAcid' -i Sample.sdf 
 850           -g Custom_Functional_Group_Hierarchy.txt --combineMatches No -o SampleOut.sdf
 851 
 852     To count number of all functional groups in molecules without writing out an output
 853     files, type:
 854 
 855         % RDKitSearchFunctionalGroups.py -m count -f All --combineMatches no -i Sample.smi
 856 
 857     To retrieve molecule not containing aromatic alcohol and aromatic halogen functional
 858     group along with the use of chirality during substructure search and write out individual
 859     SMILES files for each functional group, type: 
 860 
 861         % RDKitSearchFunctionalGroups.py --combineMatches no -u yes
 862            -f '!Alcohol.Aromatic,!Halogen.Aromatic' -i Sample.smi -o SampleOut.smi
 863 
 864     To retrieve molecule containing amine functional group from a CSV SMILES file,
 865     SMILES strings in column 1, name in column 2, and write out a SD file, type: 
 866 
 867         % RDKitSearchFunctionalGroups.py -f Amine --infileParams
 868           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 869           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 870           -i SampleSMILES.csv -o SampleOut.sdf
 871 
 872 Author:
 873     Manish Sud(msud@san.rr.com)
 874 
 875 See also:
 876     RDKitConvertFileFormat.py, RDKitFilterPAINS.py, RDKitSearchSMARTS.py
 877 
 878 Copyright:
 879     Copyright (C) 2019 Manish Sud. All rights reserved.
 880 
 881     The functionality available in this script is implemented using RDKit, an
 882     open source toolkit for cheminformatics developed by Greg Landrum.
 883 
 884     This file is part of MayaChemTools.
 885 
 886     MayaChemTools is free software; you can redistribute it and/or modify it under
 887     the terms of the GNU Lesser General Public License as published by the Free
 888     Software Foundation; either version 3 of the License, or (at your option) any
 889     later version.
 890 
 891 """
 892 
 893 if __name__ == "__main__":
 894     main()