MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitSearchFunctionalGroups.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import multiprocessing as mp
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43     from rdkit.Chem import FunctionalGroups
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 FunctionalGroupsMap = {}
  64 
  65 def main():
  66     """Start execution of the script."""
  67     
  68     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  69     
  70     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  71     
  72     # Retrieve command line arguments and options...
  73     RetrieveOptions()
  74     
  75     # Process and validate command line arguments and options...
  76     ProcessOptions()
  77     
  78     # Perform actions required by the script...
  79     PerformFunctionalGroupsSearch()
  80     
  81     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  82     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  83 
  84 def PerformFunctionalGroupsSearch():
  85     """Retrieve functional groups information and perform search."""
  86 
  87     # Process functional groups info...
  88     ProcessFunctionalGroupsInfo()
  89     
  90     # Setup pattern mols for functional group SMARTS...
  91     GroupsPatternMols = SetupFunctionalGroupsSMARTSPatterns()
  92     
  93     # Setup a molecule reader...
  94     MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
  95     Mols  = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
  96     
  97     # Set up  molecule writers...
  98     Writer,  GroupOutfilesWriters = SetupMoleculeWriters()
  99 
 100     MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternMatchCountList = ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 101     
 102     if Writer is not None:
 103         Writer.close()
 104     for GroupOutfileWriter in GroupOutfilesWriters:
 105         GroupOutfileWriter.close()
 106         
 107     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 108     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 109     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 110 
 111     MiscUtil.PrintInfo("\nTotal number of molecules matched against specified match criteria: %d" % RemainingMolCount)
 112     
 113     MiscUtil.PrintInfo("\nNumber of molecuels matched against individual functional groups:")
 114     MiscUtil.PrintInfo("FunctionalGroupName,MatchCount")
 115     
 116     for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])):
 117         GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex]
 118         if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]:
 119             GroupName = '!' + GroupName
 120         GroupMatchCount = GroupsPatternMatchCountList[GroupIndex]
 121         MiscUtil.PrintInfo("%s,%d" % (GroupName, GroupMatchCount))
 122 
 123 def ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 124     """Process and search molecules."""
 125 
 126     if OptionsInfo["MPMode"]:
 127         return ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 128     else:
 129         return ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters)
 130     
 131 def ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 132     """Process and search molecules using a single process."""
 133 
 134     MiscUtil.PrintInfo("\nSearching functional groups...")
 135     
 136     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 137     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 138     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 139     
 140     GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 141     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 142     
 143     FirstMol = True
 144     for Mol in Mols:
 145         MolCount += 1
 146         
 147         if Mol is None:
 148             continue
 149         
 150         if RDKitUtil.IsMolEmpty(Mol):
 151             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 152             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 153             continue
 154         
 155         ValidMolCount += 1
 156         if FirstMol:
 157             FirstMol = False
 158             if SetSMILESMolProps:
 159                 if Writer is not None:
 160                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 161                 for GroupOutfileWriter in GroupOutfilesWriters:
 162                     if GroupOutfileWriter is not None:
 163                         RDKitUtil.SetWriterMolProps(GroupOutfileWriter, Mol)
 164         
 165         # Match molecule against functional group patterns...
 166         MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, GroupsPatternMols)
 167 
 168         # Update functional group match count...
 169         for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList):
 170             if MatchStatus:
 171                 GroupsPatternsMatchCountList[GroupIndex] += 1
 172         
 173         if not MolMatched:
 174             continue
 175         
 176         RemainingMolCount += 1
 177         WriteMolecule(Writer, GroupOutfilesWriters, Mol,  Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList)
 178     
 179     return (MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternsMatchCountList)
 180 
 181 def ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters):
 182     """Process and search molecules using multiprocessing."""
 183 
 184     MiscUtil.PrintInfo("\nSearching functional groups  using multiprocessing...")
 185     
 186     MPParams = OptionsInfo["MPParams"]
 187     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 188     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 189     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 190     
 191     # Setup data for initializing a worker process...
 192     MiscUtil.PrintInfo("Encoding options info and functional groups pattern molecules...")
 193     OptionsInfo["EncodedGroupPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in GroupsPatternMols]
 194     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo), MiscUtil.ObjectToBase64EncodedString(FunctionalGroupsMap))
 195 
 196     # Setup a encoded mols data iterable for a worker process...
 197     WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
 198 
 199     # Setup process pool along with data initialization for each process...
 200     MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
 201     MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
 202     
 203     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
 204     
 205     # Start processing...
 206     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
 207         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 208     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
 209         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
 210     else:
 211         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
 212     
 213     GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 214     
 215     (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3
 216     
 217     FirstMol = True
 218     for Result in Results:
 219         MolCount += 1
 220         MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList = Result
 221         
 222         if EncodedMol is None:
 223             continue
 224         ValidMolCount += 1
 225         
 226         Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 227         
 228         if FirstMol:
 229             FirstMol = False
 230             if SetSMILESMolProps:
 231                 if Writer is not None:
 232                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 233                 for GroupOutfileWriter in GroupOutfilesWriters:
 234                     if GroupOutfileWriter is not None:
 235                         RDKitUtil.SetWriterMolProps(GroupOutfileWriter, Mol)
 236         
 237         # Update functional group match count...
 238         for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList):
 239             if MatchStatus:
 240                 GroupsPatternsMatchCountList[GroupIndex] += 1
 241         
 242         if not MolMatched:
 243             continue
 244         
 245         RemainingMolCount += 1
 246         WriteMolecule(Writer, GroupOutfilesWriters, Mol,  Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList)
 247     
 248     return (MolCount, ValidMolCount, RemainingMolCount,  GroupsPatternsMatchCountList)
 249 
 250 def InitializeWorkerProcess(*EncodedArgs):
 251     """Initialize data for a worker process."""
 252     
 253     global Options, OptionsInfo, FunctionalGroupsMap
 254 
 255     MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
 256     
 257     # Decode Options and OptionInfo...
 258     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
 259     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
 260     FunctionalGroupsMap = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[2])
 261 
 262     # Decode ChEMBLPatternMols...
 263     OptionsInfo["GroupPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedGroupPatternMols"]]
 264     
 265 def WorkerProcess(EncodedMolInfo):
 266     """Process data for a worker process."""
 267 
 268     MolIndex, EncodedMol = EncodedMolInfo
 269 
 270     if EncodedMol is None:
 271         return [MolIndex, None, False, None]
 272     
 273     Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
 274     if RDKitUtil.IsMolEmpty(Mol):
 275         MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
 276         MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 277         return [MolIndex, None, False, None]
 278         
 279     # Match molecule against functional group patterns...
 280     MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, OptionsInfo["GroupPatternMols"])
 281     
 282     return [MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList]
 283 
 284 def MatchMolecule(Mol, GroupsPatternMols):
 285     """Search for functional groups in a molecule."""
 286 
 287     GroupsPatternMatchStatusList = []
 288     
 289     # Match pattern mols...
 290     for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])):
 291         Status = DoesPatternMolMatch(GroupsPatternMols[GroupIndex], Mol, OptionsInfo["UseChirality"], OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex])
 292         GroupsPatternMatchStatusList.append(Status)
 293         
 294     # Match mol against all specified criteria...
 295     MolMatched = DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMatchStatusList, OptionsInfo["CombineMatchResults"], OptionsInfo["AndCombineOperatorMode"])
 296 
 297     return (MolMatched, GroupsPatternMatchStatusList)
 298 
 299 def DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMolsMatchStatus,  CombineMatchResults, AndCombineOperatorMode):
 300     """Match molecule using specified match criteia."""
 301 
 302     if CombineMatchResults and AndCombineOperatorMode:
 303         # Must match all specified SMARTS
 304         Status = True
 305         for MatchStatus in GroupsPatternMolsMatchStatus:
 306             if not MatchStatus:
 307                 Status = False
 308                 break
 309     else:
 310         # One match is enough...
 311         Status = False
 312         for MatchStatus in GroupsPatternMolsMatchStatus:
 313             if MatchStatus:
 314                 Status = True
 315                 break
 316     
 317     return Status
 318     
 319 def WriteMolecule(Writer, GroupOutfilesWriters, Mol, Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList):
 320     """Write out molecule."""
 321     
 322     if OptionsInfo["CountMode"]:
 323         return
 324     
 325     if Compute2DCoords:
 326         AllChem.Compute2DCoords(Mol)
 327     
 328     if CombineMatchResults:
 329         Writer.write(Mol)
 330     else:
 331         for GroupIndex in range(0, len(GroupsPatternMatchStatusList)):
 332             if GroupsPatternMatchStatusList[GroupIndex]:
 333                 GroupOutfilesWriters[GroupIndex].write(Mol)
 334     
 335 def SetupMoleculeWriters():
 336     """Set up molecule writers for output files."""
 337 
 338     Writer = None
 339     GroupOutfilesWriters = []
 340     
 341     if OptionsInfo["CountMode"]:
 342         return (Writer, GroupOutfilesWriters)
 343     
 344     Outfile = OptionsInfo["Outfile"]
 345     CombineMatchResults = OptionsInfo["CombineMatchResults"]
 346     GroupsOutfiles = OptionsInfo["SpecifiedFunctionalGroupsOutfiles"]
 347     
 348     if CombineMatchResults:
 349         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 350         if Writer is None:
 351             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 352         MiscUtil.PrintInfo("Generating file %s..." % Outfile)
 353     else:
 354         for GroupOutfile in GroupsOutfiles:
 355             GroupOutfileWriter = RDKitUtil.MoleculesWriter(GroupOutfile, **OptionsInfo["OutfileParams"])
 356             if GroupOutfileWriter is None:
 357                 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Writer)
 358             GroupOutfilesWriters.append(GroupOutfileWriter)
 359         
 360         GroupsCount = len(GroupsOutfiles)
 361         if GroupsCount > 4:
 362             MiscUtil.PrintInfo("Generating %d output files with the following file name format: %s<GroupName>.%s" % (GroupsCount, OptionsInfo["OutfileBasename"], OptionsInfo["OutfileExt"]))
 363         else:
 364             Delmiter = ', '
 365             OutfileNames = Delmiter.join(GroupsOutfiles)
 366             MiscUtil.PrintInfo("Generating %d output files: %s..." % (GroupsCount, OutfileNames))
 367 
 368     return (Writer, GroupOutfilesWriters)
 369     
 370 def DoesPatternMolMatch(PatternMol, Mol, UseChirality, NegateMatch):
 371     """Perform a substructure match for the presence of pattern molecule in a molecule."""
 372 
 373     MolMatched = Mol.HasSubstructMatch(PatternMol, useChirality = UseChirality)
 374     if NegateMatch:
 375         if MolMatched:
 376             MolMatched = False
 377         else:
 378             MolMatched = True
 379     
 380     return MolMatched
 381     
 382 def ProcessFunctionalGroupsInfo():
 383     """Process functional groups information."""
 384     
 385     RetrieveFunctionalGroupsInfo()
 386     ProcessSpecifiedFunctionalGroups()
 387     
 388     SetupFunctionalGroupsOutputFileNames()
 389 
 390 def ProcessSpecifiedFunctionalGroups():
 391     """Process and validate specified functional groups."""
 392     
 393     OptionsInfo["SpecifiedFunctionalGroups"] = []
 394     OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = []
 395     
 396     if re.match("^All$", OptionsInfo["FunctionalGroups"], re.I):
 397         OptionsInfo["SpecifiedFunctionalGroups"] = FunctionalGroupsMap['Names']
 398         OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = [False] * len(OptionsInfo["SpecifiedFunctionalGroups"])
 399         return
 400     
 401     # Set up a map of valid group names for checking specified group names...
 402     CanonicalGroupNameMap = {}
 403     for GroupName in FunctionalGroupsMap['Names']:
 404         CanonicalGroupNameMap[GroupName.lower()] = GroupName
 405         
 406     # Parse and validate specified names...
 407     GroupNames = re.sub(" ", "", OptionsInfo["FunctionalGroups"])
 408     if not GroupNames:
 409         MiscUtil.PrintError("No functional group name specified for \"-f, --functionalGroups\" option")
 410     
 411     SpecifiedFunctionalGroups = []
 412     SpecifiedNegateMatchStatus = []
 413     
 414     for GroupName in GroupNames.split(","):
 415         CanonicalGroupName = GroupName.lower()
 416         NegateMatchStatus = False
 417         if re.match("^!", CanonicalGroupName, re.I):
 418             NegateMatchStatus = True
 419             CanonicalGroupName = re.sub("^!", "", CanonicalGroupName)
 420         if CanonicalGroupName in CanonicalGroupNameMap:
 421             SpecifiedFunctionalGroups.append(CanonicalGroupNameMap[CanonicalGroupName])
 422             SpecifiedNegateMatchStatus.append(NegateMatchStatus)
 423         else:
 424             MiscUtil.PrintWarning("The functional group name, %s, specified using \"-f, --functionalGroups\" option is not a valid name." % (GroupName))
 425 
 426     if not len(SpecifiedFunctionalGroups):
 427         MiscUtil.PrintError("No valid functional group names specified for \"-f, --functionalGroups\" option")
 428         
 429     OptionsInfo["SpecifiedFunctionalGroups"] = SpecifiedFunctionalGroups
 430     OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = SpecifiedNegateMatchStatus
 431 
 432 def SetupFunctionalGroupsSMARTSPatterns():
 433     """Setup SMARTS patterns for specified functional groups."""
 434 
 435     OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"] = []
 436     FunctionalGroupsPatternMols = []
 437     
 438     for Name in OptionsInfo["SpecifiedFunctionalGroups"]:
 439         SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name]
 440         PatternMol = Chem.MolFromSmarts(SMARTSPattern)
 441         if PatternMol is None:
 442             MiscUtil.PrintError("Failed to parse SMARTS pattern, %s, for function group, %s" % (SMARTSPattern, Name))
 443         
 444         OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"].append(SMARTSPattern)
 445         FunctionalGroupsPatternMols.append(PatternMol)
 446     
 447     return FunctionalGroupsPatternMols
 448 
 449 def SetupFunctionalGroupsOutputFileNames():
 450     """Setup output file names for specified functional group names."""
 451 
 452     OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = []
 453     
 454     if OptionsInfo["CountMode"]:
 455         # No need of any output file...
 456         return
 457     
 458     if OptionsInfo["CombineMatchResults"]:
 459         # No need of output files for specified functional groups...
 460         return
 461     
 462     OutfileBasename = OptionsInfo["OutfileBasename"]
 463     OutfileExt = OptionsInfo["OutfileExt"]
 464     SpecifiedFunctionalGroupsOutfiles = []
 465 
 466     GroupsCount = len(OptionsInfo["SpecifiedFunctionalGroups"])
 467     for GroupIndex in range(0, GroupsCount):
 468         GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex]
 469         if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]:
 470             GroupName = "Not" + GroupName
 471         GroupName = re.sub("\.", "", GroupName)
 472         
 473         GroupOutfile = "%s%s.%s" % (OutfileBasename, GroupName, OutfileExt)
 474         SpecifiedFunctionalGroupsOutfiles.append(GroupOutfile)
 475 
 476     OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = SpecifiedFunctionalGroupsOutfiles
 477     
 478 def RetrieveFunctionalGroupsInfo():
 479     """Retrieve functional groups information."""
 480 
 481     MiscUtil.PrintInfo("\nRetrieving data from default RDKit functional groups hierarchy file Functional_Group_Hierarchy.txt...")
 482     
 483     FunctionalGroupNamesFile = OptionsInfo["GroupNamesFile"]
 484     FunctionalGroupsNodes = FunctionalGroups.BuildFuncGroupHierarchy(FunctionalGroupNamesFile)
 485 
 486     FunctionalGroupsMap['Names'] = []
 487     FunctionalGroupsMap['SMARTSPattern'] = {}
 488     
 489     RetrieveDataFromFunctionalGroupsHierarchy(FunctionalGroupsNodes)
 490 
 491     if not len(FunctionalGroupsMap['Names']):
 492         MiscUtil.PrintError("Failed to retrieve any functional group names and SMARTS patterns...")
 493         
 494     MiscUtil.PrintInfo("Total number of functional groups present functional group hierarchy: %d" % (len(FunctionalGroupsMap['Names'])))
 495     
 496 def RetrieveDataFromFunctionalGroupsHierarchy(FGNodes):
 497     """Retrieve functional groups data by recursively visiting functional group nodes."""
 498     
 499     for FGNode in FGNodes:
 500         Name = FGNode.label
 501         SMARTSPattern = FGNode.smarts
 502 
 503         if Name in FunctionalGroupsMap['SMARTSPattern']:
 504             MiscUtil.PrintWarning("Ignoring duplicate functional group name: %s..." % Name)
 505         else:
 506             FunctionalGroupsMap['Names'].append(Name)
 507             FunctionalGroupsMap['SMARTSPattern'][Name] = SMARTSPattern
 508 
 509         RetrieveDataFromFunctionalGroupsHierarchy(FGNode.children)
 510 
 511 def ListFunctionalGroupsInfo():
 512     """List functional groups information."""
 513 
 514     MiscUtil.PrintInfo("\nListing available functional groups names and SMARTS patterns...")
 515     MiscUtil.PrintInfo("\nFunctionalGroupName\tSMARTSPattern")
 516     
 517     for Name in sorted(FunctionalGroupsMap['Names']):
 518         SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name]
 519         MiscUtil.PrintInfo("%s\t%s" % (Name, SMARTSPattern))
 520     
 521     MiscUtil.PrintInfo("")
 522 
 523 def ProcessOptions():
 524     """Process and validate command line arguments and options."""
 525     
 526     MiscUtil.PrintInfo("Processing options...")
 527     
 528     # Validate options...
 529     ValidateOptions()
 530     
 531     OptionsInfo["CombineMatches"] = Options["--combineMatches"]
 532     
 533     OptionsInfo["CombineMatchResults"] = True
 534     if re.match("^No$", Options["--combineMatches"], re.I):
 535         OptionsInfo["CombineMatchResults"] = False
 536         if Options["--outfile"]:
 537             FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 538             OptionsInfo["OutfileBasename"] = FileName
 539             OptionsInfo["OutfileExt"] = FileExt
 540     
 541     OptionsInfo["CombineOperator"] = Options["--combineOperator"]
 542     OptionsInfo["AndCombineOperatorMode"] = True
 543     if re.match("^or$", Options["--combineOperator"], re.I):
 544         OptionsInfo["AndCombineOperatorMode"] = False
 545     
 546     OptionsInfo["GroupNamesFile"] = None
 547     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 548         OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"]
 549         
 550     OptionsInfo["FunctionalGroups"] = Options["--functionalGroups"]
 551     
 552     OptionsInfo["Infile"] = Options["--infile"]
 553     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 554     
 555     OptionsInfo["Outfile"] = Options["--outfile"]
 556     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 557     
 558     OptionsInfo["Overwrite"] = Options["--overwrite"]
 559     
 560     OptionsInfo["CountMode"] = False
 561     if re.match("^count$", Options["--mode"], re.I):
 562         OptionsInfo["CountMode"] = True
 563     
 564     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 565     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 566     
 567     OptionsInfo["UseChirality"] = False
 568     if re.match("^yes$", Options["--useChirality"], re.I):
 569         OptionsInfo["UseChirality"] = True
 570 
 571 def RetrieveOptions():
 572     """Retrieve command line arguments and options."""
 573     
 574     # Get options...
 575     global Options
 576     Options = docopt(_docoptUsage_)
 577 
 578     # Set current working directory to the specified directory...
 579     WorkingDir = Options["--workingdir"]
 580     if WorkingDir:
 581         os.chdir(WorkingDir)
 582     
 583     # Handle examples option...
 584     if "--examples" in Options and Options["--examples"]:
 585         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 586         sys.exit(0)
 587     
 588     # Handle listing of functional group information...
 589     if  Options and Options["--list"]:
 590         ProcessListFunctionalGroupsOption()
 591         sys.exit(0)
 592 
 593 def ProcessListFunctionalGroupsOption():
 594     """Process list functional groups information."""
 595 
 596     # Validate and process dataFile option for listing functional groups information...
 597     OptionsInfo["GroupNamesFile"] = None
 598     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 599         MiscUtil.ValidateOptionFilePath("-g, --groupNamesFile", Options["--groupNamesFile"])
 600         OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"]
 601     
 602     RetrieveFunctionalGroupsInfo()
 603     ListFunctionalGroupsInfo()
 604     
 605 def ValidateOptions():
 606     """Validate option values."""
 607     
 608     MiscUtil.ValidateOptionTextValue("-c, --combineMatches", Options["--combineMatches"], "yes no")
 609     MiscUtil.ValidateOptionTextValue("--combineOperator", Options["--combineOperator"], "and or")
 610     
 611     if not re.match("^auto$", Options["--groupNamesFile"], re.I):
 612         MiscUtil.ValidateOptionFilePath("-g, groupNamesFile", Options["--groupNamesFile"])
 613         
 614     if re.match("^none$", Options["--functionalGroups"], re.I):
 615         MiscUtil.PrintError("The name(s) of functional groups must be specified using \"-f, --functionalGroups\" option")
 616         
 617     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 618     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 619     if Options["--outfile"]:
 620         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 621         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 622         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 623     
 624     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "retrieve count")
 625     if re.match("^retrieve$", Options["--mode"], re.I):
 626         if not Options["--outfile"]:
 627             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"retrieve\" value of \"-m, --mode\" option")
 628         
 629     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 630     
 631     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 632 
 633 # Setup a usage string for docopt...
 634 _docoptUsage_ = """
 635 RDKitSearchFunctionalGroups.py - Search for functional groups using SMARTS patterns
 636 
 637 Usage:
 638     RDKitSearchFunctionalGroups.py  [--combineMatches <yes or no>] [--combineOperator <and or or>]
 639                                            [--groupNamesFile <FileName or auto>] [--infileParams <Name,Value,...>]
 640                                            [--mode <retrieve or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
 641                                            [--negate <yes or no>] [--outfileParams <Name,Value,...>] [--overwrite]
 642                                            [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> -f <Name1,Name2,Name3... or All>
 643     RDKitSearchFunctionalGroups.py [--groupNamesFile <FileName or auto>] -l | --list
 644     RDKitSearchFunctionalGroups.py -h | --help | -e | --examples
 645 
 646 Description:
 647     Perform a substructure search in an input file using SMARTS patterns for functional
 648     groups and write out the matched molecules to an output file or simply count the
 649     number of matches.
 650 
 651     The SMARTS patterns for specified functional group(s) are retrieved from file,
 652     Functional_Group_Hierarchy.txt, available in RDKit data directory.
 653 
 654     The names of valid functional groups and hierarchies  are dynamically retrieved from the
 655     functional groups hierarchy file and are shown below:
 656 
 657         AcidChloride, AcidChloride.Aromatic, AcidChloride.Aliphatic
 658         Alcohol, Alcohol.Aromatic, Alcohol.Aliphatic
 659         Aldehyde, Aldehyde.Aromatic, Aldehyde.Aliphatic
 660         Amine, Amine.Primary, Amine.Primary.Aromatic, Amine.Primary.Aliphatic,
 661         Amine.Secondary, Amine.Secondary.Aromatic, Amine.Secondary.Aliphatic
 662         Amine.Tertiary, Amine.Tertiary.Aromatic, Amine.Tertiary.Aliphatic
 663         Amine.Aromatic, Amine.Aliphatic, Amine.Cyclic
 664         Azide, Azide.Aromatic, Azide.Aliphatic
 665         BoronicAcid, BoronicAcid.Aromatic, BoronicAcid.Aliphatic
 666         CarboxylicAcid, CarboxylicAcid.Aromatic, CarboxylicAcid.Aliphatic,
 667         CarboxylicAcid.AlphaAmino
 668         Halogen, Halogen.Aromatic, Halogen.Aliphatic
 669         Halogen.NotFluorine, Halogen.NotFluorine.Aliphatic,
 670         Halogen.NotFluorine.Aromatic
 671         Halogen.Bromine, Halogen.Bromine.Aliphatic, Halogen.Bromine.Aromatic,
 672         Halogen.Bromine.BromoKetone
 673         Isocyanate, Isocyanate.Aromatic, Isocyanate.Aliphatic
 674         Nitro, Nitro.Aromatic, Nitro.Aliphatic,
 675         SulfonylChloride, SulfonylChloride.Aromatic, SulfonylChloride.Aliphatic
 676         TerminalAlkyne
 677 
 678     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt)
 679 
 680     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 681 
 682 Options:
 683     -c, --combineMatches <yes or no>  [default: yes]
 684         Combine search results for matching SMARTS patterns of specified functional groups
 685         against a molecule. Possible values: yes or no.
 686         
 687         The matched molecules are written to a single output file for "yes" value. Otherwise,
 688         multiple output files are generated, one for each functional group. The names of  
 689         these files correspond to a combination of the basename of the specified output file
 690         and the name of the functional group.
 691         
 692         No output files are generated during "count" value of "-m, --mode" option.
 693     --combineOperator <and or or>  [default: and]
 694         Logical operator to use for combining match results corresponding to specified
 695         functional group names before writing out a single file. This option is ignored
 696         during "No" value of  "-c, --combineMatches" option.
 697     -e, --examples
 698         Print examples.
 699     -g, --groupNamesFile <FileName or auto>  [default: auto]
 700         Specify a file name containing data for functional groups hierarchy or use functional
 701         group hierarchy file, Functional_Group_Hierarchy.txt, available in RDKit data directory.
 702         
 703         RDKit data format: Name<tab>Smarts<tab>Label<tab>RemovalReaction (optional)
 704         
 705         The format of data in local functional group hierarchy must match format of the
 706         data in functional group file available in RDKit data directory.
 707     -f, --functionalGroups <Name1,Name2,Name3... or All>  [default: none]
 708         Functional group names for performing substructure SMARTS search. Possible values:
 709         Comma delimited list of valid functional group names or All. The current set of valid
 710         functional group names are listed in the description section.
 711         
 712         The match results for multiple functional group names are combined using 'and'
 713         operator before writing them out to single file. No merging of match results takes
 714         place during generation of individual result files corresponding to fictional group
 715         names.
 716         
 717         The functional group name may be started with an exclamation mark to negate
 718         the match result for that fictional group.
 719     -h, --help
 720         Print this help message.
 721     -i, --infile <infile>
 722         Input file name.
 723     --infileParams <Name,Value,...>  [default: auto]
 724         A comma delimited list of parameter name and value pairs for reading
 725         molecules from files. The supported parameter names for different file
 726         formats, along with their default values, are shown below:
 727             
 728             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 729             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 730                 smilesTitleLine,auto,sanitize,yes
 731             
 732         Possible values for smilesDelimiter: space, comma or tab.
 733     -l, --list
 734         List functional groups information without performing any search.
 735     -m, --mode <retrieve or count>  [default: retrieve]
 736         Specify whether to retrieve and write out matched molecules to an output
 737         file or simply count the number of matches.
 738     --mp <yes or no>  [default: no]
 739         Use multiprocessing.
 740          
 741         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 742         function employing lazy RDKit data iterable. This allows processing of
 743         arbitrary large data sets without any additional requirements memory.
 744         
 745         All input data may be optionally loaded into memory by mp.Pool.map()
 746         before starting worker processes in a process pool by setting the value
 747         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 748         
 749         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 750         data mode may adversely impact the performance. The '--mpParams' section
 751         provides additional information to tune the value of 'chunkSize'.
 752     --mpParams <Name,Value,...>  [default: auto]
 753         A comma delimited list of parameter name and value pairs to configure
 754         multiprocessing.
 755         
 756         The supported parameter names along with their default and possible
 757         values are shown below:
 758         
 759             chunkSize, auto
 760             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 761             numProcesses, auto   [ Default: mp.cpu_count() ]
 762         
 763         These parameters are used by the following functions to configure and
 764         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 765         mp.Pool.imap().
 766         
 767         The chunkSize determines chunks of input data passed to each worker
 768         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 769         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 770         
 771         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 772         automatically converts RDKit data iterable into a list, loads all data into
 773         memory, and calculates the default chunkSize using the following method
 774         as shown in its code:
 775         
 776             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 777             if extra: chunkSize += 1
 778         
 779         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 780         and 100 data items.
 781         
 782         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 783         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 784         data into memory. Consequently, the size of input data is not known a priori.
 785         It's not possible to estimate an optimal value for the chunkSize. The default 
 786         chunkSize is set to 1.
 787         
 788         The default value for the chunkSize during 'Lazy' data mode may adversely
 789         impact the performance due to the overhead associated with exchanging
 790         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 791         a larger value during 'Lazy' input data mode, based on the size of your input
 792         data and number of processes in the process pool.
 793         
 794         The mp.Pool.map() function waits for all worker processes to process all
 795         the data and return the results. The mp.Pool.imap() function, however,
 796         returns the the results obtained from worker processes as soon as the
 797         results become available for specified chunks of data.
 798         
 799         The order of data in the results returned by both mp.Pool.map() and 
 800         mp.Pool.imap() functions always corresponds to the input data.
 801     -o, --outfile <outfile>
 802         Output file name.
 803     --outfileParams <Name,Value,...>  [default: auto]
 804         A comma delimited list of parameter name and value pairs for writing
 805         molecules to files. The supported parameter names for different file
 806         formats, along with their default values, are shown below:
 807             
 808             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 809             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 810                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 811             
 812         Default value for compute2DCoords: yes for SMILES input file; no for all other
 813         file types.
 814     --overwrite
 815         Overwrite existing files.
 816     -u, --useChirality <yes or no>  [default: no]
 817         Use stereochemistry information for SMARTS search.
 818     -w, --workingdir <dir>
 819         Location of working directory which defaults to the current directory.
 820 
 821 Examples:
 822     To list names of all available functional groups along with their SMARTS
 823     patterns, type:
 824 
 825         % RDKitSearchFunctionalGroups.py -l
 826 
 827     To retrieve molecules containing amine functional group and write out a
 828     SMILES file, type: 
 829 
 830         % RDKitSearchFunctionalGroups.py -f Amine -i Sample.smi -o SampleOut.smi
 831 
 832     To retrieve molecules containing amine functional group, perform search in
 833     multiprocessing mode on all  available CPUs without loading all data into
 834     memory, and write out a SMILES file, type: 
 835 
 836         % RDKitSearchFunctionalGroups.py --mp yes -f Amine -i Sample.smi
 837           -o SampleOut.smi
 838 
 839     To retrieve molecules containing amine functional group, perform search in
 840     multiprocessing mode on all  available CPUs by loading all data into memory,
 841     and write out a SMILES file, type: 
 842 
 843         % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode,
 844           InMemory" -f Amine -i Sample.smi -o SampleOut.smi
 845 
 846     To retrieve molecules containing amine functional group, perform search in
 847     multiprocessing mode on specific number of CPUs and chunksize without loading
 848     all data into memory, and write out a SMILES file, type: 
 849 
 850         % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode,
 851           lazy,numProcesses,4,chunkSize,8" -f Amine -i Sample.smi -o
 852           SampleOut.smi
 853 
 854     To retrieve molecules containing amine functional group but not halogens and carboxylic
 855     acid functional groups and write out a SMILES file, type: 
 856 
 857         % RDKitSearchFunctionalGroups.py -f 'Amine,!Halogen,!CarboxylicAcid'
 858           -i Sample.smi -o SampleOut.smi
 859 
 860     To retrieve molecules containing amine, halogens or carboxylic  acid functional groups
 861     and write out a SMILES file, type: 
 862 
 863         % RDKitSearchFunctionalGroups.py -f 'Amine,Halogen,CarboxylicAcid'
 864           --combineOperator or -i Sample.smi -o SampleOut.smi
 865 
 866     To retrieve molecules containing amine and carboxylic acid functional groups defined in
 867     a local functional groups hierarchy file and write out individual SD files for each
 868     funcitonal group, type: 
 869 
 870         % RDKitSearchFunctionalGroups.py -f 'Amine,CarboxylicAcid' -i Sample.sdf 
 871           -g Custom_Functional_Group_Hierarchy.txt --combineMatches No -o SampleOut.sdf
 872 
 873     To count number of all functional groups in molecules without writing out an output
 874     files, type:
 875 
 876         % RDKitSearchFunctionalGroups.py -m count -f All --combineMatches no -i Sample.smi
 877 
 878     To retrieve molecule not containing aromatic alcohol and aromatic halogen functional
 879     group along with the use of chirality during substructure search and write out individual
 880     SMILES files for each functional group, type: 
 881 
 882         % RDKitSearchFunctionalGroups.py --combineMatches no -u yes
 883            -f '!Alcohol.Aromatic,!Halogen.Aromatic' -i Sample.smi -o SampleOut.smi
 884 
 885     To retrieve molecule containing amine functional group from a CSV SMILES file,
 886     SMILES strings in column 1, name in column 2, and write out a SD file, type: 
 887 
 888         % RDKitSearchFunctionalGroups.py -f Amine --infileParams
 889           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 890           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 891           -i SampleSMILES.csv -o SampleOut.sdf
 892 
 893 Author:
 894     Manish Sud(msud@san.rr.com)
 895 
 896 See also:
 897     RDKitConvertFileFormat.py, RDKitFilterPAINS.py, RDKitSearchSMARTS.py
 898 
 899 Copyright:
 900     Copyright (C) 2024 Manish Sud. All rights reserved.
 901 
 902     The functionality available in this script is implemented using RDKit, an
 903     open source toolkit for cheminformatics developed by Greg Landrum.
 904 
 905     This file is part of MayaChemTools.
 906 
 907     MayaChemTools is free software; you can redistribute it and/or modify it under
 908     the terms of the GNU Lesser General Public License as published by the Free
 909     Software Foundation; either version 3 of the License, or (at your option) any
 910     later version.
 911 
 912 """
 913 
 914 if __name__ == "__main__":
 915     main()