MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitPerformRGroupDecomposition.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit.Chem import rdRGroupDecomposition as rgd
  43     from rdkit.Chem import rdFMCS
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 def main():
  64     """Start execution of the script."""
  65     
  66     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  67     
  68     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  69     
  70     # Retrieve command line arguments and options...
  71     RetrieveOptions()
  72     
  73     # Process and validate command line arguments and options...
  74     ProcessOptions()
  75     
  76     # Perform actions required by the script...
  77     PerformRGroupDecomposition()
  78     
  79     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  80     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  81 
  82 def PerformRGroupDecomposition():
  83     """Perform R group decomposition."""
  84 
  85     # Retrieve molecules...
  86     Mols = RetrieveMolecules()
  87 
  88     # Identify R groups and write them out...
  89     RGroups, UnmatchedMolIndices = RetrieveRGroups(Mols)
  90     WriteRGroups(Mols, RGroups, UnmatchedMolIndices)
  91     
  92 def RetrieveRGroups(Mols):
  93     """Retrieve R groups."""
  94     
  95     CoreMols = SetupCoreScaffolds(Mols)
  96     DecompositionParams = SetupRGroupDecompositionParams()
  97     RGroupDecompositionObject = rgd.RGroupDecomposition(CoreMols, DecompositionParams)
  98     
  99     MiscUtil.PrintInfo("\nPerforming R group decomposition...")
 100     
 101     UnmatchedMolIndices = []
 102     for MolIndex, Mol in enumerate(Mols):
 103         Status = RGroupDecompositionObject.Add(Mol)
 104         if Status < 0:
 105             UnmatchedMolIndices.append(MolIndex)
 106     
 107     if not RGroupDecompositionObject.Process():
 108         MiscUtil.PrintWarning("R group decomposition failed to match any molecule to core scaffold(s)...")
 109     
 110     RGroups = RGroupDecompositionObject.GetRGroupsAsColumns(asSmiles=True)
 111 
 112     return (RGroups, UnmatchedMolIndices)
 113     
 114 def SetupCoreScaffolds(Mols):
 115     """Setup core scaffold molecules(s)."""
 116 
 117     if re.match("^(BySMARTS|BySMILES)$", OptionsInfo["CoreScaffold"], re.I):
 118         return SetupCoreScaffoldsBySMARTSOrSMILES()
 119     elif re.match("^ByMCS$", OptionsInfo["CoreScaffold"], re.I):
 120         return SetupCoreScaffoldsByMCS(Mols)
 121     else:
 122         MiscUtil.PrintError("The  value, %s, specified for  \"-c, --coreScaffold\" option is not supported." % (OptionsInfo["CoreScaffold"]))
 123         
 124 def SetupCoreScaffoldsBySMARTSOrSMILES():
 125     """Setup core scaffold molecules(s) using specified SMARTS or SMILES."""
 126 
 127     BySMARTS = True if re.match("^BySMARTS$", OptionsInfo["CoreScaffold"], re.I) else False
 128     CoreScaffoldList = OptionsInfo["SMARTSOrSMILESCoreScaffoldList"]
 129 
 130     if BySMARTS:
 131         MiscUtil.PrintInfo("\nSetting up core scaffold(s) using SMARTS...\nSMARTS core scaffold(s): %s" % " ".join(CoreScaffoldList))
 132     else:
 133         MiscUtil.PrintInfo("\nSetting up core scaffold(s) using SMILES...\nSMILES core scaffold(s): %s" % " ".join(CoreScaffoldList))
 134     
 135     CoreMols = []
 136     for Core in CoreScaffoldList:
 137         if BySMARTS:
 138             CoreMol = Chem.MolFromSmarts(Core)
 139         else:
 140             CoreMol = Chem.MolFromSmiles(Core)
 141         if CoreMol is None:
 142             MiscUtil.PrintError("Failed to generate mol for core scaffold: %s" % (Core))
 143         CoreMols.append(CoreMol)
 144 
 145     return CoreMols
 146 
 147 def SetupCoreScaffoldsByMCS(Mols):
 148     """Setup core scaffold molecule using MCS."""
 149 
 150     MiscUtil.PrintInfo("\nSetting up core scaffold using MCS...")
 151 
 152     MCSParams = OptionsInfo["MCSParams"]
 153     
 154     CoreMols = []
 155 
 156     MCSResultObject = rdFMCS.FindMCS(Mols, maximizeBonds = MCSParams["MaximizeBonds"], threshold = MCSParams["Threshold"], timeout = MCSParams["TimeOut"], verbose = MCSParams["Verbose"], matchValences = MCSParams["MatchValences"], ringMatchesRingOnly = MCSParams["RingMatchesRingOnly"], completeRingsOnly = MCSParams["CompleteRingsOnly"], matchChiralTag = MCSParams["MatchChiralTag"], atomCompare = MCSParams["AtomCompare"], bondCompare = MCSParams["BondCompare"], seedSmarts = MCSParams["SeedSMARTS"]) 
 157 
 158     if MCSResultObject.canceled:
 159         MiscUtil.PrintError("MCS failed to identify a core scaffold. Specify a different set of parameters using \"-m, --mcsParams\" option and try again.")
 160     
 161     CoreNumAtoms = MCSResultObject.numAtoms
 162     CoreNumBonds = MCSResultObject.numBonds
 163     SMARTSCore = MCSResultObject.smartsString
 164     
 165     if not len(SMARTSCore):
 166         MiscUtil.PrintError("MCS failed to identify a core scaffold. Specify a different set of parameters using \"-m, --mcsParams\" option and try again.")
 167         
 168     MiscUtil.PrintInfo("SMARTS core scaffold: %s\nNumber of atoms in core scaffold: %s\nNumber of bonds in core scaffold: %s" % (SMARTSCore, CoreNumAtoms, CoreNumBonds))
 169 
 170     if CoreNumAtoms < MCSParams["MinNumAtoms"]:
 171         MiscUtil.PrintError("Number of atoms, %d, in core scaffold identified by MCS is less than, %d, as specified by \"minNumAtoms\" parameter in  \"-m, --mcsParams\" option." % (CoreNumAtoms, MCSParams["MinNumAtoms"]))
 172     
 173     if CoreNumBonds < MCSParams["MinNumBonds"]:
 174         MiscUtil.PrintError("Number of bonds, %d, in core scaffold identified by MCS is less than, %d, as specified by \"minNumBonds\" parameter in  \"-m, --mcsParams\" option." % (CoreNumBonds, MCSParams["MinNumBonds"]))
 175     
 176     CoreMol = Chem.MolFromSmarts(SMARTSCore)
 177     CoreMols.append(CoreMol)
 178 
 179     return CoreMols
 180 
 181 def SetupRGroupDecompositionParams():
 182     """Setup R group decomposition parameters."""
 183 
 184     DecompositionParams = rgd.RGroupDecompositionParameters()
 185 
 186     DecompositionParams.alignment = OptionsInfo["DecompositionParams"]["RGroupCoreAlignment"]
 187     DecompositionParams.chunkSize = OptionsInfo["DecompositionParams"]["chunkSize"]
 188     DecompositionParams.matchingStrategy = OptionsInfo["DecompositionParams"]["RGroupMatching"]
 189     DecompositionParams.onlyMatchAtRGroups = OptionsInfo["DecompositionParams"]["matchOnlyAtRGroups"]
 190     DecompositionParams.removeAllHydrogenRGroups = OptionsInfo["DecompositionParams"]["removeHydrogenOnlyGroups"]
 191     DecompositionParams.removeHydrogensPostMatch = OptionsInfo["DecompositionParams"]["removeHydrogensPostMatch"]
 192 
 193     return DecompositionParams
 194     
 195 def WriteRGroups(Mols, RGroups, UnmatchedMolIndices):
 196     """Write out R groups."""
 197 
 198     Outfile = OptionsInfo["Outfile"]
 199     UnmatchedOutfile = OptionsInfo["UnmatchedOutfile"]
 200     RemoveUnmatchedMode = OptionsInfo["RemoveUnmatchedMode"]
 201     
 202     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 203     
 204     TextOutFileMode = OptionsInfo["TextOutFileMode"]
 205     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
 206     Quote = OptionsInfo["TextOutQuote"]
 207 
 208     SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"]
 209     SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"]
 210     
 211     # Setup writers...
 212     Writer = None
 213     UnmatchedWriter = None
 214     if TextOutFileMode:
 215         Writer = open(Outfile, "w")
 216         if RemoveUnmatchedMode:
 217             UnmatchedWriter = open(UnmatchedOutfile, "w")
 218     else:
 219         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 220         if RemoveUnmatchedMode:
 221             UnmatchedWriter = RDKitUtil.MoleculesWriter(UnmatchedOutfile, **OptionsInfo["OutfileParams"])
 222         
 223     if Writer is None:
 224         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 225     if RemoveUnmatchedMode:
 226         if UnmatchedWriter is None:
 227             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % UnmatchedOutfile)
 228     
 229     if RemoveUnmatchedMode:
 230         MiscUtil.PrintInfo("\nGenerating files: %s %s..." % (Outfile, UnmatchedOutfile))
 231     else:
 232         MiscUtil.PrintInfo("\nGenerating file %s..." % Outfile)
 233         
 234     # Set up data keys and labels for  core and R groups...
 235     RGroupsDataKeys = []
 236     RGroupsDataLabels = []
 237     CoreDataLabelPresent = False
 238     for DataLabel in sorted(RGroups):
 239         if re.match("^Core", DataLabel, re.I):
 240             CoreDataLabelPresent = True
 241             RGroupsDataKeys.append(DataLabel)
 242             RGroupsDataLabels.append("SMILES%s" % DataLabel)
 243         elif re.match("^R", DataLabel, re.I):
 244             RGroupsDataKeys.append(DataLabel)
 245             RGroupsDataLabels.append("SMILES%s" % DataLabel)
 246         else:
 247             MiscUtil.PrintWarning("Ignoring unknown R group data label, %s, found during R group decomposition..." % DataLabel)
 248 
 249     if CoreDataLabelPresent:
 250         RGroupsCategoriesCount = len(RGroupsDataLabels) - 1
 251     else:
 252         RGroupsCategoriesCount = len(RGroupsDataLabels)
 253     RGroupsMolUnmatchedCount = len(UnmatchedMolIndices)
 254     RGroupsMolMatchedCount = len(Mols) - RGroupsMolUnmatchedCount
 255     
 256     # Wite out headers for a text file...
 257     if TextOutFileMode:
 258         LineWords = ["SMILES","Name"]
 259         
 260         if RemoveUnmatchedMode:
 261             Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote)
 262             UnmatchedWriter.write("%s\n" % Line)
 263         
 264         LineWords.extend(RGroupsDataLabels)
 265         Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote)
 266         Writer.write("%s\n" % Line)
 267 
 268     MolCount = 0
 269     RGroupsResultIndex = -1
 270     
 271     for MolIndex, Mol in enumerate(Mols):
 272         MolCount += 1
 273 
 274         UnmatchedMol = False
 275         if MolIndex in UnmatchedMolIndices:
 276             UnmatchedMol = True
 277         
 278         if UnmatchedMol:
 279             RGroupsDataSMILES = [""] * len(RGroupsDataKeys)
 280         else:
 281             RGroupsResultIndex += 1
 282             RGroupsDataSMILES = [RGroups[RGroupsDataKey][RGroupsResultIndex] for RGroupsDataKey in RGroupsDataKeys]
 283 
 284         if TextOutFileMode:
 285             # Write out text file including SMILES file...
 286             MolSMILES = Chem.MolToSmiles(Mol, isomericSmiles = SMILESIsomeric, kekuleSmiles = SMILESKekulize)
 287             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 288             LineWords = [MolSMILES, MolName]
 289 
 290             if UnmatchedMol and RemoveUnmatchedMode:
 291                 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote)
 292                 UnmatchedWriter.write("%s\n" % Line)
 293             else:
 294                 LineWords.extend(RGroupsDataSMILES)
 295                 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote)
 296                 Writer.write("%s\n" % Line)
 297         else:
 298             # Write out SD file...
 299             if Compute2DCoords:
 300                 AllChem.Compute2DCoords(Mol)
 301             
 302             if UnmatchedMol and RemoveUnmatchedMode:
 303                 UnmatchedWriter.write(Mol)
 304             else:
 305                 for (Name, Value) in zip(RGroupsDataLabels, RGroupsDataSMILES):
 306                     Mol.SetProp(Name, Value)
 307                 Writer.write(Mol)
 308     
 309     if Writer is not None:
 310         Writer.close()
 311     if UnmatchedWriter is not None:
 312         UnmatchedWriter.close()
 313 
 314     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 315     MiscUtil.PrintInfo("Number of  R group categories: %d" % RGroupsCategoriesCount)
 316     MiscUtil.PrintInfo("Number of  matched molecules containing core scaffold(s): %d" % RGroupsMolMatchedCount)
 317     MiscUtil.PrintInfo("Number of  unmatched molecules containing no core scaffold(s): %d" % RGroupsMolUnmatchedCount)
 318     
 319 def RetrieveMolecules():
 320     """Retrieve molecules."""
 321 
 322     Infile = OptionsInfo["Infile"]
 323     
 324     # Read molecules...
 325     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
 326     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 327     ValidMols, MolCount, ValidMolCount  = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"])
 328     
 329     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
 330     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 331     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 332 
 333     return ValidMols
 334 
 335 def ProcessDecompositionParameters():
 336     """Set up and process decomposition parameters."""
 337 
 338     SetupDecompositionParameters()
 339     ProcessSpecifiedDecompositionParameters()
 340 
 341 def SetupDecompositionParameters():
 342     """Set up default decomposition parameters."""
 343     
 344     OptionsInfo["DecompositionParams"] = {"RGroupCoreAlignment": rgd.RGroupCoreAlignment.MCS, "RGroupMatching": rgd.RGroupMatching.GreedyChunks, "chunkSize": 5, "matchOnlyAtRGroups": False, "removeHydrogenOnlyGroups": True, "removeHydrogensPostMatch": False}
 345     
 346 def ProcessSpecifiedDecompositionParameters():
 347     """Process specified decomposition parameters."""
 348 
 349     if re.match("^auto$", OptionsInfo["SpecifiedDecompositionParams"], re.I):
 350         # Nothing to process...
 351         return
 352 
 353     # Parse specified parameters...
 354     DecompositionParams = re.sub(" ", "", OptionsInfo["SpecifiedDecompositionParams"])
 355     if not DecompositionParams:
 356         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-d, --decompositionParams\" option.")
 357 
 358     DecompositionParamsWords = DecompositionParams.split(",")
 359     if len(DecompositionParamsWords) % 2:
 360         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-d, --decompositionParams\" option must be an even number." % (len(DecompositionParamsWords)))
 361     
 362     # Setup  canonical parameter names...
 363     ValidParamNames = []
 364     CanonicalParamNamesMap = {}
 365     for ParamName in sorted(OptionsInfo["DecompositionParams"]):
 366         ValidParamNames.append(ParamName)
 367         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 368 
 369     # Validate and set paramater names and value...
 370     for Index in range(0, len(DecompositionParamsWords), 2):
 371         Name = DecompositionParamsWords[Index]
 372         Value = DecompositionParamsWords[Index + 1]
 373 
 374         CanonicalName = Name.lower()
 375         if  not CanonicalName in CanonicalParamNamesMap:
 376             MiscUtil.PrintError("The parameter name, %s, specified using \"-d, --decompositionParams\" option is not a valid name. Supported parameter names: %s" % (Name,  " ".join(ValidParamNames)))
 377 
 378         ParamName = CanonicalParamNamesMap[CanonicalName]
 379         if re.match("^RGroupCoreAlignment$", ParamName, re.I):
 380             if re.match("^MCS$", Value, re.I):
 381                 ParamValue = rgd.RGroupCoreAlignment.MCS
 382             elif re.match("^None$", Value, re.I):
 383                 ParamValue = rgd.RGroupCoreAlignment.names['None']
 384             else:
 385                 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option  for parameter, %s, is not a valid value. Supported values: MCS None" % (Value, Name))
 386         elif re.match("^RGroupMatching$", ParamName, re.I):
 387             if re.match("^Greedy$", Value, re.I):
 388                 ParamValue = rgd.RGroupMatching.Greedy
 389             elif re.match("^GreedyChunks$", Value, re.I):
 390                 ParamValue = rgd.RGroupMatching.GreedyChunks
 391             elif re.match("^Exhaustive$", Value, re.I):
 392                 ParamValue = rgd.RGroupMatching.Exhaustive
 393             else:
 394                 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option  for parameter, %s, is not a valid value. Supported values: Greedy GreedyChunks Exhaustive" % (Value, Name))
 395         elif re.match("^chunkSize$", ParamName, re.I):
 396             Value = int(Value)
 397             if Value <= 0 :
 398                 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option  for parameter, %s, is not a valid value. Supported values: > 0" % (Value, Name))
 399             ParamValue = Value
 400         else:
 401             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 402                 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option  for parameter, %s, is not a valid value. Supported values: Yes No True False" % (Value, Name))
 403             ParamValue = False
 404             if re.match("^(Yes|True)$", Value, re.I):
 405                 ParamValue = True
 406         
 407         # Set value...
 408         OptionsInfo["DecompositionParams"][ParamName] = ParamValue
 409 
 410 def ProcessMCSParameters():
 411     """Set up and process MCS parameters."""
 412 
 413     SetupMCSParameters()
 414     ProcessSpecifiedMCSParameters()
 415 
 416 def SetupMCSParameters():
 417     """Set up default MCS parameters."""
 418     
 419     OptionsInfo["MCSParams"] = {"MaximizeBonds": True, "Threshold": 0.9, "TimeOut": 3600, "Verbose": False, "MatchValences": True, "MatchChiralTag": False, "RingMatchesRingOnly": True, "CompleteRingsOnly": True, "AtomCompare": rdFMCS.AtomCompare.CompareElements, "BondCompare": rdFMCS.BondCompare.CompareOrder, "SeedSMARTS": "", "MinNumAtoms": 1, "MinNumBonds": 0}
 420     
 421 def ProcessSpecifiedMCSParameters():
 422     """Process specified MCS parameters."""
 423 
 424     if re.match("^auto$", OptionsInfo["SpecifiedMCSParams"], re.I):
 425         # Nothing to process...
 426         return
 427     
 428     # Parse specified parameters...
 429     MCSParams = re.sub(" ", "", OptionsInfo["SpecifiedMCSParams"])
 430     if not MCSParams:
 431         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-m, --mcsParams\" option.")
 432 
 433     MCSParamsWords = MCSParams.split(",")
 434     if len(MCSParamsWords) % 2:
 435         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-m, --mcsParams\" option must be an even number." % (len(MCSParamsWords)))
 436     
 437     # Setup  canonical parameter names...
 438     ValidParamNames = []
 439     CanonicalParamNamesMap = {}
 440     for ParamName in sorted(OptionsInfo["MCSParams"]):
 441         ValidParamNames.append(ParamName)
 442         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 443 
 444     # Validate and set paramater names and value...
 445     for Index in range(0, len(MCSParamsWords), 2):
 446         Name = MCSParamsWords[Index]
 447         Value = MCSParamsWords[Index + 1]
 448 
 449         CanonicalName = Name.lower()
 450         if  not CanonicalName in CanonicalParamNamesMap:
 451             MiscUtil.PrintError("The parameter name, %s, specified using \"-m, --mcsParams\" option is not a valid name. Supported parameter names: %s" % (Name,  " ".join(ValidParamNames)))
 452 
 453         ParamName = CanonicalParamNamesMap[CanonicalName]
 454         if re.match("^Threshold$", ParamName, re.I):
 455             Value = float(Value)
 456             if Value <= 0.0 or Value > 1.0 :
 457                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: > 0 and <= 1.0" % (Value, Name))
 458             ParamValue = Value
 459         elif re.match("^Timeout$", ParamName, re.I):
 460             Value = int(Value)
 461             if Value <= 0:
 462                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: > 0" % (Value, Name))
 463             ParamValue = Value
 464         elif re.match("^MinNumAtoms$", ParamName, re.I):
 465             Value = int(Value)
 466             if Value < 1:
 467                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: >= 1" % (Value, Name))
 468             ParamValue = Value
 469         elif re.match("^MinNumBonds$", ParamName, re.I):
 470             Value = int(Value)
 471             if Value < 0:
 472                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: >=0 " % (Value, Name))
 473             ParamValue = Value
 474         elif re.match("^AtomCompare$", ParamName, re.I):
 475             if re.match("^CompareAny$", Value, re.I):
 476                 ParamValue = rdFMCS.AtomCompare.CompareAny
 477             elif re.match("^CompareElements$", Value, re.I):
 478                 ParamValue = Chem.rdFMCS.AtomCompare.CompareElements
 479             elif re.match("^CompareIsotopes$", Value, re.I):
 480                 ParamValue = Chem.rdFMCS.AtomCompare.CompareIsotopes
 481             else:
 482                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: CompareAny CompareElements CompareIsotopes" % (Value, Name))
 483         elif re.match("^BondCompare$", ParamName, re.I):
 484             if re.match("^CompareAny$", Value, re.I):
 485                 ParamValue = Chem.rdFMCS.BondCompare.CompareAny
 486             elif re.match("^CompareOrder$", Value, re.I):
 487                 ParamValue = rdFMCS.BondCompare.CompareOrder
 488             elif re.match("^CompareOrderExact$", Value, re.I):
 489                 ParamValue = rdFMCS.BondCompare.CompareOrderExact
 490             else:
 491                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: CompareAny CompareOrder CompareOrderExact" % (Value, Name))
 492         elif re.match("^SeedSMARTS$", ParamName, re.I):
 493             if not len(Value):
 494                 MiscUtil.PrintError("The parameter value specified using \"-m, --mcsParams\" option  for parameter, %s, is empty. " % (Name))
 495             ParamValue = Value
 496         else:
 497             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 498                 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option  for parameter, %s, is not a valid value. Supported values: Yes No True False" % (Value, Name))
 499             ParamValue = False
 500             if re.match("^(Yes|True)$", Value, re.I):
 501                 ParamValue = True
 502         
 503         # Set value...
 504         OptionsInfo["MCSParams"][ParamName] = ParamValue
 505 
 506 def ProcessOptions():
 507     """Process and validate command line arguments and options."""
 508     
 509     MiscUtil.PrintInfo("Processing options...")
 510     
 511     # Validate options...
 512     ValidateOptions()
 513     
 514     OptionsInfo["CoreScaffold"] = Options["--coreScaffold"]
 515     
 516     OptionsInfo["Infile"] = Options["--infile"]
 517     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 518     
 519     OptionsInfo["Outfile"] = Options["--outfile"]
 520     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 521     
 522     TextOutFileMode = False
 523     TextOutFileDelim = ""
 524     
 525     if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
 526         TextOutFileMode = True
 527         TextOutFileDelim = ","
 528     elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
 529         TextOutFileMode = True
 530         TextOutFileDelim = "\t"
 531     
 532     OptionsInfo["TextOutFileMode"] = TextOutFileMode
 533     OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
 534 
 535     TextOutQuote = False
 536     if re.match("^auto$", Options["--quote"], re.I):
 537         if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
 538             TextOutQuote = True
 539     else:
 540         if re.match("^yes$", Options["--quote"], re.I):
 541             TextOutQuote = True
 542     OptionsInfo["TextOutQuote"] = TextOutQuote
 543     
 544     OptionsInfo["Overwrite"] = Options["--overwrite"]
 545 
 546     RemoveUnmatchedMode = False
 547     UnmatchedOutfile = None
 548     if re.match("^yes$", Options["--removeUnmatched"], re.I):
 549         RemoveUnmatchedMode = True
 550         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 551         UnmatchedOutfile = "%sUnmatched.%s" % (FileName, FileExt)
 552     OptionsInfo["RemoveUnmatchedMode"] = RemoveUnmatchedMode
 553     OptionsInfo["UnmatchedOutfile"] = UnmatchedOutfile
 554 
 555     OptionsInfo["SpecifiedDecompositionParams"] = Options["--decompositionParams"]
 556     ProcessDecompositionParameters()
 557     
 558     OptionsInfo["SpecifiedMCSParams"] = Options["--mcsParams"]
 559     ProcessMCSParameters()
 560     
 561     SMARTSOrSMILESCoreScaffold = ""
 562     SMARTSOrSMILESCoreScaffoldList = []
 563     if not re.match("^none$", Options["--smartsOrSmilesCoreScaffold"], re.I) or len(Options["--smartsOrSmilesCoreScaffold"]):
 564         if re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I):
 565             SMARTSOrSMILESCoreScaffold = re.sub(" ", "", Options["--smartsOrSmilesCoreScaffold"])
 566             if not SMARTSOrSMILESCoreScaffold:
 567                 MiscUtil.PrintError("A non empty value must be specified for \"-s, --smartsOrSmilesCoreScaffold\" during %s value of \"-c, --coreScaffold\" option " % (Options["--coreScaffold"]))
 568             SMARTSOrSMILESCoreScaffoldList = SMARTSOrSMILESCoreScaffold.split(",")
 569     OptionsInfo["SMARTSOrSMILESCoreScaffold"] = SMARTSOrSMILESCoreScaffold
 570     OptionsInfo["SMARTSOrSMILESCoreScaffoldList"] = SMARTSOrSMILESCoreScaffoldList
 571     
 572 def RetrieveOptions():
 573     """Retrieve command line arguments and options."""
 574     
 575     # Get options...
 576     global Options
 577     Options = docopt(_docoptUsage_)
 578     
 579     # Set current working directory to the specified directory...
 580     WorkingDir = Options["--workingdir"]
 581     if WorkingDir:
 582         os.chdir(WorkingDir)
 583     
 584     # Handle examples option...
 585     if "--examples" in Options and Options["--examples"]:
 586         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 587         sys.exit(0)
 588 
 589 def ValidateOptions():
 590     """Validate option values."""
 591     
 592     MiscUtil.ValidateOptionTextValue("-c, --coreScaffold", Options["--coreScaffold"], "ByMCS BySMARTS BySMILES")
 593     
 594     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 595     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt")
 596     
 597     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd csv tsv txt")
 598     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 599     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 600     
 601     if re.match("^none$", Options["--smartsOrSmilesCoreScaffold"], re.I) or not len(Options["--smartsOrSmilesCoreScaffold"]):
 602         if re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I):
 603             MiscUtil.PrintError("A non empty value must be specified for \"-s, --smartsOrSmilesCoreScaffold\" during %s value of \"-c, --coreScaffold\" option " % (Options["--coreScaffold"]))
 604     else:
 605         if not re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I):
 606             MiscUtil.PrintError("%s value of \"-s, --smartsOrSmilesCoreScaffold\" is not allowed during %s value of \"-c, --coreScaffold\" option " % (Options["--smartsOrSmilesCoreScaffold"], Options["--coreScaffold"]))
 607     
 608     MiscUtil.ValidateOptionTextValue("-q, --quote", Options["--quote"], "yes no auto")
 609     MiscUtil.ValidateOptionTextValue("-r, --removeUnmatched", Options["--removeUnmatched"], "yes no")
 610     
 611 # Setup a usage string for docopt...
 612 _docoptUsage_ = """
 613 RDKitPerformRGroupDecomposition.py - Perform R group decomposition analysis
 614 
 615 Usage:
 616     RDKitPerformRGroupDecomposition.py [--coreScaffold <ByMCS, BySMARTS or BySMILES>]
 617                                        [--decompositionParams <Name,Value,...>]
 618                                        [--infileParams <Name,Value,...>] [--mcsParams <Name,Value,...>]
 619                                        [--outfileParams <Name,Value,...>] [--overwrite] [--quote <yes or no>]
 620                                        [--removeUnmatched <yes or no>] [--smartsOrSmilesCoreScaffold <text>]
 621                                        [-w <dir>] -i <infile> -o <outfile> 
 622     RDKitPerformRGroupDecomposition.py -h | --help | -e | --examples
 623 
 624 Description:
 625     Perform R group decomposition for a set of molecules in a series containing
 626     a common core scaffold. The core scaffold is identified by a SMARTS string,
 627     SMILES string, or using maximum common substructure (MCS) search.
 628     Multiple core scaffolds may be specified using SMARTS or SMILES strings for
 629     set of molecules corresponding to multiple series.
 630 
 631     The core scaffolds along with appropriate R groups are written out as SMILES
 632     strings to a SD or text file. The unmatched molecules without any specified
 633     core scaffold are written to a different output file.
 634 
 635     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 636     .txt, .csv, .tsv)
 637 
 638     The supported output file formats are: SD (.sdf, .sd), CSV/TSV (.csv, .tsv, .txt)
 639 
 640 Options:
 641     -c, --coreScaffold <ByMCS, BySMARTS or BySMILES>  [default: ByMCS]
 642         Specify a core scaffold for a set of molecules in a series. The core scaffold
 643         is identified by an explicit SMARTS string, SMILES string, or using maximum
 644         common substructure (MCS) search. Multiple core scaffolds may be specified
 645         using SMARTS or SMILES strings for set of molecules corresponding to multiple
 646         series.
 647     -d, --decompositionParams <Name,Value,...>  [default: auto]
 648         Parameter values to use during R group decomposition for a series of molecules.
 649         In general, it is a comma delimited list of parameter name and value pairs. The
 650         supported parameter names along with their default values are shown below:
 651             
 652             RGroupCoreAlignment,MCS, RGroupMatching,GreedyChunks,chunkSize,5,
 653             matchOnlyAtRGroups,no,removeHydrogenOnlyGroups,yes,
 654             removeHydrogensPostMatch,no
 655             
 656         A brief description of each supported parameter taken from  RDKit documentation,
 657         along with their possible values, is as follows.
 658         
 659         RGroupCoreAlignment - Mapping of core labels:
 660         
 661             MCS - Map core labels to each other using MCS
 662             None - No mapping
 663         
 664         RGroupMatching: Greedy, GreedyChunks, Exhaustive
 665         
 666         matchOnlyAtRGroups - Allow R group decomposition only at specified R groups.
 667         Possible values: yes, no.
 668         
 669         removeHydrogenOnlyGroups - Remove all R groups that only have hydrogens.
 670         Possible values: yes, no.
 671         
 672         removeHydrogensPostMatch - Remove all hydrogens from the output molecules.
 673         Possible values: yes, no.
 674     -e, --examples
 675         Print examples.
 676     -h, --help
 677         Print this help message.
 678     -i, --infile <infile>
 679         Input file name.
 680     --infileParams <Name,Value,...>  [default: auto]
 681         A comma delimited list of parameter name and value pairs for reading 
 682         molecules from files. The supported parameter names for different file
 683         formats, along with their default values, are shown below:
 684             
 685             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 686             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 687                 smilesTitleLine,auto,sanitize,yes
 688             
 689         Possible values for smilesDelimiter: space, comma or tab.
 690     -m, --mcsParams <Name,Value,...>  [default: auto]
 691         Parameter values to use for identifying a maximum common substructure
 692         (MCS) in a series of molecules. In general, it is a comma delimited list of
 693         parameter name and value pairs. The supported parameter names along with
 694         their default values are shown below:
 695             
 696             atomCompare,CompareElements,bondCompare,CompareOrder,
 697             maximizeBonds,yes,matchValences,yes,matchChiralTag,no,
 698             minNumAtoms,1,minNumBonds,0,ringMatchesRingOnly,yes,
 699             completeRingsOnly,yes,threshold,1.0,timeOut,3600,seedSMARTS,none
 700             
 701         Possible values for atomCompare: CompareAny, CompareElements,
 702         CompareIsotopes. Possible values for bondCompare: CompareAny,
 703         CompareOrder, CompareOrderExact.
 704         
 705         A brief description of MCS parameters taken from RDKit documentation is
 706         as follows:
 707             
 708             atomCompare - Controls match between two atoms
 709             bondCompare - Controls match between two bonds
 710             maximizeBonds - Maximize number of bonds instead of atoms
 711             matchValences - Include atom valences in the MCS match
 712             matchChiralTag - Include atom chirality in the MCS match
 713             minNumAtoms - Minimum number of atoms in the MCS match
 714             minNumBonds - Minimum number of bonds in the MCS match
 715             ringMatchesRingOnly - Ring bonds only match other ring bonds
 716             completeRingsOnly - Partial rings not allowed during the match
 717             threshold - Fraction of the dataset that must contain the MCS
 718             seedSMARTS - SMARTS string as the seed of the MCS
 719             timeout - Timeout for the MCS calculation in seconds
 720             
 721     -o, --outfile <outfile>
 722         Output file name.
 723     --outfileParams <Name,Value,...>  [default: auto]
 724         A comma delimited list of parameter name and value pairs for writing
 725         molecules to files. The supported parameter names for different file
 726         formats, along with their default values, are shown below:
 727             
 728             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 729             SMILES: smilesKekulize,no,smilesIsomeric,yes
 730             
 731         Default value for compute2DCoords: yes for SMILES input file; no for all other
 732         file types. The kekulize and smilesIsomeric parameters are also used during
 733         generation of SMILES strings for CSV/TSV files.
 734     --overwrite
 735         Overwrite existing files.
 736     -q, --quote <yes or no>  [default: auto]
 737         Quote SMILES strings and molecule names before writing them out to text
 738         files. Possible values: yes or no. Default: yes for CSV (.csv) text files; no for
 739         TSV (.tsv) and TXT (.txt) text files.
 740     -r, --removeUnmatched <yes or no>  [default: no]
 741         Remove unmatched molecules containing no specified core scaffold from the
 742         output file and write them to a different output file.
 743     -s, --smartsOrSmilesCoreScaffold <text>  [default: none]
 744         SMARTS or SMILES string to use for core scaffold during 'SMARTS' or 'SMILES'
 745         value of '-c, --coreScaffold' option. Multiple core scaffolds may be specified using a
 746         comma delimited set of SMARTS or SMILES strings.
 747     -w, --workingdir <dir>
 748         Location of working directory which defaults to the current directory.
 749 
 750 Examples:
 751     To perform R group decomposition for a set of molecules in a series using MCS
 752     to identify a core scaffold and write out a CSV file containing R groups, type:
 753 
 754         % RDKitPerformRGroupDecomposition.py -i SampleSeriesD3R.smi
 755           -o SampleSeriesD3ROut.csv
 756 
 757     To perform R group decomposition for a set of molecules in a series using a
 758     specified core scaffold and write out a SD file containing R groups, type:
 759 
 760         % RDKitPerformRGroupDecomposition.py  -c BySMARTS
 761           -s "Nc1nccc(-c2cnc(CNCc3ccccc3)c2)n1" -i SampleSeriesD3R.smi
 762           -o SampleSeriesD3ROut.sdf
 763 
 764     To perform R group decomposition for a set of molecules in a series using MCS
 765     to identify a core scaffold and write out CSV files containing matched and
 766     unmatched molecules without quoting values, type:
 767 
 768         % RDKitPerformRGroupDecomposition.py -c ByMCS -r yes -q no
 769           -i SampleSeriesD3R.sdf -o SampleSeriesD3ROut.csv
 770 
 771     To perform R group decomposition for a set of molecules in multiple series using
 772     specified core scaffolds and write out a TSV file containing R groups, type:
 773 
 774         % RDKitPerformRGroupDecomposition.py  -c BySMARTS
 775           -s "Nc1nccc(-c2cnc(CNCc3ccccc3)c2)n1,[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]:
 776           [#6]:1" -i SampleMultipleSeriesD3R.smi -o
 777           SampleMultipleSeriesD3ROut.tsv
 778 
 779     To perform R group decomposition for a set of molecules in a CSV SMILES file,
 780     SMILES strings in  olumn 1, name in column 2, and write out a CSV file containing
 781     R groups, type:
 782      
 783         % RDKitPerformRGroupDecomposition.py --infileParams 
 784           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 785           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 786           -i SampleSeriesD3R.smi -o SampleSeriesD3ROut.csv
 787 
 788 Author:
 789     Manish Sud(msud@san.rr.com)
 790 
 791 See also:
 792     RDKitConvertFileFormat.py, RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py
 793 
 794 Copyright:
 795     Copyright (C) 2024 Manish Sud. All rights reserved.
 796 
 797     The functionality available in this script is implemented using RDKit, an
 798     open source toolkit for cheminformatics developed by Greg Landrum.
 799 
 800     This file is part of MayaChemTools.
 801 
 802     MayaChemTools is free software; you can redistribute it and/or modify it under
 803     the terms of the GNU Lesser General Public License as published by the Free
 804     Software Foundation; either version 3 of the License, or (at your option) any
 805     later version.
 806 
 807 """
 808 
 809 if __name__ == "__main__":
 810     main()