1 #!/bin/env python 2 # 3 # File: RDKitPerformRGroupDecomposition.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 from rdkit.Chem import rdRGroupDecomposition as rgd 43 from rdkit.Chem import rdFMCS 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 67 68 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 69 70 # Retrieve command line arguments and options... 71 RetrieveOptions() 72 73 # Process and validate command line arguments and options... 74 ProcessOptions() 75 76 # Perform actions required by the script... 77 PerformRGroupDecomposition() 78 79 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 80 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 81 82 def PerformRGroupDecomposition(): 83 """Perform R group decomposition.""" 84 85 # Retrieve molecules... 86 Mols = RetrieveMolecules() 87 88 # Identify R groups and write them out... 89 RGroups, UnmatchedMolIndices = RetrieveRGroups(Mols) 90 WriteRGroups(Mols, RGroups, UnmatchedMolIndices) 91 92 def RetrieveRGroups(Mols): 93 """Retrieve R groups.""" 94 95 CoreMols = SetupCoreScaffolds(Mols) 96 DecompositionParams = SetupRGroupDecompositionParams() 97 RGroupDecompositionObject = rgd.RGroupDecomposition(CoreMols, DecompositionParams) 98 99 MiscUtil.PrintInfo("\nPerforming R group decomposition...") 100 101 UnmatchedMolIndices = [] 102 for MolIndex, Mol in enumerate(Mols): 103 Status = RGroupDecompositionObject.Add(Mol) 104 if Status < 0: 105 UnmatchedMolIndices.append(MolIndex) 106 107 if not RGroupDecompositionObject.Process(): 108 MiscUtil.PrintWarning("R group decomposition failed to match any molecule to core scaffold(s)...") 109 110 RGroups = RGroupDecompositionObject.GetRGroupsAsColumns(asSmiles=True) 111 112 return (RGroups, UnmatchedMolIndices) 113 114 def SetupCoreScaffolds(Mols): 115 """Setup core scaffold molecules(s).""" 116 117 if re.match("^(BySMARTS|BySMILES)$", OptionsInfo["CoreScaffold"], re.I): 118 return SetupCoreScaffoldsBySMARTSOrSMILES() 119 elif re.match("^ByMCS$", OptionsInfo["CoreScaffold"], re.I): 120 return SetupCoreScaffoldsByMCS(Mols) 121 else: 122 MiscUtil.PrintError("The value, %s, specified for \"-c, --coreScaffold\" option is not supported." % (OptionsInfo["CoreScaffold"])) 123 124 def SetupCoreScaffoldsBySMARTSOrSMILES(): 125 """Setup core scaffold molecules(s) using specified SMARTS or SMILES.""" 126 127 BySMARTS = True if re.match("^BySMARTS$", OptionsInfo["CoreScaffold"], re.I) else False 128 CoreScaffoldList = OptionsInfo["SMARTSOrSMILESCoreScaffoldList"] 129 130 if BySMARTS: 131 MiscUtil.PrintInfo("\nSetting up core scaffold(s) using SMARTS...\nSMARTS core scaffold(s): %s" % " ".join(CoreScaffoldList)) 132 else: 133 MiscUtil.PrintInfo("\nSetting up core scaffold(s) using SMILES...\nSMILES core scaffold(s): %s" % " ".join(CoreScaffoldList)) 134 135 CoreMols = [] 136 for Core in CoreScaffoldList: 137 if BySMARTS: 138 CoreMol = Chem.MolFromSmarts(Core) 139 else: 140 CoreMol = Chem.MolFromSmiles(Core) 141 if CoreMol is None: 142 MiscUtil.PrintError("Failed to generate mol for core scaffold: %s" % (Core)) 143 CoreMols.append(CoreMol) 144 145 return CoreMols 146 147 def SetupCoreScaffoldsByMCS(Mols): 148 """Setup core scaffold molecule using MCS.""" 149 150 MiscUtil.PrintInfo("\nSetting up core scaffold using MCS...") 151 152 MCSParams = OptionsInfo["MCSParams"] 153 154 CoreMols = [] 155 156 MCSResultObject = rdFMCS.FindMCS(Mols, maximizeBonds = MCSParams["MaximizeBonds"], threshold = MCSParams["Threshold"], timeout = MCSParams["TimeOut"], verbose = MCSParams["Verbose"], matchValences = MCSParams["MatchValences"], ringMatchesRingOnly = MCSParams["RingMatchesRingOnly"], completeRingsOnly = MCSParams["CompleteRingsOnly"], matchChiralTag = MCSParams["MatchChiralTag"], atomCompare = MCSParams["AtomCompare"], bondCompare = MCSParams["BondCompare"], seedSmarts = MCSParams["SeedSMARTS"]) 157 158 if MCSResultObject.canceled: 159 MiscUtil.PrintError("MCS failed to identify a core scaffold. Specify a different set of parameters using \"-m, --mcsParams\" option and try again.") 160 161 CoreNumAtoms = MCSResultObject.numAtoms 162 CoreNumBonds = MCSResultObject.numBonds 163 SMARTSCore = MCSResultObject.smartsString 164 165 if not len(SMARTSCore): 166 MiscUtil.PrintError("MCS failed to identify a core scaffold. Specify a different set of parameters using \"-m, --mcsParams\" option and try again.") 167 168 MiscUtil.PrintInfo("SMARTS core scaffold: %s\nNumber of atoms in core scaffold: %s\nNumber of bonds in core scaffold: %s" % (SMARTSCore, CoreNumAtoms, CoreNumBonds)) 169 170 if CoreNumAtoms < MCSParams["MinNumAtoms"]: 171 MiscUtil.PrintError("Number of atoms, %d, in core scaffold identified by MCS is less than, %d, as specified by \"minNumAtoms\" parameter in \"-m, --mcsParams\" option." % (CoreNumAtoms, MCSParams["MinNumAtoms"])) 172 173 if CoreNumBonds < MCSParams["MinNumBonds"]: 174 MiscUtil.PrintError("Number of bonds, %d, in core scaffold identified by MCS is less than, %d, as specified by \"minNumBonds\" parameter in \"-m, --mcsParams\" option." % (CoreNumBonds, MCSParams["MinNumBonds"])) 175 176 CoreMol = Chem.MolFromSmarts(SMARTSCore) 177 CoreMols.append(CoreMol) 178 179 return CoreMols 180 181 def SetupRGroupDecompositionParams(): 182 """Setup R group decomposition parameters.""" 183 184 DecompositionParams = rgd.RGroupDecompositionParameters() 185 186 DecompositionParams.alignment = OptionsInfo["DecompositionParams"]["RGroupCoreAlignment"] 187 DecompositionParams.chunkSize = OptionsInfo["DecompositionParams"]["chunkSize"] 188 DecompositionParams.matchingStrategy = OptionsInfo["DecompositionParams"]["RGroupMatching"] 189 DecompositionParams.onlyMatchAtRGroups = OptionsInfo["DecompositionParams"]["matchOnlyAtRGroups"] 190 DecompositionParams.removeAllHydrogenRGroups = OptionsInfo["DecompositionParams"]["removeHydrogenOnlyGroups"] 191 DecompositionParams.removeHydrogensPostMatch = OptionsInfo["DecompositionParams"]["removeHydrogensPostMatch"] 192 193 return DecompositionParams 194 195 def WriteRGroups(Mols, RGroups, UnmatchedMolIndices): 196 """Write out R groups.""" 197 198 Outfile = OptionsInfo["Outfile"] 199 UnmatchedOutfile = OptionsInfo["UnmatchedOutfile"] 200 RemoveUnmatchedMode = OptionsInfo["RemoveUnmatchedMode"] 201 202 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 203 204 TextOutFileMode = OptionsInfo["TextOutFileMode"] 205 TextOutFileDelim = OptionsInfo["TextOutFileDelim"] 206 Quote = OptionsInfo["TextOutQuote"] 207 208 SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"] 209 SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"] 210 211 # Setup writers... 212 Writer = None 213 UnmatchedWriter = None 214 if TextOutFileMode: 215 Writer = open(Outfile, "w") 216 if RemoveUnmatchedMode: 217 UnmatchedWriter = open(UnmatchedOutfile, "w") 218 else: 219 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 220 if RemoveUnmatchedMode: 221 UnmatchedWriter = RDKitUtil.MoleculesWriter(UnmatchedOutfile, **OptionsInfo["OutfileParams"]) 222 223 if Writer is None: 224 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 225 if RemoveUnmatchedMode: 226 if UnmatchedWriter is None: 227 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % UnmatchedOutfile) 228 229 if RemoveUnmatchedMode: 230 MiscUtil.PrintInfo("\nGenerating files: %s %s..." % (Outfile, UnmatchedOutfile)) 231 else: 232 MiscUtil.PrintInfo("\nGenerating file %s..." % Outfile) 233 234 # Set up data keys and labels for core and R groups... 235 RGroupsDataKeys = [] 236 RGroupsDataLabels = [] 237 CoreDataLabelPresent = False 238 for DataLabel in sorted(RGroups): 239 if re.match("^Core", DataLabel, re.I): 240 CoreDataLabelPresent = True 241 RGroupsDataKeys.append(DataLabel) 242 RGroupsDataLabels.append("SMILES%s" % DataLabel) 243 elif re.match("^R", DataLabel, re.I): 244 RGroupsDataKeys.append(DataLabel) 245 RGroupsDataLabels.append("SMILES%s" % DataLabel) 246 else: 247 MiscUtil.PrintWarning("Ignoring unknown R group data label, %s, found during R group decomposition..." % DataLabel) 248 249 if CoreDataLabelPresent: 250 RGroupsCategoriesCount = len(RGroupsDataLabels) - 1 251 else: 252 RGroupsCategoriesCount = len(RGroupsDataLabels) 253 RGroupsMolUnmatchedCount = len(UnmatchedMolIndices) 254 RGroupsMolMatchedCount = len(Mols) - RGroupsMolUnmatchedCount 255 256 # Wite out headers for a text file... 257 if TextOutFileMode: 258 LineWords = ["SMILES","Name"] 259 260 if RemoveUnmatchedMode: 261 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote) 262 UnmatchedWriter.write("%s\n" % Line) 263 264 LineWords.extend(RGroupsDataLabels) 265 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote) 266 Writer.write("%s\n" % Line) 267 268 MolCount = 0 269 RGroupsResultIndex = -1 270 271 for MolIndex, Mol in enumerate(Mols): 272 MolCount += 1 273 274 UnmatchedMol = False 275 if MolIndex in UnmatchedMolIndices: 276 UnmatchedMol = True 277 278 if UnmatchedMol: 279 RGroupsDataSMILES = [""] * len(RGroupsDataKeys) 280 else: 281 RGroupsResultIndex += 1 282 RGroupsDataSMILES = [RGroups[RGroupsDataKey][RGroupsResultIndex] for RGroupsDataKey in RGroupsDataKeys] 283 284 if TextOutFileMode: 285 # Write out text file including SMILES file... 286 MolSMILES = Chem.MolToSmiles(Mol, isomericSmiles = SMILESIsomeric, kekuleSmiles = SMILESKekulize) 287 MolName = RDKitUtil.GetMolName(Mol, MolCount) 288 LineWords = [MolSMILES, MolName] 289 290 if UnmatchedMol and RemoveUnmatchedMode: 291 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote) 292 UnmatchedWriter.write("%s\n" % Line) 293 else: 294 LineWords.extend(RGroupsDataSMILES) 295 Line = MiscUtil.JoinWords(LineWords, TextOutFileDelim, Quote) 296 Writer.write("%s\n" % Line) 297 else: 298 # Write out SD file... 299 if Compute2DCoords: 300 AllChem.Compute2DCoords(Mol) 301 302 if UnmatchedMol and RemoveUnmatchedMode: 303 UnmatchedWriter.write(Mol) 304 else: 305 for (Name, Value) in zip(RGroupsDataLabels, RGroupsDataSMILES): 306 Mol.SetProp(Name, Value) 307 Writer.write(Mol) 308 309 if Writer is not None: 310 Writer.close() 311 if UnmatchedWriter is not None: 312 UnmatchedWriter.close() 313 314 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 315 MiscUtil.PrintInfo("Number of R group categories: %d" % RGroupsCategoriesCount) 316 MiscUtil.PrintInfo("Number of matched molecules containing core scaffold(s): %d" % RGroupsMolMatchedCount) 317 MiscUtil.PrintInfo("Number of unmatched molecules containing no core scaffold(s): %d" % RGroupsMolUnmatchedCount) 318 319 def RetrieveMolecules(): 320 """Retrieve molecules.""" 321 322 Infile = OptionsInfo["Infile"] 323 324 # Read molecules... 325 MiscUtil.PrintInfo("\nReading file %s..." % Infile) 326 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 327 ValidMols, MolCount, ValidMolCount = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"]) 328 329 MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount) 330 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 331 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 332 333 return ValidMols 334 335 def ProcessDecompositionParameters(): 336 """Set up and process decomposition parameters.""" 337 338 SetupDecompositionParameters() 339 ProcessSpecifiedDecompositionParameters() 340 341 def SetupDecompositionParameters(): 342 """Set up default decomposition parameters.""" 343 344 OptionsInfo["DecompositionParams"] = {"RGroupCoreAlignment": rgd.RGroupCoreAlignment.MCS, "RGroupMatching": rgd.RGroupMatching.GreedyChunks, "chunkSize": 5, "matchOnlyAtRGroups": False, "removeHydrogenOnlyGroups": True, "removeHydrogensPostMatch": False} 345 346 def ProcessSpecifiedDecompositionParameters(): 347 """Process specified decomposition parameters.""" 348 349 if re.match("^auto$", OptionsInfo["SpecifiedDecompositionParams"], re.I): 350 # Nothing to process... 351 return 352 353 # Parse specified parameters... 354 DecompositionParams = re.sub(" ", "", OptionsInfo["SpecifiedDecompositionParams"]) 355 if not DecompositionParams: 356 MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-d, --decompositionParams\" option.") 357 358 DecompositionParamsWords = DecompositionParams.split(",") 359 if len(DecompositionParamsWords) % 2: 360 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-d, --decompositionParams\" option must be an even number." % (len(DecompositionParamsWords))) 361 362 # Setup canonical parameter names... 363 ValidParamNames = [] 364 CanonicalParamNamesMap = {} 365 for ParamName in sorted(OptionsInfo["DecompositionParams"]): 366 ValidParamNames.append(ParamName) 367 CanonicalParamNamesMap[ParamName.lower()] = ParamName 368 369 # Validate and set paramater names and value... 370 for Index in range(0, len(DecompositionParamsWords), 2): 371 Name = DecompositionParamsWords[Index] 372 Value = DecompositionParamsWords[Index + 1] 373 374 CanonicalName = Name.lower() 375 if not CanonicalName in CanonicalParamNamesMap: 376 MiscUtil.PrintError("The parameter name, %s, specified using \"-d, --decompositionParams\" option is not a valid name. Supported parameter names: %s" % (Name, " ".join(ValidParamNames))) 377 378 ParamName = CanonicalParamNamesMap[CanonicalName] 379 if re.match("^RGroupCoreAlignment$", ParamName, re.I): 380 if re.match("^MCS$", Value, re.I): 381 ParamValue = rgd.RGroupCoreAlignment.MCS 382 elif re.match("^None$", Value, re.I): 383 ParamValue = rgd.RGroupCoreAlignment.names['None'] 384 else: 385 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option for parameter, %s, is not a valid value. Supported values: MCS None" % (Value, Name)) 386 elif re.match("^RGroupMatching$", ParamName, re.I): 387 if re.match("^Greedy$", Value, re.I): 388 ParamValue = rgd.RGroupMatching.Greedy 389 elif re.match("^GreedyChunks$", Value, re.I): 390 ParamValue = rgd.RGroupMatching.GreedyChunks 391 elif re.match("^Exhaustive$", Value, re.I): 392 ParamValue = rgd.RGroupMatching.Exhaustive 393 else: 394 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option for parameter, %s, is not a valid value. Supported values: Greedy GreedyChunks Exhaustive" % (Value, Name)) 395 elif re.match("^chunkSize$", ParamName, re.I): 396 Value = int(Value) 397 if Value <= 0 : 398 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option for parameter, %s, is not a valid value. Supported values: > 0" % (Value, Name)) 399 ParamValue = Value 400 else: 401 if not re.match("^(Yes|No|True|False)$", Value, re.I): 402 MiscUtil.PrintError("The parameter value, %s, specified using \"-d, --decompositionParams\" option for parameter, %s, is not a valid value. Supported values: Yes No True False" % (Value, Name)) 403 ParamValue = False 404 if re.match("^(Yes|True)$", Value, re.I): 405 ParamValue = True 406 407 # Set value... 408 OptionsInfo["DecompositionParams"][ParamName] = ParamValue 409 410 def ProcessMCSParameters(): 411 """Set up and process MCS parameters.""" 412 413 SetupMCSParameters() 414 ProcessSpecifiedMCSParameters() 415 416 def SetupMCSParameters(): 417 """Set up default MCS parameters.""" 418 419 OptionsInfo["MCSParams"] = {"MaximizeBonds": True, "Threshold": 0.9, "TimeOut": 3600, "Verbose": False, "MatchValences": True, "MatchChiralTag": False, "RingMatchesRingOnly": True, "CompleteRingsOnly": True, "AtomCompare": rdFMCS.AtomCompare.CompareElements, "BondCompare": rdFMCS.BondCompare.CompareOrder, "SeedSMARTS": "", "MinNumAtoms": 1, "MinNumBonds": 0} 420 421 def ProcessSpecifiedMCSParameters(): 422 """Process specified MCS parameters.""" 423 424 if re.match("^auto$", OptionsInfo["SpecifiedMCSParams"], re.I): 425 # Nothing to process... 426 return 427 428 # Parse specified parameters... 429 MCSParams = re.sub(" ", "", OptionsInfo["SpecifiedMCSParams"]) 430 if not MCSParams: 431 MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-m, --mcsParams\" option.") 432 433 MCSParamsWords = MCSParams.split(",") 434 if len(MCSParamsWords) % 2: 435 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-m, --mcsParams\" option must be an even number." % (len(MCSParamsWords))) 436 437 # Setup canonical parameter names... 438 ValidParamNames = [] 439 CanonicalParamNamesMap = {} 440 for ParamName in sorted(OptionsInfo["MCSParams"]): 441 ValidParamNames.append(ParamName) 442 CanonicalParamNamesMap[ParamName.lower()] = ParamName 443 444 # Validate and set paramater names and value... 445 for Index in range(0, len(MCSParamsWords), 2): 446 Name = MCSParamsWords[Index] 447 Value = MCSParamsWords[Index + 1] 448 449 CanonicalName = Name.lower() 450 if not CanonicalName in CanonicalParamNamesMap: 451 MiscUtil.PrintError("The parameter name, %s, specified using \"-m, --mcsParams\" option is not a valid name. Supported parameter names: %s" % (Name, " ".join(ValidParamNames))) 452 453 ParamName = CanonicalParamNamesMap[CanonicalName] 454 if re.match("^Threshold$", ParamName, re.I): 455 Value = float(Value) 456 if Value <= 0.0 or Value > 1.0 : 457 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: > 0 and <= 1.0" % (Value, Name)) 458 ParamValue = Value 459 elif re.match("^Timeout$", ParamName, re.I): 460 Value = int(Value) 461 if Value <= 0: 462 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: > 0" % (Value, Name)) 463 ParamValue = Value 464 elif re.match("^MinNumAtoms$", ParamName, re.I): 465 Value = int(Value) 466 if Value < 1: 467 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: >= 1" % (Value, Name)) 468 ParamValue = Value 469 elif re.match("^MinNumBonds$", ParamName, re.I): 470 Value = int(Value) 471 if Value < 0: 472 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: >=0 " % (Value, Name)) 473 ParamValue = Value 474 elif re.match("^AtomCompare$", ParamName, re.I): 475 if re.match("^CompareAny$", Value, re.I): 476 ParamValue = rdFMCS.AtomCompare.CompareAny 477 elif re.match("^CompareElements$", Value, re.I): 478 ParamValue = Chem.rdFMCS.AtomCompare.CompareElements 479 elif re.match("^CompareIsotopes$", Value, re.I): 480 ParamValue = Chem.rdFMCS.AtomCompare.CompareIsotopes 481 else: 482 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: CompareAny CompareElements CompareIsotopes" % (Value, Name)) 483 elif re.match("^BondCompare$", ParamName, re.I): 484 if re.match("^CompareAny$", Value, re.I): 485 ParamValue = Chem.rdFMCS.BondCompare.CompareAny 486 elif re.match("^CompareOrder$", Value, re.I): 487 ParamValue = rdFMCS.BondCompare.CompareOrder 488 elif re.match("^CompareOrderExact$", Value, re.I): 489 ParamValue = rdFMCS.BondCompare.CompareOrderExact 490 else: 491 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: CompareAny CompareOrder CompareOrderExact" % (Value, Name)) 492 elif re.match("^SeedSMARTS$", ParamName, re.I): 493 if not len(Value): 494 MiscUtil.PrintError("The parameter value specified using \"-m, --mcsParams\" option for parameter, %s, is empty. " % (Name)) 495 ParamValue = Value 496 else: 497 if not re.match("^(Yes|No|True|False)$", Value, re.I): 498 MiscUtil.PrintError("The parameter value, %s, specified using \"-m, --mcsParams\" option for parameter, %s, is not a valid value. Supported values: Yes No True False" % (Value, Name)) 499 ParamValue = False 500 if re.match("^(Yes|True)$", Value, re.I): 501 ParamValue = True 502 503 # Set value... 504 OptionsInfo["MCSParams"][ParamName] = ParamValue 505 506 def ProcessOptions(): 507 """Process and validate command line arguments and options.""" 508 509 MiscUtil.PrintInfo("Processing options...") 510 511 # Validate options... 512 ValidateOptions() 513 514 OptionsInfo["CoreScaffold"] = Options["--coreScaffold"] 515 516 OptionsInfo["Infile"] = Options["--infile"] 517 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 518 519 OptionsInfo["Outfile"] = Options["--outfile"] 520 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 521 522 TextOutFileMode = False 523 TextOutFileDelim = "" 524 525 if MiscUtil.CheckFileExt(Options["--outfile"], "csv"): 526 TextOutFileMode = True 527 TextOutFileDelim = "," 528 elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"): 529 TextOutFileMode = True 530 TextOutFileDelim = "\t" 531 532 OptionsInfo["TextOutFileMode"] = TextOutFileMode 533 OptionsInfo["TextOutFileDelim"] = TextOutFileDelim 534 535 TextOutQuote = False 536 if re.match("^auto$", Options["--quote"], re.I): 537 if MiscUtil.CheckFileExt(Options["--outfile"], "csv"): 538 TextOutQuote = True 539 else: 540 if re.match("^yes$", Options["--quote"], re.I): 541 TextOutQuote = True 542 OptionsInfo["TextOutQuote"] = TextOutQuote 543 544 OptionsInfo["Overwrite"] = Options["--overwrite"] 545 546 RemoveUnmatchedMode = False 547 UnmatchedOutfile = None 548 if re.match("^yes$", Options["--removeUnmatched"], re.I): 549 RemoveUnmatchedMode = True 550 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 551 UnmatchedOutfile = "%sUnmatched.%s" % (FileName, FileExt) 552 OptionsInfo["RemoveUnmatchedMode"] = RemoveUnmatchedMode 553 OptionsInfo["UnmatchedOutfile"] = UnmatchedOutfile 554 555 OptionsInfo["SpecifiedDecompositionParams"] = Options["--decompositionParams"] 556 ProcessDecompositionParameters() 557 558 OptionsInfo["SpecifiedMCSParams"] = Options["--mcsParams"] 559 ProcessMCSParameters() 560 561 SMARTSOrSMILESCoreScaffold = "" 562 SMARTSOrSMILESCoreScaffoldList = [] 563 if not re.match("^none$", Options["--smartsOrSmilesCoreScaffold"], re.I) or len(Options["--smartsOrSmilesCoreScaffold"]): 564 if re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I): 565 SMARTSOrSMILESCoreScaffold = re.sub(" ", "", Options["--smartsOrSmilesCoreScaffold"]) 566 if not SMARTSOrSMILESCoreScaffold: 567 MiscUtil.PrintError("A non empty value must be specified for \"-s, --smartsOrSmilesCoreScaffold\" during %s value of \"-c, --coreScaffold\" option " % (Options["--coreScaffold"])) 568 SMARTSOrSMILESCoreScaffoldList = SMARTSOrSMILESCoreScaffold.split(",") 569 OptionsInfo["SMARTSOrSMILESCoreScaffold"] = SMARTSOrSMILESCoreScaffold 570 OptionsInfo["SMARTSOrSMILESCoreScaffoldList"] = SMARTSOrSMILESCoreScaffoldList 571 572 def RetrieveOptions(): 573 """Retrieve command line arguments and options.""" 574 575 # Get options... 576 global Options 577 Options = docopt(_docoptUsage_) 578 579 # Set current working directory to the specified directory... 580 WorkingDir = Options["--workingdir"] 581 if WorkingDir: 582 os.chdir(WorkingDir) 583 584 # Handle examples option... 585 if "--examples" in Options and Options["--examples"]: 586 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 587 sys.exit(0) 588 589 def ValidateOptions(): 590 """Validate option values.""" 591 592 MiscUtil.ValidateOptionTextValue("-c, --coreScaffold", Options["--coreScaffold"], "ByMCS BySMARTS BySMILES") 593 594 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 595 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt") 596 597 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd csv tsv txt") 598 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 599 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 600 601 if re.match("^none$", Options["--smartsOrSmilesCoreScaffold"], re.I) or not len(Options["--smartsOrSmilesCoreScaffold"]): 602 if re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I): 603 MiscUtil.PrintError("A non empty value must be specified for \"-s, --smartsOrSmilesCoreScaffold\" during %s value of \"-c, --coreScaffold\" option " % (Options["--coreScaffold"])) 604 else: 605 if not re.match("^(BySMARTS|BySMILES)$", Options["--coreScaffold"], re.I): 606 MiscUtil.PrintError("%s value of \"-s, --smartsOrSmilesCoreScaffold\" is not allowed during %s value of \"-c, --coreScaffold\" option " % (Options["--smartsOrSmilesCoreScaffold"], Options["--coreScaffold"])) 607 608 MiscUtil.ValidateOptionTextValue("-q, --quote", Options["--quote"], "yes no auto") 609 MiscUtil.ValidateOptionTextValue("-r, --removeUnmatched", Options["--removeUnmatched"], "yes no") 610 611 # Setup a usage string for docopt... 612 _docoptUsage_ = """ 613 RDKitPerformRGroupDecomposition.py - Perform R group decomposition analysis 614 615 Usage: 616 RDKitPerformRGroupDecomposition.py [--coreScaffold <ByMCS, BySMARTS or BySMILES>] 617 [--decompositionParams <Name,Value,...>] 618 [--infileParams <Name,Value,...>] [--mcsParams <Name,Value,...>] 619 [--outfileParams <Name,Value,...>] [--overwrite] [--quote <yes or no>] 620 [--removeUnmatched <yes or no>] [--smartsOrSmilesCoreScaffold <text>] 621 [-w <dir>] -i <infile> -o <outfile> 622 RDKitPerformRGroupDecomposition.py -h | --help | -e | --examples 623 624 Description: 625 Perform R group decomposition for a set of molecules in a series containing 626 a common core scaffold. The core scaffold is identified by a SMARTS string, 627 SMILES string, or using maximum common substructure (MCS) search. 628 Multiple core scaffolds may be specified using SMARTS or SMILES strings for 629 set of molecules corresponding to multiple series. 630 631 The core scaffolds along with appropriate R groups are written out as SMILES 632 strings to a SD or text file. The unmatched molecules without any specified 633 core scaffold are written to a different output file. 634 635 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 636 .txt, .csv, .tsv) 637 638 The supported output file formats are: SD (.sdf, .sd), CSV/TSV (.csv, .tsv, .txt) 639 640 Options: 641 -c, --coreScaffold <ByMCS, BySMARTS or BySMILES> [default: ByMCS] 642 Specify a core scaffold for a set of molecules in a series. The core scaffold 643 is identified by an explicit SMARTS string, SMILES string, or using maximum 644 common substructure (MCS) search. Multiple core scaffolds may be specified 645 using SMARTS or SMILES strings for set of molecules corresponding to multiple 646 series. 647 -d, --decompositionParams <Name,Value,...> [default: auto] 648 Parameter values to use during R group decomposition for a series of molecules. 649 In general, it is a comma delimited list of parameter name and value pairs. The 650 supported parameter names along with their default values are shown below: 651 652 RGroupCoreAlignment,MCS, RGroupMatching,GreedyChunks,chunkSize,5, 653 matchOnlyAtRGroups,no,removeHydrogenOnlyGroups,yes, 654 removeHydrogensPostMatch,no 655 656 A brief description of each supported parameter taken from RDKit documentation, 657 along with their possible values, is as follows. 658 659 RGroupCoreAlignment - Mapping of core labels: 660 661 MCS - Map core labels to each other using MCS 662 None - No mapping 663 664 RGroupMatching: Greedy, GreedyChunks, Exhaustive 665 666 matchOnlyAtRGroups - Allow R group decomposition only at specified R groups. 667 Possible values: yes, no. 668 669 removeHydrogenOnlyGroups - Remove all R groups that only have hydrogens. 670 Possible values: yes, no. 671 672 removeHydrogensPostMatch - Remove all hydrogens from the output molecules. 673 Possible values: yes, no. 674 -e, --examples 675 Print examples. 676 -h, --help 677 Print this help message. 678 -i, --infile <infile> 679 Input file name. 680 --infileParams <Name,Value,...> [default: auto] 681 A comma delimited list of parameter name and value pairs for reading 682 molecules from files. The supported parameter names for different file 683 formats, along with their default values, are shown below: 684 685 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 686 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 687 smilesTitleLine,auto,sanitize,yes 688 689 Possible values for smilesDelimiter: space, comma or tab. 690 -m, --mcsParams <Name,Value,...> [default: auto] 691 Parameter values to use for identifying a maximum common substructure 692 (MCS) in a series of molecules. In general, it is a comma delimited list of 693 parameter name and value pairs. The supported parameter names along with 694 their default values are shown below: 695 696 atomCompare,CompareElements,bondCompare,CompareOrder, 697 maximizeBonds,yes,matchValences,yes,matchChiralTag,no, 698 minNumAtoms,1,minNumBonds,0,ringMatchesRingOnly,yes, 699 completeRingsOnly,yes,threshold,1.0,timeOut,3600,seedSMARTS,none 700 701 Possible values for atomCompare: CompareAny, CompareElements, 702 CompareIsotopes. Possible values for bondCompare: CompareAny, 703 CompareOrder, CompareOrderExact. 704 705 A brief description of MCS parameters taken from RDKit documentation is 706 as follows: 707 708 atomCompare - Controls match between two atoms 709 bondCompare - Controls match between two bonds 710 maximizeBonds - Maximize number of bonds instead of atoms 711 matchValences - Include atom valences in the MCS match 712 matchChiralTag - Include atom chirality in the MCS match 713 minNumAtoms - Minimum number of atoms in the MCS match 714 minNumBonds - Minimum number of bonds in the MCS match 715 ringMatchesRingOnly - Ring bonds only match other ring bonds 716 completeRingsOnly - Partial rings not allowed during the match 717 threshold - Fraction of the dataset that must contain the MCS 718 seedSMARTS - SMARTS string as the seed of the MCS 719 timeout - Timeout for the MCS calculation in seconds 720 721 -o, --outfile <outfile> 722 Output file name. 723 --outfileParams <Name,Value,...> [default: auto] 724 A comma delimited list of parameter name and value pairs for writing 725 molecules to files. The supported parameter names for different file 726 formats, along with their default values, are shown below: 727 728 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 729 SMILES: smilesKekulize,no,smilesIsomeric,yes 730 731 Default value for compute2DCoords: yes for SMILES input file; no for all other 732 file types. The kekulize and smilesIsomeric parameters are also used during 733 generation of SMILES strings for CSV/TSV files. 734 --overwrite 735 Overwrite existing files. 736 -q, --quote <yes or no> [default: auto] 737 Quote SMILES strings and molecule names before writing them out to text 738 files. Possible values: yes or no. Default: yes for CSV (.csv) text files; no for 739 TSV (.tsv) and TXT (.txt) text files. 740 -r, --removeUnmatched <yes or no> [default: no] 741 Remove unmatched molecules containing no specified core scaffold from the 742 output file and write them to a different output file. 743 -s, --smartsOrSmilesCoreScaffold <text> [default: none] 744 SMARTS or SMILES string to use for core scaffold during 'SMARTS' or 'SMILES' 745 value of '-c, --coreScaffold' option. Multiple core scaffolds may be specified using a 746 comma delimited set of SMARTS or SMILES strings. 747 -w, --workingdir <dir> 748 Location of working directory which defaults to the current directory. 749 750 Examples: 751 To perform R group decomposition for a set of molecules in a series using MCS 752 to identify a core scaffold and write out a CSV file containing R groups, type: 753 754 % RDKitPerformRGroupDecomposition.py -i SampleSeriesD3R.smi 755 -o SampleSeriesD3ROut.csv 756 757 To perform R group decomposition for a set of molecules in a series using a 758 specified core scaffold and write out a SD file containing R groups, type: 759 760 % RDKitPerformRGroupDecomposition.py -c BySMARTS 761 -s "Nc1nccc(-c2cnc(CNCc3ccccc3)c2)n1" -i SampleSeriesD3R.smi 762 -o SampleSeriesD3ROut.sdf 763 764 To perform R group decomposition for a set of molecules in a series using MCS 765 to identify a core scaffold and write out CSV files containing matched and 766 unmatched molecules without quoting values, type: 767 768 % RDKitPerformRGroupDecomposition.py -c ByMCS -r yes -q no 769 -i SampleSeriesD3R.sdf -o SampleSeriesD3ROut.csv 770 771 To perform R group decomposition for a set of molecules in multiple series using 772 specified core scaffolds and write out a TSV file containing R groups, type: 773 774 % RDKitPerformRGroupDecomposition.py -c BySMARTS 775 -s "Nc1nccc(-c2cnc(CNCc3ccccc3)c2)n1,[#6]-[#6]1:[#6]:[#6]:[#6]:[#6]: 776 [#6]:1" -i SampleMultipleSeriesD3R.smi -o 777 SampleMultipleSeriesD3ROut.tsv 778 779 To perform R group decomposition for a set of molecules in a CSV SMILES file, 780 SMILES strings in olumn 1, name in column 2, and write out a CSV file containing 781 R groups, type: 782 783 % RDKitPerformRGroupDecomposition.py --infileParams 784 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 785 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 786 -i SampleSeriesD3R.smi -o SampleSeriesD3ROut.csv 787 788 Author: 789 Manish Sud(msud@san.rr.com) 790 791 See also: 792 RDKitConvertFileFormat.py, RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py 793 794 Copyright: 795 Copyright (C) 2025 Manish Sud. All rights reserved. 796 797 The functionality available in this script is implemented using RDKit, an 798 open source toolkit for cheminformatics developed by Greg Landrum. 799 800 This file is part of MayaChemTools. 801 802 MayaChemTools is free software; you can redistribute it and/or modify it under 803 the terms of the GNU Lesser General Public License as published by the Free 804 Software Foundation; either version 3 of the License, or (at your option) any 805 later version. 806 807 """ 808 809 if __name__ == "__main__": 810 main()