1 #!/bin/env python 2 # 3 # File: RDKitEnumerateCompoundLibrary.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 from rdkit.Chem import FunctionalGroups 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 RxnNamesMap = {} 63 64 def main(): 65 """Start execution of the script.""" 66 67 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 PerformChemicalLibraryEnumeration() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 def PerformChemicalLibraryEnumeration(): 84 """Retrieve functional groups information and perform search.""" 85 86 ProcessReactionNamesInfo() 87 PerformEnumeration() 88 89 def PerformEnumeration(): 90 """Enumerate virutal compound library.""" 91 92 ReactantFilesList = OptionsInfo["ReactantFilesList"] 93 Outfile = OptionsInfo["Outfile"] 94 95 RxnByNameMode = OptionsInfo["RxnByNameMode"] 96 if RxnByNameMode: 97 RxnSMARTSPattern = OptionsInfo["RxnNameSMARTS"] 98 else: 99 RxnSMARTSPattern = OptionsInfo["SpecifiedSMARTS"] 100 101 # Set up a reaction and match number of reactants in rxn SMARTS against number of 102 # reactant files... 103 MiscUtil.PrintInfo("\nValidating reaction SMARTS...") 104 try: 105 Rxn = AllChem.ReactionFromSmarts(RxnSMARTSPattern) 106 except Exception as ErrMsg: 107 MiscUtil.PrintError("Failed to validate reaction SMARTS %s\n%s\n" % (RxnSMARTSPattern, ErrMsg)) 108 109 RxnReactantsCount = Rxn.GetNumReactantTemplates() 110 111 ReactantFilesList = OptionsInfo["ReactantFilesList"] 112 ReactantFilesCount = len(ReactantFilesList) 113 if ReactantFilesCount != RxnReactantsCount: 114 MiscUtil.PrintError("The number of specified reactant files, %d, must match number of reactants, %d, in reaction SMARTS" % (ReactantFilesCount, RxnReactantsCount)) 115 116 # Retrieve reactant molecules... 117 ReactantsMolsList = RetrieveReactantsMolecules() 118 119 # Set up a molecule writer... 120 Writer = None 121 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 122 if Writer is None: 123 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 124 125 MiscUtil.PrintInfo("\nGenerating file %s..." % Outfile) 126 127 # Set up reaction... 128 ReturnReactants = False 129 if OptionsInfo["UseReactantNames"]: 130 ReturnReactants = True 131 RxnProducts = AllChem.EnumerateLibraryFromReaction(Rxn, ReactantsMolsList, ReturnReactants) 132 133 # Generate product molecules and write them out... 134 135 Compute2DCoords = OptionsInfo["Compute2DCoords"] 136 Sanitize = OptionsInfo["Sanitize"] 137 138 ProdMolCount = 0 139 ValidProdMolCount = 0 140 141 if ReturnReactants: 142 for Products, Reactants in list(RxnProducts): 143 for ProdMol in Products: 144 ProdMolCount += 1 145 146 # Set product name... 147 ReactantMolNames = [ReactantMol.GetProp("_Name") for ReactantMol in Reactants] 148 Delimiter = "_" 149 ProdMolName = Delimiter.join(ReactantMolNames) + "_Prod%d" % ProdMolCount 150 ProdMol.SetProp("_Name", ProdMolName) 151 152 Status = WriteProductMolecule(Writer, ProdMol, Sanitize, Compute2DCoords) 153 if Status: 154 ValidProdMolCount += 1 155 else: 156 for Products in list(RxnProducts): 157 for ProdMol in Products: 158 ProdMolCount += 1 159 160 # Set product name... 161 ProdMolName = "Prod%d" % ProdMolCount 162 ProdMol.SetProp("_Name", ProdMolName) 163 164 Status = WriteProductMolecule(Writer, ProdMol, Sanitize, Compute2DCoords) 165 if Status: 166 ValidProdMolCount += 1 167 168 if Writer is not None: 169 Writer.close() 170 171 if ValidProdMolCount: 172 MiscUtil.PrintInfo("\nTotal number of product molecules: %d" % ProdMolCount) 173 MiscUtil.PrintInfo("Number of valid product molecules: %d" % ValidProdMolCount) 174 MiscUtil.PrintInfo("Number of ignored product molecules: %d" % (ProdMolCount - ValidProdMolCount)) 175 else: 176 MiscUtil.PrintInfo("\nThe compound library enumeration failed to generate any product molecules.\nCheck to make sure the reactants specified in input files match their corresponding specifications in reaction SMARTS and try again.") 177 178 def WriteProductMolecule(Writer, ProdMol, Sanitize, Compute2DCoords): 179 """Prepare and write out product molecule.""" 180 181 try: 182 if Sanitize: 183 Chem.SanitizeMol(ProdMol) 184 except (RuntimeError, ValueError): 185 MiscUtil.PrintWarning("Ignoring product molecule: Failed to sanitize...\n") 186 return False 187 188 try: 189 if Compute2DCoords: 190 AllChem.Compute2DCoords(ProdMol) 191 except (RuntimeError, ValueError): 192 MiscUtil.PrintWarning("Ignoring product molecule: Failed to compute 2D coordinates...\n") 193 return False 194 195 Writer.write(ProdMol) 196 197 return True 198 199 def RetrieveReactantsMolecules(): 200 """Retrieve reactant molecules from each reactant file and return a list containing lists of molecules 201 for each reactant file.""" 202 203 MiscUtil.PrintInfo("\nProcessing reactant file(s)...") 204 205 ReactantsMolsList = [] 206 ReactantFilesList = OptionsInfo["ReactantFilesList"] 207 UseReactantNames = OptionsInfo["UseReactantNames"] 208 ReactantCount = 0 209 210 for FileIndex in range(0, len(ReactantFilesList)): 211 ReactantCount += 1 212 ReactantFile = ReactantFilesList[FileIndex] 213 214 MiscUtil.PrintInfo("\nProcessing reactant file: %s..." % ReactantFile) 215 216 Mols = RDKitUtil.ReadMolecules(ReactantFile, **OptionsInfo["InfileParams"]) 217 218 ValidMols = [] 219 MolCount = 0 220 ValidMolCount = 0 221 222 for Mol in Mols: 223 MolCount += 1 224 if Mol is None: 225 continue 226 227 if RDKitUtil.IsMolEmpty(Mol): 228 MolName = RDKitUtil.GetMolName(Mol, MolCount) 229 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 230 continue 231 232 ValidMolCount += 1 233 234 # Check and set mol name... 235 if UseReactantNames: 236 MolName = RDKitUtil.GetMolName(Mol) 237 if not len(MolName): 238 MolName = "React%dMol%d" % (ReactantCount, MolCount) 239 Mol.SetProp("_Name", MolName) 240 241 ValidMols.append(Mol) 242 243 ReactantsMolsList.append(ValidMols) 244 245 MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount) 246 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 247 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 248 249 return ReactantsMolsList 250 251 def ProcessReactionNamesInfo(): 252 """Process reaction names information.""" 253 254 if not OptionsInfo["RxnByNameMode"]: 255 return 256 257 RetrieveReactionNamesInfo() 258 ProcessSpecifiedReactionName() 259 260 def ProcessSpecifiedReactionName(): 261 """Process and validate specified reaction name.""" 262 263 OptionsInfo["RxnNameSMARTS"] = None 264 265 # Set up a map of valid group rxn names for checking specified rxn names... 266 CanonicalRxnNameMap = {} 267 for Name in RxnNamesMap['Names']: 268 CanonicalRxnNameMap[Name.lower()] = Name 269 270 CanonicalRxnName = OptionsInfo["RxnName"].lower() 271 if CanonicalRxnName in CanonicalRxnNameMap: 272 Name = CanonicalRxnNameMap[CanonicalRxnName] 273 OptionsInfo["RxnNameSMARTS"] = RxnNamesMap['SMARTSPattern'][Name] 274 else: 275 MiscUtil.PrintError("The rxn name name, %s, specified using \"-r, --rxnName\" option is not a valid name." % (OptionsInfo["RxnName"])) 276 277 def ProcessListReactionNamesOption(): 278 """Process list reaction names information.""" 279 280 ProcessReactionNamesFileOption() 281 ProcessColumnOptions() 282 283 RetrieveReactionNamesInfo() 284 ListAndValidateReactionNamesInfo() 285 286 def RetrieveReactionNamesInfo(): 287 """Retrieve reaction names information.""" 288 289 RxnNamesFile = OptionsInfo["RxnNamesFile"] 290 291 MiscUtil.PrintInfo("\nRetrieving reaction names and SMARTS patterns from file %s" % (RxnNamesFile)) 292 293 if not os.path.exists(RxnNamesFile): 294 MiscUtil.PrintError("The reaction names file, %s, doesn't exist.\n" % (RxnNamesFile)) 295 296 IgnoreHeaderLine = True 297 RxnLinesWords = MiscUtil.GetTextLinesWords(RxnNamesFile, OptionsInfo["RxnNamesFileDelimiter"], OptionsInfo["RxnNamesFileQuote"], IgnoreHeaderLine) 298 299 RxnNamesMap['Names'] = [] 300 RxnNamesMap['SMARTSPattern'] = {} 301 302 RxnNameColindex = OptionsInfo["RxnNameColnum"] - 1 303 RxnSMARTSColindex = OptionsInfo["RxnSMARTSColnum"] - 1 304 305 for LineWords in RxnLinesWords: 306 Name = LineWords[RxnNameColindex] 307 SMARTSPattern = LineWords[RxnSMARTSColindex] 308 309 if Name in RxnNamesMap['SMARTSPattern']: 310 MiscUtil.PrintWarning("Ignoring duplicate reaction name: %s..." % Name) 311 else: 312 RxnNamesMap['Names'].append(Name) 313 RxnNamesMap['SMARTSPattern'][Name] = SMARTSPattern 314 315 if not len(RxnNamesMap['Names']): 316 MiscUtil.PrintError("Failed to retrieve any reaction names and SMARTS patterns...") 317 318 MiscUtil.PrintInfo("Total number of reactions present in reaction names and SMARTS file: %d" % (len(RxnNamesMap['Names']))) 319 320 def ListAndValidateReactionNamesInfo(): 321 """List and validate reaction names information.""" 322 323 ListReactionNamesInfo() 324 ValidateReactionNamesInfo() 325 326 def ListReactionNamesInfo(): 327 """List reaction names information.""" 328 329 MiscUtil.PrintInfo("\nListing available reaction names and SMARTS patterns...") 330 MiscUtil.PrintInfo("\nReactionName\tSMARTSPattern") 331 332 RxnCount = 0 333 for Name in sorted(RxnNamesMap['Names']): 334 RxnCount += 1 335 SMARTSPattern = RxnNamesMap['SMARTSPattern'][Name] 336 MiscUtil.PrintInfo("%s\t%s" % (Name, SMARTSPattern)) 337 338 MiscUtil.PrintInfo("\nTotal number of reactions: %s" % RxnCount) 339 340 def ValidateReactionNamesInfo(): 341 """Validate reaction names information.""" 342 343 MiscUtil.PrintInfo("\nValidating reaction SMARTS patterns...") 344 345 RxnCount = 0 346 ValidRxnCount = 0 347 for Name in sorted(RxnNamesMap['Names']): 348 RxnCount += 1 349 SMARTSPattern = RxnNamesMap['SMARTSPattern'][Name] 350 try: 351 Rxn = AllChem.ReactionFromSmarts(SMARTSPattern) 352 ValidRxnCount += 1 353 except Exception as ErrMsg: 354 MiscUtil.PrintInfo("\nFailed to validate reaction SMARTS. ReactionName: %s; SMARTSPattern: %s\n%s\n" % (Name, SMARTSPattern, ErrMsg)) 355 356 InvalidRxnCount = RxnCount - ValidRxnCount 357 MiscUtil.PrintInfo("\nTotal number of reactions: %s\nNumber of valid reactions: %s\nNumber of invalid reactions: %s" % (RxnCount, ValidRxnCount, InvalidRxnCount)) 358 359 MiscUtil.PrintInfo("") 360 361 def ProcessReactionNamesFileOption(): 362 """Process reaction names file option.""" 363 364 RxnNamesFile = None 365 if not re.match("^auto$", Options["--rxnNamesFile"], re.I): 366 MiscUtil.ValidateOptionFilePath("--rxnNamesFile", Options["--rxnNamesFile"]) 367 RxnNamesFile = Options["--rxnNamesFile"] 368 369 if RxnNamesFile is None: 370 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 371 RxnNamesFile = os.path.join(MayaChemToolsDataDir, "ReactionNamesAndSMARTS.csv") 372 373 OptionsInfo["RxnNamesFile"] = RxnNamesFile 374 OptionsInfo["RxnNamesFileDelimiter"] = ',' 375 OptionsInfo["RxnNamesFileQuote"] = '"' 376 377 def ProcessColumnOptions(): 378 """Process column options. """ 379 380 ProcessColumnModeOption() 381 RetrieveColumnNames() 382 383 ProcessReactionNameColOption() 384 ProcessReactionSMARTSColOption() 385 386 def ProcessColumnModeOption(): 387 """Process column mode option.""" 388 389 CollabelMode, ColnumMode = [False, False] 390 Colmode = Options["--colmode"] 391 if re.match("^collabel$", Colmode, re.I): 392 CollabelMode = True 393 elif re.match("^colnum$", Colmode, re.I): 394 ColnumMode = True 395 else: 396 MiscUtil.PrintError("The value, %s, specified for option \"-c, --colmode\" is not valid. Supported values: collabel or colnum\n" % (Colmode)) 397 398 OptionsInfo["Colmode"] = Colmode 399 OptionsInfo["CollabelMode"] = CollabelMode 400 OptionsInfo["ColnumMode"] = ColnumMode 401 402 def RetrieveColumnNames(): 403 """Retrieve column names. """ 404 405 RxnNamesFile = OptionsInfo["RxnNamesFile"] 406 IgnoreHeaderLine = False 407 RxnLinesWords = MiscUtil.GetTextLinesWords(RxnNamesFile, OptionsInfo["RxnNamesFileDelimiter"], OptionsInfo["RxnNamesFileQuote"], IgnoreHeaderLine) 408 Colnames = RxnLinesWords[0] 409 410 if len(Colnames) == 0: 411 MiscUtil.PrintError("The first line in reaction names, %s, is empty. It must contain column names.\n" % OptionsInfo["RxnNamesFile"]) 412 413 ColnameToColnumMap = {} 414 ColnumToColnameMap = {} 415 for ColIndex, Colname in enumerate(Colnames): 416 Colnum = ColIndex + 1 417 ColnameToColnumMap[Colname] = Colnum 418 ColnumToColnameMap[Colnum] = Colname 419 420 OptionsInfo["Colnames"] = Colnames 421 OptionsInfo["ColCount"] = len(Colnames) 422 OptionsInfo["ColnameToColnumMap"] = ColnameToColnumMap 423 OptionsInfo["ColnumToColnameMap"] = ColnumToColnameMap 424 425 def ProcessReactionNameColOption(): 426 """Process reaction name column option. """ 427 428 RxnNameCol = Options["--colRxnName"] 429 if re.match("^auto$", RxnNameCol, re.I): 430 Colname = "RxnName" 431 if Colname not in OptionsInfo["ColnameToColnumMap"]: 432 MiscUtil.PrintError("The reaction name column name, %s, doen't exist in reaction names file. You must specify a valid reaction name column name or number using \"--colRxnName\" option.\n" % Colname) 433 434 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 435 RxnNameColSpec = Colnum if OptionsInfo["ColnumMode"] else Colname 436 else: 437 RxnNameColSpec = RxnNameCol 438 439 RxnNameColname, RxnNameColnum = ProcessColumnSpecification("--colRxnName", RxnNameColSpec) 440 441 OptionsInfo["RxnNameCol"] = RxnNameCol 442 OptionsInfo["RxnNameColname"] = RxnNameColname 443 OptionsInfo["RxnNameColnum"] = RxnNameColnum 444 445 def ProcessReactionSMARTSColOption(): 446 """Process reaction SMARTS column option. """ 447 448 RxnSMARTSCol = Options["--colRxnSMARTS"] 449 if re.match("^auto$", RxnSMARTSCol, re.I): 450 Colname = "RxnSMARTS" 451 if Colname not in OptionsInfo["ColnameToColnumMap"]: 452 MiscUtil.PrintError("The reaction SMARTS column name, %s, doen't exist in reaction names file. You must specify a valid reaction name column name or number using \"--colRxnSMARTS\" option.\n" % Colname) 453 454 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 455 RxnSMARTSColSpec = Colnum if OptionsInfo["ColnumMode"] else Colname 456 else: 457 RxnSMARTSColSpec = RxnSMARTSCol 458 459 RxnSMARTSColname, RxnSMARTSColnum = ProcessColumnSpecification("--colRxnSMARTS", RxnSMARTSColSpec) 460 461 OptionsInfo["RxnSMARTSCol"] = RxnSMARTSCol 462 OptionsInfo["RxnSMARTSColname"] = RxnSMARTSColname 463 OptionsInfo["RxnSMARTSColnum"] = RxnSMARTSColnum 464 465 def ProcessColumnSpecification(OptionName, Colspec): 466 """Process column specification corresponding to a column name or number.""" 467 468 Colname, Colnum = [None, None] 469 if OptionsInfo["ColnumMode"]: 470 Colnum = int(Colspec) 471 if Colnum not in OptionsInfo["ColnumToColnameMap"]: 472 MiscUtil.PrintError("The column number, %s, specified using \"%s\" option doesn't exist in reaction names file. You must specify a valid column number. Valid values: >= 1 and <= %s\n" % (Colnum, OptionName, OptionsInfo["ColCount"])) 473 Colname = OptionsInfo["ColnumToColnameMap"][Colnum] 474 else: 475 Colname = Colspec 476 if Colname not in OptionsInfo["ColnameToColnumMap"]: 477 MiscUtil.PrintError("The column name, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column name. Valid values: %s\n" % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))) 478 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 479 480 return (Colname, Colnum) 481 482 483 def ProcessOptions(): 484 """Process and validate command line arguments and options.""" 485 486 MiscUtil.PrintInfo("Processing options...") 487 488 # Validate options... 489 ValidateOptions() 490 491 Compute2DCoords = True 492 if not re.match("^yes$", Options["--compute2DCoords"], re.I): 493 Compute2DCoords = False 494 OptionsInfo["Compute2DCoords"] = Compute2DCoords 495 496 OptionsInfo["Mode"] = Options["--mode"] 497 RxnByNameMode = True 498 if not re.match("^RxnByName$", Options["--mode"], re.I): 499 RxnByNameMode = False 500 OptionsInfo["RxnByNameMode"] = RxnByNameMode 501 502 OptionsInfo["ProdMolNamesMode"] = Options["--prodMolNames"] 503 UseReactantNames = False 504 if re.match("^UseReactants$", Options["--prodMolNames"], re.I): 505 UseReactantNames = True 506 OptionsInfo["UseReactantNames"] = UseReactantNames 507 508 OptionsInfo["RxnName"] = Options["--rxnName"] 509 OptionsInfo["RxnNameSMARTS"] = None 510 if OptionsInfo["RxnByNameMode"]: 511 if not Options["--rxnName"]: 512 MiscUtil.PrintError("No rxn name specified using \"-r, --rxnName\" option during \"RxnByName\" value of \"-m, --mode\" option") 513 514 ProcessReactionNamesFileOption() 515 ProcessColumnOptions() 516 517 ReactantFiles = re.sub(" ", "", Options["--infiles"]) 518 ReactantFilesList = [] 519 ReactantFilesList = ReactantFiles.split(",") 520 OptionsInfo["ReactantFiles"] = ReactantFiles 521 OptionsInfo["ReactantFilesList"] = ReactantFilesList 522 523 OptionsInfo["SpecifiedSMARTS"] = Options["--smartsRxn"] 524 if not OptionsInfo["RxnByNameMode"]: 525 if not Options["--smartsRxn"]: 526 MiscUtil.PrintError("No rxn SMARTS pattern specified using \"-r, --rxnName\" option during \"RxnByName\" value of \"-m, --mode\" option") 527 528 OptionsInfo["Outfile"] = Options["--outfile"] 529 OptionsInfo["Overwrite"] = Options["--overwrite"] 530 531 # Use first reactant file as input file as all input files have the same format... 532 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], ReactantFilesList[0]) 533 534 # No need to pass any input or output file name due to absence of any auto parameter... 535 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"]) 536 537 Sanitize = True 538 if not re.match("^yes$", Options["--sanitize"], re.I): 539 Sanitize = False 540 OptionsInfo["Sanitize"] = Sanitize 541 542 def RetrieveOptions(): 543 """Retrieve command line arguments and options.""" 544 545 # Get options... 546 global Options 547 Options = docopt(_docoptUsage_) 548 549 # Set current working directory to the specified directory... 550 WorkingDir = Options["--workingdir"] 551 if WorkingDir: 552 os.chdir(WorkingDir) 553 554 # Handle examples option... 555 if "--examples" in Options and Options["--examples"]: 556 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 557 sys.exit(0) 558 559 # Handle listing of functional group information... 560 if Options and Options["--list"]: 561 ProcessListReactionNamesOption() 562 sys.exit(0) 563 564 def ValidateOptions(): 565 """Validate option values.""" 566 567 MiscUtil.ValidateOptionTextValue("-c, --colmode", Options["--colmode"], "collabel colnum") 568 569 MiscUtil.ValidateOptionTextValue("--compute2DCoords", Options["--compute2DCoords"], "yes no") 570 571 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "RxnByName RxnBySMARTS") 572 MiscUtil.ValidateOptionTextValue("-p, --prodMolNames", Options["--prodMolNames"], "UseReactants Sequential") 573 574 if not re.match("^auto$", Options["--rxnNamesFile"], re.I): 575 MiscUtil.ValidateOptionFilePath("--rxnNamesFile", Options["--rxnNamesFile"]) 576 577 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 578 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 579 580 ReactantFiles = re.sub(" ", "", Options["--infiles"]) 581 if not ReactantFiles: 582 MiscUtil.PrintError("No reactant files specified for \"-i, --infiles\" option") 583 584 # Validate file extensions... 585 for ReactantFile in ReactantFiles.split(","): 586 MiscUtil.ValidateOptionFilePath("-i, --infiles", ReactantFile) 587 MiscUtil.ValidateOptionFileExt("-i, --infiles", ReactantFile, "sdf sd smi csv tsv txt") 588 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infiles", ReactantFile, "-o, --outfile", Options["--outfile"]) 589 590 # Match file formats... 591 FirstFile = True 592 FirstFileFormat = "" 593 for ReactantFile in ReactantFiles.split(","): 594 FileFormat = "" 595 if MiscUtil.CheckFileExt(ReactantFile, "sdf sd"): 596 FileFormat = "SD" 597 elif MiscUtil.CheckFileExt(ReactantFile, "smi csv tsv txt"): 598 FileFormat = "SMILES" 599 else: 600 MiscUtil.PrintError("The file name specified , %s, for option \"-i, --infiles\" is not valid. Supported file formats: sdf sd smi csv tsv txt\n" % ReactantFile) 601 602 if FirstFile: 603 FirstFile = False 604 FirstFileFormat = FileFormat 605 continue 606 607 if not re.match("^%s$" % FirstFileFormat, FileFormat, re.IGNORECASE): 608 MiscUtil.PrintError("All reactant file names - %s - specified using option \"-i, --infiles\" must have the same file format.\n" % ReactantFiles) 609 610 611 MiscUtil.ValidateOptionTextValue("--sanitize", Options["--sanitize"], "yes no") 612 613 # Setup a usage string for docopt... 614 _docoptUsage_ = """ 615 RDKitEnumerateCompoundLibrary.py - Enumerate a virtual compound library 616 617 Usage: 618 RDKitEnumerateCompoundLibrary.py [--colmode <collabel or colnum>] [--colRxnName <text or number>] 619 [--colRxnSMARTS <text or number>] [--compute2DCoords <yes or no>] [--infileParams <Name,Value,...>] 620 [--mode <RxnByName or RxnBySMARTS>] [--outfileParams <Name,Value,...>] [--overwrite] 621 [--prodMolNames <UseReactants or Sequential>] [--rxnName <text>] 622 [--rxnNamesFile <FileName or auto>] [--smartsRxn <text>] [--sanitize <yes or no>] 623 [-w <dir>] -i <ReactantFile1,...> -o <outfile> 624 RDKitEnumerateCompoundLibrary.py [--colmode <collabel or colnum>] [--colRxnName <text or number>] [--colRxnSMARTS <text or number>] 625 [--rxnNamesFile <FileName or auto>] -l | --list 626 RDKitEnumerateCompoundLibrary.py -h | --help | -e | --examples 627 628 Description: 629 Perform a combinatorial enumeration of a virtual library of molecules for a reaction specified 630 using a reaction name or SMARTS pattern and reactant input files. 631 632 The SMARTS patterns for supported reactions names [ Ref 134 ] are retrieved from file, 633 ReactionNamesAndSMARTS.csv, available in MayaChemTools data directory. The current 634 list of supported reaction names is shown below: 635 636 '1,2,4_triazole_acetohydrazide', '1,2,4_triazole_carboxylic_acid_ester', 3_nitrile_pyridine, 637 Benzimidazole_derivatives_aldehyde, Benzimidazole_derivatives_carboxylic_acid_ester, 638 Benzofuran, Benzothiazole, Benzothiophene, Benzoxazole_aromatic_aldehyde, 639 Benzoxazole_carboxylic_acid, Buchwald_Hartwig, Decarboxylative_coupling, Fischer_indole, 640 Friedlaender_chinoline, Grignard_alcohol, Grignard_carbonyl, Heck_non_terminal_vinyl, 641 Heck_terminal_vinyl, Heteroaromatic_nuc_sub, Huisgen_Cu_catalyzed_1,4_subst, 642 Huisgen_disubst_alkyne, Huisgen_Ru_catalyzed_1,5_subst, Imidazole, Indole, Mitsunobu_imide, 643 Mitsunobu_phenole, Mitsunobu_sulfonamide, Mitsunobu_tetrazole_1, Mitsunobu_tetrazole_2, 644 Mitsunobu_tetrazole_3, Mitsunobu_tetrazole_4, N_arylation_heterocycles, Negishi, 645 Niementowski_quinazoline, Nucl_sub_aromatic_ortho_nitro, Nucl_sub_aromatic_para_nitro, 646 Oxadiazole, Paal_Knorr_pyrrole, Phthalazinone, Pictet_Spengler, Piperidine_indole, 647 Pyrazole, Reductive_amination, Schotten_Baumann_amide, Sonogashira, Spiro_chromanone, 648 Stille, Sulfon_amide, Suzuki, Tetrazole_connect_regioisomer_1, Tetrazole_connect_regioisomer_2, 649 Tetrazole_terminal, Thiazole, Thiourea, Triaryl_imidazole, Urea, Williamson_ether, Wittig 650 651 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt) 652 653 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 654 655 Options: 656 -c, --colmode <collabel or colnum> [default: collabel] 657 Use column number or name for the specification of columns in a CSV 658 file containing reaction names along with reaction SMARTS. You may 659 specify a reaction names file using '--rxnNamesFile' option. 660 --colRxnName <text or number> [default: auto] 661 Column name or number corresponding to reaction names. The default value 662 is automatically set based on the value of '-c, --colmode': 'RxnName' for 663 'collabel'; Reaction name column number for 'colnum'. 664 --colRxnSMARTS <text or number> [default: auto] 665 Column name or number corresponding to reaction SMARTS strings. The default 666 value is automatically set based on the value of '-c, --colmode': 'RxnSMARTS' 667 for 'collabel'; Reacton SMARTS column number for 'colnum'. 668 --compute2DCoords <yes or no> [default: yes] 669 Compute 2D coordinates of product molecules before writing them out. 670 -i, --infiles <ReactantFile1, ReactantFile2...> 671 Comma delimited list of reactant file names for enumerating a compound library 672 using reaction SMARTS. The number of reactant files must match number of 673 reaction components in reaction SMARTS. All reactant input files must have 674 the same format. 675 --infileParams <Name,Value,...> [default: auto] 676 A comma delimited list of parameter name and value pairs for reading 677 molecules from files. The supported parameter names for different file 678 formats, along with their default values, are shown below: 679 680 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 681 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 682 smilesTitleLine,auto,sanitize,yes 683 684 Possible values for smilesDelimiter: space, comma or tab. These parameters apply 685 to all reactant input files, which must have the same file format. 686 -e, --examples 687 Print examples. 688 -h, --help 689 Print this help message. 690 -l, --list 691 List available reaction names along with corresponding SMARTS patterns without 692 performing any enumeration. In addition, reaction SMARTS patterns are validated. 693 -m, --mode <RxnByName or RxnBySMARTS> [default: RxnByName] 694 Indicate whether a reaction is specified by a reaction name or a SMARTS pattern. 695 Possible values: RxnByName or RxnBySMARTS. 696 -o, --outfile <outfile> 697 Output file name. 698 --outfileParams <Name,Value,...> [default: auto] 699 A comma delimited list of parameter name and value pairs for writing 700 molecules to files. The supported parameter names for different file 701 formats, along with their default values, are shown below: 702 703 SD: kekulize,yes,forceV3000,no 704 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 705 smilesTitleLine,yes 706 707 -p, --prodMolNames <UseReactants or Sequential> [default: UseReactants] 708 Generate names of product molecules using reactant names or assign names in 709 a sequential order. Possible values: UseReactants or Sequential. Format of 710 molecule names: UseReactants - <ReactName1>_<ReactName2>..._Prod<Num>; 711 Sequential - Prod<Num> 712 --overwrite 713 Overwrite existing files. 714 -r, --rxnName <text> 715 Name of a reaction to use for enumerating a compound library. This option 716 is only used during 'RxnByName' value of '-m, --mode' option. 717 --rxnNamesFile <FileName or auto> [default: auto] 718 Specify a file name containing data for names of reactions and SMARTS patterns or 719 use default file, ReactionNamesAndSMARTS.csv, available in MayaChemTools data 720 directory. 721 722 Default reactions SMARTS file format: RxnName,RxnSMARTS. 723 724 The local file format is assumed to be same as the default file format. You may 725 explicitly specify column names or numbers for reaction name and reaction 726 SMARTS using '--colRxnName' and '--colRxnSMARTS' options. 727 -s, --smartsRxn <text> 728 SMARTS pattern of a reaction to use for enumerating a compound library. This 729 option is only used during 'RxnBySMARTS' value of '-m, --mode' option. 730 --sanitize <yes or no> [default: yes] 731 Sanitize product molecules before writing them out. 732 -w, --workingdir <dir> 733 Location of working directory which defaults to the current directory. 734 735 Examples: 736 To list all available reaction names along with their SMARTS pattern, type: 737 738 % RDKitEnumerateCompoundLibrary.py -l 739 740 To perform a combinatorial enumeration of a virtual compound library corresponding 741 to named amide reaction, Schotten_Baumann_amide, and write out a SMILES file 742 type: 743 744 % RDKitEnumerateCompoundLibrary.py -r Schotten_Baumann_amide 745 -i 'SampleAcids.smi,SampleAmines.smi' -o SampleOutCmpdLibrary.smi 746 747 To run the previous command using a local reaction names file with explicit 748 specification of column names containing reaction names and SMARTS, and write 749 out a SMILES file type: 750 751 % RDKitEnumerateCompoundLibrary.py -r Schotten_Baumann_amide 752 --rxnNamesFile ReactionNamesAndSMARTS.csv 753 --colmode collabel --colRxnName RxnName --colRxnSMARTS RxnSMARTS 754 -i 'SampleAcids.smi,SampleAmines.smi' -o SampleOutCmpdLibrary.smi 755 756 To perform a combinatorial enumeration of a virtual compound library corresponding 757 to an amide reaction specified using a SMARTS pattern and write out a SD file containing 758 sanitized molecules, computed 2D coordinates, and generation of molecule names from 759 reactant names, type: 760 761 % RDKitEnumerateCompoundLibrary.py -m RxnBySMARTS 762 -s '[O:2]=[C:1][OH].[N:3]>>[O:2]=[C:1][N:3]' 763 -i 'SampleAcids.smi,SampleAmines.smi' -o SampleOutCmpdLibrary.sdf 764 765 To perform a combinatorial enumeration of a virtual compound library corresponding 766 to an amide reaction specified using a SMARTS pattern and write out a SD file containing 767 unsanitized molecules, without generating 2D coordinates, and a sequential generation 768 of molecule names, type: 769 770 % RDKitEnumerateCompoundLibrary.py -m RxnBySMARTS -c no --sanitize no 771 -p Sequential -s '[O:2]=[C:1][OH].[N:3]>>[O:2]=[C:1][N:3]' 772 -i 'SampleAcids.smi,SampleAmines.smi' -o SampleOutCmpdLibrary.sdf 773 774 Author: 775 Manish Sud(msud@san.rr.com) 776 777 See also: 778 RDKitConvertFileFormat.py, RDKitFilterPAINS.py, RDKitSearchFunctionalGroups.py, 779 RDKitSearchSMARTS.py 780 781 Copyright: 782 Copyright (C) 2024 Manish Sud. All rights reserved. 783 784 The functionality available in this script is implemented using RDKit, an 785 open source toolkit for cheminformatics developed by Greg Landrum. 786 787 This file is part of MayaChemTools. 788 789 MayaChemTools is free software; you can redistribute it and/or modify it under 790 the terms of the GNU Lesser General Public License as published by the Free 791 Software Foundation; either version 3 of the License, or (at your option) any 792 later version. 793 794 """ 795 796 if __name__ == "__main__": 797 main()