1 #!/bin/env python 2 # 3 # File: RDKitGenerateMolecularFrameworks.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import csv 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 from rdkit.Chem.Scaffolds import MurckoScaffold 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 def main(): 64 """Start execution of the script.""" 65 66 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 67 68 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 69 70 # Retrieve command line arguments and options... 71 RetrieveOptions() 72 73 # Process and validate command line arguments and options... 74 ProcessOptions() 75 76 # Perform actions required by the script... 77 GenerateMolecularFrameworks() 78 79 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 80 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 81 82 def GenerateMolecularFrameworks(): 83 """Generate Bemis Murcko molecular framworks.""" 84 85 Infile = OptionsInfo["Infile"] 86 Outfile = OptionsInfo["Outfile"] 87 88 UseChirality = OptionsInfo["UseChirality"] 89 90 RemoveDuplicateFrameworks = OptionsInfo["RemoveDuplicateFrameworks"] 91 UseGraphFrameworks = OptionsInfo["UseGraphFrameworks"] 92 93 SortFrameworks = OptionsInfo["SortFrameworks"] 94 if SortFrameworks: 95 FrameworkMolIDs = [] 96 FrameworkMolIDToMolMap = {} 97 FrameworkMolIDToAtomCountMap = {} 98 99 DuplicateFrameworkMolIDs = [] 100 DuplicateFrameworkMolIDToMolMap = {} 101 DuplicateFrameworkMolIDToAtomCountMap = {} 102 103 DuplicatesOutfile = "" 104 if RemoveDuplicateFrameworks: 105 DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"] 106 107 # Setup a molecule reader... 108 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 109 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 110 111 # Set up a molecular framework writer... 112 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 113 if Writer is None: 114 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 115 116 # Set up a duplicate molecular framework writer... 117 if RemoveDuplicateFrameworks: 118 DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"]) 119 if Writer is None: 120 MiscUtil.PrintError("Failed to setup a writer for duplicates output fie %s " % DuplicatesOutfile) 121 122 if RemoveDuplicateFrameworks: 123 MiscUtil.PrintInfo("Generating files: %s and %s..." % (Outfile, DuplicatesOutfile)) 124 else: 125 MiscUtil.PrintInfo("Generating file %s..." % Outfile) 126 127 # Process molecules... 128 MolCount = 0 129 ValidMolCount = 0 130 131 FrameworksCount = 0 132 UniqueFrameworksCount = 0 133 DuplicateFrameworksCount = 0 134 135 CanonicalSMILESMap = {} 136 137 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 138 139 for Mol in Mols: 140 MolCount += 1 141 142 if Mol is None: 143 continue 144 145 if RDKitUtil.IsMolEmpty(Mol): 146 MolName = RDKitUtil.GetMolName(Mol, MolCount) 147 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 148 continue 149 150 ValidMolCount += 1 151 152 if UseGraphFrameworks: 153 FrameworksMol = MurckoScaffold.MakeScaffoldGeneric(Mol) 154 else: 155 FrameworksMol = MurckoScaffold.GetScaffoldForMol(Mol) 156 157 if Compute2DCoords: 158 AllChem.Compute2DCoords(FrameworksMol) 159 160 if SortFrameworks: 161 HeavyAtomCount = FrameworksMol.GetNumHeavyAtoms() 162 163 FrameworksCount += 1 164 165 if RemoveDuplicateFrameworks: 166 CanonicalSMILES = Chem.MolToSmiles(FrameworksMol, isomericSmiles = UseChirality, canonical = True) 167 if CanonicalSMILES in CanonicalSMILESMap: 168 DuplicateFrameworksCount += 1 169 if SortFrameworks: 170 # Track duplicate frameworks... 171 DuplicateFrameworkMolIDs.append(DuplicateFrameworksCount) 172 DuplicateFrameworkMolIDToMolMap[DuplicateFrameworksCount] = FrameworksMol 173 DuplicateFrameworkMolIDToAtomCountMap[DuplicateFrameworksCount] = HeavyAtomCount 174 else: 175 # Write it out... 176 DuplicatesWriter.write(FrameworksMol) 177 else: 178 UniqueFrameworksCount += 1 179 CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES 180 if SortFrameworks: 181 # Track unique frameworks... 182 FrameworkMolIDs.append(UniqueFrameworksCount) 183 FrameworkMolIDToMolMap[UniqueFrameworksCount] = FrameworksMol 184 FrameworkMolIDToAtomCountMap[UniqueFrameworksCount] = HeavyAtomCount 185 else: 186 # Write it out... 187 Writer.write(FrameworksMol) 188 elif SortFrameworks: 189 # Track for sorting... 190 FrameworkMolIDs.append(FrameworksCount) 191 FrameworkMolIDToMolMap[FrameworksCount] = FrameworksMol 192 FrameworkMolIDToAtomCountMap[FrameworksCount] = HeavyAtomCount 193 else: 194 # Write it out... 195 Writer.write(FrameworksMol) 196 197 if SortFrameworks: 198 ReverseOrder = OptionsInfo["DescendingSortOrder"] 199 SortAndWriteFrameworks(Writer, FrameworkMolIDs, FrameworkMolIDToMolMap, FrameworkMolIDToAtomCountMap, ReverseOrder) 200 if RemoveDuplicateFrameworks: 201 SortAndWriteFrameworks(DuplicatesWriter, DuplicateFrameworkMolIDs, DuplicateFrameworkMolIDToMolMap, DuplicateFrameworkMolIDToAtomCountMap, ReverseOrder) 202 203 Writer.close() 204 if RemoveDuplicateFrameworks: 205 DuplicatesWriter.close() 206 207 MiscUtil.PrintInfo("\nTotal number of molecular frameworks: %d" % FrameworksCount) 208 if RemoveDuplicateFrameworks: 209 MiscUtil.PrintInfo("Number of unique molecular frameworks: %d" % UniqueFrameworksCount) 210 MiscUtil.PrintInfo("Number of duplicate molecular frameworks: %d" % DuplicateFrameworksCount) 211 212 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 213 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 214 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 215 216 def SortAndWriteFrameworks(MolWriter, MolIDs, MolIDToMolMap, MolIDToAtomCountMap, ReverseOrder): 217 """Sort frameworks and write them out.""" 218 SortedMolIDs = sorted(MolIDs, key = lambda MolID: MolIDToAtomCountMap[MolID], reverse = ReverseOrder) 219 for MolID in SortedMolIDs: 220 FrameworksMol = MolIDToMolMap[MolID] 221 MolWriter.write(FrameworksMol) 222 223 def ProcessOptions(): 224 """Process and validate command line arguments and options.""" 225 226 MiscUtil.PrintInfo("Processing options...") 227 228 # Validate options... 229 ValidateOptions() 230 231 OptionsInfo["Infile"] = Options["--infile"] 232 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 233 234 OptionsInfo["Outfile"] = Options["--outfile"] 235 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 236 237 OptionsInfo["Overwrite"] = Options["--overwrite"] 238 239 OptionsInfo["Mode"] = Options["--mode"] 240 OptionsInfo["UseGraphFrameworks"] = False 241 if re.match("^GraphFrameworks$", OptionsInfo["Mode"], re.I): 242 OptionsInfo["UseGraphFrameworks"] = True 243 244 OptionsInfo["RemoveDuplicates"] = Options["--removeDuplicates"] 245 OptionsInfo["RemoveDuplicateFrameworks"] = False 246 if re.match("^Yes$", OptionsInfo["RemoveDuplicates"], re.I): 247 OptionsInfo["RemoveDuplicateFrameworks"] = True 248 249 # Setup outfile for writing out duplicates... 250 OptionsInfo["DuplicatesOutfile"] = "" 251 if OptionsInfo["RemoveDuplicateFrameworks"]: 252 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 253 OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt) 254 255 OptionsInfo["Sort"] = Options["--sort"] 256 OptionsInfo["SortFrameworks"] = False 257 if re.match("^Yes$", OptionsInfo["Sort"], re.I): 258 OptionsInfo["SortFrameworks"] = True 259 260 OptionsInfo["SortOrder"] = Options["--sortOrder"] 261 OptionsInfo["DescendingSortOrder"] = False 262 if re.match("^Descending$", OptionsInfo["SortOrder"], re.I): 263 OptionsInfo["DescendingSortOrder"] = True 264 265 OptionsInfo["UseChirality"] = False 266 if re.match("^yes$", Options["--useChirality"], re.I): 267 OptionsInfo["UseChirality"] = True 268 269 def RetrieveOptions(): 270 """Retrieve command line arguments and options.""" 271 272 # Get options... 273 global Options 274 Options = docopt(_docoptUsage_) 275 276 # Set current working directory to the specified directory... 277 WorkingDir = Options["--workingdir"] 278 if WorkingDir: 279 os.chdir(WorkingDir) 280 281 # Handle examples option... 282 if "--examples" in Options and Options["--examples"]: 283 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 284 sys.exit(0) 285 286 def ValidateOptions(): 287 """Validate option values.""" 288 289 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 290 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 291 292 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 293 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 294 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 295 296 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "GraphFrameworks AtomicFrameworks") 297 298 MiscUtil.ValidateOptionTextValue("-r, --removeDuplicates", Options["--removeDuplicates"], "yes no") 299 MiscUtil.ValidateOptionTextValue("-s, --sort", Options["--sort"], "yes no") 300 MiscUtil.ValidateOptionTextValue("--sortOrder", Options["--sortOrder"], "ascending descending") 301 302 MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no") 303 304 # Setup a usage string for docopt... 305 _docoptUsage_ = """ 306 RDKitGenerateMolecularFrameworks.py - Generate Bemis Murcko molecular frameworks 307 308 Usage: 309 RDKitGenerateMolecularFrameworks.py [--infileParams <Name,Value,...>] 310 [--mode <GraphFrameworks or AtomicFrameworks> ] 311 [ --outfileParams <Name,Value,...> ] [--overwrite] [--removeDuplicates <yes or no>] 312 [--sort <yes or no>] [--sortOrder <ascending or descending>] 313 [--useChirality <yes or no>] [-w <dir>] -i <infile> -o <outfile> 314 RDKitGenerateMolecularFrameworks.py -h | --help | -e | --examples 315 316 Description: 317 Generate Bemis Murcko [ Ref 133 ] molecular frameworks for molecules. Two types of molecular 318 frameworks can be generated: Graph or atomic frameworks. The graph molecular framework 319 is a generic framework. The atom type, hybridization, and bond order is ignore during its 320 generation. All atoms are set to carbon atoms and all bonds are single bonds. The atom type, 321 hybridization, and bond order is preserved during generation of atomic molecular frameworks. 322 323 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt) 324 325 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 326 327 Options: 328 -e, --examples 329 Print examples. 330 -h, --help 331 Print this help message. 332 -i, --infile <infile> 333 Input file name. 334 --infileParams <Name,Value,...> [default: auto] 335 A comma delimited list of parameter name and value pairs for reading 336 molecules from files. The supported parameter names for different file 337 formats, along with their default values, are shown below: 338 339 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 340 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 341 smilesTitleLine,auto,sanitize,yes 342 343 Possible values for smilesDelimiter: space, comma or tab. 344 -m, --mode <GraphFrameworks or AtomicFrameworks> [default: GraphFrameworks] 345 Type of molecular frameworks to generate for molecules. Possible values: GraphFrameworks 346 or AtomicFrameworks. The graph molecular framework is a generic framework. The atom type, 347 hybridization, and bond order is ignore during its generation. All atoms are set to carbon atoms 348 and all bonds are single bonds. The atom type, hybridization, and bond order is preserved 349 during the generation of atomic molecular frameworks. 350 -o, --outfile <outfile> 351 Output file name. 352 --outfileParams <Name,Value,...> [default: auto] 353 A comma delimited list of parameter name and value pairs for writing 354 molecules to files. The supported parameter names for different file 355 formats, along with their default values, are shown below: 356 357 SD: compute2DCoords,auto,forceV3000,no 358 SMILES: smilesDelimiter,space,smilesTitleLine,yes 359 360 Default value for compute2DCoords: yes for SMILES input file; no for all other 361 file types. 362 --overwrite 363 Overwrite existing files. 364 -r, --removeDuplicates <yes or no> [default: no] 365 Remove duplicate molecular frameworks. Possible values: yes or no. The duplicate 366 molecular franworks are identified using canonical SMILES. The removed frameworks 367 are written to a separate output file. 368 -s, --sort <yes or no> [default: no] 369 Sort molecular frameworks by heavy atom count. Possible values: yes or no. 370 --sortOrder <ascending or descending> [default: ascending] 371 Sorting order for molecular frameworks. Possible values: ascending or descending. 372 -u, --useChirality <yes or no> [default: yes] 373 Use stereochemistry for generation of canonical SMILES strings to identify 374 duplicate molecular frameworks. 375 -w, --workingdir <dir> 376 Location of working directory which defaults to the current directory. 377 378 Examples: 379 To generate graph molecular framworks for molecules and write out a SMILES file, 380 type: 381 382 % RDKitGenerateMolecularFrameworks.py -i Sample.smi -o SampleOut.smi 383 384 To generate graph molecular framworks, remove duplicate frameworks for molecules 385 and write out SD files for unique and duplicate frameworks, type: 386 387 % RDKitGenerateMolecularFrameworks.py -m GraphFrameworks -r yes 388 -i Sample.sdf -o SampleOut.sdf 389 390 To generate atomic molecular framworks, remove duplicate frameworks, sort 391 framworks by heavy atom count in ascending order, write out SMILES files for 392 unique and duplicate frameworks, type: 393 394 % RDKitGenerateMolecularFrameworks.py -m AtomicFrameworks -r yes 395 -s yes -i Sample.smi -o SampleOut.smi 396 397 To generate graph molecular framworks for molecules in a CSV SMILES file, 398 SMILES strings in column 1, name in olumn 2, emove duplicate frameworks, 399 sort framworks by heavy atom count in decending order and write out a SD 400 file, type: 401 402 % RDKitGenerateMolecularFrameworks.py -m AtomicFrameworks 403 --removeDuplicates yes -s yes --sortOrder descending --infileParams 404 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 405 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 406 -i SampleSMILES.csv -o SampleOut.sdf 407 408 Author: 409 Manish Sud(msud@san.rr.com) 410 411 See also: 412 RDKitConvertFileFormat.py, RDKitDrawMolecules.py, RDKitSearchFunctionalGroups.py, 413 RDKitSearchSMARTS.py 414 415 Copyright: 416 Copyright (C) 2024 Manish Sud. All rights reserved. 417 418 The functionality available in this script is implemented using RDKit, an 419 open source toolkit for cheminformatics developed by Greg Landrum. 420 421 This file is part of MayaChemTools. 422 423 MayaChemTools is free software; you can redistribute it and/or modify it under 424 the terms of the GNU Lesser General Public License as published by the Free 425 Software Foundation; either version 3 of the License, or (at your option) any 426 later version. 427 428 """ 429 430 if __name__ == "__main__": 431 main()