MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitGenerateMolecularFrameworks.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 import csv
  37 
  38 # RDKit imports...
  39 try:
  40     from rdkit import rdBase
  41     from rdkit import Chem
  42     from rdkit.Chem import AllChem
  43     from rdkit.Chem.Scaffolds import MurckoScaffold
  44 except ImportError as ErrMsg:
  45     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  46     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  47     sys.exit(1)
  48 
  49 # MayaChemTools imports...
  50 try:
  51     from docopt import docopt
  52     import MiscUtil
  53     import RDKitUtil
  54 except ImportError as ErrMsg:
  55     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  56     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  57     sys.exit(1)
  58 
  59 ScriptName = os.path.basename(sys.argv[0])
  60 Options = {}
  61 OptionsInfo = {}
  62 
  63 def main():
  64     """Start execution of the script."""
  65     
  66     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  67     
  68     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  69     
  70     # Retrieve command line arguments and options...
  71     RetrieveOptions()
  72     
  73     # Process and validate command line arguments and options...
  74     ProcessOptions()
  75     
  76     # Perform actions required by the script...
  77     GenerateMolecularFrameworks()
  78     
  79     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  80     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  81 
  82 def GenerateMolecularFrameworks():
  83     """Generate Bemis Murcko molecular framworks."""
  84     
  85     Infile = OptionsInfo["Infile"]
  86     Outfile = OptionsInfo["Outfile"]
  87 
  88     UseChirality = OptionsInfo["UseChirality"]
  89 
  90     RemoveDuplicateFrameworks = OptionsInfo["RemoveDuplicateFrameworks"]
  91     UseGraphFrameworks = OptionsInfo["UseGraphFrameworks"]
  92     
  93     SortFrameworks = OptionsInfo["SortFrameworks"]
  94     if SortFrameworks:
  95         FrameworkMolIDs = []
  96         FrameworkMolIDToMolMap = {}
  97         FrameworkMolIDToAtomCountMap = {}
  98         
  99         DuplicateFrameworkMolIDs = []
 100         DuplicateFrameworkMolIDToMolMap = {}
 101         DuplicateFrameworkMolIDToAtomCountMap = {}
 102         
 103     DuplicatesOutfile = ""
 104     if RemoveDuplicateFrameworks:
 105         DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"]
 106 
 107     # Setup a molecule reader...
 108     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
 109     Mols  = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
 110     
 111     # Set up a molecular framework  writer...
 112     Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 113     if Writer is None:
 114         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 115     
 116     # Set up a duplicate molecular framework writer...    
 117     if RemoveDuplicateFrameworks:
 118         DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"])
 119         if Writer is None:
 120             MiscUtil.PrintError("Failed to setup a writer for duplicates output fie %s " % DuplicatesOutfile)
 121         
 122     if RemoveDuplicateFrameworks:
 123         MiscUtil.PrintInfo("Generating files: %s and %s..." % (Outfile, DuplicatesOutfile))
 124     else:
 125         MiscUtil.PrintInfo("Generating file %s..." % Outfile)
 126 
 127     # Process molecules...
 128     MolCount = 0
 129     ValidMolCount = 0
 130     
 131     FrameworksCount = 0
 132     UniqueFrameworksCount = 0
 133     DuplicateFrameworksCount = 0
 134     
 135     CanonicalSMILESMap = {}
 136     
 137     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 138     
 139     for Mol in Mols:
 140         MolCount += 1
 141         
 142         if Mol is None:
 143             continue
 144         
 145         if RDKitUtil.IsMolEmpty(Mol):
 146             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 147             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 148             continue
 149         
 150         ValidMolCount += 1
 151 
 152         if UseGraphFrameworks:
 153             FrameworksMol = MurckoScaffold.MakeScaffoldGeneric(Mol)
 154         else:
 155             FrameworksMol = MurckoScaffold.GetScaffoldForMol(Mol)
 156 
 157         if Compute2DCoords:
 158             AllChem.Compute2DCoords(FrameworksMol)
 159             
 160         if SortFrameworks:
 161             HeavyAtomCount = FrameworksMol.GetNumHeavyAtoms()
 162 
 163         FrameworksCount += 1
 164         
 165         if RemoveDuplicateFrameworks:
 166             CanonicalSMILES = Chem.MolToSmiles(FrameworksMol, isomericSmiles = UseChirality, canonical = True)
 167             if CanonicalSMILES in CanonicalSMILESMap:
 168                 DuplicateFrameworksCount += 1
 169                 if SortFrameworks:
 170                     # Track duplicate frameworks...
 171                     DuplicateFrameworkMolIDs.append(DuplicateFrameworksCount)
 172                     DuplicateFrameworkMolIDToMolMap[DuplicateFrameworksCount] = FrameworksMol
 173                     DuplicateFrameworkMolIDToAtomCountMap[DuplicateFrameworksCount] = HeavyAtomCount
 174                 else:
 175                     # Write it out...
 176                     DuplicatesWriter.write(FrameworksMol)
 177             else:
 178                 UniqueFrameworksCount += 1
 179                 CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES
 180                 if SortFrameworks:
 181                     # Track unique frameworks...
 182                     FrameworkMolIDs.append(UniqueFrameworksCount)
 183                     FrameworkMolIDToMolMap[UniqueFrameworksCount] = FrameworksMol
 184                     FrameworkMolIDToAtomCountMap[UniqueFrameworksCount] = HeavyAtomCount
 185                 else:
 186                     # Write it out...
 187                     Writer.write(FrameworksMol)
 188         elif SortFrameworks:
 189             # Track for sorting...
 190             FrameworkMolIDs.append(FrameworksCount)
 191             FrameworkMolIDToMolMap[FrameworksCount] = FrameworksMol
 192             FrameworkMolIDToAtomCountMap[FrameworksCount] = HeavyAtomCount
 193         else:
 194             # Write it out...
 195             Writer.write(FrameworksMol)
 196             
 197     if SortFrameworks:
 198         ReverseOrder = OptionsInfo["DescendingSortOrder"]
 199         SortAndWriteFrameworks(Writer, FrameworkMolIDs, FrameworkMolIDToMolMap, FrameworkMolIDToAtomCountMap, ReverseOrder)
 200         if RemoveDuplicateFrameworks:
 201             SortAndWriteFrameworks(DuplicatesWriter, DuplicateFrameworkMolIDs, DuplicateFrameworkMolIDToMolMap, DuplicateFrameworkMolIDToAtomCountMap, ReverseOrder)
 202     
 203     Writer.close()
 204     if RemoveDuplicateFrameworks:
 205         DuplicatesWriter.close()
 206 
 207     MiscUtil.PrintInfo("\nTotal number of molecular frameworks: %d" % FrameworksCount)
 208     if RemoveDuplicateFrameworks:
 209         MiscUtil.PrintInfo("Number of unique molecular frameworks: %d" % UniqueFrameworksCount)
 210         MiscUtil.PrintInfo("Number of duplicate molecular frameworks: %d" % DuplicateFrameworksCount)
 211         
 212     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 213     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 214     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 215 
 216 def SortAndWriteFrameworks(MolWriter, MolIDs, MolIDToMolMap, MolIDToAtomCountMap, ReverseOrder):
 217     """Sort frameworks and write them out."""
 218     SortedMolIDs = sorted(MolIDs, key = lambda MolID: MolIDToAtomCountMap[MolID], reverse = ReverseOrder)
 219     for MolID in SortedMolIDs:
 220         FrameworksMol = MolIDToMolMap[MolID]
 221         MolWriter.write(FrameworksMol)
 222 
 223 def ProcessOptions():
 224     """Process and validate command line arguments and options."""
 225     
 226     MiscUtil.PrintInfo("Processing options...")
 227     
 228     # Validate options...
 229     ValidateOptions()
 230     
 231     OptionsInfo["Infile"] = Options["--infile"]
 232     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 233     
 234     OptionsInfo["Outfile"] = Options["--outfile"]
 235     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 236     
 237     OptionsInfo["Overwrite"] = Options["--overwrite"]
 238 
 239     OptionsInfo["Mode"] = Options["--mode"]
 240     OptionsInfo["UseGraphFrameworks"] = False
 241     if re.match("^GraphFrameworks$", OptionsInfo["Mode"], re.I):
 242         OptionsInfo["UseGraphFrameworks"] = True
 243     
 244     OptionsInfo["RemoveDuplicates"] = Options["--removeDuplicates"]
 245     OptionsInfo["RemoveDuplicateFrameworks"] = False
 246     if re.match("^Yes$", OptionsInfo["RemoveDuplicates"], re.I):
 247         OptionsInfo["RemoveDuplicateFrameworks"] = True
 248         
 249     # Setup outfile for writing out duplicates...
 250     OptionsInfo["DuplicatesOutfile"] = ""
 251     if OptionsInfo["RemoveDuplicateFrameworks"]:
 252         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 253         OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt)
 254 
 255     OptionsInfo["Sort"] = Options["--sort"]
 256     OptionsInfo["SortFrameworks"] = False
 257     if re.match("^Yes$", OptionsInfo["Sort"], re.I):
 258         OptionsInfo["SortFrameworks"] = True
 259     
 260     OptionsInfo["SortOrder"] = Options["--sortOrder"]
 261     OptionsInfo["DescendingSortOrder"] = False
 262     if re.match("^Descending$", OptionsInfo["SortOrder"], re.I):
 263         OptionsInfo["DescendingSortOrder"] = True
 264     
 265     OptionsInfo["UseChirality"] = False
 266     if re.match("^yes$", Options["--useChirality"], re.I):
 267         OptionsInfo["UseChirality"] = True
 268 
 269 def RetrieveOptions():
 270     """Retrieve command line arguments and options."""
 271     
 272     # Get options...
 273     global Options
 274     Options = docopt(_docoptUsage_)
 275     
 276     # Set current working directory to the specified directory...
 277     WorkingDir = Options["--workingdir"]
 278     if WorkingDir:
 279         os.chdir(WorkingDir)
 280     
 281     # Handle examples option...
 282     if "--examples" in Options and Options["--examples"]:
 283         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 284         sys.exit(0)
 285 
 286 def ValidateOptions():
 287     """Validate option values."""
 288     
 289     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 290     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 291     
 292     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 293     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 294     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 295     
 296     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "GraphFrameworks AtomicFrameworks")
 297     
 298     MiscUtil.ValidateOptionTextValue("-r, --removeDuplicates", Options["--removeDuplicates"], "yes no")
 299     MiscUtil.ValidateOptionTextValue("-s, --sort", Options["--sort"], "yes no")
 300     MiscUtil.ValidateOptionTextValue("--sortOrder", Options["--sortOrder"], "ascending descending")
 301 
 302     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 303     
 304 # Setup a usage string for docopt...
 305 _docoptUsage_ = """
 306 RDKitGenerateMolecularFrameworks.py - Generate Bemis Murcko molecular frameworks
 307 
 308 Usage:
 309     RDKitGenerateMolecularFrameworks.py [--infileParams <Name,Value,...>]
 310                                          [--mode <GraphFrameworks or AtomicFrameworks> ]
 311                                          [ --outfileParams <Name,Value,...> ]  [--overwrite] [--removeDuplicates <yes or no>]
 312                                          [--sort <yes or no>] [--sortOrder <ascending or descending>]
 313                                          [--useChirality <yes or no>] [-w <dir>] -i <infile> -o <outfile>
 314     RDKitGenerateMolecularFrameworks.py -h | --help | -e | --examples
 315 
 316 Description:
 317     Generate Bemis Murcko [ Ref 133 ] molecular frameworks for molecules. Two types of molecular
 318     frameworks can be generated: Graph or atomic frameworks. The graph molecular framework
 319     is a generic framework. The atom type, hybridization, and bond order is ignore during its
 320     generation. All atoms are set to carbon atoms and all bonds are single bonds. The atom type,
 321     hybridization, and bond order is preserved during generation of atomic molecular frameworks.
 322 
 323     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt)
 324 
 325     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 326 
 327 Options:
 328     -e, --examples
 329         Print examples.
 330     -h, --help
 331         Print this help message.
 332     -i, --infile <infile>
 333         Input file name.
 334     --infileParams <Name,Value,...>  [default: auto]
 335         A comma delimited list of parameter name and value pairs for reading
 336         molecules from files. The supported parameter names for different file
 337         formats, along with their default values, are shown below:
 338             
 339             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 340             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 341                 smilesTitleLine,auto,sanitize,yes
 342             
 343         Possible values for smilesDelimiter: space, comma or tab.
 344     -m, --mode <GraphFrameworks or AtomicFrameworks>  [default: GraphFrameworks]
 345         Type of molecular frameworks to generate for molecules. Possible values: GraphFrameworks
 346          or AtomicFrameworks. The graph molecular framework is a generic framework. The atom type,
 347          hybridization, and bond order is ignore during its generation. All atoms are set to carbon atoms
 348          and all bonds are single bonds. The atom type, hybridization, and bond order is preserved
 349          during the generation of atomic molecular frameworks.
 350     -o, --outfile <outfile>
 351         Output file name.
 352     --outfileParams <Name,Value,...>  [default: auto]
 353         A comma delimited list of parameter name and value pairs for writing
 354         molecules to files. The supported parameter names for different file
 355         formats, along with their default values, are shown below:
 356             
 357             SD: compute2DCoords,auto,forceV3000,no
 358             SMILES: smilesDelimiter,space,smilesTitleLine,yes
 359             
 360         Default value for compute2DCoords: yes for SMILES input file; no for all other
 361         file types.
 362     --overwrite
 363         Overwrite existing files.
 364     -r, --removeDuplicates <yes or no>  [default: no]
 365         Remove duplicate molecular frameworks. Possible values: yes or no. The duplicate
 366         molecular franworks are identified using canonical SMILES. The removed frameworks
 367         are written to a separate output file.
 368     -s, --sort <yes or no>  [default: no]
 369         Sort molecular frameworks by heavy atom count. Possible values: yes or no.
 370     --sortOrder <ascending or descending>  [default: ascending]
 371         Sorting order for molecular frameworks. Possible values: ascending or descending.
 372     -u, --useChirality <yes or no>  [default: yes]
 373         Use stereochemistry for generation of canonical SMILES strings to identify
 374         duplicate molecular frameworks.
 375     -w, --workingdir <dir>
 376         Location of working directory which defaults to the current directory.
 377 
 378 Examples:
 379     To generate graph molecular framworks for molecules and write out a SMILES file,
 380     type:
 381 
 382         % RDKitGenerateMolecularFrameworks.py -i Sample.smi -o SampleOut.smi
 383 
 384     To generate graph molecular framworks, remove duplicate frameworks for molecules
 385     and write out SD files for unique and duplicate frameworks, type:
 386 
 387         % RDKitGenerateMolecularFrameworks.py -m GraphFrameworks -r yes
 388           -i Sample.sdf -o SampleOut.sdf
 389 
 390     To generate atomic molecular framworks, remove duplicate frameworks, sort
 391     framworks by heavy atom count in ascending order, write out SMILES files for
 392     unique and duplicate frameworks, type:
 393 
 394         % RDKitGenerateMolecularFrameworks.py -m AtomicFrameworks -r yes
 395           -s yes -i Sample.smi -o SampleOut.smi
 396 
 397     To generate graph molecular framworks for molecules in a CSV SMILES file,
 398     SMILES strings in column 1, name in olumn 2, emove duplicate frameworks,
 399     sort framworks by heavy atom count in decending order and write out a SD
 400     file, type:
 401 
 402         % RDKitGenerateMolecularFrameworks.py -m AtomicFrameworks
 403           --removeDuplicates yes -s yes --sortOrder descending --infileParams
 404           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 405           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 406           -i SampleSMILES.csv -o SampleOut.sdf
 407 
 408 Author:
 409     Manish Sud(msud@san.rr.com)
 410 
 411 See also:
 412     RDKitConvertFileFormat.py, RDKitDrawMolecules.py, RDKitSearchFunctionalGroups.py,
 413     RDKitSearchSMARTS.py
 414 
 415 Copyright:
 416     Copyright (C) 2024 Manish Sud. All rights reserved.
 417 
 418     The functionality available in this script is implemented using RDKit, an
 419     open source toolkit for cheminformatics developed by Greg Landrum.
 420 
 421     This file is part of MayaChemTools.
 422 
 423     MayaChemTools is free software; you can redistribute it and/or modify it under
 424     the terms of the GNU Lesser General Public License as published by the Free
 425     Software Foundation; either version 3 of the License, or (at your option) any
 426     later version.
 427 
 428 """
 429 
 430 if __name__ == "__main__":
 431     main()