MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitCalculateRMSD.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem, rdMolAlign
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script."""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     CalculateRMSD()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def CalculateRMSD():
  81     """Calculate RMSD values."""
  82     
  83     if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I):
  84         MiscUtil.PrintError("RMSD couldn't be calculated: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
  85         
  86     RefFile = OptionsInfo["RefFile"]
  87     ProbeFile = OptionsInfo["ProbeFile"]
  88     
  89     Outfile = OptionsInfo["Outfile"]
  90     OutDelim = OptionsInfo["OutDelim"]
  91 
  92     # Read reference and probe molecules...
  93     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
  94     
  95     MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile))
  96     ValidRefMols, RefMolCount, ValidRefMolCount  = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"])
  97     
  98     MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile))
  99     ValidProbeMols, ProbeMolCount, ValidProbeMolCount  = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"])
 100 
 101     # Set up output file...
 102     MiscUtil.PrintInfo("\nGenerating file %s...\n" % Outfile)
 103     OutFH = open(Outfile, "w")
 104     if OutFH is None:
 105         MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile))
 106     
 107     Line = "RefMolID%sProbeMolID%sRMSD\n" % (OutDelim, OutDelim)
 108     OutFH.write(Line)
 109     
 110     if re.match("^OneToOne$", OptionsInfo["Mode"], re.I):
 111         CalculateOneToOneRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 112     elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I):
 113         CalculateAllToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 114     elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I):
 115         CalculateFirstToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 116     else:
 117         MiscUtil.PrintError("RMSD couldn't be calculated: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
 118 
 119     OutFH.close()
 120     
 121     MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount))
 122     MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount))
 123     MiscUtil.PrintInfo("Number of ignored molecules:  Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount)))
 124     
 125 def CalculateOneToOneRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 126     """Calculate pairwise RMSD values."""
 127     
 128     ValidRefMolCount = len(ValidRefMols)
 129     ValidProbeMolCount = len(ValidProbeMols)
 130     
 131     MolCount = ValidRefMolCount
 132     if ValidRefMolCount > ValidProbeMolCount:
 133         MolCount = ValidProbeMolCount
 134     
 135     if ValidRefMolCount != ValidProbeMolCount:
 136         MiscUtil.PrintWarning("Number of valid reference molecules, %d,  is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount))
 137         MiscUtil.PrintWarning("Pairwise RMSD will be calculated only for first %s molecules.\n" % (MolCount))
 138 
 139     # Process molecules...
 140     for MolIndex in range(0, MolCount):
 141         RefMol = ValidRefMols[MolIndex]
 142         ProbeMol = ValidProbeMols[MolIndex]
 143 
 144         RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1))
 145         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1))
 146 
 147         RMSD = CalculateRMSDValue(RefMol, ProbeMol)
 148         
 149         Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD)
 150         OutFH.write(Line)
 151         
 152 def CalculateAllToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 153     """Calculate RMSD values between all pairs of molecules."""
 154     
 155     # Process molecules...
 156     RefMolCount = 0
 157     for RefMol in ValidRefMols:
 158         RefMolCount += 1
 159         RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 160 
 161         ProbeMolCount = 0
 162         for ProbeMol in ValidProbeMols:
 163             ProbeMolCount += 1
 164             ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 165             
 166             RMSD = CalculateRMSDValue(RefMol, ProbeMol)
 167 
 168             Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD)
 169             OutFH.write(Line)
 170         
 171 def CalculateFirstToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 172     """Calculate RMSD values between first reference molecues and all probe molecules."""
 173     
 174     # Process molecules...
 175     RefMol = ValidRefMols[0]
 176     RefMolCount = 1
 177     RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 178 
 179     ProbeMolCount = 0
 180     for ProbeMol in ValidProbeMols:
 181         ProbeMolCount += 1
 182         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 183             
 184         RMSD = CalculateRMSDValue(RefMol, ProbeMol)
 185 
 186         Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD)
 187         OutFH.write(Line)
 188         
 189 def CalculateRMSDValue(RefMol, ProbeMol):
 190     """Calculate RMSD value for a pair of molecules and return it as a string."""
 191 
 192     try:
 193         if OptionsInfo["UseBestRMSD"]:
 194             RMSD = AllChem.GetBestRMS(ProbeMol, RefMol)
 195         else:
 196             RMSD = rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters = OptionsInfo["MaxIters"])
 197         RMSD = "%.2f" % RMSD
 198     except (RuntimeError, ValueError):
 199         RMSD = "None"
 200         
 201     return RMSD
 202 
 203 def ProcessOptions():
 204     """Process and validate command line arguments and options."""
 205     
 206     MiscUtil.PrintInfo("Processing options...")
 207     
 208     # Validate options...
 209     ValidateOptions()
 210     
 211     OptionsInfo["CalcRMSD"] = Options["--calcRMSD"]
 212     OptionsInfo["UseBestRMSD"] = False
 213     if re.match("^BestRMSD$", OptionsInfo["CalcRMSD"], re.I):
 214         OptionsInfo["UseBestRMSD"] = True
 215     
 216     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 217     
 218     OptionsInfo["Mode"] = Options["--mode"]
 219     
 220     OptionsInfo["RefFile"] = Options["--reffile"]
 221     OptionsInfo["ProbeFile"] = Options["--probefile"]
 222 
 223     # No need for any RDKit specific --outfileParams....
 224     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"])
 225     
 226     OptionsInfo["Outfile"] = Options["--outfile"]
 227     
 228     OptionsInfo["Overwrite"] = Options["--overwrite"]
 229     
 230     OptionsInfo["OutDelim"] = " "
 231     if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"):
 232         OptionsInfo["OutDelim"] = ","
 233     elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"):
 234         OptionsInfo["OutDelim"] = "\t"
 235     else:
 236         MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid. Supported file formats: csv tsv txt\n" % (OptionsInfo["Outfile"]))
 237 
 238 def RetrieveOptions():
 239     """Retrieve command line arguments and options."""
 240     
 241     # Get options...
 242     global Options
 243     Options = docopt(_docoptUsage_)
 244     
 245     # Set current working directory to the specified directory...
 246     WorkingDir = Options["--workingdir"]
 247     if WorkingDir:
 248         os.chdir(WorkingDir)
 249     
 250     # Handle examples option...
 251     if "--examples" in Options and Options["--examples"]:
 252         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 253         sys.exit(0)
 254 
 255 def ValidateOptions():
 256     """Validate option values."""
 257     
 258     MiscUtil.ValidateOptionTextValue("--calcRMSD", Options["--calcRMSD"], "RMSD BestRMSD")
 259     
 260     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 261     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne  AllToAll FirstToAll")
 262     
 263     MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"])
 264     MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol")
 265     
 266     MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"])
 267     MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol")
 268         
 269     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt")
 270     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 271     MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"])
 272     MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"])
 273     
 274 # Setup a usage string for docopt...
 275 _docoptUsage_ = """
 276 RDKitCalculateRMSD.py - Calculate RMSD between molecules
 277 
 278 Usage:
 279     RDKitCalculateRMSD.py [--calcRMSD <RMSD, BestRMSD>] [--infileParams <Name,Value,...>]
 280                           [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>]
 281                           [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 
 282     RDKitCalculateRMSD.py -h | --help | -e | --examples
 283 
 284 Description:
 285     Calculate Root Mean Square Distance (RMSD) between a set of similar molecules in
 286     reference and probe input files. The RDKit function fails to calculate RMSD values for
 287     dissimilar molecules. Consequently, a text string 'None' is written out as a RMSD value
 288     for dissimilar molecule pairs.
 289 
 290     The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
 291 
 292     The supported output file formats are:  CSV (.csv), TSV (.tsv, .txt)
 293 
 294 Options:
 295     -c, --calcRMSD <RMSD, BestRMSD>  [default: RMSD]
 296         Methodology for calculating RMSD values. Possible values: RMSD, BestRMSD.
 297         During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to
 298         align and calculate RMSD. This function calculates optimal RMSD for aligning two
 299         molecules, taking symmetry into account. Otherwise, the RMSD value is calculated
 300         using 'AllChem.AlignMol function' without changing the atom order. A word to the
 301         wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to
 302         align all permutations of matching atom orders in both molecules, for some molecules
 303         it will lead to 'combinatorial explosion'.
 304     --infileParams <Name,Value,...>  [default: auto]
 305         A comma delimited list of parameter name and value pairs for reading
 306         molecules from files. The supported parameter names for different file
 307         formats, along with their default values, are shown below:
 308             
 309             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 310             
 311     --maxIters <number>  [default: 50]
 312         Maximum number of iterations to perform for each molecule pair during minimization
 313         of RMSD values. This option is ignored during BestRMSD mode.
 314     -m, --mode <OneToOne, AllToAll, FirstToAll>  [default: OneToOne]
 315         Specify how molecules are handled in reference and probe input files during
 316         calculation of RMSD between reference and probe molecules.  Possible values:
 317         OneToOne, AllToAll and AllToFirst. For OneToOne mode, the number of molecules
 318         in reference file must be equal to the number of molecules in probe file. The RMSD
 319         is calculated for each pair of molecules in the reference and probe file and written
 320         to the output file. For AllToAll mode, the RMSD is calculated for each reference
 321         molecule against all probe molecules. For FirstToAll mode, however, the RMSD
 322         is only calculated for the first reference molecule against all probe molecules.
 323     -e, --examples
 324         Print examples.
 325     -h, --help
 326         Print this help message.
 327     -p, --probefile <probefile>
 328         Probe input file name.
 329     -r, --reffile <reffile>
 330         Reference input file name.
 331     -o, --outfile <outfile>
 332         Output file name for writing out RMSD values. Supported text file extensions: csv or tsv.
 333     --overwrite
 334         Overwrite existing files.
 335     -w, --workingdir <dir>
 336         Location of working directory which defaults to the current directory.
 337 
 338 Examples:
 339     To calculate RMSD between pair of molecules in reference and probe input
 340     3D SD files and write out a CSV file containing calculated RMSD values along with
 341     appropriate molecule IDs, type:
 342 
 343         % RDKitCalculateRMSD.py  -r Sample3DRef.sdf -p Sample3DProb.sdf
 344           -o SampleOut.csv
 345 
 346     To calculate RMSD between all molecules in reference and probe input
 347     3D SD files and write out a CSV file containing calculated RMSD values along with
 348     appropriate molecule IDs, type:
 349 
 350         % RDKitCalculateRMSD.py  -m AllToAll -r Sample3DRef.sdf -p
 351           Sample3DProb.sdf -o SampleOut.csv
 352 
 353     To calculate best RMSD between first  molecule in reference all probe molecules
 354     in 3D SD files and write out a TSV file containing calculated RMSD values along with
 355     appropriate molecule IDs, type:
 356 
 357         % RDKitCalculateRMSD.py  -m FirstToAll --calcRMSD BestRMSD -r
 358           Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.tsv
 359 
 360     To calculate RMSD between all molecules in reference and probe molecules input
 361     3D SD files without removing hydrogens and write out a TSV file containing
 362     calculated RMSD values along with appropriate molecule IDs, type:
 363 
 364         % RDKitCalculateRMSD.py  -m AllToAll --infileParams
 365           "removeHydrogens,no" -r Sample3DRef.sdf  -p Sample3DProb.sdf
 366           -o SampleOut.tsv
 367 
 368 Author:
 369     Manish Sud(msud@san.rr.com)
 370 
 371 See also:
 372     RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py,
 373     RDKitGenerateConformers.py, RDKitPerformMinimization.py
 374 
 375 Copyright:
 376     Copyright (C) 2024 Manish Sud. All rights reserved.
 377 
 378     The functionality available in this script is implemented using RDKit, an
 379     open source toolkit for cheminformatics developed by Greg Landrum.
 380 
 381     This file is part of MayaChemTools.
 382 
 383     MayaChemTools is free software; you can redistribute it and/or modify it under
 384     the terms of the GNU Lesser General Public License as published by the Free
 385     Software Foundation; either version 3 of the License, or (at your option) any
 386     later version.
 387 
 388 """
 389 
 390 if __name__ == "__main__":
 391     main()