MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitCompareMoleculeShapes.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem, rdMolAlign, rdShapeHelpers
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script"""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     CompareMoleculeShapes()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def CompareMoleculeShapes():
  81     """Compare shape of molecules."""
  82     
  83     if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I):
  84         MiscUtil.PrintError("Shape comparison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
  85         
  86     if not re.match("^(Open3A|CrippenOpen3A)$", OptionsInfo["Alignment"], re.I):
  87         MiscUtil.PrintError("Shape couldn't be performed: Specified alignment mode, %s, is not supported" % OptionsInfo["Alignment"])
  88         
  89     RefFile = OptionsInfo["RefFile"]
  90     ProbeFile = OptionsInfo["ProbeFile"]
  91     
  92     Outfile = OptionsInfo["Outfile"]
  93     OutDelim = OptionsInfo["OutDelim"]
  94 
  95     # Read reference and probe molecules...
  96     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
  97     
  98     MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile))
  99     ValidRefMols, RefMolCount, ValidRefMolCount  = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"])
 100     
 101     MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile))
 102     ValidProbeMols, ProbeMolCount, ValidProbeMolCount  = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"])
 103 
 104     # Set up output file...
 105     MiscUtil.PrintInfo("Generating file %s...\n" % Outfile)
 106     OutFH = open(Outfile, "w")
 107     if OutFH is None:
 108         MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile))
 109 
 110     if OptionsInfo["UseCrippenOpen3A"]:
 111         Line = "RefMolID%sProbeMolID%sCrippenOpen3AScore" % (OutDelim, OutDelim)
 112     else:
 113         Line = "RefMolID%sProbeMolID%sOpen3AScore" % (OutDelim, OutDelim)
 114         
 115     if OptionsInfo["CalcTanimotoDistance"]:
 116         Line = "%s%sTanimotoDistance" % (Line, OutDelim)
 117     if OptionsInfo["CalcProtrudeDistance"]:
 118         Line = "%s%sProtrudeDistance" % (Line, OutDelim)
 119     OutFH.write("%s\n" % Line)
 120         
 121     if re.match("^OneToOne$", OptionsInfo["Mode"], re.I):
 122         PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 123     elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I):
 124         PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 125     elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I):
 126         PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 127     else:
 128         MiscUtil.PrintError("Shape comaprison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
 129 
 130     OutFH.close()
 131     
 132     MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount))
 133     MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount))
 134     MiscUtil.PrintInfo("Number of ignored molecules:  Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount)))
 135     
 136 def PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 137     """Perform pairwise shape comparison"""
 138     
 139     ValidRefMolCount = len(ValidRefMols)
 140     ValidProbeMolCount = len(ValidProbeMols)
 141     
 142     MolCount = ValidRefMolCount
 143     if ValidRefMolCount > ValidProbeMolCount:
 144         MolCount = ValidProbeMolCount
 145     
 146     if ValidRefMolCount != ValidProbeMolCount:
 147         MiscUtil.PrintWarning("Number of valid reference molecules, %d,  is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount))
 148         MiscUtil.PrintWarning("Pairwise shape comparison will be performed only for first %s molecules.\n" % (MolCount))
 149 
 150     # Process molecules...
 151     for MolIndex in range(0, MolCount):
 152         RefMol = ValidRefMols[MolIndex]
 153         ProbeMol = ValidProbeMols[MolIndex]
 154 
 155         RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1))
 156         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1))
 157 
 158         PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 159         
 160 def PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 161     """Perform shape comparison between all pairs of molecules."""
 162     
 163     # Process molecules...
 164     RefMolCount = 0
 165     for RefMol in ValidRefMols:
 166         RefMolCount += 1
 167         RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 168 
 169         ProbeMolCount = 0
 170         for ProbeMol in ValidProbeMols:
 171             ProbeMolCount += 1
 172             ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 173             
 174             PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 175         
 176 def PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 177     """Perform shape comparison between first reference molecues and all probe molecules. """
 178     
 179     # Process molecules...
 180     RefMol = ValidRefMols[0]
 181     RefMolCount = 1
 182     RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 183 
 184     ProbeMolCount = 0
 185     for ProbeMol in ValidProbeMols:
 186         ProbeMolCount += 1
 187         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 188         
 189         PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 190 
 191 def PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim):
 192     """Perform shape comparison and write to output file."""
 193 
 194     AlignmentScore = PerformShapeAlignment(RefMol, ProbeMol)
 195     AlignmentScore = "%.2f" % AlignmentScore
 196     
 197     LineWords = []
 198     LineWords.extend([RefMolName, ProbeMolName, AlignmentScore])
 199     
 200     if OptionsInfo["CalcTanimotoDistance"]:
 201         TanimotoDistance = CalculateTanimotoShapeDistance(RefMol, ProbeMol)
 202         LineWords.append(TanimotoDistance)
 203     
 204     if OptionsInfo["CalcProtrudeDistance"]:
 205         ProtrudeDistance = CalculateProtrudeShapeDistance(RefMol, ProbeMol)
 206         LineWords.append(ProtrudeDistance)
 207     
 208     Line = OutDelim.join(LineWords)
 209     OutFH.write("%s\n" % Line)
 210     
 211 def PerformShapeAlignment(RefMol, ProbeMol):
 212     """Perform shape alignment and return alignment score."""
 213     
 214     if OptionsInfo["UseCrippenOpen3A"]:
 215         CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol)
 216         Score = CrippenO3A.Align()
 217     else:
 218         O3A = rdMolAlign.GetO3A(ProbeMol, RefMol)
 219         Score = O3A.Align()
 220 
 221     return Score
 222         
 223 def CalculateTanimotoShapeDistance(RefMol, ProbeMol):
 224     """Calculate Tanimoto shape for a pair of already aligned molecules and return it as a string"""
 225 
 226     Distance = rdShapeHelpers.ShapeTanimotoDist(ProbeMol, RefMol)
 227     Distance = "%.2f" % Distance
 228 
 229     return Distance
 230 
 231 def CalculateProtrudeShapeDistance(RefMol, ProbeMol):
 232     """Calculate protrude shape for a pair of already aligned molecules and return it as a string"""
 233 
 234     Distance = rdShapeHelpers.ShapeProtrudeDist(ProbeMol, RefMol)
 235     Distance = "%.2f" % Distance
 236 
 237     return Distance
 238 
 239 def ProcessOptions():
 240     """Process and validate command line arguments and options"""
 241     
 242     MiscUtil.PrintInfo("Processing options...")
 243     
 244     # Validate options...
 245     ValidateOptions()
 246     
 247     OptionsInfo["Alignment"] = Options["--alignment"]
 248     OptionsInfo["UseCrippenOpen3A"] = False
 249     if re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I):
 250         OptionsInfo["UseCrippenOpen3A"] = True
 251     
 252     OptionsInfo["Distance"] = Options["--distance"]
 253     
 254     OptionsInfo["CalcTanimotoDistance"] = False
 255     OptionsInfo["CalcProtrudeDistance"] = False
 256     if re.match("^Tanimoto$", OptionsInfo["Distance"], re.I):
 257         OptionsInfo["CalcTanimotoDistance"] = True
 258     elif re.match("^Protrude$", OptionsInfo["Distance"], re.I):
 259         OptionsInfo["CalcProtrudeDistance"] = True
 260     else:
 261         OptionsInfo["CalcTanimotoDistance"] = True
 262         OptionsInfo["CalcProtrudeDistance"] = True
 263     
 264     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 265     OptionsInfo["Mode"] = Options["--mode"]
 266     
 267     OptionsInfo["RefFile"] = Options["--reffile"]
 268     OptionsInfo["ProbeFile"] = Options["--probefile"]
 269     
 270     OptionsInfo["Outfile"] = Options["--outfile"]
 271     OptionsInfo["Overwrite"] = Options["--overwrite"]
 272     
 273     # No need for any RDKit specific --outfileParams....
 274     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"])
 275     
 276     OptionsInfo["OutDelim"] = " "
 277     if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"):
 278         OptionsInfo["OutDelim"] = ","
 279     elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"):
 280         OptionsInfo["OutDelim"] = "\t"
 281     else:
 282         MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid. Supported file formats: csv tsv txt\n" % (OptionsInfo["Outfile"]))
 283     
 284 def RetrieveOptions():
 285     """Retrieve command line arguments and options"""
 286     
 287     # Get options...
 288     global Options
 289     Options = docopt(_docoptUsage_)
 290 
 291     # Set current working directory to the specified directory...
 292     WorkingDir = Options["--workingdir"]
 293     if WorkingDir:
 294         os.chdir(WorkingDir)
 295     
 296     # Handle examples option...
 297     if "--examples" in Options and Options["--examples"]:
 298         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 299         sys.exit(0)
 300 
 301 def ValidateOptions():
 302     """Validate option values"""
 303     
 304     MiscUtil.ValidateOptionTextValue("Alignment", Options["--alignment"], "Open3A CrippenOpen3A")
 305     MiscUtil.ValidateOptionTextValue("Distance", Options["--distance"], "Tanimoto Protrude Both")
 306     
 307     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 308     
 309     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne  AllToAll FirstToAll")
 310     
 311     MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"])
 312     MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol")
 313     
 314     MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"])
 315     MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol")
 316     
 317     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt")
 318     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 319     
 320     MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"])
 321     MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"])
 322 
 323 # Setup a usage string for docopt...
 324 _docoptUsage_ = """
 325 RDKitCompareMoleculeShapes.py - Compare shapes of molecules
 326 
 327 Usage:
 328     RDKitCompareMoleculeShapes.py [--alignment <Open3A, CrippenOpen3A>]
 329                                   [--distance <Tanimoto, Protrude, Both>]  [--infileParams <Name,Value,...>]
 330                                   [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>]
 331                                   [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 
 332     RDKitCompareMoleculeShapes.py -h | --help | -e | --examples
 333 
 334 Description:
 335     Compare shapes of molecules between a set of molecules in reference and probe
 336     input files. The molecules are aligned using either Open 3DAlign or Crippen Open
 337     3DAlign before calculating shape Tanimoto and protrude distances.
 338 
 339     The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
 340 
 341     The supported output file formats are: CSV (.csv), TSV (.tsv, .txt)
 342 
 343 Options:
 344     -a, --alignment <Open3A, CrippenOpen3A>  [default: Open3A]
 345         Alignment methodology to use for aligning molecules before calculating Tanimoto and
 346         protrude shape distances. Possible values: Open3A or CrippenOpen3A. Open 3DAlign
 347         (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types and charges.
 348         Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions to overlay
 349         molecules.
 350     -d, --distance <Tanimoto, Protrude, Both>  [default: Both]
 351         Shape comparison distance to calculate for comparing shapes of molecules. Possible
 352         values: Tanimoto, Protrude, or Both. Shape Tanimoto distance takes the volume
 353         overlay into account during the calculation of distance. Shape protrude distance,
 354         however, focuses on the volume mismatch.
 355     --infileParams <Name,Value,...>  [default: auto]
 356         A comma delimited list of parameter name and value pairs for reading
 357         molecules from files. The supported parameter names for different file
 358         formats, along with their default values, are shown below:
 359             
 360             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 361             
 362     --maxIters <number>  [default: 50]
 363         Maximum number of iterations to perform for each molecule pair during alignment.
 364     -m, --mode <OneToOne, AllToAll, FirstToAll>  [default: OneToOne]
 365         Specify how molecules are handled in reference and probe input files during
 366         comparison of shapes between reference and probe molecules.  Possible values:
 367         OneToOne, AllToAll and AllToFirst. For OneToOne mode, the molecule shapes are
 368         calculated for each pair of molecules in the reference and probe file and the shape
 369         distances are written to the output file. For AllToAll mode, the shape distances are
 370         calculated for each reference molecule against all probe molecules. For FirstToAll mode,
 371         however, the shape distances are only calculated for the first reference molecule
 372         against all probe molecules.
 373     -e, --examples
 374         Print examples.
 375     -h, --help
 376         Print this help message.
 377     -p, --probefile <probefile>
 378         Probe input file name.
 379     -r, --reffile <reffile>
 380         Reference input file name.
 381     -o, --outfile <outfile>
 382         Output file name for writing out shape distances. Supported text file extensions: csv or tsv.
 383     --overwrite
 384         Overwrite existing files.
 385     -w, --workingdir <dir>
 386         Location of working directory which defaults to the current directory.
 387 
 388 Examples:
 389     To perform shape alignment using Open3A methodology between pair of molecules in
 390     reference and probe molecules in 3D SD files, calculate both Tanimoto and protrude
 391     distances, and write out a CSV file containing calculated distance values along with
 392     appropriate molecule IDs, type:
 393 
 394         % RDKitCompareMoleculeShapes.py  -r Sample3DRef.sdf -p Sample3DProb.sdf
 395           -o SampleOut.csv
 396 
 397     To perform shape alignment using Crippen Open3A methodology between all molecules in
 398     reference and probe molecules in 3D SD files, calculate only Tanimoto distance, and write
 399     out a TSV file containing calculated distance value along with appropriate molecule IDs, type:
 400 
 401         % RDKitCompareMoleculeShapes.py  -m AllToAll -a CrippenOpen3A -d Tanimoto
 402           -r Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.csv
 403 
 404     To perform shape alignment using Open3A methodology between first reference molecule
 405     against all probe molecules in 3D SD files without removing hydrogens , calculate both
 406     Tanimoto and protrude distances, and write out a CSV file containing calculated distance values along with
 407     appropriate molecule IDs, type:
 408 
 409         % RDKitCompareMoleculeShapes.py -m FirstToAll -a Open3A -d Both 
 410           --infileParams "removeHydrogens,no" -r Sample3DRef.sdf
 411           -p Sample3DProb.sdf -o SampleOut.csv
 412 
 413 Author:
 414     Manish Sud(msud@san.rr.com)
 415 
 416 See also:
 417     RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, RDKitConvertFileFormat.py,
 418     RDKitGenerateConformers.py, RDKitPerformMinimization.py
 419 
 420 Copyright:
 421     Copyright (C) 2019 Manish Sud. All rights reserved.
 422 
 423     The functionality available in this script is implemented using RDKit, an
 424     open source toolkit for cheminformatics developed by Greg Landrum.
 425 
 426     This file is part of MayaChemTools.
 427 
 428     MayaChemTools is free software; you can redistribute it and/or modify it under
 429     the terms of the GNU Lesser General Public License as published by the Free
 430     Software Foundation; either version 3 of the License, or (at your option) any
 431     later version.
 432 
 433 """
 434 
 435 if __name__ == "__main__":
 436     main()