MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitCompareMoleculeShapes.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2026 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 import re
  35 
  36 # RDKit imports...
  37 try:
  38     from rdkit import rdBase
  39     from rdkit.Chem import rdMolAlign, rdShapeHelpers
  40 except ImportError as ErrMsg:
  41     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  42     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  43     sys.exit(1)
  44 
  45 # MayaChemTools imports...
  46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  47 try:
  48     from docopt import docopt
  49     import MiscUtil
  50     import RDKitUtil
  51 except ImportError as ErrMsg:
  52     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  53     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  54     sys.exit(1)
  55 
  56 ScriptName = os.path.basename(sys.argv[0])
  57 Options = {}
  58 OptionsInfo = {}
  59 
  60 
  61 def main():
  62     """Start execution of the script."""
  63 
  64     MiscUtil.PrintInfo(
  65         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  66         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  67     )
  68 
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70 
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73 
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76 
  77     # Perform actions required by the script...
  78     CompareMoleculeShapes()
  79 
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 
  84 def CompareMoleculeShapes():
  85     """Compare shape of molecules."""
  86 
  87     if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I):
  88         MiscUtil.PrintError(
  89             "Shape comparison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]
  90         )
  91 
  92     if not re.match("^(Open3A|CrippenOpen3A)$", OptionsInfo["Alignment"], re.I):
  93         MiscUtil.PrintError(
  94             "Shape couldn't be performed: Specified alignment mode, %s, is not supported" % OptionsInfo["Alignment"]
  95         )
  96 
  97     RefFile = OptionsInfo["RefFile"]
  98     ProbeFile = OptionsInfo["ProbeFile"]
  99 
 100     Outfile = OptionsInfo["Outfile"]
 101     OutDelim = OptionsInfo["OutDelim"]
 102 
 103     # Read reference and probe molecules...
 104     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 105 
 106     MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile))
 107     ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules(
 108         RefFile, **OptionsInfo["InfileParams"]
 109     )
 110 
 111     MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile))
 112     ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules(
 113         ProbeFile, **OptionsInfo["InfileParams"]
 114     )
 115 
 116     # Set up output file...
 117     MiscUtil.PrintInfo("Generating file %s...\n" % Outfile)
 118     OutFH = open(Outfile, "w")
 119     if OutFH is None:
 120         MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile))
 121 
 122     if OptionsInfo["UseCrippenOpen3A"]:
 123         Line = "RefMolID%sProbeMolID%sCrippenOpen3AScore" % (OutDelim, OutDelim)
 124     else:
 125         Line = "RefMolID%sProbeMolID%sOpen3AScore" % (OutDelim, OutDelim)
 126 
 127     if OptionsInfo["CalcTanimotoDistance"]:
 128         Line = "%s%sTanimotoDistance" % (Line, OutDelim)
 129     if OptionsInfo["CalcProtrudeDistance"]:
 130         Line = "%s%sProtrudeDistance" % (Line, OutDelim)
 131     OutFH.write("%s\n" % Line)
 132 
 133     if re.match("^OneToOne$", OptionsInfo["Mode"], re.I):
 134         PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 135     elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I):
 136         PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 137     elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I):
 138         PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim)
 139     else:
 140         MiscUtil.PrintError(
 141             "Shape comaprison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]
 142         )
 143 
 144     OutFH.close()
 145 
 146     MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount))
 147     MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount))
 148     MiscUtil.PrintInfo(
 149         "Number of ignored molecules:  Reference - %d; Probe - %d"
 150         % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount))
 151     )
 152 
 153 
 154 def PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 155     """Perform pairwise shape comparison."""
 156 
 157     ValidRefMolCount = len(ValidRefMols)
 158     ValidProbeMolCount = len(ValidProbeMols)
 159 
 160     MolCount = ValidRefMolCount
 161     if ValidRefMolCount > ValidProbeMolCount:
 162         MolCount = ValidProbeMolCount
 163 
 164     if ValidRefMolCount != ValidProbeMolCount:
 165         MiscUtil.PrintWarning(
 166             "Number of valid reference molecules, %d,  is not equal to number of valid probe molecules, %d .\n"
 167             % (ValidRefMolCount, ValidProbeMolCount)
 168         )
 169         MiscUtil.PrintWarning("Pairwise shape comparison will be performed only for first %s molecules.\n" % (MolCount))
 170 
 171     # Process molecules...
 172     for MolIndex in range(0, MolCount):
 173         RefMol = ValidRefMols[MolIndex]
 174         ProbeMol = ValidProbeMols[MolIndex]
 175 
 176         RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1))
 177         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1))
 178 
 179         PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 180 
 181 
 182 def PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 183     """Perform shape comparison between all pairs of molecules."""
 184 
 185     # Process molecules...
 186     RefMolCount = 0
 187     for RefMol in ValidRefMols:
 188         RefMolCount += 1
 189         RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 190 
 191         ProbeMolCount = 0
 192         for ProbeMol in ValidProbeMols:
 193             ProbeMolCount += 1
 194             ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 195 
 196             PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 197 
 198 
 199 def PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim):
 200     """Perform shape comparison between first reference molecues and all probe molecules."""
 201 
 202     # Process molecules...
 203     RefMol = ValidRefMols[0]
 204     RefMolCount = 1
 205     RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 206 
 207     ProbeMolCount = 0
 208     for ProbeMol in ValidProbeMols:
 209         ProbeMolCount += 1
 210         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 211 
 212         PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim)
 213 
 214 
 215 def PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim):
 216     """Perform shape comparison and write to output file."""
 217 
 218     AlignmentScore = PerformShapeAlignment(RefMol, ProbeMol)
 219     AlignmentScore = "%.2f" % AlignmentScore
 220 
 221     LineWords = []
 222     LineWords.extend([RefMolName, ProbeMolName, AlignmentScore])
 223 
 224     if OptionsInfo["CalcTanimotoDistance"]:
 225         TanimotoDistance = CalculateTanimotoShapeDistance(RefMol, ProbeMol)
 226         LineWords.append(TanimotoDistance)
 227 
 228     if OptionsInfo["CalcProtrudeDistance"]:
 229         ProtrudeDistance = CalculateProtrudeShapeDistance(RefMol, ProbeMol)
 230         LineWords.append(ProtrudeDistance)
 231 
 232     Line = OutDelim.join(LineWords)
 233     OutFH.write("%s\n" % Line)
 234 
 235 
 236 def PerformShapeAlignment(RefMol, ProbeMol):
 237     """Perform shape alignment and return alignment score."""
 238 
 239     if OptionsInfo["UseCrippenOpen3A"]:
 240         CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol)
 241         Score = CrippenO3A.Align()
 242     else:
 243         O3A = rdMolAlign.GetO3A(ProbeMol, RefMol)
 244         Score = O3A.Align()
 245 
 246     return Score
 247 
 248 
 249 def CalculateTanimotoShapeDistance(RefMol, ProbeMol):
 250     """Calculate Tanimoto shape for a pair of already aligned molecules and return it as a string."""
 251 
 252     Distance = rdShapeHelpers.ShapeTanimotoDist(ProbeMol, RefMol)
 253     Distance = "%.2f" % Distance
 254 
 255     return Distance
 256 
 257 
 258 def CalculateProtrudeShapeDistance(RefMol, ProbeMol):
 259     """Calculate protrude shape for a pair of already aligned molecules and return it as a string."""
 260 
 261     Distance = rdShapeHelpers.ShapeProtrudeDist(ProbeMol, RefMol)
 262     Distance = "%.2f" % Distance
 263 
 264     return Distance
 265 
 266 
 267 def ProcessOptions():
 268     """Process and validate command line arguments and options."""
 269 
 270     MiscUtil.PrintInfo("Processing options...")
 271 
 272     # Validate options...
 273     ValidateOptions()
 274 
 275     OptionsInfo["Alignment"] = Options["--alignment"]
 276     OptionsInfo["UseCrippenOpen3A"] = False
 277     if re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I):
 278         OptionsInfo["UseCrippenOpen3A"] = True
 279 
 280     OptionsInfo["Distance"] = Options["--distance"]
 281 
 282     OptionsInfo["CalcTanimotoDistance"] = False
 283     OptionsInfo["CalcProtrudeDistance"] = False
 284     if re.match("^Tanimoto$", OptionsInfo["Distance"], re.I):
 285         OptionsInfo["CalcTanimotoDistance"] = True
 286     elif re.match("^Protrude$", OptionsInfo["Distance"], re.I):
 287         OptionsInfo["CalcProtrudeDistance"] = True
 288     else:
 289         OptionsInfo["CalcTanimotoDistance"] = True
 290         OptionsInfo["CalcProtrudeDistance"] = True
 291 
 292     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 293     OptionsInfo["Mode"] = Options["--mode"]
 294 
 295     OptionsInfo["RefFile"] = Options["--reffile"]
 296     OptionsInfo["ProbeFile"] = Options["--probefile"]
 297 
 298     OptionsInfo["Outfile"] = Options["--outfile"]
 299     OptionsInfo["Overwrite"] = Options["--overwrite"]
 300 
 301     # No need for any RDKit specific --outfileParams....
 302     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"])
 303 
 304     OptionsInfo["OutDelim"] = " "
 305     if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"):
 306         OptionsInfo["OutDelim"] = ","
 307     elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"):
 308         OptionsInfo["OutDelim"] = "\t"
 309     else:
 310         MiscUtil.PrintError(
 311             'The file name specified , %s, for option "--outfile" is not valid. Supported file formats: csv tsv txt\n'
 312             % (OptionsInfo["Outfile"])
 313         )
 314 
 315 
 316 def RetrieveOptions():
 317     """Retrieve command line arguments and options."""
 318 
 319     # Get options...
 320     global Options
 321     Options = docopt(_docoptUsage_)
 322 
 323     # Set current working directory to the specified directory...
 324     WorkingDir = Options["--workingdir"]
 325     if WorkingDir:
 326         os.chdir(WorkingDir)
 327 
 328     # Handle examples option...
 329     if "--examples" in Options and Options["--examples"]:
 330         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 331         sys.exit(0)
 332 
 333 
 334 def ValidateOptions():
 335     """Validate option values."""
 336 
 337     MiscUtil.ValidateOptionTextValue("Alignment", Options["--alignment"], "Open3A CrippenOpen3A")
 338     MiscUtil.ValidateOptionTextValue("Distance", Options["--distance"], "Tanimoto Protrude Both")
 339 
 340     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 341 
 342     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne  AllToAll FirstToAll")
 343 
 344     MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"])
 345     MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol")
 346 
 347     MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"])
 348     MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol")
 349 
 350     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt")
 351     MiscUtil.ValidateOptionsOutputFileOverwrite(
 352         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 353     )
 354 
 355     MiscUtil.ValidateOptionsDistinctFileNames(
 356         "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]
 357     )
 358     MiscUtil.ValidateOptionsDistinctFileNames(
 359         "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]
 360     )
 361 
 362 
 363 # Setup a usage string for docopt...
 364 _docoptUsage_ = """
 365 RDKitCompareMoleculeShapes.py - Compare shapes of molecules
 366 
 367 Usage:
 368     RDKitCompareMoleculeShapes.py [--alignment <Open3A, CrippenOpen3A>]
 369                                   [--distance <Tanimoto, Protrude, Both>]  [--infileParams <Name,Value,...>]
 370                                   [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>]
 371                                   [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 
 372     RDKitCompareMoleculeShapes.py -h | --help | -e | --examples
 373 
 374 Description:
 375     Compare shapes of molecules between a set of molecules in reference and probe
 376     input files. The molecules are aligned using either Open 3DAlign or Crippen Open
 377     3DAlign before calculating shape Tanimoto and protrude distances.
 378 
 379     The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
 380 
 381     The supported output file formats are: CSV (.csv), TSV (.tsv, .txt)
 382 
 383 Options:
 384     -a, --alignment <Open3A, CrippenOpen3A>  [default: Open3A]
 385         Alignment methodology to use for aligning molecules before calculating Tanimoto and
 386         protrude shape distances. Possible values: Open3A or CrippenOpen3A. Open 3DAlign
 387         (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types and charges.
 388         Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions to overlay
 389         molecules.
 390     -d, --distance <Tanimoto, Protrude, Both>  [default: Both]
 391         Shape comparison distance to calculate for comparing shapes of molecules. Possible
 392         values: Tanimoto, Protrude, or Both. Shape Tanimoto distance takes the volume
 393         overlay into account during the calculation of distance. Shape protrude distance,
 394         however, focuses on the volume mismatch.
 395     --infileParams <Name,Value,...>  [default: auto]
 396         A comma delimited list of parameter name and value pairs for reading
 397         molecules from files. The supported parameter names for different file
 398         formats, along with their default values, are shown below:
 399             
 400             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 401             
 402     --maxIters <number>  [default: 50]
 403         Maximum number of iterations to perform for each molecule pair during alignment.
 404     -m, --mode <OneToOne, AllToAll, FirstToAll>  [default: OneToOne]
 405         Specify how molecules are handled in reference and probe input files during
 406         comparison of shapes between reference and probe molecules.  Possible values:
 407         OneToOne, AllToAll and AllToFirst. For OneToOne mode, the molecule shapes are
 408         calculated for each pair of molecules in the reference and probe file and the shape
 409         distances are written to the output file. For AllToAll mode, the shape distances are
 410         calculated for each reference molecule against all probe molecules. For FirstToAll mode,
 411         however, the shape distances are only calculated for the first reference molecule
 412         against all probe molecules.
 413     -e, --examples
 414         Print examples.
 415     -h, --help
 416         Print this help message.
 417     -p, --probefile <probefile>
 418         Probe input file name.
 419     -r, --reffile <reffile>
 420         Reference input file name.
 421     -o, --outfile <outfile>
 422         Output file name for writing out shape distances. Supported text file extensions: csv or tsv.
 423     --overwrite
 424         Overwrite existing files.
 425     -w, --workingdir <dir>
 426         Location of working directory which defaults to the current directory.
 427 
 428 Examples:
 429     To perform shape alignment using Open3A methodology between pair of molecules in
 430     reference and probe input in 3D SD files, calculate both Tanimoto and protrude
 431     distances, and write out a CSV file containing calculated distance values along with
 432     appropriate molecule IDs, type:
 433 
 434         % RDKitCompareMoleculeShapes.py  -r Sample3DRef.sdf -p Sample3DProb.sdf
 435           -o SampleOut.csv
 436 
 437     To perform shape alignment using Crippen Open3A methodology between all molecules in
 438     reference and probe molecules in 3D SD files, calculate only Tanimoto distance, and write
 439     out a TSV file containing calculated distance value along with appropriate molecule IDs, type:
 440 
 441         % RDKitCompareMoleculeShapes.py  -m AllToAll -a CrippenOpen3A -d Tanimoto
 442           -r Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.csv
 443 
 444     To perform shape alignment using Open3A methodology between first reference molecule
 445     against all probe molecules in 3D SD files without removing hydrogens , calculate both
 446     Tanimoto and protrude distances, and write out a CSV file containing calculated distance values along with
 447     appropriate molecule IDs, type:
 448 
 449         % RDKitCompareMoleculeShapes.py -m FirstToAll -a Open3A -d Both 
 450           --infileParams "removeHydrogens,no" -r Sample3DRef.sdf
 451           -p Sample3DProb.sdf -o SampleOut.csv
 452 
 453 Author:
 454     Manish Sud(msud@san.rr.com)
 455 
 456 See also:
 457     RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, RDKitConvertFileFormat.py,
 458     RDKitGenerateConformers.py, RDKitPerformMinimization.py
 459 
 460 Copyright:
 461     Copyright (C) 2026 Manish Sud. All rights reserved.
 462 
 463     The functionality available in this script is implemented using RDKit, an
 464     open source toolkit for cheminformatics developed by Greg Landrum.
 465 
 466     This file is part of MayaChemTools.
 467 
 468     MayaChemTools is free software; you can redistribute it and/or modify it under
 469     the terms of the GNU Lesser General Public License as published by the Free
 470     Software Foundation; either version 3 of the License, or (at your option) any
 471     later version.
 472 
 473 """
 474 
 475 if __name__ == "__main__":
 476     main()