MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitAlignMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2026 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 import re
  35 
  36 # RDKit imports...
  37 try:
  38     from rdkit import rdBase
  39     from rdkit.Chem import AllChem, rdMolAlign
  40 except ImportError as ErrMsg:
  41     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  42     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  43     sys.exit(1)
  44 
  45 # MayaChemTools imports...
  46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  47 try:
  48     from docopt import docopt
  49     import MiscUtil
  50     import RDKitUtil
  51 except ImportError as ErrMsg:
  52     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  53     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  54     sys.exit(1)
  55 
  56 ScriptName = os.path.basename(sys.argv[0])
  57 Options = {}
  58 OptionsInfo = {}
  59 
  60 
  61 def main():
  62     """Start execution of the script."""
  63 
  64     MiscUtil.PrintInfo(
  65         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  66         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  67     )
  68 
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70 
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73 
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76 
  77     # Perform actions required by the script...
  78     AlignMolecules()
  79 
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 
  84 def AlignMolecules():
  85     """Align molecules."""
  86 
  87     if not re.match("^(OneToOne|FirstToAll)$", OptionsInfo["Mode"], re.I):
  88         MiscUtil.PrintError(
  89             "Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]
  90         )
  91 
  92     RefFile = OptionsInfo["RefFile"]
  93     ProbeFile = OptionsInfo["ProbeFile"]
  94 
  95     # Read reference and probe molecules...
  96     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
  97 
  98     MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile))
  99     ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules(
 100         RefFile, **OptionsInfo["InfileParams"]
 101     )
 102 
 103     MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile))
 104     ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules(
 105         ProbeFile, **OptionsInfo["InfileParams"]
 106     )
 107 
 108     # Set up a molecule writer...
 109     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 110     if Writer is None:
 111         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 112     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 113 
 114     AlignmentFailedCount = 0
 115     if re.match("^OneToOne$", OptionsInfo["Mode"], re.I):
 116         AlignmentFailedCount = PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer)
 117     elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I):
 118         AlignmentFailedCount = PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer)
 119     else:
 120         MiscUtil.PrintError(
 121             "Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]
 122         )
 123 
 124     if Writer is not None:
 125         Writer.close()
 126 
 127     MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount))
 128     MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount))
 129     MiscUtil.PrintInfo("Number of probe molecules failed during alignment: %d" % AlignmentFailedCount)
 130     MiscUtil.PrintInfo(
 131         "Number of ignored molecules:  Reference - %d; Probe - %d"
 132         % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount + AlignmentFailedCount))
 133     )
 134 
 135 
 136 def PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer):
 137     """Perform pairwise alignment."""
 138 
 139     ValidRefMolCount = len(ValidRefMols)
 140     ValidProbeMolCount = len(ValidProbeMols)
 141 
 142     MolCount = ValidRefMolCount
 143     if ValidRefMolCount > ValidProbeMolCount:
 144         MolCount = ValidProbeMolCount
 145 
 146     if ValidRefMolCount != ValidProbeMolCount:
 147         MiscUtil.PrintWarning(
 148             "Number of valid reference molecules, %d,  is not equal to number of valid probe molecules, %d .\n"
 149             % (ValidRefMolCount, ValidProbeMolCount)
 150         )
 151         MiscUtil.PrintWarning("Pairwise alignment will be performed only for first %s molecules.\n" % (MolCount))
 152 
 153     # Process molecules...
 154     AlignmentFailedCount = 0
 155     for MolIndex in range(0, MolCount):
 156         RefMol = ValidRefMols[MolIndex]
 157         ProbeMol = ValidProbeMols[MolIndex]
 158 
 159         RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1))
 160         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1))
 161 
 162         Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer)
 163         if not Status:
 164             AlignmentFailedCount += 1
 165 
 166     return AlignmentFailedCount
 167 
 168 
 169 def PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer):
 170     """Perform alignment between first reference molecues and all probe molecules."""
 171 
 172     # Process molecules...
 173     RefMol = ValidRefMols[0]
 174     RefMolCount = 1
 175     RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 176 
 177     ProbeMolCount = 0
 178     AlignmentFailedCount = 0
 179     for ProbeMol in ValidProbeMols:
 180         ProbeMolCount += 1
 181         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 182 
 183         Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer)
 184         if not Status:
 185             AlignmentFailedCount += 1
 186 
 187     return AlignmentFailedCount
 188 
 189 
 190 def PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer):
 191     """Perform alignment and write to output file."""
 192 
 193     Status = True
 194     try:
 195         if OptionsInfo["UseRMSD"]:
 196             rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters=OptionsInfo["MaxIters"])
 197         elif OptionsInfo["UseBestRMSD"]:
 198             AllChem.GetBestRMS(ProbeMol, RefMol)
 199         elif OptionsInfo["UseOpen3A"]:
 200             O3A = rdMolAlign.GetO3A(ProbeMol, RefMol)
 201             O3A.Align()
 202         elif OptionsInfo["UseCrippenOpen3A"]:
 203             CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol)
 204             CrippenO3A.Align()
 205         else:
 206             MiscUtil.PrintError(
 207                 "Alignment couldn't be performed: Specified alignment value, %s, is not supported"
 208                 % OptionsInfo["Alignment"]
 209             )
 210     except (RuntimeError, ValueError):
 211         Status = False
 212         MiscUtil.PrintWarning(
 213             "Alignment failed between reference molecule, %s, and probe molecule, %s.\nWriting unaligned probe molecule...\n"
 214             % (RefMolName, ProbeMolName)
 215         )
 216 
 217     # Write out aligned probe molecule...
 218     Writer.write(ProbeMol)
 219 
 220     return Status
 221 
 222 
 223 def ProcessOptions():
 224     """Process and validate command line arguments and options."""
 225 
 226     MiscUtil.PrintInfo("Processing options...")
 227 
 228     # Validate options...
 229     ValidateOptions()
 230 
 231     OptionsInfo["Alignment"] = Options["--alignment"]
 232 
 233     OptionsInfo["UseRMSD"] = False
 234     OptionsInfo["UseBestRMSD"] = False
 235 
 236     OptionsInfo["UseOpen3A"] = False
 237     OptionsInfo["UseCrippenOpen3A"] = False
 238 
 239     # Process alignment
 240     if re.match("^RMSD$", OptionsInfo["Alignment"], re.I):
 241         OptionsInfo["UseRMSD"] = True
 242     elif re.match("^BestRMSD$", OptionsInfo["Alignment"], re.I):
 243         OptionsInfo["UseBestRMSD"] = True
 244     elif re.match("^Open3A$", OptionsInfo["Alignment"], re.I):
 245         OptionsInfo["UseOpen3A"] = True
 246     elif re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I):
 247         OptionsInfo["UseCrippenOpen3A"] = True
 248 
 249     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 250 
 251     OptionsInfo["Mode"] = Options["--mode"]
 252 
 253     OptionsInfo["RefFile"] = Options["--reffile"]
 254     OptionsInfo["ProbeFile"] = Options["--probefile"]
 255 
 256     # No need for any RDKit specific --outfileParams....
 257     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"])
 258 
 259     OptionsInfo["Outfile"] = Options["--outfile"]
 260     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
 261         "--outfileParams", Options["--outfileParams"]
 262     )
 263 
 264     OptionsInfo["Overwrite"] = Options["--overwrite"]
 265 
 266 
 267 def RetrieveOptions():
 268     """Retrieve command line arguments and options."""
 269 
 270     # Get options...
 271     global Options
 272     Options = docopt(_docoptUsage_)
 273 
 274     # Set current working directory to the specified directory...
 275     WorkingDir = Options["--workingdir"]
 276     if WorkingDir:
 277         os.chdir(WorkingDir)
 278 
 279     # Handle examples option...
 280     if "--examples" in Options and Options["--examples"]:
 281         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 282         sys.exit(0)
 283 
 284 
 285 def ValidateOptions():
 286     """Validate option values."""
 287 
 288     MiscUtil.ValidateOptionTextValue("--alignment", Options["--alignment"], "Open3A CrippenOpen3A RMSD BestRMSD")
 289 
 290     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 291     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne  FirstToAll")
 292 
 293     MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"])
 294     MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol")
 295 
 296     MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"])
 297     MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol")
 298 
 299     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sd sdf")
 300     MiscUtil.ValidateOptionsOutputFileOverwrite(
 301         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 302     )
 303     MiscUtil.ValidateOptionsDistinctFileNames(
 304         "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]
 305     )
 306     MiscUtil.ValidateOptionsDistinctFileNames(
 307         "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]
 308     )
 309 
 310     MiscUtil.ValidateOptionsDistinctFileNames(
 311         "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]
 312     )
 313     MiscUtil.ValidateOptionsDistinctFileNames(
 314         "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]
 315     )
 316 
 317 
 318 # Setup a usage string for docopt...
 319 _docoptUsage_ = """
 320 RDKitAlignMolecules.py - Align molecules by RMSD or shape
 321 
 322 Usage:
 323     RDKitAlignMolecules.py [--alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>]
 324                            [--infileParams <Name,Value,...>] [--maxIters <number>]
 325                            [--mode <OneToOne, FirstToAll>] [ --outfileParams <Name,Value,...> ] 
 326                            [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 
 327     RDKitAlignMolecules.py -h | --help | -e | --examples
 328 
 329 Description:
 330     Perform alignment between a set of similar molecules in reference and probe
 331     input files. The molecules are aligned either by Root Mean Square Distance (RMSD)
 332     between molecules or overlying their shapes (Open3A or CrippenOpen3A).
 333     The RDKit function fails to calculate RMSD values for dissimilar molecules. Consequently,
 334     unaligned probe molecules are written to the output file for dissimilar molecule pairs.
 335 
 336     The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
 337 
 338     The supported output file formats are:  SD (.sdf, .sd)
 339 
 340 Options:
 341     -a, --alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>  [default: Open3A]
 342         Alignment methodology to use for aligning molecules. Possible values: Open3A,
 343         CrippenOpen3A, RMSD, BestRMSD.
 344         
 345         The Open3A and CrippenOpen3A allow alignment of molecules using their shapes
 346         Open 3DAlign (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types
 347         and charges. Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions
 348         to overlay molecules.
 349         
 350         During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to
 351         align and calculate RMSD. This function calculates optimal RMSD for aligning two
 352         molecules, taking symmetry into account. Otherwise, the RMSD value is calculated
 353         using 'AllChem.AlignMol function' without changing the atom order. A word to the
 354         wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to
 355         align all permutations of matching atom orders in both molecules, for some molecules
 356         it will lead to 'combinatorial explosion'.
 357     --infileParams <Name,Value,...>  [default: auto]
 358         A comma delimited list of parameter name and value pairs for reading
 359         molecules from files. The supported parameter names for different file
 360         formats, along with their default values, are shown below:
 361             
 362             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 363             
 364     --maxIters <number>  [default: 50]
 365         Maximum number of iterations to perform for each molecule pair during minimization
 366         of RMSD values. This option is ignored during BestRMSD mode.
 367     -m, --mode <OneToOne, FirstToAll>  [default: OneToOne]
 368         Specify how molecules are handled in reference and probe input files during
 369         alignment of molecules between reference and probe molecules.  Possible values:
 370         OneToOne and  FirstToAll. For OneToOne mode, the alignment is performed
 371         for each pair of molecules in the reference and probe file, and the aligned probe
 372         molecule is written the output file. For FirstToAll mode, the alignment is only
 373         performed between the first reference molecule against all probe molecules.
 374     -e, --examples
 375         Print examples.
 376     -h, --help
 377         Print this help message.
 378     -o, --outfile <outfile>
 379         Output file name for writing out aligned probe molecules values. Supported
 380         file extensions: sdf or sd.
 381     --outfileParams <Name,Value,...>  [default: auto]
 382         A comma delimited list of parameter name and value pairs for writing
 383         molecules to files. The supported parameter names for different file
 384         formats, along with their default values, are shown below:
 385             
 386             SD: kekulize,yes,forceV3000,no
 387             
 388     -p, --probefile <probefile>
 389         Probe input file name.
 390     -r, --reffile <reffile>
 391         Reference input file name.
 392     --overwrite
 393         Overwrite existing files.
 394     -w, --workingdir <dir>
 395         Location of working directory which defaults to the current directory.
 396 
 397 Examples:
 398     To perform shape alignment using Open3A methodology between pairs of molecules in
 399     reference and probe input 3D SD files and write out a SD file containing aligned
 400     molecules, type:
 401 
 402         % RDKitAlignMolecules.py  -r Sample3DRef.sdf -p Sample3DProb.sdf
 403           -o SampleOut.sdf
 404 
 405     To perform alignment using RMSD methodology between pairs of molecules in
 406     reference and probe input 3D SD files and write out a SD file containing aligned
 407     molecules, type:
 408 
 409         % RDKitAlignMolecules.py  -a RMSD -r Sample3DRef.sdf -p Sample3DProb.sdf
 410           -o SampleOut.sdf
 411 
 412     To perform alignment using Open3A methodology  between first reference molecule
 413     against all probe molecules in 3D SD files without removing hydrogens , and write out
 414     a SD file containing aligned molecules, type:
 415 
 416         % RDKitAlignMolecules.py -m FirstToAll -a Open3A
 417           --infileParams "removeHydrogens,no" -r Sample3DRef.sdf
 418           -p Sample3DProb.sdf -o SampleOut.sdf
 419 
 420 Author:
 421     Manish Sud(msud@san.rr.com)
 422 
 423 See also:
 424     RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitCalculateRMSD.py,
 425     RDKitConvertFileFormat.py, RDKitGenerateConformers.py, RDKitPerformMinimization.py
 426 
 427 Copyright:
 428     Copyright (C) 2026 Manish Sud. All rights reserved.
 429 
 430     The functionality available in this script is implemented using RDKit, an
 431     open source toolkit for cheminformatics developed by Greg Landrum.
 432 
 433     This file is part of MayaChemTools.
 434 
 435     MayaChemTools is free software; you can redistribute it and/or modify it under
 436     the terms of the GNU Lesser General Public License as published by the Free
 437     Software Foundation; either version 3 of the License, or (at your option) any
 438     later version.
 439 
 440 """
 441 
 442 if __name__ == "__main__":
 443     main()