MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitAlignMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem, rdMolAlign
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script."""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     AlignMolecules()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def AlignMolecules():
  81     """Align molecules."""
  82     
  83     if not re.match("^(OneToOne|FirstToAll)$", OptionsInfo["Mode"], re.I):
  84         MiscUtil.PrintError("Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
  85         
  86     RefFile = OptionsInfo["RefFile"]
  87     ProbeFile = OptionsInfo["ProbeFile"]
  88     
  89     Outfile = OptionsInfo["Outfile"]
  90 
  91     # Read reference and probe molecules...
  92     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
  93     
  94     MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile))
  95     ValidRefMols, RefMolCount, ValidRefMolCount  = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"])
  96     
  97     MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile))
  98     ValidProbeMols, ProbeMolCount, ValidProbeMolCount  = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"])
  99 
 100     # Set up a molecule writer...
 101     Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
 102     if Writer is None:
 103         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
 104     MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"])
 105 
 106     AlignmentFailedCount = 0
 107     if re.match("^OneToOne$", OptionsInfo["Mode"], re.I):
 108         AlignmentFailedCount = PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer)
 109     elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I):
 110          AlignmentFailedCount = PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer)
 111     else:
 112         MiscUtil.PrintError("Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"])
 113 
 114     if Writer is not None:
 115         Writer.close()
 116     
 117     MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount))
 118     MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount))
 119     MiscUtil.PrintInfo("Number of probe molecules failed during alignment: %d" % AlignmentFailedCount)
 120     MiscUtil.PrintInfo("Number of ignored molecules:  Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount + AlignmentFailedCount)))
 121     
 122 def PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer):
 123     """Perform pairwise alignment."""
 124     
 125     ValidRefMolCount = len(ValidRefMols)
 126     ValidProbeMolCount = len(ValidProbeMols)
 127     
 128     MolCount = ValidRefMolCount
 129     if ValidRefMolCount > ValidProbeMolCount:
 130         MolCount = ValidProbeMolCount
 131     
 132     if ValidRefMolCount != ValidProbeMolCount:
 133         MiscUtil.PrintWarning("Number of valid reference molecules, %d,  is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount))
 134         MiscUtil.PrintWarning("Pairwise alignment will be performed only for first %s molecules.\n" % (MolCount))
 135 
 136     # Process molecules...
 137     AlignmentFailedCount = 0
 138     for MolIndex in range(0, MolCount):
 139         RefMol = ValidRefMols[MolIndex]
 140         ProbeMol = ValidProbeMols[MolIndex]
 141 
 142         RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1))
 143         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1))
 144 
 145         Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer)
 146         if not Status:
 147             AlignmentFailedCount += 1
 148     
 149     return AlignmentFailedCount
 150 
 151 def PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer):
 152     """Perform alignment between first reference molecues and all probe molecules."""
 153     
 154     # Process molecules...
 155     RefMol = ValidRefMols[0]
 156     RefMolCount = 1
 157     RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount)
 158 
 159     ProbeMolCount = 0
 160     AlignmentFailedCount = 0
 161     for ProbeMol in ValidProbeMols:
 162         ProbeMolCount += 1
 163         ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount)
 164             
 165         Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer)
 166         if not Status:
 167             AlignmentFailedCount += 1
 168 
 169     return AlignmentFailedCount
 170 
 171 def PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer):
 172     """Perform alignment and write to output file."""
 173 
 174     Status = True
 175     try:
 176         if OptionsInfo["UseRMSD"]:
 177             RMSD = rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters = OptionsInfo["MaxIters"])
 178         elif OptionsInfo["UseBestRMSD"]:
 179             RMSD = AllChem.GetBestRMS(ProbeMol, RefMol)
 180         elif OptionsInfo["UseOpen3A"]:
 181             O3A = rdMolAlign.GetO3A(ProbeMol, RefMol)
 182             Score = O3A.Align()
 183         elif OptionsInfo["UseCrippenOpen3A"]:
 184             CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol)
 185             Score = CrippenO3A.Align()
 186         else:
 187             MiscUtil.PrintError("Alignment couldn't be performed: Specified alignment value, %s, is not supported" % OptionsInfo["Alignment"])
 188     except (RuntimeError, ValueError):
 189         Status = False
 190         MiscUtil.PrintWarning("Alignment failed between reference molecule, %s, and probe molecule, %s.\nWriting unaligned probe molecule...\n" % (RefMolName, ProbeMolName))
 191 
 192     # Write out aligned probe molecule...
 193     Writer.write(ProbeMol)
 194     
 195     return Status
 196 
 197 def ProcessOptions():
 198     """Process and validate command line arguments and options."""
 199     
 200     MiscUtil.PrintInfo("Processing options...")
 201     
 202     # Validate options...
 203     ValidateOptions()
 204     
 205     OptionsInfo["Alignment"] = Options["--alignment"]
 206     
 207     OptionsInfo["UseRMSD"] = False
 208     OptionsInfo["UseBestRMSD"] = False
 209     
 210     OptionsInfo["UseOpen3A"] = False
 211     OptionsInfo["UseCrippenOpen3A"] = False
 212 
 213     # Process alignment
 214     if re.match("^RMSD$", OptionsInfo["Alignment"], re.I):
 215         OptionsInfo["UseRMSD"] = True
 216     elif re.match("^BestRMSD$", OptionsInfo["Alignment"], re.I):
 217         OptionsInfo["UseBestRMSD"] = True
 218     elif re.match("^Open3A$", OptionsInfo["Alignment"], re.I):
 219         OptionsInfo["UseOpen3A"] = True
 220     elif re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I):
 221         OptionsInfo["UseCrippenOpen3A"] = True
 222     
 223     OptionsInfo["MaxIters"] = int(Options["--maxIters"])
 224     
 225     OptionsInfo["Mode"] = Options["--mode"]
 226     
 227     OptionsInfo["RefFile"] = Options["--reffile"]
 228     OptionsInfo["ProbeFile"] = Options["--probefile"]
 229 
 230     # No need for any RDKit specific --outfileParams....
 231     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"])
 232     
 233     OptionsInfo["Outfile"] = Options["--outfile"]
 234     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"])
 235     
 236     OptionsInfo["Overwrite"] = Options["--overwrite"]
 237     
 238 def RetrieveOptions():
 239     """Retrieve command line arguments and options."""
 240     
 241     # Get options...
 242     global Options
 243     Options = docopt(_docoptUsage_)
 244 
 245     # Set current working directory to the specified directory...
 246     WorkingDir = Options["--workingdir"]
 247     if WorkingDir:
 248         os.chdir(WorkingDir)
 249     
 250     # Handle examples option...
 251     if "--examples" in Options and Options["--examples"]:
 252         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 253         sys.exit(0)
 254 
 255 def ValidateOptions():
 256     """Validate option values."""
 257     
 258     MiscUtil.ValidateOptionTextValue("--alignment", Options["--alignment"], "Open3A CrippenOpen3A RMSD BestRMSD")
 259     
 260     MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0})
 261     MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne  FirstToAll")
 262     
 263     MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"])
 264     MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol")
 265     
 266     MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"])
 267     MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol")
 268         
 269     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sd sdf")
 270     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 271     MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"])
 272     MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"])
 273     
 274     MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"])
 275     MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"])
 276     
 277 # Setup a usage string for docopt...
 278 _docoptUsage_ = """
 279 RDKitAlignMolecules.py - Align molecules by RMSD or shape
 280 
 281 Usage:
 282     RDKitAlignMolecules.py [--alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>]
 283                            [--infileParams <Name,Value,...>] [--maxIters <number>]
 284                            [--mode <OneToOne, FirstToAll>] [ --outfileParams <Name,Value,...> ] 
 285                            [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 
 286     RDKitAlignMolecules.py -h | --help | -e | --examples
 287 
 288 Description:
 289     Perform alignment between a set of similar molecules in reference and probe
 290     input files. The molecules are aligned either by Root Mean Square Distance (RMSD)
 291     between molecules or overlying their shapes (Open3A or CrippenOpen3A).
 292     The RDKit function fails to calculate RMSD values for dissimilar molecules. Consequently,
 293     unaligned probe molecules are written to the output file for dissimilar molecule pairs.
 294 
 295     The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
 296 
 297     The supported output file formats are:  SD (.sdf, .sd)
 298 
 299 Options:
 300     -a, --alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>  [default: Open3A]
 301         Alignment methodology to use for aligning molecules. Possible values: Open3A,
 302         CrippenOpen3A, RMSD, BestRMSD.
 303         
 304         The Open3A and CrippenOpen3A allow alignment of molecules using their shapes
 305         Open 3DAlign (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types
 306         and charges. Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions
 307         to overlay molecules.
 308         
 309         During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to
 310         align and calculate RMSD. This function calculates optimal RMSD for aligning two
 311         molecules, taking symmetry into account. Otherwise, the RMSD value is calculated
 312         using 'AllChem.AlignMol function' without changing the atom order. A word to the
 313         wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to
 314         align all permutations of matching atom orders in both molecules, for some molecules
 315         it will lead to 'combinatorial explosion'.
 316     --infileParams <Name,Value,...>  [default: auto]
 317         A comma delimited list of parameter name and value pairs for reading
 318         molecules from files. The supported parameter names for different file
 319         formats, along with their default values, are shown below:
 320             
 321             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 322             
 323     --maxIters <number>  [default: 50]
 324         Maximum number of iterations to perform for each molecule pair during minimization
 325         of RMSD values. This option is ignored during BestRMSD mode.
 326     -m, --mode <OneToOne, FirstToAll>  [default: OneToOne]
 327         Specify how molecules are handled in reference and probe input files during
 328         alignment of molecules between reference and probe molecules.  Possible values:
 329         OneToOne and  FirstToAll. For OneToOne mode, the alignment is performed
 330         for each pair of molecules in the reference and probe file, and the aligned probe
 331         molecule is written the output file. For FirstToAll mode, the alignment is only
 332         performed between the first reference molecule against all probe molecules.
 333     -e, --examples
 334         Print examples.
 335     -h, --help
 336         Print this help message.
 337     -o, --outfile <outfile>
 338         Output file name for writing out aligned probe molecules values. Supported
 339         file extensions: sdf or sd.
 340     --outfileParams <Name,Value,...>  [default: auto]
 341         A comma delimited list of parameter name and value pairs for writing
 342         molecules to files. The supported parameter names for different file
 343         formats, along with their default values, are shown below:
 344             
 345             SD: kekulize,yes,forceV3000,no
 346             
 347     -p, --probefile <probefile>
 348         Probe input file name.
 349     -r, --reffile <reffile>
 350         Reference input file name.
 351     --overwrite
 352         Overwrite existing files.
 353     -w, --workingdir <dir>
 354         Location of working directory which defaults to the current directory.
 355 
 356 Examples:
 357     To perform shape alignment using Open3A methodology between pairs of molecules in
 358     reference and probe input 3D SD files and write out a SD file containing aligned
 359     molecules, type:
 360 
 361         % RDKitAlignMolecules.py  -r Sample3DRef.sdf -p Sample3DProb.sdf
 362           -o SampleOut.sdf
 363 
 364     To perform alignment using RMSD methodology between pairs of molecules in
 365     reference and probe input 3D SD files and write out a SD file containing aligned
 366     molecules, type:
 367 
 368         % RDKitAlignMolecules.py  -a RMSD -r Sample3DRef.sdf -p Sample3DProb.sdf
 369           -o SampleOut.sdf
 370 
 371     To perform alignment using Open3A methodology  between first reference molecule
 372     against all probe molecules in 3D SD files without removing hydrogens , and write out
 373     a SD file containing aligned molecules, type:
 374 
 375         % RDKitAlignMolecules.py -m FirstToAll -a Open3A
 376           --infileParams "removeHydrogens,no" -r Sample3DRef.sdf
 377           -p Sample3DProb.sdf -o SampleOut.sdf
 378 
 379 Author:
 380     Manish Sud(msud@san.rr.com)
 381 
 382 See also:
 383     RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitCalculateRMSD.py,
 384     RDKitConvertFileFormat.py, RDKitGenerateConformers.py, RDKitPerformMinimization.py
 385 
 386 Copyright:
 387     Copyright (C) 2025 Manish Sud. All rights reserved.
 388 
 389     The functionality available in this script is implemented using RDKit, an
 390     open source toolkit for cheminformatics developed by Greg Landrum.
 391 
 392     This file is part of MayaChemTools.
 393 
 394     MayaChemTools is free software; you can redistribute it and/or modify it under
 395     the terms of the GNU Lesser General Public License as published by the Free
 396     Software Foundation; either version 3 of the License, or (at your option) any
 397     later version.
 398 
 399 """
 400 
 401 if __name__ == "__main__":
 402     main()