1 #!/bin/env python 2 # 3 # File: RDKitAlignMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2026 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 import os 32 import sys 33 import time 34 import re 35 36 # RDKit imports... 37 try: 38 from rdkit import rdBase 39 from rdkit.Chem import AllChem, rdMolAlign 40 except ImportError as ErrMsg: 41 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 42 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 43 sys.exit(1) 44 45 # MayaChemTools imports... 46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 47 try: 48 from docopt import docopt 49 import MiscUtil 50 import RDKitUtil 51 except ImportError as ErrMsg: 52 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 53 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 54 sys.exit(1) 55 56 ScriptName = os.path.basename(sys.argv[0]) 57 Options = {} 58 OptionsInfo = {} 59 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo( 65 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" 66 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()) 67 ) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 AlignMolecules() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 84 def AlignMolecules(): 85 """Align molecules.""" 86 87 if not re.match("^(OneToOne|FirstToAll)$", OptionsInfo["Mode"], re.I): 88 MiscUtil.PrintError( 89 "Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"] 90 ) 91 92 RefFile = OptionsInfo["RefFile"] 93 ProbeFile = OptionsInfo["ProbeFile"] 94 95 # Read reference and probe molecules... 96 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 97 98 MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile)) 99 ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules( 100 RefFile, **OptionsInfo["InfileParams"] 101 ) 102 103 MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile)) 104 ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules( 105 ProbeFile, **OptionsInfo["InfileParams"] 106 ) 107 108 # Set up a molecule writer... 109 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 110 if Writer is None: 111 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 112 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 113 114 AlignmentFailedCount = 0 115 if re.match("^OneToOne$", OptionsInfo["Mode"], re.I): 116 AlignmentFailedCount = PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer) 117 elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I): 118 AlignmentFailedCount = PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer) 119 else: 120 MiscUtil.PrintError( 121 "Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"] 122 ) 123 124 if Writer is not None: 125 Writer.close() 126 127 MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount)) 128 MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount)) 129 MiscUtil.PrintInfo("Number of probe molecules failed during alignment: %d" % AlignmentFailedCount) 130 MiscUtil.PrintInfo( 131 "Number of ignored molecules: Reference - %d; Probe - %d" 132 % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount + AlignmentFailedCount)) 133 ) 134 135 136 def PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer): 137 """Perform pairwise alignment.""" 138 139 ValidRefMolCount = len(ValidRefMols) 140 ValidProbeMolCount = len(ValidProbeMols) 141 142 MolCount = ValidRefMolCount 143 if ValidRefMolCount > ValidProbeMolCount: 144 MolCount = ValidProbeMolCount 145 146 if ValidRefMolCount != ValidProbeMolCount: 147 MiscUtil.PrintWarning( 148 "Number of valid reference molecules, %d, is not equal to number of valid probe molecules, %d .\n" 149 % (ValidRefMolCount, ValidProbeMolCount) 150 ) 151 MiscUtil.PrintWarning("Pairwise alignment will be performed only for first %s molecules.\n" % (MolCount)) 152 153 # Process molecules... 154 AlignmentFailedCount = 0 155 for MolIndex in range(0, MolCount): 156 RefMol = ValidRefMols[MolIndex] 157 ProbeMol = ValidProbeMols[MolIndex] 158 159 RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1)) 160 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1)) 161 162 Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer) 163 if not Status: 164 AlignmentFailedCount += 1 165 166 return AlignmentFailedCount 167 168 169 def PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer): 170 """Perform alignment between first reference molecues and all probe molecules.""" 171 172 # Process molecules... 173 RefMol = ValidRefMols[0] 174 RefMolCount = 1 175 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 176 177 ProbeMolCount = 0 178 AlignmentFailedCount = 0 179 for ProbeMol in ValidProbeMols: 180 ProbeMolCount += 1 181 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 182 183 Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer) 184 if not Status: 185 AlignmentFailedCount += 1 186 187 return AlignmentFailedCount 188 189 190 def PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer): 191 """Perform alignment and write to output file.""" 192 193 Status = True 194 try: 195 if OptionsInfo["UseRMSD"]: 196 rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters=OptionsInfo["MaxIters"]) 197 elif OptionsInfo["UseBestRMSD"]: 198 AllChem.GetBestRMS(ProbeMol, RefMol) 199 elif OptionsInfo["UseOpen3A"]: 200 O3A = rdMolAlign.GetO3A(ProbeMol, RefMol) 201 O3A.Align() 202 elif OptionsInfo["UseCrippenOpen3A"]: 203 CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol) 204 CrippenO3A.Align() 205 else: 206 MiscUtil.PrintError( 207 "Alignment couldn't be performed: Specified alignment value, %s, is not supported" 208 % OptionsInfo["Alignment"] 209 ) 210 except (RuntimeError, ValueError): 211 Status = False 212 MiscUtil.PrintWarning( 213 "Alignment failed between reference molecule, %s, and probe molecule, %s.\nWriting unaligned probe molecule...\n" 214 % (RefMolName, ProbeMolName) 215 ) 216 217 # Write out aligned probe molecule... 218 Writer.write(ProbeMol) 219 220 return Status 221 222 223 def ProcessOptions(): 224 """Process and validate command line arguments and options.""" 225 226 MiscUtil.PrintInfo("Processing options...") 227 228 # Validate options... 229 ValidateOptions() 230 231 OptionsInfo["Alignment"] = Options["--alignment"] 232 233 OptionsInfo["UseRMSD"] = False 234 OptionsInfo["UseBestRMSD"] = False 235 236 OptionsInfo["UseOpen3A"] = False 237 OptionsInfo["UseCrippenOpen3A"] = False 238 239 # Process alignment 240 if re.match("^RMSD$", OptionsInfo["Alignment"], re.I): 241 OptionsInfo["UseRMSD"] = True 242 elif re.match("^BestRMSD$", OptionsInfo["Alignment"], re.I): 243 OptionsInfo["UseBestRMSD"] = True 244 elif re.match("^Open3A$", OptionsInfo["Alignment"], re.I): 245 OptionsInfo["UseOpen3A"] = True 246 elif re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I): 247 OptionsInfo["UseCrippenOpen3A"] = True 248 249 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 250 251 OptionsInfo["Mode"] = Options["--mode"] 252 253 OptionsInfo["RefFile"] = Options["--reffile"] 254 OptionsInfo["ProbeFile"] = Options["--probefile"] 255 256 # No need for any RDKit specific --outfileParams.... 257 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"]) 258 259 OptionsInfo["Outfile"] = Options["--outfile"] 260 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters( 261 "--outfileParams", Options["--outfileParams"] 262 ) 263 264 OptionsInfo["Overwrite"] = Options["--overwrite"] 265 266 267 def RetrieveOptions(): 268 """Retrieve command line arguments and options.""" 269 270 # Get options... 271 global Options 272 Options = docopt(_docoptUsage_) 273 274 # Set current working directory to the specified directory... 275 WorkingDir = Options["--workingdir"] 276 if WorkingDir: 277 os.chdir(WorkingDir) 278 279 # Handle examples option... 280 if "--examples" in Options and Options["--examples"]: 281 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 282 sys.exit(0) 283 284 285 def ValidateOptions(): 286 """Validate option values.""" 287 288 MiscUtil.ValidateOptionTextValue("--alignment", Options["--alignment"], "Open3A CrippenOpen3A RMSD BestRMSD") 289 290 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 291 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne FirstToAll") 292 293 MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"]) 294 MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol") 295 296 MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"]) 297 MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol") 298 299 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sd sdf") 300 MiscUtil.ValidateOptionsOutputFileOverwrite( 301 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"] 302 ) 303 MiscUtil.ValidateOptionsDistinctFileNames( 304 "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"] 305 ) 306 MiscUtil.ValidateOptionsDistinctFileNames( 307 "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"] 308 ) 309 310 MiscUtil.ValidateOptionsDistinctFileNames( 311 "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"] 312 ) 313 MiscUtil.ValidateOptionsDistinctFileNames( 314 "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"] 315 ) 316 317 318 # Setup a usage string for docopt... 319 _docoptUsage_ = """ 320 RDKitAlignMolecules.py - Align molecules by RMSD or shape 321 322 Usage: 323 RDKitAlignMolecules.py [--alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>] 324 [--infileParams <Name,Value,...>] [--maxIters <number>] 325 [--mode <OneToOne, FirstToAll>] [ --outfileParams <Name,Value,...> ] 326 [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 327 RDKitAlignMolecules.py -h | --help | -e | --examples 328 329 Description: 330 Perform alignment between a set of similar molecules in reference and probe 331 input files. The molecules are aligned either by Root Mean Square Distance (RMSD) 332 between molecules or overlying their shapes (Open3A or CrippenOpen3A). 333 The RDKit function fails to calculate RMSD values for dissimilar molecules. Consequently, 334 unaligned probe molecules are written to the output file for dissimilar molecule pairs. 335 336 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 337 338 The supported output file formats are: SD (.sdf, .sd) 339 340 Options: 341 -a, --alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD> [default: Open3A] 342 Alignment methodology to use for aligning molecules. Possible values: Open3A, 343 CrippenOpen3A, RMSD, BestRMSD. 344 345 The Open3A and CrippenOpen3A allow alignment of molecules using their shapes 346 Open 3DAlign (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types 347 and charges. Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions 348 to overlay molecules. 349 350 During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to 351 align and calculate RMSD. This function calculates optimal RMSD for aligning two 352 molecules, taking symmetry into account. Otherwise, the RMSD value is calculated 353 using 'AllChem.AlignMol function' without changing the atom order. A word to the 354 wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to 355 align all permutations of matching atom orders in both molecules, for some molecules 356 it will lead to 'combinatorial explosion'. 357 --infileParams <Name,Value,...> [default: auto] 358 A comma delimited list of parameter name and value pairs for reading 359 molecules from files. The supported parameter names for different file 360 formats, along with their default values, are shown below: 361 362 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 363 364 --maxIters <number> [default: 50] 365 Maximum number of iterations to perform for each molecule pair during minimization 366 of RMSD values. This option is ignored during BestRMSD mode. 367 -m, --mode <OneToOne, FirstToAll> [default: OneToOne] 368 Specify how molecules are handled in reference and probe input files during 369 alignment of molecules between reference and probe molecules. Possible values: 370 OneToOne and FirstToAll. For OneToOne mode, the alignment is performed 371 for each pair of molecules in the reference and probe file, and the aligned probe 372 molecule is written the output file. For FirstToAll mode, the alignment is only 373 performed between the first reference molecule against all probe molecules. 374 -e, --examples 375 Print examples. 376 -h, --help 377 Print this help message. 378 -o, --outfile <outfile> 379 Output file name for writing out aligned probe molecules values. Supported 380 file extensions: sdf or sd. 381 --outfileParams <Name,Value,...> [default: auto] 382 A comma delimited list of parameter name and value pairs for writing 383 molecules to files. The supported parameter names for different file 384 formats, along with their default values, are shown below: 385 386 SD: kekulize,yes,forceV3000,no 387 388 -p, --probefile <probefile> 389 Probe input file name. 390 -r, --reffile <reffile> 391 Reference input file name. 392 --overwrite 393 Overwrite existing files. 394 -w, --workingdir <dir> 395 Location of working directory which defaults to the current directory. 396 397 Examples: 398 To perform shape alignment using Open3A methodology between pairs of molecules in 399 reference and probe input 3D SD files and write out a SD file containing aligned 400 molecules, type: 401 402 % RDKitAlignMolecules.py -r Sample3DRef.sdf -p Sample3DProb.sdf 403 -o SampleOut.sdf 404 405 To perform alignment using RMSD methodology between pairs of molecules in 406 reference and probe input 3D SD files and write out a SD file containing aligned 407 molecules, type: 408 409 % RDKitAlignMolecules.py -a RMSD -r Sample3DRef.sdf -p Sample3DProb.sdf 410 -o SampleOut.sdf 411 412 To perform alignment using Open3A methodology between first reference molecule 413 against all probe molecules in 3D SD files without removing hydrogens , and write out 414 a SD file containing aligned molecules, type: 415 416 % RDKitAlignMolecules.py -m FirstToAll -a Open3A 417 --infileParams "removeHydrogens,no" -r Sample3DRef.sdf 418 -p Sample3DProb.sdf -o SampleOut.sdf 419 420 Author: 421 Manish Sud(msud@san.rr.com) 422 423 See also: 424 RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitCalculateRMSD.py, 425 RDKitConvertFileFormat.py, RDKitGenerateConformers.py, RDKitPerformMinimization.py 426 427 Copyright: 428 Copyright (C) 2026 Manish Sud. All rights reserved. 429 430 The functionality available in this script is implemented using RDKit, an 431 open source toolkit for cheminformatics developed by Greg Landrum. 432 433 This file is part of MayaChemTools. 434 435 MayaChemTools is free software; you can redistribute it and/or modify it under 436 the terms of the GNU Lesser General Public License as published by the Free 437 Software Foundation; either version 3 of the License, or (at your option) any 438 later version. 439 440 """ 441 442 if __name__ == "__main__": 443 main()