1 #!/bin/env python 2 # 3 # File: RDKitAlignMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem, rdMolAlign 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 65 66 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 67 68 # Retrieve command line arguments and options... 69 RetrieveOptions() 70 71 # Process and validate command line arguments and options... 72 ProcessOptions() 73 74 # Perform actions required by the script... 75 AlignMolecules() 76 77 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 78 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 79 80 def AlignMolecules(): 81 """Align molecules.""" 82 83 if not re.match("^(OneToOne|FirstToAll)$", OptionsInfo["Mode"], re.I): 84 MiscUtil.PrintError("Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 85 86 RefFile = OptionsInfo["RefFile"] 87 ProbeFile = OptionsInfo["ProbeFile"] 88 89 Outfile = OptionsInfo["Outfile"] 90 91 # Read reference and probe molecules... 92 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 93 94 MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile)) 95 ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"]) 96 97 MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile)) 98 ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"]) 99 100 # Set up a molecule writer... 101 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 102 if Writer is None: 103 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 104 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 105 106 AlignmentFailedCount = 0 107 if re.match("^OneToOne$", OptionsInfo["Mode"], re.I): 108 AlignmentFailedCount = PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer) 109 elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I): 110 AlignmentFailedCount = PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer) 111 else: 112 MiscUtil.PrintError("Alignment couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 113 114 if Writer is not None: 115 Writer.close() 116 117 MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount)) 118 MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount)) 119 MiscUtil.PrintInfo("Number of probe molecules failed during alignment: %d" % AlignmentFailedCount) 120 MiscUtil.PrintInfo("Number of ignored molecules: Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount + AlignmentFailedCount))) 121 122 def PerformOneToOneAlignment(ValidRefMols, ValidProbeMols, Writer): 123 """Perform pairwise alignment.""" 124 125 ValidRefMolCount = len(ValidRefMols) 126 ValidProbeMolCount = len(ValidProbeMols) 127 128 MolCount = ValidRefMolCount 129 if ValidRefMolCount > ValidProbeMolCount: 130 MolCount = ValidProbeMolCount 131 132 if ValidRefMolCount != ValidProbeMolCount: 133 MiscUtil.PrintWarning("Number of valid reference molecules, %d, is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount)) 134 MiscUtil.PrintWarning("Pairwise alignment will be performed only for first %s molecules.\n" % (MolCount)) 135 136 # Process molecules... 137 AlignmentFailedCount = 0 138 for MolIndex in range(0, MolCount): 139 RefMol = ValidRefMols[MolIndex] 140 ProbeMol = ValidProbeMols[MolIndex] 141 142 RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1)) 143 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1)) 144 145 Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer) 146 if not Status: 147 AlignmentFailedCount += 1 148 149 return AlignmentFailedCount 150 151 def PerformFirstToAllAlignment(ValidRefMols, ValidProbeMols, Writer): 152 """Perform alignment between first reference molecues and all probe molecules.""" 153 154 # Process molecules... 155 RefMol = ValidRefMols[0] 156 RefMolCount = 1 157 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 158 159 ProbeMolCount = 0 160 AlignmentFailedCount = 0 161 for ProbeMol in ValidProbeMols: 162 ProbeMolCount += 1 163 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 164 165 Status = PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer) 166 if not Status: 167 AlignmentFailedCount += 1 168 169 return AlignmentFailedCount 170 171 def PerformAlignmentAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, Writer): 172 """Perform alignment and write to output file.""" 173 174 Status = True 175 try: 176 if OptionsInfo["UseRMSD"]: 177 RMSD = rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters = OptionsInfo["MaxIters"]) 178 elif OptionsInfo["UseBestRMSD"]: 179 RMSD = AllChem.GetBestRMS(ProbeMol, RefMol) 180 elif OptionsInfo["UseOpen3A"]: 181 O3A = rdMolAlign.GetO3A(ProbeMol, RefMol) 182 Score = O3A.Align() 183 elif OptionsInfo["UseCrippenOpen3A"]: 184 CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol) 185 Score = CrippenO3A.Align() 186 else: 187 MiscUtil.PrintError("Alignment couldn't be performed: Specified alignment value, %s, is not supported" % OptionsInfo["Alignment"]) 188 except (RuntimeError, ValueError): 189 Status = False 190 MiscUtil.PrintWarning("Alignment failed between reference molecule, %s, and probe molecule, %s.\nWriting unaligned probe molecule...\n" % (RefMolName, ProbeMolName)) 191 192 # Write out aligned probe molecule... 193 Writer.write(ProbeMol) 194 195 return Status 196 197 def ProcessOptions(): 198 """Process and validate command line arguments and options.""" 199 200 MiscUtil.PrintInfo("Processing options...") 201 202 # Validate options... 203 ValidateOptions() 204 205 OptionsInfo["Alignment"] = Options["--alignment"] 206 207 OptionsInfo["UseRMSD"] = False 208 OptionsInfo["UseBestRMSD"] = False 209 210 OptionsInfo["UseOpen3A"] = False 211 OptionsInfo["UseCrippenOpen3A"] = False 212 213 # Process alignment 214 if re.match("^RMSD$", OptionsInfo["Alignment"], re.I): 215 OptionsInfo["UseRMSD"] = True 216 elif re.match("^BestRMSD$", OptionsInfo["Alignment"], re.I): 217 OptionsInfo["UseBestRMSD"] = True 218 elif re.match("^Open3A$", OptionsInfo["Alignment"], re.I): 219 OptionsInfo["UseOpen3A"] = True 220 elif re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I): 221 OptionsInfo["UseCrippenOpen3A"] = True 222 223 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 224 225 OptionsInfo["Mode"] = Options["--mode"] 226 227 OptionsInfo["RefFile"] = Options["--reffile"] 228 OptionsInfo["ProbeFile"] = Options["--probefile"] 229 230 # No need for any RDKit specific --outfileParams.... 231 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"]) 232 233 OptionsInfo["Outfile"] = Options["--outfile"] 234 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"]) 235 236 OptionsInfo["Overwrite"] = Options["--overwrite"] 237 238 def RetrieveOptions(): 239 """Retrieve command line arguments and options.""" 240 241 # Get options... 242 global Options 243 Options = docopt(_docoptUsage_) 244 245 # Set current working directory to the specified directory... 246 WorkingDir = Options["--workingdir"] 247 if WorkingDir: 248 os.chdir(WorkingDir) 249 250 # Handle examples option... 251 if "--examples" in Options and Options["--examples"]: 252 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 253 sys.exit(0) 254 255 def ValidateOptions(): 256 """Validate option values.""" 257 258 MiscUtil.ValidateOptionTextValue("--alignment", Options["--alignment"], "Open3A CrippenOpen3A RMSD BestRMSD") 259 260 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 261 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne FirstToAll") 262 263 MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"]) 264 MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol") 265 266 MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"]) 267 MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol") 268 269 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sd sdf") 270 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 271 MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]) 272 MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]) 273 274 MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]) 275 MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]) 276 277 # Setup a usage string for docopt... 278 _docoptUsage_ = """ 279 RDKitAlignMolecules.py - Align molecules by RMSD or shape 280 281 Usage: 282 RDKitAlignMolecules.py [--alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD>] 283 [--infileParams <Name,Value,...>] [--maxIters <number>] 284 [--mode <OneToOne, FirstToAll>] [ --outfileParams <Name,Value,...> ] 285 [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 286 RDKitAlignMolecules.py -h | --help | -e | --examples 287 288 Description: 289 Perform alignment between a set of similar molecules in reference and probe 290 input files. The molecules are aligned either by Root Mean Square Distance (RMSD) 291 between molecules or overlying their shapes (Open3A or CrippenOpen3A). 292 The RDKit function fails to calculate RMSD values for dissimilar molecules. Consequently, 293 unaligned probe molecules are written to the output file for dissimilar molecule pairs. 294 295 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 296 297 The supported output file formats are: SD (.sdf, .sd) 298 299 Options: 300 -a, --alignment <Open3A, CrippenOpen3A, RMSD, BestRMSD> [default: Open3A] 301 Alignment methodology to use for aligning molecules. Possible values: Open3A, 302 CrippenOpen3A, RMSD, BestRMSD. 303 304 The Open3A and CrippenOpen3A allow alignment of molecules using their shapes 305 Open 3DAlign (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types 306 and charges. Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions 307 to overlay molecules. 308 309 During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to 310 align and calculate RMSD. This function calculates optimal RMSD for aligning two 311 molecules, taking symmetry into account. Otherwise, the RMSD value is calculated 312 using 'AllChem.AlignMol function' without changing the atom order. A word to the 313 wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to 314 align all permutations of matching atom orders in both molecules, for some molecules 315 it will lead to 'combinatorial explosion'. 316 --infileParams <Name,Value,...> [default: auto] 317 A comma delimited list of parameter name and value pairs for reading 318 molecules from files. The supported parameter names for different file 319 formats, along with their default values, are shown below: 320 321 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 322 323 --maxIters <number> [default: 50] 324 Maximum number of iterations to perform for each molecule pair during minimization 325 of RMSD values. This option is ignored during BestRMSD mode. 326 -m, --mode <OneToOne, FirstToAll> [default: OneToOne] 327 Specify how molecules are handled in reference and probe input files during 328 alignment of molecules between reference and probe molecules. Possible values: 329 OneToOne and FirstToAll. For OneToOne mode, the alignment is performed 330 for each pair of molecules in the reference and probe file, and the aligned probe 331 molecule is written the output file. For FirstToAll mode, the alignment is only 332 performed between the first reference molecule against all probe molecules. 333 -e, --examples 334 Print examples. 335 -h, --help 336 Print this help message. 337 -o, --outfile <outfile> 338 Output file name for writing out aligned probe molecules values. Supported 339 file extensions: sdf or sd. 340 --outfileParams <Name,Value,...> [default: auto] 341 A comma delimited list of parameter name and value pairs for writing 342 molecules to files. The supported parameter names for different file 343 formats, along with their default values, are shown below: 344 345 SD: kekulize,yes,forceV3000,no 346 347 -p, --probefile <probefile> 348 Probe input file name. 349 -r, --reffile <reffile> 350 Reference input file name. 351 --overwrite 352 Overwrite existing files. 353 -w, --workingdir <dir> 354 Location of working directory which defaults to the current directory. 355 356 Examples: 357 To perform shape alignment using Open3A methodology between pairs of molecules in 358 reference and probe input 3D SD files and write out a SD file containing aligned 359 molecules, type: 360 361 % RDKitAlignMolecules.py -r Sample3DRef.sdf -p Sample3DProb.sdf 362 -o SampleOut.sdf 363 364 To perform alignment using RMSD methodology between pairs of molecules in 365 reference and probe input 3D SD files and write out a SD file containing aligned 366 molecules, type: 367 368 % RDKitAlignMolecules.py -a RMSD -r Sample3DRef.sdf -p Sample3DProb.sdf 369 -o SampleOut.sdf 370 371 To perform alignment using Open3A methodology between first reference molecule 372 against all probe molecules in 3D SD files without removing hydrogens , and write out 373 a SD file containing aligned molecules, type: 374 375 % RDKitAlignMolecules.py -m FirstToAll -a Open3A 376 --infileParams "removeHydrogens,no" -r Sample3DRef.sdf 377 -p Sample3DProb.sdf -o SampleOut.sdf 378 379 Author: 380 Manish Sud(msud@san.rr.com) 381 382 See also: 383 RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitCalculateRMSD.py, 384 RDKitConvertFileFormat.py, RDKitGenerateConformers.py, RDKitPerformMinimization.py 385 386 Copyright: 387 Copyright (C) 2025 Manish Sud. All rights reserved. 388 389 The functionality available in this script is implemented using RDKit, an 390 open source toolkit for cheminformatics developed by Greg Landrum. 391 392 This file is part of MayaChemTools. 393 394 MayaChemTools is free software; you can redistribute it and/or modify it under 395 the terms of the GNU Lesser General Public License as published by the Free 396 Software Foundation; either version 3 of the License, or (at your option) any 397 later version. 398 399 """ 400 401 if __name__ == "__main__": 402 main()