1 #!/bin/env python 2 # 3 # File: RDKitCalculateRMSD.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem, rdMolAlign 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 65 66 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 67 68 # Retrieve command line arguments and options... 69 RetrieveOptions() 70 71 # Process and validate command line arguments and options... 72 ProcessOptions() 73 74 # Perform actions required by the script... 75 CalculateRMSD() 76 77 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 78 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 79 80 def CalculateRMSD(): 81 """Calculate RMSD values.""" 82 83 if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I): 84 MiscUtil.PrintError("RMSD couldn't be calculated: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 85 86 RefFile = OptionsInfo["RefFile"] 87 ProbeFile = OptionsInfo["ProbeFile"] 88 89 Outfile = OptionsInfo["Outfile"] 90 OutDelim = OptionsInfo["OutDelim"] 91 92 # Read reference and probe molecules... 93 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 94 95 MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile)) 96 ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"]) 97 98 MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile)) 99 ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"]) 100 101 # Set up output file... 102 MiscUtil.PrintInfo("\nGenerating file %s...\n" % Outfile) 103 OutFH = open(Outfile, "w") 104 if OutFH is None: 105 MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile)) 106 107 Line = "RefMolID%sProbeMolID%sRMSD\n" % (OutDelim, OutDelim) 108 OutFH.write(Line) 109 110 if re.match("^OneToOne$", OptionsInfo["Mode"], re.I): 111 CalculateOneToOneRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 112 elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I): 113 CalculateAllToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 114 elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I): 115 CalculateFirstToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 116 else: 117 MiscUtil.PrintError("RMSD couldn't be calculated: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 118 119 OutFH.close() 120 121 MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount)) 122 MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount)) 123 MiscUtil.PrintInfo("Number of ignored molecules: Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount))) 124 125 def CalculateOneToOneRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 126 """Calculate pairwise RMSD values.""" 127 128 ValidRefMolCount = len(ValidRefMols) 129 ValidProbeMolCount = len(ValidProbeMols) 130 131 MolCount = ValidRefMolCount 132 if ValidRefMolCount > ValidProbeMolCount: 133 MolCount = ValidProbeMolCount 134 135 if ValidRefMolCount != ValidProbeMolCount: 136 MiscUtil.PrintWarning("Number of valid reference molecules, %d, is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount)) 137 MiscUtil.PrintWarning("Pairwise RMSD will be calculated only for first %s molecules.\n" % (MolCount)) 138 139 # Process molecules... 140 for MolIndex in range(0, MolCount): 141 RefMol = ValidRefMols[MolIndex] 142 ProbeMol = ValidProbeMols[MolIndex] 143 144 RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1)) 145 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1)) 146 147 RMSD = CalculateRMSDValue(RefMol, ProbeMol) 148 149 Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD) 150 OutFH.write(Line) 151 152 def CalculateAllToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 153 """Calculate RMSD values between all pairs of molecules.""" 154 155 # Process molecules... 156 RefMolCount = 0 157 for RefMol in ValidRefMols: 158 RefMolCount += 1 159 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 160 161 ProbeMolCount = 0 162 for ProbeMol in ValidProbeMols: 163 ProbeMolCount += 1 164 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 165 166 RMSD = CalculateRMSDValue(RefMol, ProbeMol) 167 168 Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD) 169 OutFH.write(Line) 170 171 def CalculateFirstToAllRMSDValues(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 172 """Calculate RMSD values between first reference molecues and all probe molecules.""" 173 174 # Process molecules... 175 RefMol = ValidRefMols[0] 176 RefMolCount = 1 177 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 178 179 ProbeMolCount = 0 180 for ProbeMol in ValidProbeMols: 181 ProbeMolCount += 1 182 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 183 184 RMSD = CalculateRMSDValue(RefMol, ProbeMol) 185 186 Line = "%s%s%s%s%s\n" % (RefMolName, OutDelim, ProbeMolName, OutDelim, RMSD) 187 OutFH.write(Line) 188 189 def CalculateRMSDValue(RefMol, ProbeMol): 190 """Calculate RMSD value for a pair of molecules and return it as a string.""" 191 192 try: 193 if OptionsInfo["UseBestRMSD"]: 194 RMSD = AllChem.GetBestRMS(ProbeMol, RefMol) 195 else: 196 RMSD = rdMolAlign.AlignMol(ProbeMol, RefMol, maxIters = OptionsInfo["MaxIters"]) 197 RMSD = "%.2f" % RMSD 198 except (RuntimeError, ValueError): 199 RMSD = "None" 200 201 return RMSD 202 203 def ProcessOptions(): 204 """Process and validate command line arguments and options.""" 205 206 MiscUtil.PrintInfo("Processing options...") 207 208 # Validate options... 209 ValidateOptions() 210 211 OptionsInfo["CalcRMSD"] = Options["--calcRMSD"] 212 OptionsInfo["UseBestRMSD"] = False 213 if re.match("^BestRMSD$", OptionsInfo["CalcRMSD"], re.I): 214 OptionsInfo["UseBestRMSD"] = True 215 216 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 217 218 OptionsInfo["Mode"] = Options["--mode"] 219 220 OptionsInfo["RefFile"] = Options["--reffile"] 221 OptionsInfo["ProbeFile"] = Options["--probefile"] 222 223 # No need for any RDKit specific --outfileParams.... 224 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"]) 225 226 OptionsInfo["Outfile"] = Options["--outfile"] 227 228 OptionsInfo["Overwrite"] = Options["--overwrite"] 229 230 OptionsInfo["OutDelim"] = " " 231 if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"): 232 OptionsInfo["OutDelim"] = "," 233 elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"): 234 OptionsInfo["OutDelim"] = "\t" 235 else: 236 MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid. Supported file formats: csv tsv txt\n" % (OptionsInfo["Outfile"])) 237 238 def RetrieveOptions(): 239 """Retrieve command line arguments and options.""" 240 241 # Get options... 242 global Options 243 Options = docopt(_docoptUsage_) 244 245 # Set current working directory to the specified directory... 246 WorkingDir = Options["--workingdir"] 247 if WorkingDir: 248 os.chdir(WorkingDir) 249 250 # Handle examples option... 251 if "--examples" in Options and Options["--examples"]: 252 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 253 sys.exit(0) 254 255 def ValidateOptions(): 256 """Validate option values.""" 257 258 MiscUtil.ValidateOptionTextValue("--calcRMSD", Options["--calcRMSD"], "RMSD BestRMSD") 259 260 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 261 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne AllToAll FirstToAll") 262 263 MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"]) 264 MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol") 265 266 MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"]) 267 MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol") 268 269 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt") 270 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 271 MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]) 272 MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]) 273 274 # Setup a usage string for docopt... 275 _docoptUsage_ = """ 276 RDKitCalculateRMSD.py - Calculate RMSD between molecules 277 278 Usage: 279 RDKitCalculateRMSD.py [--calcRMSD <RMSD, BestRMSD>] [--infileParams <Name,Value,...>] 280 [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>] 281 [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 282 RDKitCalculateRMSD.py -h | --help | -e | --examples 283 284 Description: 285 Calculate Root Mean Square Distance (RMSD) between a set of similar molecules in 286 reference and probe input files. The RDKit function fails to calculate RMSD values for 287 dissimilar molecules. Consequently, a text string 'None' is written out as a RMSD value 288 for dissimilar molecule pairs. 289 290 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 291 292 The supported output file formats are: CSV (.csv), TSV (.tsv, .txt) 293 294 Options: 295 -c, --calcRMSD <RMSD, BestRMSD> [default: RMSD] 296 Methodology for calculating RMSD values. Possible values: RMSD, BestRMSD. 297 During BestRMSMode mode, the RDKit 'function AllChem.GetBestRMS' is used to 298 align and calculate RMSD. This function calculates optimal RMSD for aligning two 299 molecules, taking symmetry into account. Otherwise, the RMSD value is calculated 300 using 'AllChem.AlignMol function' without changing the atom order. A word to the 301 wise from RDKit documentation: The AllChem.GetBestRMS function will attempt to 302 align all permutations of matching atom orders in both molecules, for some molecules 303 it will lead to 'combinatorial explosion'. 304 --infileParams <Name,Value,...> [default: auto] 305 A comma delimited list of parameter name and value pairs for reading 306 molecules from files. The supported parameter names for different file 307 formats, along with their default values, are shown below: 308 309 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 310 311 --maxIters <number> [default: 50] 312 Maximum number of iterations to perform for each molecule pair during minimization 313 of RMSD values. This option is ignored during BestRMSD mode. 314 -m, --mode <OneToOne, AllToAll, FirstToAll> [default: OneToOne] 315 Specify how molecules are handled in reference and probe input files during 316 calculation of RMSD between reference and probe molecules. Possible values: 317 OneToOne, AllToAll and AllToFirst. For OneToOne mode, the number of molecules 318 in reference file must be equal to the number of molecules in probe file. The RMSD 319 is calculated for each pair of molecules in the reference and probe file and written 320 to the output file. For AllToAll mode, the RMSD is calculated for each reference 321 molecule against all probe molecules. For FirstToAll mode, however, the RMSD 322 is only calculated for the first reference molecule against all probe molecules. 323 -e, --examples 324 Print examples. 325 -h, --help 326 Print this help message. 327 -p, --probefile <probefile> 328 Probe input file name. 329 -r, --reffile <reffile> 330 Reference input file name. 331 -o, --outfile <outfile> 332 Output file name for writing out RMSD values. Supported text file extensions: csv or tsv. 333 --overwrite 334 Overwrite existing files. 335 -w, --workingdir <dir> 336 Location of working directory which defaults to the current directory. 337 338 Examples: 339 To calculate RMSD between pair of molecules in reference and probe input 340 3D SD files and write out a CSV file containing calculated RMSD values along with 341 appropriate molecule IDs, type: 342 343 % RDKitCalculateRMSD.py -r Sample3DRef.sdf -p Sample3DProb.sdf 344 -o SampleOut.csv 345 346 To calculate RMSD between all molecules in reference and probe input 347 3D SD files and write out a CSV file containing calculated RMSD values along with 348 appropriate molecule IDs, type: 349 350 % RDKitCalculateRMSD.py -m AllToAll -r Sample3DRef.sdf -p 351 Sample3DProb.sdf -o SampleOut.csv 352 353 To calculate best RMSD between first molecule in reference all probe molecules 354 in 3D SD files and write out a TSV file containing calculated RMSD values along with 355 appropriate molecule IDs, type: 356 357 % RDKitCalculateRMSD.py -m FirstToAll --calcRMSD BestRMSD -r 358 Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.tsv 359 360 To calculate RMSD between all molecules in reference and probe molecules input 361 3D SD files without removing hydrogens and write out a TSV file containing 362 calculated RMSD values along with appropriate molecule IDs, type: 363 364 % RDKitCalculateRMSD.py -m AllToAll --infileParams 365 "removeHydrogens,no" -r Sample3DRef.sdf -p Sample3DProb.sdf 366 -o SampleOut.tsv 367 368 Author: 369 Manish Sud(msud@san.rr.com) 370 371 See also: 372 RDKitCalculateMolecularDescriptors.py, RDKitCompareMoleculeShapes.py, RDKitConvertFileFormat.py, 373 RDKitGenerateConformers.py, RDKitPerformMinimization.py 374 375 Copyright: 376 Copyright (C) 2024 Manish Sud. All rights reserved. 377 378 The functionality available in this script is implemented using RDKit, an 379 open source toolkit for cheminformatics developed by Greg Landrum. 380 381 This file is part of MayaChemTools. 382 383 MayaChemTools is free software; you can redistribute it and/or modify it under 384 the terms of the GNU Lesser General Public License as published by the Free 385 Software Foundation; either version 3 of the License, or (at your option) any 386 later version. 387 388 """ 389 390 if __name__ == "__main__": 391 main()