1 #!/bin/env python 2 # 3 # File: RDKitCompareMoleculeShapes.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem, rdMolAlign, rdShapeHelpers 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 65 66 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 67 68 # Retrieve command line arguments and options... 69 RetrieveOptions() 70 71 # Process and validate command line arguments and options... 72 ProcessOptions() 73 74 # Perform actions required by the script... 75 CompareMoleculeShapes() 76 77 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 78 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 79 80 def CompareMoleculeShapes(): 81 """Compare shape of molecules.""" 82 83 if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I): 84 MiscUtil.PrintError("Shape comparison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 85 86 if not re.match("^(Open3A|CrippenOpen3A)$", OptionsInfo["Alignment"], re.I): 87 MiscUtil.PrintError("Shape couldn't be performed: Specified alignment mode, %s, is not supported" % OptionsInfo["Alignment"]) 88 89 RefFile = OptionsInfo["RefFile"] 90 ProbeFile = OptionsInfo["ProbeFile"] 91 92 Outfile = OptionsInfo["Outfile"] 93 OutDelim = OptionsInfo["OutDelim"] 94 95 # Read reference and probe molecules... 96 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 97 98 MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile)) 99 ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules(RefFile, **OptionsInfo["InfileParams"]) 100 101 MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile)) 102 ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules(ProbeFile, **OptionsInfo["InfileParams"]) 103 104 # Set up output file... 105 MiscUtil.PrintInfo("Generating file %s...\n" % Outfile) 106 OutFH = open(Outfile, "w") 107 if OutFH is None: 108 MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile)) 109 110 if OptionsInfo["UseCrippenOpen3A"]: 111 Line = "RefMolID%sProbeMolID%sCrippenOpen3AScore" % (OutDelim, OutDelim) 112 else: 113 Line = "RefMolID%sProbeMolID%sOpen3AScore" % (OutDelim, OutDelim) 114 115 if OptionsInfo["CalcTanimotoDistance"]: 116 Line = "%s%sTanimotoDistance" % (Line, OutDelim) 117 if OptionsInfo["CalcProtrudeDistance"]: 118 Line = "%s%sProtrudeDistance" % (Line, OutDelim) 119 OutFH.write("%s\n" % Line) 120 121 if re.match("^OneToOne$", OptionsInfo["Mode"], re.I): 122 PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 123 elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I): 124 PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 125 elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I): 126 PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 127 else: 128 MiscUtil.PrintError("Shape comaprison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"]) 129 130 OutFH.close() 131 132 MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount)) 133 MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount)) 134 MiscUtil.PrintInfo("Number of ignored molecules: Reference - %d; Probe - %d" % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount))) 135 136 def PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 137 """Perform pairwise shape comparison.""" 138 139 ValidRefMolCount = len(ValidRefMols) 140 ValidProbeMolCount = len(ValidProbeMols) 141 142 MolCount = ValidRefMolCount 143 if ValidRefMolCount > ValidProbeMolCount: 144 MolCount = ValidProbeMolCount 145 146 if ValidRefMolCount != ValidProbeMolCount: 147 MiscUtil.PrintWarning("Number of valid reference molecules, %d, is not equal to number of valid probe molecules, %d .\n" % (ValidRefMolCount, ValidProbeMolCount)) 148 MiscUtil.PrintWarning("Pairwise shape comparison will be performed only for first %s molecules.\n" % (MolCount)) 149 150 # Process molecules... 151 for MolIndex in range(0, MolCount): 152 RefMol = ValidRefMols[MolIndex] 153 ProbeMol = ValidProbeMols[MolIndex] 154 155 RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1)) 156 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1)) 157 158 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 159 160 def PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 161 """Perform shape comparison between all pairs of molecules.""" 162 163 # Process molecules... 164 RefMolCount = 0 165 for RefMol in ValidRefMols: 166 RefMolCount += 1 167 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 168 169 ProbeMolCount = 0 170 for ProbeMol in ValidProbeMols: 171 ProbeMolCount += 1 172 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 173 174 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 175 176 def PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 177 """Perform shape comparison between first reference molecues and all probe molecules.""" 178 179 # Process molecules... 180 RefMol = ValidRefMols[0] 181 RefMolCount = 1 182 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 183 184 ProbeMolCount = 0 185 for ProbeMol in ValidProbeMols: 186 ProbeMolCount += 1 187 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 188 189 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 190 191 def PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim): 192 """Perform shape comparison and write to output file.""" 193 194 AlignmentScore = PerformShapeAlignment(RefMol, ProbeMol) 195 AlignmentScore = "%.2f" % AlignmentScore 196 197 LineWords = [] 198 LineWords.extend([RefMolName, ProbeMolName, AlignmentScore]) 199 200 if OptionsInfo["CalcTanimotoDistance"]: 201 TanimotoDistance = CalculateTanimotoShapeDistance(RefMol, ProbeMol) 202 LineWords.append(TanimotoDistance) 203 204 if OptionsInfo["CalcProtrudeDistance"]: 205 ProtrudeDistance = CalculateProtrudeShapeDistance(RefMol, ProbeMol) 206 LineWords.append(ProtrudeDistance) 207 208 Line = OutDelim.join(LineWords) 209 OutFH.write("%s\n" % Line) 210 211 def PerformShapeAlignment(RefMol, ProbeMol): 212 """Perform shape alignment and return alignment score.""" 213 214 if OptionsInfo["UseCrippenOpen3A"]: 215 CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol) 216 Score = CrippenO3A.Align() 217 else: 218 O3A = rdMolAlign.GetO3A(ProbeMol, RefMol) 219 Score = O3A.Align() 220 221 return Score 222 223 def CalculateTanimotoShapeDistance(RefMol, ProbeMol): 224 """Calculate Tanimoto shape for a pair of already aligned molecules and return it as a string.""" 225 226 Distance = rdShapeHelpers.ShapeTanimotoDist(ProbeMol, RefMol) 227 Distance = "%.2f" % Distance 228 229 return Distance 230 231 def CalculateProtrudeShapeDistance(RefMol, ProbeMol): 232 """Calculate protrude shape for a pair of already aligned molecules and return it as a string.""" 233 234 Distance = rdShapeHelpers.ShapeProtrudeDist(ProbeMol, RefMol) 235 Distance = "%.2f" % Distance 236 237 return Distance 238 239 def ProcessOptions(): 240 """Process and validate command line arguments and options.""" 241 242 MiscUtil.PrintInfo("Processing options...") 243 244 # Validate options... 245 ValidateOptions() 246 247 OptionsInfo["Alignment"] = Options["--alignment"] 248 OptionsInfo["UseCrippenOpen3A"] = False 249 if re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I): 250 OptionsInfo["UseCrippenOpen3A"] = True 251 252 OptionsInfo["Distance"] = Options["--distance"] 253 254 OptionsInfo["CalcTanimotoDistance"] = False 255 OptionsInfo["CalcProtrudeDistance"] = False 256 if re.match("^Tanimoto$", OptionsInfo["Distance"], re.I): 257 OptionsInfo["CalcTanimotoDistance"] = True 258 elif re.match("^Protrude$", OptionsInfo["Distance"], re.I): 259 OptionsInfo["CalcProtrudeDistance"] = True 260 else: 261 OptionsInfo["CalcTanimotoDistance"] = True 262 OptionsInfo["CalcProtrudeDistance"] = True 263 264 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 265 OptionsInfo["Mode"] = Options["--mode"] 266 267 OptionsInfo["RefFile"] = Options["--reffile"] 268 OptionsInfo["ProbeFile"] = Options["--probefile"] 269 270 OptionsInfo["Outfile"] = Options["--outfile"] 271 OptionsInfo["Overwrite"] = Options["--overwrite"] 272 273 # No need for any RDKit specific --outfileParams.... 274 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"]) 275 276 OptionsInfo["OutDelim"] = " " 277 if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"): 278 OptionsInfo["OutDelim"] = "," 279 elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"): 280 OptionsInfo["OutDelim"] = "\t" 281 else: 282 MiscUtil.PrintError("The file name specified , %s, for option \"--outfile\" is not valid. Supported file formats: csv tsv txt\n" % (OptionsInfo["Outfile"])) 283 284 def RetrieveOptions(): 285 """Retrieve command line arguments and options.""" 286 287 # Get options... 288 global Options 289 Options = docopt(_docoptUsage_) 290 291 # Set current working directory to the specified directory... 292 WorkingDir = Options["--workingdir"] 293 if WorkingDir: 294 os.chdir(WorkingDir) 295 296 # Handle examples option... 297 if "--examples" in Options and Options["--examples"]: 298 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 299 sys.exit(0) 300 301 def ValidateOptions(): 302 """Validate option values.""" 303 304 MiscUtil.ValidateOptionTextValue("Alignment", Options["--alignment"], "Open3A CrippenOpen3A") 305 MiscUtil.ValidateOptionTextValue("Distance", Options["--distance"], "Tanimoto Protrude Both") 306 307 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 308 309 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne AllToAll FirstToAll") 310 311 MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"]) 312 MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol") 313 314 MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"]) 315 MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol") 316 317 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt") 318 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 319 320 MiscUtil.ValidateOptionsDistinctFileNames("-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"]) 321 MiscUtil.ValidateOptionsDistinctFileNames("-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"]) 322 323 # Setup a usage string for docopt... 324 _docoptUsage_ = """ 325 RDKitCompareMoleculeShapes.py - Compare shapes of molecules 326 327 Usage: 328 RDKitCompareMoleculeShapes.py [--alignment <Open3A, CrippenOpen3A>] 329 [--distance <Tanimoto, Protrude, Both>] [--infileParams <Name,Value,...>] 330 [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>] 331 [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 332 RDKitCompareMoleculeShapes.py -h | --help | -e | --examples 333 334 Description: 335 Compare shapes of molecules between a set of molecules in reference and probe 336 input files. The molecules are aligned using either Open 3DAlign or Crippen Open 337 3DAlign before calculating shape Tanimoto and protrude distances. 338 339 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 340 341 The supported output file formats are: CSV (.csv), TSV (.tsv, .txt) 342 343 Options: 344 -a, --alignment <Open3A, CrippenOpen3A> [default: Open3A] 345 Alignment methodology to use for aligning molecules before calculating Tanimoto and 346 protrude shape distances. Possible values: Open3A or CrippenOpen3A. Open 3DAlign 347 (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types and charges. 348 Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions to overlay 349 molecules. 350 -d, --distance <Tanimoto, Protrude, Both> [default: Both] 351 Shape comparison distance to calculate for comparing shapes of molecules. Possible 352 values: Tanimoto, Protrude, or Both. Shape Tanimoto distance takes the volume 353 overlay into account during the calculation of distance. Shape protrude distance, 354 however, focuses on the volume mismatch. 355 --infileParams <Name,Value,...> [default: auto] 356 A comma delimited list of parameter name and value pairs for reading 357 molecules from files. The supported parameter names for different file 358 formats, along with their default values, are shown below: 359 360 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 361 362 --maxIters <number> [default: 50] 363 Maximum number of iterations to perform for each molecule pair during alignment. 364 -m, --mode <OneToOne, AllToAll, FirstToAll> [default: OneToOne] 365 Specify how molecules are handled in reference and probe input files during 366 comparison of shapes between reference and probe molecules. Possible values: 367 OneToOne, AllToAll and AllToFirst. For OneToOne mode, the molecule shapes are 368 calculated for each pair of molecules in the reference and probe file and the shape 369 distances are written to the output file. For AllToAll mode, the shape distances are 370 calculated for each reference molecule against all probe molecules. For FirstToAll mode, 371 however, the shape distances are only calculated for the first reference molecule 372 against all probe molecules. 373 -e, --examples 374 Print examples. 375 -h, --help 376 Print this help message. 377 -p, --probefile <probefile> 378 Probe input file name. 379 -r, --reffile <reffile> 380 Reference input file name. 381 -o, --outfile <outfile> 382 Output file name for writing out shape distances. Supported text file extensions: csv or tsv. 383 --overwrite 384 Overwrite existing files. 385 -w, --workingdir <dir> 386 Location of working directory which defaults to the current directory. 387 388 Examples: 389 To perform shape alignment using Open3A methodology between pair of molecules in 390 reference and probe input in 3D SD files, calculate both Tanimoto and protrude 391 distances, and write out a CSV file containing calculated distance values along with 392 appropriate molecule IDs, type: 393 394 % RDKitCompareMoleculeShapes.py -r Sample3DRef.sdf -p Sample3DProb.sdf 395 -o SampleOut.csv 396 397 To perform shape alignment using Crippen Open3A methodology between all molecules in 398 reference and probe molecules in 3D SD files, calculate only Tanimoto distance, and write 399 out a TSV file containing calculated distance value along with appropriate molecule IDs, type: 400 401 % RDKitCompareMoleculeShapes.py -m AllToAll -a CrippenOpen3A -d Tanimoto 402 -r Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.csv 403 404 To perform shape alignment using Open3A methodology between first reference molecule 405 against all probe molecules in 3D SD files without removing hydrogens , calculate both 406 Tanimoto and protrude distances, and write out a CSV file containing calculated distance values along with 407 appropriate molecule IDs, type: 408 409 % RDKitCompareMoleculeShapes.py -m FirstToAll -a Open3A -d Both 410 --infileParams "removeHydrogens,no" -r Sample3DRef.sdf 411 -p Sample3DProb.sdf -o SampleOut.csv 412 413 Author: 414 Manish Sud(msud@san.rr.com) 415 416 See also: 417 RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, RDKitConvertFileFormat.py, 418 RDKitGenerateConformers.py, RDKitPerformMinimization.py 419 420 Copyright: 421 Copyright (C) 2025 Manish Sud. All rights reserved. 422 423 The functionality available in this script is implemented using RDKit, an 424 open source toolkit for cheminformatics developed by Greg Landrum. 425 426 This file is part of MayaChemTools. 427 428 MayaChemTools is free software; you can redistribute it and/or modify it under 429 the terms of the GNU Lesser General Public License as published by the Free 430 Software Foundation; either version 3 of the License, or (at your option) any 431 later version. 432 433 """ 434 435 if __name__ == "__main__": 436 main()