1 #!/bin/env python 2 # 3 # File: RDKitCompareMoleculeShapes.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2026 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 import os 32 import sys 33 import time 34 import re 35 36 # RDKit imports... 37 try: 38 from rdkit import rdBase 39 from rdkit.Chem import rdMolAlign, rdShapeHelpers 40 except ImportError as ErrMsg: 41 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 42 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 43 sys.exit(1) 44 45 # MayaChemTools imports... 46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 47 try: 48 from docopt import docopt 49 import MiscUtil 50 import RDKitUtil 51 except ImportError as ErrMsg: 52 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 53 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 54 sys.exit(1) 55 56 ScriptName = os.path.basename(sys.argv[0]) 57 Options = {} 58 OptionsInfo = {} 59 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo( 65 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" 66 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()) 67 ) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 CompareMoleculeShapes() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 84 def CompareMoleculeShapes(): 85 """Compare shape of molecules.""" 86 87 if not re.match("^(OneToOne|AllToAll|FirstToAll)$", OptionsInfo["Mode"], re.I): 88 MiscUtil.PrintError( 89 "Shape comparison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"] 90 ) 91 92 if not re.match("^(Open3A|CrippenOpen3A)$", OptionsInfo["Alignment"], re.I): 93 MiscUtil.PrintError( 94 "Shape couldn't be performed: Specified alignment mode, %s, is not supported" % OptionsInfo["Alignment"] 95 ) 96 97 RefFile = OptionsInfo["RefFile"] 98 ProbeFile = OptionsInfo["ProbeFile"] 99 100 Outfile = OptionsInfo["Outfile"] 101 OutDelim = OptionsInfo["OutDelim"] 102 103 # Read reference and probe molecules... 104 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 105 106 MiscUtil.PrintInfo("\nProcessing file %s..." % (RefFile)) 107 ValidRefMols, RefMolCount, ValidRefMolCount = RDKitUtil.ReadAndValidateMolecules( 108 RefFile, **OptionsInfo["InfileParams"] 109 ) 110 111 MiscUtil.PrintInfo("Processing file %s..." % (ProbeFile)) 112 ValidProbeMols, ProbeMolCount, ValidProbeMolCount = RDKitUtil.ReadAndValidateMolecules( 113 ProbeFile, **OptionsInfo["InfileParams"] 114 ) 115 116 # Set up output file... 117 MiscUtil.PrintInfo("Generating file %s...\n" % Outfile) 118 OutFH = open(Outfile, "w") 119 if OutFH is None: 120 MiscUtil.PrintError("Couldn't open output file: %s.\n" % (Outfile)) 121 122 if OptionsInfo["UseCrippenOpen3A"]: 123 Line = "RefMolID%sProbeMolID%sCrippenOpen3AScore" % (OutDelim, OutDelim) 124 else: 125 Line = "RefMolID%sProbeMolID%sOpen3AScore" % (OutDelim, OutDelim) 126 127 if OptionsInfo["CalcTanimotoDistance"]: 128 Line = "%s%sTanimotoDistance" % (Line, OutDelim) 129 if OptionsInfo["CalcProtrudeDistance"]: 130 Line = "%s%sProtrudeDistance" % (Line, OutDelim) 131 OutFH.write("%s\n" % Line) 132 133 if re.match("^OneToOne$", OptionsInfo["Mode"], re.I): 134 PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 135 elif re.match("^AllToAll$", OptionsInfo["Mode"], re.I): 136 PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 137 elif re.match("^FirstToAll$", OptionsInfo["Mode"], re.I): 138 PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim) 139 else: 140 MiscUtil.PrintError( 141 "Shape comaprison couldn't be performed: Specified mode, %s, is not supported" % OptionsInfo["Mode"] 142 ) 143 144 OutFH.close() 145 146 MiscUtil.PrintInfo("\nTotal number of molecules: Reference - %d; Probe - %d" % (RefMolCount, ProbeMolCount)) 147 MiscUtil.PrintInfo("Number of valid molecules: Reference - %d; Probe - %d" % (ValidRefMolCount, ValidProbeMolCount)) 148 MiscUtil.PrintInfo( 149 "Number of ignored molecules: Reference - %d; Probe - %d" 150 % ((RefMolCount - ValidRefMolCount), (ProbeMolCount - ValidProbeMolCount)) 151 ) 152 153 154 def PerformOneToOneShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 155 """Perform pairwise shape comparison.""" 156 157 ValidRefMolCount = len(ValidRefMols) 158 ValidProbeMolCount = len(ValidProbeMols) 159 160 MolCount = ValidRefMolCount 161 if ValidRefMolCount > ValidProbeMolCount: 162 MolCount = ValidProbeMolCount 163 164 if ValidRefMolCount != ValidProbeMolCount: 165 MiscUtil.PrintWarning( 166 "Number of valid reference molecules, %d, is not equal to number of valid probe molecules, %d .\n" 167 % (ValidRefMolCount, ValidProbeMolCount) 168 ) 169 MiscUtil.PrintWarning("Pairwise shape comparison will be performed only for first %s molecules.\n" % (MolCount)) 170 171 # Process molecules... 172 for MolIndex in range(0, MolCount): 173 RefMol = ValidRefMols[MolIndex] 174 ProbeMol = ValidProbeMols[MolIndex] 175 176 RefMolName = RDKitUtil.GetMolName(RefMol, (MolIndex + 1)) 177 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, (MolIndex + 1)) 178 179 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 180 181 182 def PerformAllToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 183 """Perform shape comparison between all pairs of molecules.""" 184 185 # Process molecules... 186 RefMolCount = 0 187 for RefMol in ValidRefMols: 188 RefMolCount += 1 189 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 190 191 ProbeMolCount = 0 192 for ProbeMol in ValidProbeMols: 193 ProbeMolCount += 1 194 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 195 196 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 197 198 199 def PerformFirstToAllShapeComparison(ValidRefMols, ValidProbeMols, OutFH, OutDelim): 200 """Perform shape comparison between first reference molecues and all probe molecules.""" 201 202 # Process molecules... 203 RefMol = ValidRefMols[0] 204 RefMolCount = 1 205 RefMolName = RDKitUtil.GetMolName(RefMol, RefMolCount) 206 207 ProbeMolCount = 0 208 for ProbeMol in ValidProbeMols: 209 ProbeMolCount += 1 210 ProbeMolName = RDKitUtil.GetMolName(ProbeMol, ProbeMolCount) 211 212 PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim) 213 214 215 def PerformShapeComparisonAndWrieOutput(RefMol, ProbeMol, RefMolName, ProbeMolName, OutFH, OutDelim): 216 """Perform shape comparison and write to output file.""" 217 218 AlignmentScore = PerformShapeAlignment(RefMol, ProbeMol) 219 AlignmentScore = "%.2f" % AlignmentScore 220 221 LineWords = [] 222 LineWords.extend([RefMolName, ProbeMolName, AlignmentScore]) 223 224 if OptionsInfo["CalcTanimotoDistance"]: 225 TanimotoDistance = CalculateTanimotoShapeDistance(RefMol, ProbeMol) 226 LineWords.append(TanimotoDistance) 227 228 if OptionsInfo["CalcProtrudeDistance"]: 229 ProtrudeDistance = CalculateProtrudeShapeDistance(RefMol, ProbeMol) 230 LineWords.append(ProtrudeDistance) 231 232 Line = OutDelim.join(LineWords) 233 OutFH.write("%s\n" % Line) 234 235 236 def PerformShapeAlignment(RefMol, ProbeMol): 237 """Perform shape alignment and return alignment score.""" 238 239 if OptionsInfo["UseCrippenOpen3A"]: 240 CrippenO3A = rdMolAlign.GetCrippenO3A(ProbeMol, RefMol) 241 Score = CrippenO3A.Align() 242 else: 243 O3A = rdMolAlign.GetO3A(ProbeMol, RefMol) 244 Score = O3A.Align() 245 246 return Score 247 248 249 def CalculateTanimotoShapeDistance(RefMol, ProbeMol): 250 """Calculate Tanimoto shape for a pair of already aligned molecules and return it as a string.""" 251 252 Distance = rdShapeHelpers.ShapeTanimotoDist(ProbeMol, RefMol) 253 Distance = "%.2f" % Distance 254 255 return Distance 256 257 258 def CalculateProtrudeShapeDistance(RefMol, ProbeMol): 259 """Calculate protrude shape for a pair of already aligned molecules and return it as a string.""" 260 261 Distance = rdShapeHelpers.ShapeProtrudeDist(ProbeMol, RefMol) 262 Distance = "%.2f" % Distance 263 264 return Distance 265 266 267 def ProcessOptions(): 268 """Process and validate command line arguments and options.""" 269 270 MiscUtil.PrintInfo("Processing options...") 271 272 # Validate options... 273 ValidateOptions() 274 275 OptionsInfo["Alignment"] = Options["--alignment"] 276 OptionsInfo["UseCrippenOpen3A"] = False 277 if re.match("^CrippenOpen3A$", OptionsInfo["Alignment"], re.I): 278 OptionsInfo["UseCrippenOpen3A"] = True 279 280 OptionsInfo["Distance"] = Options["--distance"] 281 282 OptionsInfo["CalcTanimotoDistance"] = False 283 OptionsInfo["CalcProtrudeDistance"] = False 284 if re.match("^Tanimoto$", OptionsInfo["Distance"], re.I): 285 OptionsInfo["CalcTanimotoDistance"] = True 286 elif re.match("^Protrude$", OptionsInfo["Distance"], re.I): 287 OptionsInfo["CalcProtrudeDistance"] = True 288 else: 289 OptionsInfo["CalcTanimotoDistance"] = True 290 OptionsInfo["CalcProtrudeDistance"] = True 291 292 OptionsInfo["MaxIters"] = int(Options["--maxIters"]) 293 OptionsInfo["Mode"] = Options["--mode"] 294 295 OptionsInfo["RefFile"] = Options["--reffile"] 296 OptionsInfo["ProbeFile"] = Options["--probefile"] 297 298 OptionsInfo["Outfile"] = Options["--outfile"] 299 OptionsInfo["Overwrite"] = Options["--overwrite"] 300 301 # No need for any RDKit specific --outfileParams.... 302 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"]) 303 304 OptionsInfo["OutDelim"] = " " 305 if MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "csv"): 306 OptionsInfo["OutDelim"] = "," 307 elif MiscUtil.CheckFileExt(OptionsInfo["Outfile"], "tsv txt"): 308 OptionsInfo["OutDelim"] = "\t" 309 else: 310 MiscUtil.PrintError( 311 'The file name specified , %s, for option "--outfile" is not valid. Supported file formats: csv tsv txt\n' 312 % (OptionsInfo["Outfile"]) 313 ) 314 315 316 def RetrieveOptions(): 317 """Retrieve command line arguments and options.""" 318 319 # Get options... 320 global Options 321 Options = docopt(_docoptUsage_) 322 323 # Set current working directory to the specified directory... 324 WorkingDir = Options["--workingdir"] 325 if WorkingDir: 326 os.chdir(WorkingDir) 327 328 # Handle examples option... 329 if "--examples" in Options and Options["--examples"]: 330 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 331 sys.exit(0) 332 333 334 def ValidateOptions(): 335 """Validate option values.""" 336 337 MiscUtil.ValidateOptionTextValue("Alignment", Options["--alignment"], "Open3A CrippenOpen3A") 338 MiscUtil.ValidateOptionTextValue("Distance", Options["--distance"], "Tanimoto Protrude Both") 339 340 MiscUtil.ValidateOptionIntegerValue("--maxIters", Options["--maxIters"], {">": 0}) 341 342 MiscUtil.ValidateOptionTextValue("--mode", Options["--mode"], "OneToOne AllToAll FirstToAll") 343 344 MiscUtil.ValidateOptionFilePath("-r, --reffile", Options["--reffile"]) 345 MiscUtil.ValidateOptionFileExt("-r, --reffile", Options["--reffile"], "sdf sd mol") 346 347 MiscUtil.ValidateOptionFilePath("-p, --probefile", Options["--probefile"]) 348 MiscUtil.ValidateOptionFileExt("-p, --probefile", Options["--probefile"], "sdf sd mol") 349 350 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "csv tsv txt") 351 MiscUtil.ValidateOptionsOutputFileOverwrite( 352 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"] 353 ) 354 355 MiscUtil.ValidateOptionsDistinctFileNames( 356 "-r, --reffile", Options["--reffile"], "-o, --outfile", Options["--outfile"] 357 ) 358 MiscUtil.ValidateOptionsDistinctFileNames( 359 "-p, --probefile", Options["--probefile"], "-o, --outfile", Options["--outfile"] 360 ) 361 362 363 # Setup a usage string for docopt... 364 _docoptUsage_ = """ 365 RDKitCompareMoleculeShapes.py - Compare shapes of molecules 366 367 Usage: 368 RDKitCompareMoleculeShapes.py [--alignment <Open3A, CrippenOpen3A>] 369 [--distance <Tanimoto, Protrude, Both>] [--infileParams <Name,Value,...>] 370 [--maxIters <number>] [--mode <OneToOne, AllToAll, FirstToAll>] 371 [--overwrite] [-w <dir>] -r <reffile> -p <probefile> -o <outfile> 372 RDKitCompareMoleculeShapes.py -h | --help | -e | --examples 373 374 Description: 375 Compare shapes of molecules between a set of molecules in reference and probe 376 input files. The molecules are aligned using either Open 3DAlign or Crippen Open 377 3DAlign before calculating shape Tanimoto and protrude distances. 378 379 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 380 381 The supported output file formats are: CSV (.csv), TSV (.tsv, .txt) 382 383 Options: 384 -a, --alignment <Open3A, CrippenOpen3A> [default: Open3A] 385 Alignment methodology to use for aligning molecules before calculating Tanimoto and 386 protrude shape distances. Possible values: Open3A or CrippenOpen3A. Open 3DAlign 387 (Open3A) [ Ref 132 ] overlays molecules based on MMFF atom types and charges. 388 Crippen Open 3DAlign (CrippenOpen3A) uses Crippen logP contributions to overlay 389 molecules. 390 -d, --distance <Tanimoto, Protrude, Both> [default: Both] 391 Shape comparison distance to calculate for comparing shapes of molecules. Possible 392 values: Tanimoto, Protrude, or Both. Shape Tanimoto distance takes the volume 393 overlay into account during the calculation of distance. Shape protrude distance, 394 however, focuses on the volume mismatch. 395 --infileParams <Name,Value,...> [default: auto] 396 A comma delimited list of parameter name and value pairs for reading 397 molecules from files. The supported parameter names for different file 398 formats, along with their default values, are shown below: 399 400 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 401 402 --maxIters <number> [default: 50] 403 Maximum number of iterations to perform for each molecule pair during alignment. 404 -m, --mode <OneToOne, AllToAll, FirstToAll> [default: OneToOne] 405 Specify how molecules are handled in reference and probe input files during 406 comparison of shapes between reference and probe molecules. Possible values: 407 OneToOne, AllToAll and AllToFirst. For OneToOne mode, the molecule shapes are 408 calculated for each pair of molecules in the reference and probe file and the shape 409 distances are written to the output file. For AllToAll mode, the shape distances are 410 calculated for each reference molecule against all probe molecules. For FirstToAll mode, 411 however, the shape distances are only calculated for the first reference molecule 412 against all probe molecules. 413 -e, --examples 414 Print examples. 415 -h, --help 416 Print this help message. 417 -p, --probefile <probefile> 418 Probe input file name. 419 -r, --reffile <reffile> 420 Reference input file name. 421 -o, --outfile <outfile> 422 Output file name for writing out shape distances. Supported text file extensions: csv or tsv. 423 --overwrite 424 Overwrite existing files. 425 -w, --workingdir <dir> 426 Location of working directory which defaults to the current directory. 427 428 Examples: 429 To perform shape alignment using Open3A methodology between pair of molecules in 430 reference and probe input in 3D SD files, calculate both Tanimoto and protrude 431 distances, and write out a CSV file containing calculated distance values along with 432 appropriate molecule IDs, type: 433 434 % RDKitCompareMoleculeShapes.py -r Sample3DRef.sdf -p Sample3DProb.sdf 435 -o SampleOut.csv 436 437 To perform shape alignment using Crippen Open3A methodology between all molecules in 438 reference and probe molecules in 3D SD files, calculate only Tanimoto distance, and write 439 out a TSV file containing calculated distance value along with appropriate molecule IDs, type: 440 441 % RDKitCompareMoleculeShapes.py -m AllToAll -a CrippenOpen3A -d Tanimoto 442 -r Sample3DRef.sdf -p Sample3DProb.sdf -o SampleOut.csv 443 444 To perform shape alignment using Open3A methodology between first reference molecule 445 against all probe molecules in 3D SD files without removing hydrogens , calculate both 446 Tanimoto and protrude distances, and write out a CSV file containing calculated distance values along with 447 appropriate molecule IDs, type: 448 449 % RDKitCompareMoleculeShapes.py -m FirstToAll -a Open3A -d Both 450 --infileParams "removeHydrogens,no" -r Sample3DRef.sdf 451 -p Sample3DProb.sdf -o SampleOut.csv 452 453 Author: 454 Manish Sud(msud@san.rr.com) 455 456 See also: 457 RDKitCalculateRMSD.py, RDKitCalculateMolecularDescriptors.py, RDKitConvertFileFormat.py, 458 RDKitGenerateConformers.py, RDKitPerformMinimization.py 459 460 Copyright: 461 Copyright (C) 2026 Manish Sud. All rights reserved. 462 463 The functionality available in this script is implemented using RDKit, an 464 open source toolkit for cheminformatics developed by Greg Landrum. 465 466 This file is part of MayaChemTools. 467 468 MayaChemTools is free software; you can redistribute it and/or modify it under 469 the terms of the GNU Lesser General Public License as published by the Free 470 Software Foundation; either version 3 of the License, or (at your option) any 471 later version. 472 473 """ 474 475 if __name__ == "__main__": 476 main()