1 #!/bin/env python 2 # 3 # File: RDKitPickDiverseMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 from rdkit import DataStructs 43 from rdkit.Chem.Fingerprints import FingerprintMols 44 from rdkit.Chem import rdMolDescriptors 45 from rdkit.SimDivFilters import rdSimDivPickers 46 from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker 47 from rdkit.SimDivFilters.rdSimDivPickers import HierarchicalClusterPicker 48 except ImportError as ErrMsg: 49 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 50 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 51 sys.exit(1) 52 53 # MayaChemTools imports... 54 try: 55 from docopt import docopt 56 import MiscUtil 57 import RDKitUtil 58 except ImportError as ErrMsg: 59 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 60 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 61 sys.exit(1) 62 63 ScriptName = os.path.basename(sys.argv[0]) 64 Options = {} 65 OptionsInfo = {} 66 67 def main(): 68 """Start execution of the script.""" 69 70 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 71 72 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 73 74 # Retrieve command line arguments and options... 75 RetrieveOptions() 76 77 # Process and validate command line arguments and options... 78 ProcessOptions() 79 80 # Perform actions required by the script... 81 PickDiverseMolecules() 82 83 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 84 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 85 86 def PickDiverseMolecules(): 87 """Pick diverse molecules.""" 88 89 Mols = RetrieveMolecules() 90 MolsFingerprints = GenerateFingerprints(Mols) 91 DiverseMols = SelectMolecules(Mols, MolsFingerprints) 92 93 WriteMolecules(DiverseMols) 94 95 def SelectMolecules(Mols, MolsFingerprints): 96 """Select diverse molecules.""" 97 98 if OptionsInfo["NumMols"] > len(Mols): 99 MiscUtil.PrintError("The number of diverse molecules to pick, %d, specified using \"-n, --numMols\" must be less than total number of valid molecules, %d" % (OptionsInfo["NumMols"], len(Mols))) 100 101 DiverseMols = [] 102 if re.match("^MaxMin$", OptionsInfo["Mode"], re.I): 103 return SelectMoleculesUsingMaxMin(Mols, MolsFingerprints) 104 elif re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I): 105 return SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints) 106 else: 107 MiscUtil.PrintError("The mode vaue, %s, is not a valid mode." % OptionsInfo["Mode"]) 108 109 return DiverseMols 110 111 def SelectMoleculesUsingMaxMin(Mols, MolsFingerprints): 112 """Select diverse molecules using MaxMin methodology.""" 113 114 MiscUtil.PrintInfo("\nSelecting diverse molecules using MaxMin methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"]) 115 116 DiverseMols = [] 117 118 PoolSize = len(MolsFingerprints) 119 PickSize = OptionsInfo["NumMols"] 120 SimilarityFunction = OptionsInfo["SimilarityFunction"] 121 122 Picker = MaxMinPicker() 123 PairwiseDistance = lambda i, j: 1 - SimilarityFunction(MolsFingerprints[i], MolsFingerprints[j]) 124 125 MolIndices = Picker.LazyPick(PairwiseDistance, PoolSize, PickSize) 126 127 for Index in list(MolIndices): 128 DiverseMols.append(Mols[Index]) 129 130 return DiverseMols 131 132 def SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints): 133 """Select diverse molecules using hierarchical clustering methodology.""" 134 135 try: 136 import numpy 137 except ImportError: 138 MiscUtil.PrintError("Failed to import numpy python module. This is required for picking diverse molecules using hierarchical for clustering.") 139 140 MiscUtil.PrintInfo("\nSelecting diverse molecules using %s hierarchical clustering methodology..." % OptionsInfo["SpecifiedClusteringMethod"]) 141 142 DiverseMols = [] 143 144 PoolSize = len(MolsFingerprints) 145 PickSize = OptionsInfo["NumMols"] 146 DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints) 147 148 ClusterPicker = HierarchicalClusterPicker(OptionsInfo["SpecifiedClusteringMethodID"]) 149 MolIndices = ClusterPicker.Pick(numpy.asarray(DistanceMatrix), PoolSize, PickSize) 150 151 for Index in MolIndices: 152 DiverseMols.append(Mols[Index]) 153 154 return DiverseMols 155 156 def RetrieveMolecules(): 157 """Retrieve molecules.""" 158 159 Infile = OptionsInfo["Infile"] 160 161 # Read molecules... 162 MiscUtil.PrintInfo("\nReading file %s..." % Infile) 163 164 OptionsInfo["InfileParams"]["AllowEmptyMols"] = False 165 ValidMols, MolCount, ValidMolCount = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"]) 166 167 MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount) 168 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 169 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 170 171 return ValidMols 172 173 def GenerateFingerprints(Mols): 174 """Generate fingerprints.""" 175 176 FingerprintsName = OptionsInfo["SpecifiedFingerprints"] 177 178 MolsFingerprints = [] 179 if re.match("^AtomPairs$", FingerprintsName, re.I): 180 return GenerateAtomPairsFingerprints(Mols) 181 elif re.match("^MACCS166Keys$", FingerprintsName, re.I): 182 return GenerateMACCS166KeysFingerprints(Mols) 183 elif re.match("^Morgan$", FingerprintsName, re.I): 184 return GenerateMorganFingerprints(Mols) 185 elif re.match("^MorganFeatures$", FingerprintsName, re.I): 186 return GenerateMorganFeaturesFingerprints(Mols) 187 elif re.match("^PathLength$", FingerprintsName, re.I): 188 return GeneratePathLengthFingerprints(Mols) 189 elif re.match("^TopologicalTorsions$", FingerprintsName, re.I): 190 return GenerateTopologicalTorsionsFingerprints(Mols) 191 else: 192 MiscUtil.PrintError("Fingerprints name, %s, is not a valid name" % FingerprintsName) 193 194 return MolsFingerprints 195 196 def GenerateAtomPairsFingerprints(Mols): 197 """Generate AtomPairs fingerprints.""" 198 199 MiscUtil.PrintInfo("\nGenerating AtomPairs fingerprints...") 200 201 MinLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MinLength"] 202 MaxLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MaxLength"] 203 UseChirality = OptionsInfo["FingerprintsParams"]["AtomPairs"]["UseChirality"] 204 205 if OptionsInfo["GenerateBitVectFingerints"]: 206 # Generate ExplicitBitVect fingerprints... 207 FPSize = 2048 208 BitsPerHash = 4 209 MolsFingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols] 210 else: 211 # Generate IntSparseIntVect fingerprints... 212 MolsFingerprints = [rdMolDescriptors.GetAtomPairFingerprint(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality) for Mol in Mols] 213 214 return MolsFingerprints 215 216 def GenerateMACCS166KeysFingerprints(Mols): 217 """Generate MACCS166Keys fingerprints.""" 218 219 MiscUtil.PrintInfo("\nGenerating MACCS166Keys fingerprints...") 220 221 # Generate ExplicitBitVect fingerprints... 222 MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols] 223 224 return MolsFingerprints 225 226 def GenerateMorganFingerprints(Mols): 227 """Generate Morgan fingerprints.""" 228 229 MiscUtil.PrintInfo("\nGenerating Morgan fingerprints...") 230 231 Radius = OptionsInfo["FingerprintsParams"]["Morgan"]["Radius"] 232 UseChirality = OptionsInfo["FingerprintsParams"]["Morgan"]["UseChirality"] 233 UseFeatures = False 234 235 if OptionsInfo["GenerateBitVectFingerints"]: 236 # Generate ExplicitBitVect fingerprints... 237 FPSize = 2048 238 MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols] 239 else: 240 # Generate UIntSparseIntVect fingerprints... 241 MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols] 242 243 return MolsFingerprints 244 245 def GenerateMorganFeaturesFingerprints(Mols): 246 """Generate MorganFeatures fingerprints.""" 247 248 MiscUtil.PrintInfo("\nGenerating MorganFeatures fingerprints...") 249 250 # Setup fingerprints parameters... 251 Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"] 252 UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["UseChirality"] 253 UseFeatures = True 254 255 if OptionsInfo["GenerateBitVectFingerints"]: 256 # Generate ExplicitBitVect fingerprints... 257 FPSize = 2048 258 MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols] 259 else: 260 # Generate UIntSparseIntVect fingerprints... 261 MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols] 262 263 return MolsFingerprints 264 265 def GeneratePathLengthFingerprints(Mols): 266 """Generate PathLength fingerprints.""" 267 268 MiscUtil.PrintInfo("\nGenerating PathLength fingerprints ...") 269 270 MinPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MinPath"] 271 MaxPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MaxPath"] 272 FPSize = OptionsInfo["FingerprintsParams"]["PathLength"]["FPSize"] 273 BitsPerHash = OptionsInfo["FingerprintsParams"]["PathLength"]["BitsPerHash"] 274 UseHs = False 275 TargetDensity = 0.3 276 MinSize = 54 277 278 # Generate ExplicitBitVect fingerprints... 279 MolsFingerprints = [FingerprintMols.FingerprintMol(Mol, minPath = MinPath, maxPath = MaxPath, fpSize = FPSize, bitsPerHash = BitsPerHash, useHs = UseHs, tgtDensity = TargetDensity, minSize = MinSize) for Mol in Mols] 280 281 return MolsFingerprints 282 283 def GenerateTopologicalTorsionsFingerprints(Mols): 284 """Generate TopologicalTorsions fingerprints.""" 285 286 MiscUtil.PrintInfo("\nGenerating TopologicalTorsions fingerprints...") 287 288 UseChirality = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["UseChirality"] 289 290 if OptionsInfo["GenerateBitVectFingerints"]: 291 FPSize = 2048 292 BitsPerHash = 4 293 MolsFingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Mol, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols] 294 else: 295 # Generate LongSparseIntVect fingerprint... 296 MolsFingerprints = [rdMolDescriptors.GetTopologicalTorsionFingerprint(Mol, includeChirality = UseChirality) for Mol in Mols] 297 298 return MolsFingerprints 299 300 def GenerateLowerTriangularDistanceMatrix(MolsFingerprints): 301 """Generate a lower triangular distance matrix without the diagonal.""" 302 303 SimilarityFunction = OptionsInfo["SimilarityFunction"] 304 305 DistanceMatrix = [] 306 NumFPs = len(MolsFingerprints) 307 for Index1 in range(0, NumFPs): 308 for Index2 in range(0, Index1): 309 Distance = 1 - SimilarityFunction(MolsFingerprints[Index1], MolsFingerprints[Index2],) 310 DistanceMatrix.append(Distance) 311 312 return DistanceMatrix 313 314 def WriteMolecules(Mols): 315 """Write out molecules.""" 316 317 Outfile = OptionsInfo["Outfile"] 318 319 # Set up a molecule writer... 320 Writer = None 321 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 322 if Writer is None: 323 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 324 MiscUtil.PrintInfo("\nGenerating file %s...\n" % Outfile) 325 326 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 327 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 328 329 # Write out molecules... 330 FirstMol = True 331 for Mol in Mols: 332 if FirstMol: 333 FirstMol = False 334 if SetSMILESMolProps: 335 RDKitUtil.SetWriterMolProps(Writer, Mol) 336 RDKitUtil.SetWriterMolProps(Writer, Mol) 337 338 if Compute2DCoords: 339 AllChem.Compute2DCoords(Mol) 340 Writer.write(Mol) 341 342 if Writer is not None: 343 Writer.close() 344 345 MiscUtil.PrintInfo("Total number of diverse molecules selected: %d" % (len(Mols))) 346 347 def ProcessFingerprintsParameters(): 348 """Set up and process fingerprints parameters.""" 349 350 SetupFingerprintsNamesAndParameters() 351 ProcessSpecifiedFingerprintsName() 352 ProcessSpecifiedFingerprintsParameters() 353 354 def SetupFingerprintsNamesAndParameters(): 355 """Set up fingerprints parameters.""" 356 357 OptionsInfo["FingerprintsNames"] = ["AtomPairs", "MACCS166Keys", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"] 358 359 OptionsInfo["FingerprintsParams"] = {} 360 OptionsInfo["FingerprintsParams"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False} 361 OptionsInfo["FingerprintsParams"]["MACCS166Keys"] = {} 362 OptionsInfo["FingerprintsParams"]["Morgan"] = {"Radius": 2, "UseChirality": False} 363 OptionsInfo["FingerprintsParams"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False} 364 OptionsInfo["FingerprintsParams"]["TopologicalTorsions"] = {"UseChirality": False} 365 OptionsInfo["FingerprintsParams"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "FPSize": 2048, "BitsPerHash": 2} 366 367 def ProcessSpecifiedFingerprintsName(): 368 """Process specified fingerprints name.""" 369 370 # Set up a canonical fingerprints name map... 371 CanonicalFingerprintsNamesMap = {} 372 for Name in OptionsInfo["FingerprintsNames"]: 373 CanonicalName = Name.lower() 374 CanonicalFingerprintsNamesMap[CanonicalName] = Name 375 376 # Validate specified fingerprints name... 377 CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower() 378 if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap: 379 MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"])) 380 381 OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName] 382 383 def ProcessSpecifiedFingerprintsParameters(): 384 """Process specified fingerprints parameters.""" 385 386 if re.match("^auto$", OptionsInfo["ParamsFingerprints"], re.I): 387 # Nothing to process... 388 return 389 390 SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"] 391 392 # Parse specified fingerprints parameters... 393 ParamsFingerprints = re.sub(" ", "", OptionsInfo["ParamsFingerprints"]) 394 if not ParamsFingerprints: 395 MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-p, --paramsFingerprints\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName)) 396 397 ParamsFingerprintsWords = ParamsFingerprints.split(",") 398 if len(ParamsFingerprintsWords) % 2: 399 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-p, --paramsFingerprints\" option must be an even number." % (len(ParamsFingerprintsWords))) 400 401 # Setup a canonical parameter names for specified fingerprints... 402 ValidParamNames = [] 403 CanonicalParamNamesMap = {} 404 for ParamName in sorted(OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName]): 405 ValidParamNames.append(ParamName) 406 CanonicalParamNamesMap[ParamName.lower()] = ParamName 407 408 # Validate and set paramater names and value... 409 for Index in range(0, len(ParamsFingerprintsWords), 2): 410 Name = ParamsFingerprintsWords[Index] 411 Value = ParamsFingerprintsWords[Index + 1] 412 413 CanonicalName = Name.lower() 414 if not CanonicalName in CanonicalParamNamesMap: 415 MiscUtil.PrintError("The parameter name, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames))) 416 417 ParamName = CanonicalParamNamesMap[CanonicalName] 418 if re.match("^UseChirality$", ParamName, re.I): 419 if not re.match("^(Yes|No|True|False)$", Value, re.I): 420 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName)) 421 ParamValue = False 422 if re.match("^(Yes|True)$", Value, re.I): 423 ParamValue = True 424 else: 425 ParamValue = int(Value) 426 if ParamValue <= 0: 427 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName)) 428 429 # Set value... 430 OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName][ParamName] = ParamValue 431 432 def ProcessSimilarityMetricParameter(): 433 """Process specified similarity metric value.""" 434 435 SimilarityInfoMap = {} 436 CanonicalNameMap = {} 437 438 for SimilarityFunctionInfo in DataStructs.similarityFunctions: 439 Name = SimilarityFunctionInfo[0] 440 Function = SimilarityFunctionInfo[1] 441 442 SimilarityInfoMap[Name] = Function 443 CanonicalName = Name.lower() 444 CanonicalNameMap[CanonicalName] = Name 445 446 SpecifiedCanonicalName = OptionsInfo["SimilarityMetric"].lower() 447 SimilarityFunction = None 448 if SpecifiedCanonicalName in CanonicalNameMap: 449 SimilarityName = CanonicalNameMap[SpecifiedCanonicalName] 450 SimilarityFunction = SimilarityInfoMap[SimilarityName] 451 else: 452 MiscUtil.PrintError("Similarity metric name, %s, is not a valid name. " % OptionsInfo["SimilarityMetric"]) 453 454 OptionsInfo["SimilarityMetric"] = SimilarityName 455 OptionsInfo["SimilarityFunction"] = SimilarityFunction 456 457 # RDKit similarity functions, besides Dice and Tanimoto, are not able to handle int bit vectors... 458 GenerateBitVectFingerints = False 459 if not re.match("^(Tanimoto|Dice)$", SimilarityName, re.I): 460 GenerateBitVectFingerints = True 461 OptionsInfo["GenerateBitVectFingerints"] = GenerateBitVectFingerints 462 463 def ProcessClusteringMethodParameter(): 464 """Process specified clustering method parameter.""" 465 466 OptionsInfo["SpecifiedClusteringMethod"] = "" 467 OptionsInfo["SpecifiedClusteringMethodID"] = "" 468 469 if not re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I): 470 # Nothing to process... 471 return 472 473 # Setup a canonical cluster method name map.. 474 ClusteringMethodInfoMap = {} 475 CanonicalClusteringMethodNameMap = {} 476 for Name in sorted(rdSimDivPickers.ClusterMethod.names): 477 NameID = rdSimDivPickers.ClusterMethod.names[Name] 478 ClusteringMethodInfoMap[Name] = NameID 479 480 CanonicalName = Name.lower() 481 CanonicalClusteringMethodNameMap[CanonicalName] = Name 482 483 CanonicalName = OptionsInfo["ClusteringMethod"].lower() 484 if not CanonicalName in CanonicalClusteringMethodNameMap: 485 MiscUtil.PrintError("The cluster method, %s, specified using \"-c, --clusteringMethod\" option is not a valid name." % (OptionsInfo["ClusteringMethod"])) 486 487 SpecifiedClusteringMethodName = CanonicalClusteringMethodNameMap[CanonicalName] 488 OptionsInfo["SpecifiedClusteringMethod"] = SpecifiedClusteringMethodName 489 OptionsInfo["SpecifiedClusteringMethodID"] = ClusteringMethodInfoMap[SpecifiedClusteringMethodName] 490 491 def ProcessOptions(): 492 """Process and validate command line arguments and options.""" 493 494 MiscUtil.PrintInfo("Processing options...") 495 496 # Validate options... 497 ValidateOptions() 498 499 OptionsInfo["Mode"] = Options["--mode"] 500 OptionsInfo["Fingerprints"] = Options["--fingerprints"] 501 502 OptionsInfo["ClusteringMethod"] = Options["--clusteringMethod"] 503 ProcessClusteringMethodParameter() 504 505 OptionsInfo["NumMols"] = int(Options["--numMols"]) 506 507 OptionsInfo["Infile"] = Options["--infile"] 508 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 509 510 OptionsInfo["Outfile"] = Options["--outfile"] 511 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 512 513 OptionsInfo["Overwrite"] = Options["--overwrite"] 514 515 OptionsInfo["SimilarityMetric"] = Options["--similarityMetric"] 516 ProcessSimilarityMetricParameter() 517 518 OptionsInfo["ParamsFingerprints"] = Options["--paramsFingerprints"] 519 ProcessFingerprintsParameters() 520 521 def RetrieveOptions(): 522 """Retrieve command line arguments and options.""" 523 524 # Get options... 525 global Options 526 Options = docopt(_docoptUsage_) 527 528 # Set current working directory to the specified directory... 529 WorkingDir = Options["--workingdir"] 530 if WorkingDir: 531 os.chdir(WorkingDir) 532 533 # Handle examples option... 534 if "--examples" in Options and Options["--examples"]: 535 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 536 sys.exit(0) 537 538 def ValidateOptions(): 539 """Validate option values.""" 540 541 MiscUtil.ValidateOptionTextValue("-c, --clusteringMethod", Options["--clusteringMethod"], "Centroid CLink Gower McQuitty SLink UPGMA Ward") 542 MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs MACCS166Keys Morgan MorganFeatures PathLength TopologicalTorsions") 543 544 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "MaxMin HierarchicalClustering") 545 MiscUtil.ValidateOptionIntegerValue("-n, --numMols", Options["--numMols"], {">": 0}) 546 547 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 548 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 549 550 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 551 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 552 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 553 554 MiscUtil.ValidateOptionTextValue("-s, --similarityMetric", Options["--similarityMetric"], "BraunBlanquet Cosine Dice Kulczynski RogotGoldberg Russel Sokal Tanimoto") 555 556 # Setup a usage string for docopt... 557 _docoptUsage_ = """ 558 RDKitPickDiverseMolecules.py - Pick a diverse subset of molecules 559 560 Usage: 561 RDKitPickDiverseMolecules.py [--clusteringMethod <Centroid, CLink...>] 562 [--fingerprints <MACCS166Keys, Morgan, PathLength...>] 563 [--infileParams <Name,Value,...>] [--mode <MaxMin or HierarchicalClustering>] 564 [--numMols <number>] [--outfileParams <Name,Value,...>] 565 [--overwrite] [--paramsFingerprints <Name,Value,...>] 566 [--similarityMetric <Dice, Tanimoto...>] [-w <dir>] -i <infile> -o <outfile> 567 RDKitPickDiverseMolecules.py -h | --help | -e | --examples 568 569 Description: 570 Pick a subset of diverse molecules based on a variety of 2D fingerprints using 571 MaxMin [ Ref 135 ] or an available hierarchical clustering methodology and write 572 them to a file. 573 574 The default fingerprints types for various fingerprints are shown below: 575 576 AtomPairs IntSparseIntVect 577 MACCS166Keys ExplicitBitVect 578 Morgan UIntSparseIntVect 579 MorganFeatures UIntSparseIntVect 580 PathLength ExplicitBitVect 581 TopologicalTorsions LongSparseIntVect 582 583 The Dice and Tanimoto similarity functions available in RDKit are able to 584 handle fingerprints corresponding to both IntVect and BitVect. All other 585 similarity functions, however, expect BitVect fingerprints to calculate 586 pairwise similarity. Consequently, ExplicitBitVect fingerprints are generated 587 for AtomPairs, Morgan, MorganFeatures, and TopologicalTorsions for 588 similarity calculations instead of default IntVect fingerprints. 589 590 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt) 591 592 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 593 594 Options: 595 -c, --clusteringMethod <Centroid, CLink...> [default: Centroid] 596 Clustering method to use for picking a subset of diverse molecules during 597 hierarchical clustering. Supported values: Centroid, CLink, Gower, 598 McQuitty, SLink, UPGMA, Ward. This option is ignored for 'MaxMin' value 599 of '-m, --mode' option. The Clink and SLink corresponding to CompleteLink 600 and SingleLink cluster method. 601 -f, --fingerprints <MACCS166Keys, Morgan, PathLength...> [default: Morgan] 602 Fingerprints to use for calculating similarity/distance between molecules. 603 Supported values: AtomPairs, MACCS166Keys, Morgan, MorganFeatures, PathLength, 604 TopologicalTorsions. The PathLength fingerprints are Daylight like fingerprints. 605 The Morgan and MorganFeature fingerprints are circular fingerprints, corresponding 606 Scitegic's Extended Connectivity Fingerprints (ECFP) and Features Connectivity 607 Fingerprints (FCFP). The values of default parameters for generating fingerprints 608 can be modified using '-p, --paramsFingerprints' option. 609 -e, --examples 610 Print examples. 611 -h, --help 612 Print this help message. 613 -i, --infile <infile> 614 Input file name. 615 --infileParams <Name,Value,...> [default: auto] 616 A comma delimited list of parameter name and value pairs for reading 617 molecules from files. The supported parameter names for different file 618 formats, along with their default values, are shown below: 619 620 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 621 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 622 smilesTitleLine,auto,sanitize,yes 623 624 Possible values for smilesDelimiter: space, comma or tab. 625 -m, --mode <MaxMin or HierarchicalClustering> [default: MaxMin] 626 Pick a diverse subset of molecules using MaxMin or hierarchical clustering 627 methodology. 628 -n, --numMols <number> [default: 25] 629 Number of diverse molecules to pick. 630 -o, --outfile <outfile> 631 Output file name. 632 --outfileParams <Name,Value,...> [default: auto] 633 A comma delimited list of parameter name and value pairs for writing 634 molecules to files. The supported parameter names for different file 635 formats, along with their default values, are shown below: 636 637 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 638 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 639 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 640 641 Default value for compute2DCoords: yes for SMILES input file; no for all other 642 file types. 643 --overwrite 644 Overwrite existing files. 645 -p, --paramsFingerprints <Name,Value,...> [default: auto] 646 Parameter values to use for generating fingerprints. The default values 647 are dependent on the value of '-f, --fingerprints' option. In general, it is a 648 comma delimited list of parameter name and value pairs for the name of 649 the fingerprints specified using '-f, --fingerprints' option. The supported 650 parameter names along with their default values for valid fingerprints 651 names are shown below: 652 653 AtomPairs: minLength,1 ,maxLength,30, useChirality,No 654 Morgan: radius,2, useChirality,No 655 MorganFeatures: radius,2, useChirality,No 656 PathLength: minPath,1, maxPath,7, fpSize, 2048, bitsPerHash,2 657 TopologicalTorsions: useChirality,No 658 659 -s, --similarityMetric <Dice, Tanimoto...> [default: Tanimoto] 660 Similarity metric to use for calculating similarity/distance between molecules. 661 Possible values: BraunBlanquet, Cosine, Dice, Kulczynski, RogotGoldberg, 662 Russel, Sokal, Tanimoto. 663 -w, --workingdir <dir> 664 Location of working directory which defaults to the current directory. 665 666 Examples: 667 To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity 668 metric corresponding to Morgan fingerprints with radius of 2, and write 669 out a SMILES file, type: 670 671 % RDKitPickDiverseMolecules.py -i Sample.smi -o SampleOut.smi 672 673 To pick 50 diverse molecules using MaxMin methodology, Dice similarity metric 674 corresponding to PathLength fingerprints with max path length of 6, and write 675 out a SD file, type: 676 677 % RDKitPickDiverseMolecules.py -m MaxMin -f PathLength -s Dice -n 50 678 -p 'maxPath,6' -i Sample.sdf -o SampleOut.sdf 679 680 To pick 25 diverse molecules using Centroid hierarchical clustering methodology, 681 Tanimoto similarity metric corresponding to Morgan fingerprints with radius of 2, 682 and write out a SMILES file, type: 683 684 % RDKitPickDiverseMolecules.py -m HierarchicalClustering -i Sample.smi 685 -o SampleOut.smi 686 687 To pick 50 diverse molecules using Ward hierarchical methodology methodology, 688 Dice similarity metric corresponding to MorganFeatures fingerprints with radius 689 of 2 along with deploying chirality, and write out a SD file, type: 690 691 % RDKitPickDiverseMolecules.py -m HierarchicalClustering -c Ward -n 50 692 -f MorganFeatures -p 'radius,2,useChirality,No' -i Sample.sdf -o 693 SampleOut.sdf 694 695 To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity 696 metric corresponding to Morgan fingerprints with radius of 2 from a CSV SMIKES 697 file , SMILES strings in column 1, name in olumn 2, and write out a SD file, type: 698 699 % RDKitPickDiverseMolecules.py --infileParams 700 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 701 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 702 -i SampleSMILES.csv -o SampleOut.sdf 703 704 Author: 705 Manish Sud(msud@san.rr.com) 706 707 See also: 708 RDKitClusterMolecules.py, RDKitConvertFileFormat.py, RDKitSearchFunctionalGroups.py, 709 RDKitSearchSMARTS.py 710 711 Copyright: 712 Copyright (C) 2025 Manish Sud. All rights reserved. 713 714 The functionality available in this script is implemented using RDKit, an 715 open source toolkit for cheminformatics developed by Greg Landrum. 716 717 This file is part of MayaChemTools. 718 719 MayaChemTools is free software; you can redistribute it and/or modify it under 720 the terms of the GNU Lesser General Public License as published by the Free 721 Software Foundation; either version 3 of the License, or (at your option) any 722 later version. 723 724 """ 725 726 if __name__ == "__main__": 727 main()