MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitPickDiverseMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit import DataStructs
  43     from rdkit.Chem.Fingerprints import FingerprintMols
  44     from rdkit.Chem import rdMolDescriptors
  45     from rdkit.SimDivFilters import rdSimDivPickers
  46     from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
  47     from rdkit.SimDivFilters.rdSimDivPickers import HierarchicalClusterPicker
  48 except ImportError as ErrMsg:
  49     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  50     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  51     sys.exit(1)
  52 
  53 # MayaChemTools imports...
  54 try:
  55     from docopt import docopt
  56     import MiscUtil
  57     import RDKitUtil
  58 except ImportError as ErrMsg:
  59     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  60     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  61     sys.exit(1)
  62 
  63 ScriptName = os.path.basename(sys.argv[0])
  64 Options = {}
  65 OptionsInfo = {}
  66 
  67 def main():
  68     """Start execution of the script."""
  69     
  70     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  71     
  72     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  73     
  74     # Retrieve command line arguments and options...
  75     RetrieveOptions()
  76     
  77     # Process and validate command line arguments and options...
  78     ProcessOptions()
  79     
  80     # Perform actions required by the script...
  81     PickDiverseMolecules()
  82     
  83     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  84     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  85 
  86 def PickDiverseMolecules():
  87     """Pick diverse molecules."""
  88 
  89     Mols = RetrieveMolecules()
  90     MolsFingerprints = GenerateFingerprints(Mols)
  91     DiverseMols = SelectMolecules(Mols, MolsFingerprints)
  92     
  93     WriteMolecules(DiverseMols)
  94 
  95 def SelectMolecules(Mols, MolsFingerprints):
  96     """Select diverse molecules."""
  97 
  98     if OptionsInfo["NumMols"] > len(Mols):
  99         MiscUtil.PrintError("The number of diverse molecules to pick, %d, specified using \"-n, --numMols\" must be less than total number of valid molecules, %d" % (OptionsInfo["NumMols"], len(Mols)))
 100     
 101     DiverseMols = []
 102     if re.match("^MaxMin$", OptionsInfo["Mode"], re.I):
 103         return SelectMoleculesUsingMaxMin(Mols, MolsFingerprints)
 104     elif re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I):
 105         return SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints)
 106     else:
 107         MiscUtil.PrintError("The mode vaue, %s, is not a valid mode." % OptionsInfo["Mode"])
 108     
 109     return DiverseMols
 110 
 111 def SelectMoleculesUsingMaxMin(Mols, MolsFingerprints):
 112     """Select diverse molecules using MaxMin methodology."""
 113 
 114     MiscUtil.PrintInfo("\nSelecting diverse molecules using MaxMin methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"])
 115     
 116     DiverseMols = []
 117     
 118     PoolSize = len(MolsFingerprints)
 119     PickSize = OptionsInfo["NumMols"]
 120     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 121 
 122     Picker = MaxMinPicker()
 123     PairwiseDistance = lambda i, j: 1 - SimilarityFunction(MolsFingerprints[i], MolsFingerprints[j])
 124 
 125     MolIndices = Picker.LazyPick(PairwiseDistance, PoolSize, PickSize)
 126             
 127     for Index in list(MolIndices):
 128         DiverseMols.append(Mols[Index])
 129     
 130     return DiverseMols
 131 
 132 def SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints):
 133     """Select diverse molecules using hierarchical clustering  methodology."""
 134 
 135     try:
 136         import numpy
 137     except ImportError:
 138         MiscUtil.PrintError("Failed to import numpy python module. This is required for picking diverse molecules using hierarchical for clustering.")
 139     
 140     MiscUtil.PrintInfo("\nSelecting diverse molecules using %s hierarchical clustering methodology..." % OptionsInfo["SpecifiedClusteringMethod"])
 141     
 142     DiverseMols = []
 143     
 144     PoolSize = len(MolsFingerprints)
 145     PickSize = OptionsInfo["NumMols"]
 146     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 147     
 148     ClusterPicker = HierarchicalClusterPicker(OptionsInfo["SpecifiedClusteringMethodID"])
 149     MolIndices = ClusterPicker.Pick(numpy.asarray(DistanceMatrix), PoolSize, PickSize)
 150     
 151     for Index in MolIndices:
 152         DiverseMols.append(Mols[Index])
 153     
 154     return DiverseMols
 155 
 156 def RetrieveMolecules():
 157     """Retrieve molecules."""
 158 
 159     Infile = OptionsInfo["Infile"]
 160     
 161     # Read molecules...
 162     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
 163     
 164     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 165     ValidMols, MolCount, ValidMolCount  = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"])
 166     
 167     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
 168     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 169     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 170 
 171     return ValidMols
 172 
 173 def GenerateFingerprints(Mols):
 174     """Generate fingerprints."""
 175 
 176     FingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 177     
 178     MolsFingerprints = []
 179     if re.match("^AtomPairs$", FingerprintsName, re.I):
 180         return GenerateAtomPairsFingerprints(Mols)
 181     elif re.match("^MACCS166Keys$", FingerprintsName, re.I):
 182         return GenerateMACCS166KeysFingerprints(Mols)
 183     elif re.match("^Morgan$", FingerprintsName, re.I):
 184         return GenerateMorganFingerprints(Mols)
 185     elif re.match("^MorganFeatures$", FingerprintsName, re.I):
 186         return GenerateMorganFeaturesFingerprints(Mols)
 187     elif re.match("^PathLength$", FingerprintsName, re.I):
 188         return GeneratePathLengthFingerprints(Mols)
 189     elif re.match("^TopologicalTorsions$", FingerprintsName, re.I):
 190         return GenerateTopologicalTorsionsFingerprints(Mols)
 191     else:
 192         MiscUtil.PrintError("Fingerprints name, %s, is not a valid name" % FingerprintsName)
 193     
 194     return MolsFingerprints
 195 
 196 def GenerateAtomPairsFingerprints(Mols):
 197     """Generate AtomPairs fingerprints."""
 198 
 199     MiscUtil.PrintInfo("\nGenerating AtomPairs fingerprints...")
 200     
 201     MinLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MinLength"]
 202     MaxLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MaxLength"]
 203     UseChirality = OptionsInfo["FingerprintsParams"]["AtomPairs"]["UseChirality"]
 204 
 205     if OptionsInfo["GenerateBitVectFingerints"]:
 206         # Generate ExplicitBitVect fingerprints...
 207         FPSize = 2048
 208         BitsPerHash = 4
 209         MolsFingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 210     else:
 211         # Generate IntSparseIntVect fingerprints...
 212         MolsFingerprints = [rdMolDescriptors.GetAtomPairFingerprint(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality) for Mol in Mols]
 213 
 214     return MolsFingerprints
 215 
 216 def GenerateMACCS166KeysFingerprints(Mols):
 217     """Generate MACCS166Keys fingerprints."""
 218 
 219     MiscUtil.PrintInfo("\nGenerating MACCS166Keys fingerprints...")
 220 
 221     # Generate ExplicitBitVect fingerprints...
 222     MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols]
 223 
 224     return MolsFingerprints
 225 
 226 def GenerateMorganFingerprints(Mols):
 227     """Generate Morgan fingerprints."""
 228 
 229     MiscUtil.PrintInfo("\nGenerating  Morgan fingerprints...")
 230     
 231     Radius = OptionsInfo["FingerprintsParams"]["Morgan"]["Radius"]
 232     UseChirality = OptionsInfo["FingerprintsParams"]["Morgan"]["UseChirality"]
 233     UseFeatures = False
 234 
 235     if OptionsInfo["GenerateBitVectFingerints"]:
 236         # Generate ExplicitBitVect fingerprints...
 237         FPSize = 2048
 238         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 239     else:
 240         # Generate UIntSparseIntVect fingerprints...
 241         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 242 
 243     return MolsFingerprints
 244 
 245 def GenerateMorganFeaturesFingerprints(Mols):
 246     """Generate MorganFeatures fingerprints."""
 247 
 248     MiscUtil.PrintInfo("\nGenerating  MorganFeatures fingerprints...")
 249     
 250     # Setup fingerprints parameters...
 251     Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"]
 252     UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["UseChirality"]
 253     UseFeatures = True
 254     
 255     if OptionsInfo["GenerateBitVectFingerints"]:
 256         # Generate ExplicitBitVect fingerprints...
 257         FPSize = 2048
 258         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 259     else:
 260         # Generate UIntSparseIntVect fingerprints...
 261         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 262 
 263     return MolsFingerprints
 264 
 265 def GeneratePathLengthFingerprints(Mols):
 266     """Generate PathLength fingerprints."""
 267 
 268     MiscUtil.PrintInfo("\nGenerating PathLength fingerprints ...")
 269     
 270     MinPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MinPath"]
 271     MaxPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MaxPath"]
 272     FPSize = OptionsInfo["FingerprintsParams"]["PathLength"]["FPSize"]
 273     BitsPerHash = OptionsInfo["FingerprintsParams"]["PathLength"]["BitsPerHash"]
 274     UseHs = False
 275     TargetDensity = 0.3
 276     MinSize = 54
 277 
 278     # Generate ExplicitBitVect fingerprints...
 279     MolsFingerprints = [FingerprintMols.FingerprintMol(Mol, minPath = MinPath, maxPath = MaxPath, fpSize = FPSize, bitsPerHash = BitsPerHash, useHs = UseHs, tgtDensity = TargetDensity, minSize = MinSize) for Mol in Mols]
 280 
 281     return MolsFingerprints
 282 
 283 def GenerateTopologicalTorsionsFingerprints(Mols):
 284     """Generate TopologicalTorsions fingerprints."""
 285 
 286     MiscUtil.PrintInfo("\nGenerating TopologicalTorsions fingerprints...")
 287     
 288     UseChirality = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["UseChirality"]
 289 
 290     if OptionsInfo["GenerateBitVectFingerints"]:
 291         FPSize = 2048
 292         BitsPerHash = 4
 293         MolsFingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Mol,  includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 294     else:
 295         # Generate LongSparseIntVect fingerprint...
 296         MolsFingerprints = [rdMolDescriptors.GetTopologicalTorsionFingerprint(Mol,  includeChirality = UseChirality) for Mol in Mols]
 297 
 298     return MolsFingerprints
 299 
 300 def GenerateLowerTriangularDistanceMatrix(MolsFingerprints):
 301     """Generate a lower triangular distance matrix without the diagonal."""
 302 
 303     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 304 
 305     DistanceMatrix = []
 306     NumFPs = len(MolsFingerprints)
 307     for Index1 in range(0, NumFPs):
 308         for Index2 in range(0, Index1):
 309             Distance =  1 - SimilarityFunction(MolsFingerprints[Index1], MolsFingerprints[Index2],)
 310             DistanceMatrix.append(Distance)
 311 
 312     return DistanceMatrix
 313 
 314 def WriteMolecules(Mols):
 315     """Write out molecules."""
 316 
 317     Outfile = OptionsInfo["Outfile"]
 318     
 319     # Set up a molecule writer...
 320     Writer = None
 321     Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 322     if Writer is None:
 323         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 324     MiscUtil.PrintInfo("\nGenerating file %s...\n" % Outfile)
 325 
 326     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 327     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 328     
 329     # Write out molecules...
 330     FirstMol = True
 331     for Mol in Mols:
 332         if FirstMol:
 333             FirstMol = False
 334             if SetSMILESMolProps:
 335                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 336                 RDKitUtil.SetWriterMolProps(Writer, Mol)
 337         
 338         if Compute2DCoords:
 339             AllChem.Compute2DCoords(Mol)
 340         Writer.write(Mol)
 341         
 342     if Writer is not None:
 343         Writer.close()
 344         
 345     MiscUtil.PrintInfo("Total number of diverse molecules selected: %d" % (len(Mols)))
 346 
 347 def ProcessFingerprintsParameters():
 348     """Set up and process fingerprints parameters."""
 349 
 350     SetupFingerprintsNamesAndParameters()
 351     ProcessSpecifiedFingerprintsName()
 352     ProcessSpecifiedFingerprintsParameters()
 353 
 354 def SetupFingerprintsNamesAndParameters():
 355     """Set up fingerprints parameters."""
 356     
 357     OptionsInfo["FingerprintsNames"] = ["AtomPairs", "MACCS166Keys", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
 358     
 359     OptionsInfo["FingerprintsParams"] = {}
 360     OptionsInfo["FingerprintsParams"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False}
 361     OptionsInfo["FingerprintsParams"]["MACCS166Keys"] = {}
 362     OptionsInfo["FingerprintsParams"]["Morgan"] = {"Radius": 2, "UseChirality": False}
 363     OptionsInfo["FingerprintsParams"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False}
 364     OptionsInfo["FingerprintsParams"]["TopologicalTorsions"] = {"UseChirality": False}
 365     OptionsInfo["FingerprintsParams"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "FPSize": 2048, "BitsPerHash": 2}
 366 
 367 def ProcessSpecifiedFingerprintsName():
 368     """Process specified fingerprints name."""
 369 
 370     #  Set up a canonical fingerprints name map...
 371     CanonicalFingerprintsNamesMap = {}
 372     for Name in OptionsInfo["FingerprintsNames"]:
 373         CanonicalName = Name.lower()
 374         CanonicalFingerprintsNamesMap[CanonicalName] = Name
 375 
 376     # Validate specified fingerprints name...
 377     CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
 378     if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
 379         MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"]))
 380     
 381     OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
 382 
 383 def ProcessSpecifiedFingerprintsParameters():
 384     """Process specified fingerprints parameters."""
 385 
 386     if re.match("^auto$", OptionsInfo["ParamsFingerprints"], re.I):
 387         # Nothing to process...
 388         return
 389 
 390     SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 391     
 392     # Parse specified fingerprints parameters...
 393     ParamsFingerprints = re.sub(" ", "", OptionsInfo["ParamsFingerprints"])
 394     if not ParamsFingerprints:
 395         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-p, --paramsFingerprints\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName))
 396 
 397     ParamsFingerprintsWords = ParamsFingerprints.split(",")
 398     if len(ParamsFingerprintsWords) % 2:
 399         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-p, --paramsFingerprints\" option must be an even number." % (len(ParamsFingerprintsWords)))
 400 
 401     # Setup a canonical parameter names for specified fingerprints...
 402     ValidParamNames = []
 403     CanonicalParamNamesMap = {}
 404     for ParamName in sorted(OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName]):
 405         ValidParamNames.append(ParamName)
 406         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 407 
 408     # Validate and set paramater names and value...
 409     for Index in range(0, len(ParamsFingerprintsWords), 2):
 410         Name = ParamsFingerprintsWords[Index]
 411         Value = ParamsFingerprintsWords[Index + 1]
 412 
 413         CanonicalName = Name.lower()
 414         if  not CanonicalName in CanonicalParamNamesMap:
 415             MiscUtil.PrintError("The parameter name, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames)))
 416 
 417         ParamName = CanonicalParamNamesMap[CanonicalName]
 418         if re.match("^UseChirality$", ParamName, re.I):
 419             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 420                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName))
 421             ParamValue = False
 422             if re.match("^(Yes|True)$", Value, re.I):
 423                 ParamValue = True
 424         else:
 425             ParamValue = int(Value)
 426             if ParamValue <= 0:
 427                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName))
 428         
 429         # Set value...
 430         OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName][ParamName] = ParamValue
 431 
 432 def ProcessSimilarityMetricParameter():
 433     """Process specified similarity metric value."""
 434 
 435     SimilarityInfoMap = {}
 436     CanonicalNameMap = {}
 437     
 438     for SimilarityFunctionInfo in DataStructs.similarityFunctions:
 439         Name = SimilarityFunctionInfo[0]
 440         Function = SimilarityFunctionInfo[1]
 441         
 442         SimilarityInfoMap[Name] = Function
 443         CanonicalName = Name.lower()
 444         CanonicalNameMap[CanonicalName] = Name
 445     
 446     SpecifiedCanonicalName = OptionsInfo["SimilarityMetric"].lower()
 447     SimilarityFunction = None
 448     if  SpecifiedCanonicalName in CanonicalNameMap:
 449         SimilarityName = CanonicalNameMap[SpecifiedCanonicalName]
 450         SimilarityFunction = SimilarityInfoMap[SimilarityName]
 451     else:
 452         MiscUtil.PrintError("Similarity metric name, %s, is not a valid name. " % OptionsInfo["SimilarityMetric"])
 453         
 454     OptionsInfo["SimilarityMetric"] = SimilarityName
 455     OptionsInfo["SimilarityFunction"] = SimilarityFunction
 456 
 457     # RDKit similarity functions, besides Dice and Tanimoto, are not able to handle int bit vectors...
 458     GenerateBitVectFingerints = False
 459     if not re.match("^(Tanimoto|Dice)$", SimilarityName, re.I):
 460         GenerateBitVectFingerints = True
 461     OptionsInfo["GenerateBitVectFingerints"] = GenerateBitVectFingerints
 462     
 463 def ProcessClusteringMethodParameter():
 464     """Process specified clustering method parameter."""
 465 
 466     OptionsInfo["SpecifiedClusteringMethod"] = ""
 467     OptionsInfo["SpecifiedClusteringMethodID"] = ""
 468     
 469     if not re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I):
 470         # Nothing to process...
 471         return
 472 
 473     # Setup a canonical cluster method name map..
 474     ClusteringMethodInfoMap = {}
 475     CanonicalClusteringMethodNameMap = {}
 476     for Name in sorted(rdSimDivPickers.ClusterMethod.names):
 477         NameID =  rdSimDivPickers.ClusterMethod.names[Name]
 478         ClusteringMethodInfoMap[Name] = NameID
 479         
 480         CanonicalName = Name.lower()
 481         CanonicalClusteringMethodNameMap[CanonicalName] = Name
 482 
 483     CanonicalName = OptionsInfo["ClusteringMethod"].lower()
 484     if not CanonicalName in CanonicalClusteringMethodNameMap:
 485         MiscUtil.PrintError("The cluster method, %s, specified using \"-c, --clusteringMethod\" option is not a valid name." % (OptionsInfo["ClusteringMethod"]))
 486 
 487     SpecifiedClusteringMethodName = CanonicalClusteringMethodNameMap[CanonicalName]
 488     OptionsInfo["SpecifiedClusteringMethod"] = SpecifiedClusteringMethodName
 489     OptionsInfo["SpecifiedClusteringMethodID"] = ClusteringMethodInfoMap[SpecifiedClusteringMethodName] 
 490     
 491 def ProcessOptions():
 492     """Process and validate command line arguments and options."""
 493     
 494     MiscUtil.PrintInfo("Processing options...")
 495     
 496     # Validate options...
 497     ValidateOptions()
 498     
 499     OptionsInfo["Mode"] = Options["--mode"]
 500     OptionsInfo["Fingerprints"] = Options["--fingerprints"]
 501     
 502     OptionsInfo["ClusteringMethod"] = Options["--clusteringMethod"]
 503     ProcessClusteringMethodParameter()
 504 
 505     OptionsInfo["NumMols"] = int(Options["--numMols"])
 506     
 507     OptionsInfo["Infile"] = Options["--infile"]
 508     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 509     
 510     OptionsInfo["Outfile"] = Options["--outfile"]
 511     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 512     
 513     OptionsInfo["Overwrite"] = Options["--overwrite"]
 514 
 515     OptionsInfo["SimilarityMetric"] = Options["--similarityMetric"]
 516     ProcessSimilarityMetricParameter()
 517     
 518     OptionsInfo["ParamsFingerprints"] = Options["--paramsFingerprints"]
 519     ProcessFingerprintsParameters()
 520     
 521 def RetrieveOptions():
 522     """Retrieve command line arguments and options."""
 523     
 524     # Get options...
 525     global Options
 526     Options = docopt(_docoptUsage_)
 527     
 528     # Set current working directory to the specified directory...
 529     WorkingDir = Options["--workingdir"]
 530     if WorkingDir:
 531         os.chdir(WorkingDir)
 532     
 533     # Handle examples option...
 534     if "--examples" in Options and Options["--examples"]:
 535         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 536         sys.exit(0)
 537 
 538 def ValidateOptions():
 539     """Validate option values."""
 540     
 541     MiscUtil.ValidateOptionTextValue("-c, --clusteringMethod", Options["--clusteringMethod"], "Centroid CLink Gower McQuitty SLink UPGMA Ward")
 542     MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs MACCS166Keys Morgan MorganFeatures PathLength TopologicalTorsions")
 543     
 544     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "MaxMin HierarchicalClustering")
 545     MiscUtil.ValidateOptionIntegerValue("-n, --numMols", Options["--numMols"], {">": 0})
 546     
 547     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 548     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 549     
 550     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 551     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 552     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 553         
 554     MiscUtil.ValidateOptionTextValue("-s, --similarityMetric", Options["--similarityMetric"], "BraunBlanquet Cosine Dice Kulczynski RogotGoldberg Russel Sokal Tanimoto")
 555     
 556 # Setup a usage string for docopt...
 557 _docoptUsage_ = """
 558 RDKitPickDiverseMolecules.py - Pick a diverse subset of molecules
 559 
 560 Usage:
 561     RDKitPickDiverseMolecules.py [--clusteringMethod <Centroid, CLink...>]
 562                                  [--fingerprints <MACCS166Keys, Morgan, PathLength...>]
 563                                  [--infileParams <Name,Value,...>] [--mode <MaxMin or HierarchicalClustering>]
 564                                  [--numMols <number>]  [--outfileParams <Name,Value,...>] 
 565                                  [--overwrite] [--paramsFingerprints <Name,Value,...>]
 566                                  [--similarityMetric <Dice, Tanimoto...>] [-w <dir>] -i <infile> -o <outfile> 
 567     RDKitPickDiverseMolecules.py -h | --help | -e | --examples
 568 
 569 Description:
 570     Pick a subset of diverse molecules  based on a variety of 2D fingerprints using
 571     MaxMin [ Ref 135 ] or an available hierarchical clustering methodology and write
 572     them to a file.
 573 
 574     The default fingerprints types for various fingerprints are shown below:
 575 
 576         AtomPairs              IntSparseIntVect
 577         MACCS166Keys           ExplicitBitVect
 578         Morgan                 UIntSparseIntVect
 579         MorganFeatures         UIntSparseIntVect
 580         PathLength             ExplicitBitVect
 581         TopologicalTorsions    LongSparseIntVect
 582  
 583     The Dice and Tanimoto similarity functions available in RDKit are able to
 584     handle fingerprints corresponding to both IntVect and BitVect. All other
 585     similarity functions, however, expect BitVect fingerprints to calculate
 586     pairwise similarity. Consequently, ExplicitBitVect fingerprints are generated
 587     for AtomPairs, Morgan, MorganFeatures, and TopologicalTorsions for
 588     similarity calculations instead of default IntVect fingerprints.
 589 
 590     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt)
 591 
 592     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 593 
 594 Options:
 595     -c, --clusteringMethod <Centroid, CLink...>  [default: Centroid]
 596         Clustering method to use for picking a subset of diverse molecules during
 597         hierarchical clustering. Supported values: Centroid, CLink, Gower,
 598         McQuitty, SLink, UPGMA, Ward. This option is ignored for 'MaxMin' value
 599         of '-m, --mode' option. The Clink and SLink corresponding to CompleteLink
 600         and SingleLink cluster method.
 601     -f, --fingerprints <MACCS166Keys, Morgan, PathLength...>  [default: Morgan]
 602         Fingerprints to use for calculating similarity/distance between molecules.
 603         Supported values: AtomPairs, MACCS166Keys, Morgan, MorganFeatures, PathLength,
 604         TopologicalTorsions. The PathLength fingerprints are Daylight like fingerprints.
 605         The Morgan and MorganFeature fingerprints are circular fingerprints, corresponding
 606         Scitegic's Extended Connectivity Fingerprints (ECFP) and Features Connectivity
 607         Fingerprints (FCFP). The values of default parameters for generating fingerprints
 608         can be modified using '-p, --paramsFingerprints' option.
 609     -e, --examples
 610         Print examples.
 611     -h, --help
 612         Print this help message.
 613     -i, --infile <infile>
 614         Input file name.
 615     --infileParams <Name,Value,...>  [default: auto]
 616         A comma delimited list of parameter name and value pairs for reading
 617         molecules from files. The supported parameter names for different file
 618         formats, along with their default values, are shown below:
 619             
 620             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 621             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 622                 smilesTitleLine,auto,sanitize,yes
 623             
 624         Possible values for smilesDelimiter: space, comma or tab.
 625     -m, --mode <MaxMin or HierarchicalClustering>  [default: MaxMin]
 626         Pick a diverse subset of molecules using MaxMin or hierarchical clustering
 627         methodology.
 628     -n, --numMols <number>  [default: 25]
 629         Number of diverse molecules to pick.
 630     -o, --outfile <outfile>
 631         Output file name.
 632     --outfileParams <Name,Value,...>  [default: auto]
 633         A comma delimited list of parameter name and value pairs for writing
 634         molecules to files. The supported parameter names for different file
 635         formats, along with their default values, are shown below:
 636             
 637             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 638             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 639                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 640             
 641         Default value for compute2DCoords: yes for SMILES input file; no for all other
 642         file types.
 643     --overwrite
 644         Overwrite existing files.
 645     -p, --paramsFingerprints <Name,Value,...>  [default: auto]
 646         Parameter values to use for generating fingerprints. The default values
 647         are dependent on the value of '-f, --fingerprints' option. In general, it is a
 648         comma delimited list of parameter name and value pairs for the name of
 649         the fingerprints specified using '-f, --fingerprints' option. The supported
 650         parameter names along with their default values for valid fingerprints
 651         names are shown below:
 652             
 653             AtomPairs: minLength,1 ,maxLength,30, useChirality,No
 654             Morgan:   radius,2, useChirality,No
 655             MorganFeatures:   radius,2, useChirality,No
 656             PathLength: minPath,1, maxPath,7, fpSize, 2048, bitsPerHash,2
 657             TopologicalTorsions: useChirality,No
 658             
 659     -s, --similarityMetric <Dice, Tanimoto...>  [default: Tanimoto]
 660         Similarity metric to use for calculating similarity/distance between molecules.
 661         Possible values: BraunBlanquet, Cosine, Dice, Kulczynski, RogotGoldberg,
 662         Russel, Sokal, Tanimoto.
 663     -w, --workingdir <dir>
 664         Location of working directory which defaults to the current directory.
 665 
 666 Examples:
 667     To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity
 668     metric corresponding to Morgan fingerprints with radius of 2, and write
 669     out a SMILES file, type:
 670 
 671         % RDKitPickDiverseMolecules.py  -i Sample.smi -o SampleOut.smi
 672 
 673     To pick 50 diverse molecules using MaxMin methodology, Dice similarity metric
 674     corresponding to PathLength fingerprints with max path length of 6, and write
 675     out a SD file, type:
 676 
 677         % RDKitPickDiverseMolecules.py  -m MaxMin -f PathLength -s Dice -n 50
 678           -p 'maxPath,6' -i Sample.sdf -o SampleOut.sdf
 679 
 680     To pick 25 diverse molecules using Centroid hierarchical clustering methodology,
 681     Tanimoto similarity metric corresponding to Morgan fingerprints with radius of 2,
 682     and write out a SMILES file, type:
 683 
 684         % RDKitPickDiverseMolecules.py  -m HierarchicalClustering -i Sample.smi
 685           -o SampleOut.smi
 686 
 687     To pick 50 diverse molecules using Ward hierarchical methodology methodology,
 688     Dice similarity metric corresponding to MorganFeatures fingerprints with radius
 689     of 2 along with deploying chirality, and write out a SD file, type:
 690 
 691         % RDKitPickDiverseMolecules.py  -m HierarchicalClustering -c Ward -n 50
 692           -f MorganFeatures -p 'radius,2,useChirality,No' -i Sample.sdf -o
 693           SampleOut.sdf
 694 
 695     To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity
 696     metric corresponding to Morgan fingerprints with radius of 2 from a CSV SMIKES
 697     file , SMILES strings in column 1, name in olumn 2, and write out a SD file, type:
 698 
 699         % RDKitPickDiverseMolecules.py  --infileParams
 700           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 701           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 702           -i SampleSMILES.csv -o SampleOut.sdf
 703 
 704 Author:
 705     Manish Sud(msud@san.rr.com)
 706 
 707 See also:
 708     RDKitClusterMolecules.py, RDKitConvertFileFormat.py, RDKitSearchFunctionalGroups.py,
 709     RDKitSearchSMARTS.py
 710 
 711 Copyright:
 712     Copyright (C) 2025 Manish Sud. All rights reserved.
 713 
 714     The functionality available in this script is implemented using RDKit, an
 715     open source toolkit for cheminformatics developed by Greg Landrum.
 716 
 717     This file is part of MayaChemTools.
 718 
 719     MayaChemTools is free software; you can redistribute it and/or modify it under
 720     the terms of the GNU Lesser General Public License as published by the Free
 721     Software Foundation; either version 3 of the License, or (at your option) any
 722     later version.
 723 
 724 """
 725 
 726 if __name__ == "__main__":
 727     main()