MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitClusterMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit import DataStructs
  43     from rdkit.Chem.Fingerprints import FingerprintMols
  44     from rdkit.Chem import rdMolDescriptors
  45     from rdkit.ML.Cluster import Butina
  46     from rdkit.SimDivFilters import rdSimDivPickers
  47     from rdkit.SimDivFilters.rdSimDivPickers import HierarchicalClusterPicker
  48 except ImportError as ErrMsg:
  49     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  50     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  51     sys.exit(1)
  52 
  53 # MayaChemTools imports...
  54 try:
  55     from docopt import docopt
  56     import MiscUtil
  57     import RDKitUtil
  58 except ImportError as ErrMsg:
  59     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  60     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  61     sys.exit(1)
  62 
  63 ScriptName = os.path.basename(sys.argv[0])
  64 Options = {}
  65 OptionsInfo = {}
  66 
  67 def main():
  68     """Start execution of the script."""
  69     
  70     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  71     
  72     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  73     
  74     # Retrieve command line arguments and options...
  75     RetrieveOptions()
  76     
  77     # Process and validate command line arguments and options...
  78     ProcessOptions()
  79     
  80     # Perform actions required by the script...
  81     ClusterMolecules()
  82     
  83     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  84     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  85 
  86 def ClusterMolecules():
  87     """Cluster molecules."""
  88 
  89     Mols = RetrieveMolecules()
  90     MolsFingerprints = GenerateFingerprints(Mols)
  91     MolsClusters = PerformClustering(Mols, MolsFingerprints)
  92     
  93     WriteMolecules(MolsClusters)
  94 
  95 def PerformClustering(Mols, MolsFingerprints):
  96     """Perform clustering."""
  97 
  98     ClusteredMols = []
  99     if re.match("^Butina$", OptionsInfo["ClusteringMethod"], re.I):
 100         return PerformButinaClustering(Mols, MolsFingerprints)
 101     else:
 102         return PerformHierarchicalClustering(Mols, MolsFingerprints)
 103     
 104     return ClusteredMols
 105 
 106 def PerformButinaClustering(Mols, MolsFingerprints):
 107     """Perform clustering using Butina methodology."""
 108 
 109     MiscUtil.PrintInfo("\nClustering molecules using Butina methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"])
 110     
 111     FingerprintsCount = len(MolsFingerprints)
 112     DistanceCutoff =  1 - OptionsInfo["ButinaSimilarityCutoff"]
 113     Reordering = OptionsInfo["ButinaReordering"]
 114     
 115     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 116 
 117     ClusteredMolIndices = Butina.ClusterData(DistanceMatrix, FingerprintsCount, DistanceCutoff, reordering = Reordering, isDistData = True)
 118 
 119     MolsClusters = []
 120     for Cluster in ClusteredMolIndices:
 121         MolsCluster = [Mols[MolIndex] for MolIndex in Cluster]
 122         MolsClusters.append(MolsCluster)
 123 
 124     return MolsClusters
 125 
 126 def PerformHierarchicalClustering(Mols, MolsFingerprints):
 127     """Perform hierarchical clustering."""
 128 
 129     try:
 130         import numpy
 131     except ImportError:
 132         MiscUtil.PrintError("Failed to import numpy python module. This is required to cluster molecules using hierarchical clustering methodology.")
 133     
 134     if OptionsInfo["NumClusters"] > len(Mols):
 135         MiscUtil.PrintError("The number of clusters, %d, specified using \"-n, --numClusters\" must be less than total number of valid molecules, %d" % (OptionsInfo["NumClusters"], len(Mols)))
 136     
 137     MiscUtil.PrintInfo("\nCluster molecules using %s hierarchical clustering methodology and %s similarity metric..." % (OptionsInfo["SpecifiedHierarchicalClusteringMethod"], OptionsInfo["SimilarityMetric"]))
 138     
 139     NumFingerprints = len(MolsFingerprints)
 140     NumClusters = OptionsInfo["NumClusters"]
 141     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 142     
 143     ClusterPicker = HierarchicalClusterPicker(OptionsInfo["SpecifiedHierarchicalClusteringMethodID"])
 144     ClusteredMolIndices = ClusterPicker.Cluster(numpy.asarray(DistanceMatrix), NumFingerprints, NumClusters)
 145 
 146     MolsClusters = []
 147     for Cluster in ClusteredMolIndices:
 148         MolsCluster = [Mols[MolIndex] for MolIndex in Cluster]
 149         MolsClusters.append(MolsCluster)
 150     
 151     return MolsClusters
 152 
 153 def WriteMolecules(MolsClusters):
 154     """Write out molecules for each cluster along with cluster numbers."""
 155 
 156     ClustersCount = len(MolsClusters)
 157     
 158     SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
 159     TextOutFileMode = OptionsInfo["TextOutFileMode"]
 160     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
 161 
 162     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 163     
 164     SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"]
 165     SMILESKekulize = OptionsInfo["OutfileParams"]["SMILESKekulize"]
 166     
 167     # Setup outfile names and writers...
 168     SetupClustersOutFilesNames(len(MolsClusters))
 169     SingleClusterWriter, ClustersOutfilesWriters = SetupMoleculeWriters(ClustersCount)
 170 
 171     MolCount = 0
 172     SingleMolClustersCount = 0
 173     
 174     if SingleOutFileMode:
 175         Writer = SingleClusterWriter
 176     
 177     for ClusterIndex in range(0, ClustersCount):
 178         MolsCluster = MolsClusters[ClusterIndex]
 179         ClusterNum = ClusterIndex + 1
 180 
 181         if len(MolsCluster) == 1:
 182             SingleMolClustersCount += 1
 183         
 184         if not SingleOutFileMode:
 185             Writer = ClustersOutfilesWriters[ClusterIndex]
 186             
 187         for Mol in MolsCluster:
 188             MolCount += 1
 189 
 190             if TextOutFileMode:
 191                 # Write out text file including SMILES file...
 192                 SMILES = Chem.MolToSmiles(Mol, isomericSmiles = SMILESIsomeric, kekuleSmiles = SMILESKekulize)
 193                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 194                 Line = TextOutFileDelim.join([SMILES, MolName, "%d" % ClusterNum])
 195                 Writer.write("%s\n" % Line)
 196             else:
 197                 # Write out SD file...
 198                 Mol.SetProp("ClusterNumber", "%s" % ClusterNum)
 199                 if Compute2DCoords:
 200                     AllChem.Compute2DCoords(Mol)
 201                 Writer.write(Mol)
 202     
 203     if SingleClusterWriter is not None:
 204         SingleClusterWriter.close()
 205     for ClusterOutfileWriter in ClustersOutfilesWriters:
 206         ClusterOutfileWriter.close()
 207 
 208     MiscUtil.PrintInfo("\nTotal number of clusters: %d" % ClustersCount)
 209 
 210     if ClustersCount > 0:
 211         MiscUtil.PrintInfo("\nNumber of clusters containing only a single molecule: %d" % SingleMolClustersCount)
 212         MiscUtil.PrintInfo("Average number of molecules per cluster: %.1f" % (MolCount/ClustersCount))
 213     
 214         MiscUtil.PrintInfo("\nNumber of molecules in each cluster:")
 215         MiscUtil.PrintInfo("ClusterNumber,MolCount")
 216         ClusterNum = 0
 217         for MolsCluster in MolsClusters:
 218             ClusterNum += 1
 219             MiscUtil.PrintInfo("%d,%d" % (ClusterNum, len(MolsCluster)))
 220 
 221 def RetrieveMolecules():
 222     """Retrieve molecules."""
 223 
 224     Infile = OptionsInfo["Infile"]
 225     
 226     # Read molecules...
 227     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
 228     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 229     ValidMols, MolCount, ValidMolCount  = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"])
 230     
 231     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
 232     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 233     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 234 
 235     return ValidMols
 236 
 237 def GenerateFingerprints(Mols):
 238     """Generate fingerprints."""
 239 
 240     FingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 241     
 242     MolsFingerprints = []
 243     if re.match("^AtomPairs$", FingerprintsName, re.I):
 244         return GenerateAtomPairsFingerprints(Mols)
 245     elif re.match("^MACCS166Keys$", FingerprintsName, re.I):
 246         return GenerateMACCS166KeysFingerprints(Mols)
 247     elif re.match("^Morgan$", FingerprintsName, re.I):
 248         return GenerateMorganFingerprints(Mols)
 249     elif re.match("^MorganFeatures$", FingerprintsName, re.I):
 250         return GenerateMorganFeaturesFingerprints(Mols)
 251     elif re.match("^PathLength$", FingerprintsName, re.I):
 252         return GeneratePathLengthFingerprints(Mols)
 253     elif re.match("^TopologicalTorsions$", FingerprintsName, re.I):
 254         return GenerateTopologicalTorsionsFingerprints(Mols)
 255     else:
 256         MiscUtil.PrintError("Fingerprints name, %s, is not a valid name" % FingerprintsName)
 257     
 258     return MolsFingerprints
 259 
 260 def GenerateAtomPairsFingerprints(Mols):
 261     """Generate AtomPairs fingerprints."""
 262 
 263     MiscUtil.PrintInfo("\nGenerating AtomPairs %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 264     
 265     MinLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MinLength"]
 266     MaxLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MaxLength"]
 267     UseChirality = OptionsInfo["FingerprintsParams"]["AtomPairs"]["UseChirality"]
 268     FPSize = OptionsInfo["FingerprintsParams"]["AtomPairs"]["FPSize"]
 269     BitsPerHash = OptionsInfo["FingerprintsParams"]["AtomPairs"]["BitsPerHash"]
 270 
 271     if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I):
 272         # Generate ExplicitBitVect fingerprints...
 273         MiscUtil.PrintInfo("FPSize: %s; BitsPerHash: %s" % (FPSize, BitsPerHash))
 274         MolsFingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 275     else:
 276         # Generate IntSparseIntVect fingerprints...
 277         MolsFingerprints = [rdMolDescriptors.GetAtomPairFingerprint(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality) for Mol in Mols]
 278 
 279     return MolsFingerprints
 280 
 281 def GenerateMACCS166KeysFingerprints(Mols):
 282     """Generate MACCS166Keys fingerprints."""
 283 
 284     MiscUtil.PrintInfo("\nGenerating MACCS166Keys %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 285 
 286     # Generate ExplicitBitVect fingerprints...
 287     MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols]
 288 
 289     return MolsFingerprints
 290 
 291 def GenerateMorganFingerprints(Mols):
 292     """Generate Morgan fingerprints."""
 293 
 294     MiscUtil.PrintInfo("\nGenerating Morgan %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 295     
 296     Radius = OptionsInfo["FingerprintsParams"]["Morgan"]["Radius"]
 297     UseChirality = OptionsInfo["FingerprintsParams"]["Morgan"]["UseChirality"]
 298     FPSize = OptionsInfo["FingerprintsParams"]["Morgan"]["FPSize"]
 299     UseFeatures = False
 300 
 301     if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I):
 302         # Generate ExplicitBitVect fingerprints...
 303         MiscUtil.PrintInfo("FPSize: %s" % (FPSize))
 304         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 305     else:
 306         # Generate UIntSparseIntVect fingerprints...
 307         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 308 
 309     return MolsFingerprints
 310 
 311 def GenerateMorganFeaturesFingerprints(Mols):
 312     """Generate MorganFeatures fingerprints."""
 313 
 314     MiscUtil.PrintInfo("\nGenerating MorganFeatures %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 315     
 316     # Setup fingerprints parameters...
 317     Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"]
 318     UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["UseChirality"]
 319     FPSize = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["FPSize"]
 320     UseFeatures = True
 321     
 322     if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I):
 323         # Generate ExplicitBitVect fingerprints...
 324         MiscUtil.PrintInfo("FPSize: %s" % (FPSize))
 325         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 326     else:
 327         # Generate UIntSparseIntVect fingerprints...
 328         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 329 
 330     return MolsFingerprints
 331 
 332 def GeneratePathLengthFingerprints(Mols):
 333     """Generate PathLength fingerprints."""
 334 
 335     MiscUtil.PrintInfo("\nGenerating PathLength %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 336     
 337     MinPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MinPath"]
 338     MaxPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MaxPath"]
 339     FPSize = OptionsInfo["FingerprintsParams"]["PathLength"]["FPSize"]
 340     BitsPerHash = OptionsInfo["FingerprintsParams"]["PathLength"]["BitsPerHash"]
 341     UseHs = False
 342     TargetDensity = 0.3
 343     MinSize = 54
 344 
 345     # Generate ExplicitBitVect fingerprints...
 346     MiscUtil.PrintInfo("FPSize: %s; BitsPerHash: %s" % (FPSize, BitsPerHash))
 347     MolsFingerprints = [FingerprintMols.FingerprintMol(Mol, minPath = MinPath, maxPath = MaxPath, fpSize = FPSize, bitsPerHash = BitsPerHash, useHs = UseHs, tgtDensity = TargetDensity, minSize = MinSize) for Mol in Mols]
 348 
 349     return MolsFingerprints
 350 
 351 def GenerateTopologicalTorsionsFingerprints(Mols):
 352     """Generate TopologicalTorsions fingerprints."""
 353 
 354     MiscUtil.PrintInfo("\nGenerating TopologicalTorsions %s fingerprints..." % OptionsInfo["SpecifiedFingerprintsType"])
 355     
 356     UseChirality = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["UseChirality"]
 357     FPSize = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["FPSize"]
 358     BitsPerHash = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["BitsPerHash"]
 359 
 360     if re.match("^BitVect$", OptionsInfo["SpecifiedFingerprintsType"], re.I):
 361         # Generate ExplicitBitVect fingerprints...
 362         MiscUtil.PrintInfo("FPSize: %s; BitsPerHash: %s" % (FPSize, BitsPerHash))
 363         MolsFingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Mol,  includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 364     else:
 365         # Generate LongSparseIntVect fingerprint...
 366         MolsFingerprints = [rdMolDescriptors.GetTopologicalTorsionFingerprint(Mol,  includeChirality = UseChirality) for Mol in Mols]
 367 
 368     return MolsFingerprints
 369 
 370 def GenerateLowerTriangularDistanceMatrix(MolsFingerprints):
 371     """Generate a lower triangular distance matrix without the diagonal."""
 372 
 373     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 374 
 375     DistanceMatrix = []
 376     NumFPs = len(MolsFingerprints)
 377     for Index1 in range(0, NumFPs):
 378         for Index2 in range(0, Index1):
 379             Distance =  1 - SimilarityFunction(MolsFingerprints[Index1], MolsFingerprints[Index2],)
 380             DistanceMatrix.append(Distance)
 381 
 382     return DistanceMatrix
 383 
 384 def SetupMoleculeWriters(ClustersCount):
 385     """Set up molecule writers for SD and text files."""
 386     
 387     Writer = None
 388     ClustersOutfilesWriters = []
 389 
 390     TextOutFileMode = OptionsInfo["TextOutFileMode"]
 391     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
 392     TextOutFileTitleLine = OptionsInfo["TextOutFileTitleLine"]
 393     
 394     if OptionsInfo["SingleOutFileMode"]:
 395         Outfile = OptionsInfo["Outfile"]
 396         if TextOutFileMode:
 397             Writer = open(Outfile, "w")
 398         else:
 399             Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 400         if Writer is None:
 401             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 402         
 403         if TextOutFileMode:
 404             if TextOutFileTitleLine:
 405                 WriteTextFileHeaderLine(Writer, TextOutFileDelim)
 406         
 407         MiscUtil.PrintInfo("Generating file %s..." % Outfile)
 408     else:
 409         for ClusterIndex in range(0, ClustersCount):
 410             Outfile = OptionsInfo["ClustersOutfiles"][ClusterIndex]
 411             if TextOutFileMode:
 412                 ClusterWriter = open(Outfile, "w")
 413             else:
 414                 ClusterWriter = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 415             if ClusterWriter is None:
 416                 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 417             
 418             if TextOutFileMode:
 419                 if TextOutFileTitleLine:
 420                     WriteTextFileHeaderLine(ClusterWriter, TextOutFileDelim)
 421         
 422             ClustersOutfilesWriters.append(ClusterWriter)
 423         
 424         if ClustersCount > 4:
 425             MiscUtil.PrintInfo("Generating %d output files with the following file name format: %s_Cluster<Num>.%s" % (ClustersCount, OptionsInfo["OutfileBasename"], OptionsInfo["OutfileExt"]))
 426         else:
 427             Delmiter = ','
 428             OutfileNames = Delmiter.join(OptionsInfo["ClustersOutfiles"])
 429             MiscUtil.PrintInfo("Generating %d output files: %s..." % (ClustersCount, OutfileNames))
 430         
 431     return (Writer, ClustersOutfilesWriters)
 432 
 433 def WriteTextFileHeaderLine(Writer, TextOutFileDelim):
 434     """Write out a header line for text files including SMILEs file."""
 435     
 436     Line = TextOutFileDelim.join(["SMILES", "Name", "ClusterNumber"])
 437     Writer.write("%s\n" % Line)
 438 
 439 def SetupClustersOutFilesNames(ClustersCount):
 440     """Set up out file names for clusters."""
 441 
 442     OptionsInfo["ClustersOutfiles"] = []
 443     if OptionsInfo["SingleOutFileMode"] or ClustersCount == 0:
 444         # Nothing to do...
 445         return
 446 
 447     OutfileBasename = OptionsInfo["OutfileBasename"]
 448     OutfileExt = OptionsInfo["OutfileExt"]
 449     
 450     ClusterOutfiles = []
 451     for ClusterIndex in range(0, ClustersCount):
 452         ClusterNum = ClusterIndex + 1
 453         ClusterOutfile = "%s_Cluster%d.%s" % (OutfileBasename, ClusterNum, OutfileExt)
 454         ClusterOutfiles.append(ClusterOutfile)
 455     
 456     OptionsInfo["ClustersOutfiles"] = ClusterOutfiles
 457     
 458 def ProcessFingerprintsParameters():
 459     """Set up and process fingerprints parameters."""
 460 
 461     SetupFingerprintsNamesAndParameters()
 462     
 463     ProcessSpecifiedFingerprintsName()
 464     ProcessSpecifiedFingerprintsType()
 465     
 466     ProcessSpecifiedFingerprintsParameters()
 467 
 468 def SetupFingerprintsNamesAndParameters():
 469     """Set up fingerprints parameters."""
 470     
 471     OptionsInfo["FingerprintsNames"] = ["AtomPairs", "MACCS166Keys", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
 472     
 473     OptionsInfo["FingerprintsParams"] = {}
 474     OptionsInfo["FingerprintsParams"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False, "FPSize": 2048, "BitsPerHash": 2}
 475     OptionsInfo["FingerprintsParams"]["MACCS166Keys"] = {}
 476     OptionsInfo["FingerprintsParams"]["Morgan"] = {"Radius": 2, "UseChirality": False, "FPSize": 2048}
 477     OptionsInfo["FingerprintsParams"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False, "FPSize": 2048}
 478     OptionsInfo["FingerprintsParams"]["TopologicalTorsions"] = {"UseChirality": False, "FPSize": 2048, "BitsPerHash": 4}
 479     OptionsInfo["FingerprintsParams"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "FPSize": 2048, "BitsPerHash": 2}
 480 
 481 def ProcessSpecifiedFingerprintsName():
 482     """Process specified fingerprints name."""
 483 
 484     #  Set up a canonical fingerprints name map...
 485     CanonicalFingerprintsNamesMap = {}
 486     for Name in OptionsInfo["FingerprintsNames"]:
 487         CanonicalName = Name.lower()
 488         CanonicalFingerprintsNamesMap[CanonicalName] = Name
 489 
 490     # Validate specified fingerprints name...
 491     CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
 492     if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
 493         MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"]))
 494     
 495     OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
 496 
 497 def ProcessSpecifiedFingerprintsType():
 498     """Process specified fingerprints type."""
 499 
 500     FingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 501     FingerprintsType = OptionsInfo["FingerprintsType"]
 502     SimilarityName = OptionsInfo["SimilarityMetric"]
 503     
 504     if re.match("^auto$", FingerprintsType, re.I):
 505         if re.match("^(MACCS166Keys|PathLength)$", FingerprintsName, re.I):
 506             SpecifiedFingerprintsType = "BitVect"
 507         else:
 508             if re.match("^(Tanimoto|Dice)$", SimilarityName, re.I):
 509                 SpecifiedFingerprintsType = "IntVect"
 510             else:
 511                 SpecifiedFingerprintsType = "BitVect"
 512     elif re.match("^IntVect$", FingerprintsType, re.I):
 513         SpecifiedFingerprintsType = "IntVect"
 514         
 515         if re.match("^(MACCS166Keys|PathLength)$", FingerprintsName, re.I):
 516             MiscUtil.PrintError("The fingerprints Type, %s, specified using \"--fingerprintsType\" is not allowed for fingerprints %s." % (FingerprintsType, FingerprintsName))
 517             
 518         # RDKit similarity functions, besides Dice and Tanimoto, are not able to handle int bit vectors...
 519         if not re.match("^(Tanimoto|Dice)$", SimilarityName, re.I):
 520             MiscUtil.PrintError("The fingerprints Type, %s, specified using \"--fingerprintsType\" is not allowed for similarity metric %s.\nSupported similarity metrics: Tanimoto or Dice" % (FingerprintsType, SimilarityName))
 521     elif re.match("^BitVect$", FingerprintsType, re.I):
 522         SpecifiedFingerprintsType = "BitVect"
 523     else:
 524         MiscUtil.PrintError("The fingerprints Type, %s, is not supported." % (FingerprintsType))
 525     
 526     OptionsInfo["SpecifiedFingerprintsType"] = SpecifiedFingerprintsType
 527 
 528 def ProcessSpecifiedFingerprintsParameters():
 529     """Process specified fingerprints parameters."""
 530 
 531     if re.match("^auto$", OptionsInfo["ParamsFingerprints"], re.I):
 532         # Nothing to process...
 533         return
 534 
 535     SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 536     
 537     # Parse specified fingerprints parameters...
 538     ParamsFingerprints = re.sub(" ", "", OptionsInfo["ParamsFingerprints"])
 539     if not ParamsFingerprints:
 540         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-p, --paramsFingerprints\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName))
 541 
 542     ParamsFingerprintsWords = ParamsFingerprints.split(",")
 543     if len(ParamsFingerprintsWords) % 2:
 544         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-p, --paramsFingerprints\" option must be an even number." % (len(ParamsFingerprintsWords)))
 545 
 546     # Setup canonical parameter names for specified fingerprints...
 547     ValidParamNames = []
 548     CanonicalParamNamesMap = {}
 549     for ParamName in sorted(OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName]):
 550         ValidParamNames.append(ParamName)
 551         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 552 
 553     # Validate and set paramater names and value...
 554     for Index in range(0, len(ParamsFingerprintsWords), 2):
 555         Name = ParamsFingerprintsWords[Index]
 556         Value = ParamsFingerprintsWords[Index + 1]
 557 
 558         CanonicalName = Name.lower()
 559         if  not CanonicalName in CanonicalParamNamesMap:
 560             MiscUtil.PrintError("The parameter name, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames)))
 561 
 562         ParamName = CanonicalParamNamesMap[CanonicalName]
 563         if re.match("^UseChirality$", ParamName, re.I):
 564             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 565                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName))
 566             ParamValue = False
 567             if re.match("^(Yes|True)$", Value, re.I):
 568                 ParamValue = True
 569         else:
 570             ParamValue = int(Value)
 571             if ParamValue <= 0:
 572                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName))
 573         
 574         # Set value...
 575         OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName][ParamName] = ParamValue
 576 
 577 def ProcessSimilarityMetricParameter():
 578     """Process specified similarity metric value."""
 579 
 580     SimilarityInfoMap = {}
 581     CanonicalNameMap = {}
 582     
 583     for SimilarityFunctionInfo in DataStructs.similarityFunctions:
 584         Name = SimilarityFunctionInfo[0]
 585         Function = SimilarityFunctionInfo[1]
 586         
 587         SimilarityInfoMap[Name] = Function
 588         CanonicalName = Name.lower()
 589         CanonicalNameMap[CanonicalName] = Name
 590     
 591     SpecifiedCanonicalName = OptionsInfo["SimilarityMetric"].lower()
 592     SimilarityFunction = None
 593     if  SpecifiedCanonicalName in CanonicalNameMap:
 594         SimilarityName = CanonicalNameMap[SpecifiedCanonicalName]
 595         SimilarityFunction = SimilarityInfoMap[SimilarityName]
 596     else:
 597         MiscUtil.PrintError("Similarity metric name, %s, is not a valid name. " % OptionsInfo["SimilarityMetric"])
 598         
 599     OptionsInfo["SimilarityMetric"] = SimilarityName
 600     OptionsInfo["SimilarityFunction"] = SimilarityFunction
 601 
 602 def ProcessClusteringMethodParameter():
 603     """Process specified clustering method parameter."""
 604 
 605     OptionsInfo["SpecifiedHierarchicalClusteringMethod"] = ""
 606     OptionsInfo["SpecifiedHierarchicalClusteringMethodID"] = ""
 607     
 608     if re.match("^Butina$", OptionsInfo["ClusteringMethod"], re.I):
 609         # Nothing to process...
 610         return
 611 
 612     # Setup a canonical cluster method name map..
 613     ClusteringMethodInfoMap = {}
 614     CanonicalClusteringMethodNameMap = {}
 615     for Name in sorted(rdSimDivPickers.ClusterMethod.names):
 616         NameID =  rdSimDivPickers.ClusterMethod.names[Name]
 617         ClusteringMethodInfoMap[Name] = NameID
 618         
 619         CanonicalName = Name.lower()
 620         CanonicalClusteringMethodNameMap[CanonicalName] = Name
 621 
 622     CanonicalName = OptionsInfo["ClusteringMethod"].lower()
 623     if not CanonicalName in CanonicalClusteringMethodNameMap:
 624         MiscUtil.PrintError("The clustering method, %s, specified using \"-c, --clusteringMethod\" option is not a valid name." % (OptionsInfo["ClusteringMethod"]))
 625 
 626     SpecifiedHierarchicalClusteringMethodName = CanonicalClusteringMethodNameMap[CanonicalName]
 627     OptionsInfo["SpecifiedHierarchicalClusteringMethod"] = SpecifiedHierarchicalClusteringMethodName
 628     OptionsInfo["SpecifiedHierarchicalClusteringMethodID"] = ClusteringMethodInfoMap[SpecifiedHierarchicalClusteringMethodName] 
 629     
 630 def ProcessOptions():
 631     """Process and validate command line arguments and options."""
 632     
 633     MiscUtil.PrintInfo("Processing options...")
 634     
 635     # Validate options...
 636     ValidateOptions()
 637     
 638     OptionsInfo["ButinaSimilarityCutoff"] = float(Options["--butinaSimilarityCutoff"])
 639     OptionsInfo["ButinaReordering"] = False
 640     if re.match("^Yes$", Options["--butinaReordering"], re.I):
 641         OptionsInfo["ButinaReordering"] = True
 642     
 643     OptionsInfo["Fingerprints"] = Options["--fingerprints"]
 644     OptionsInfo["FingerprintsType"] = Options["--fingerprintsType"]
 645     
 646     OptionsInfo["ClusteringMethod"] = Options["--clusteringMethod"]
 647     ProcessClusteringMethodParameter()
 648 
 649     OptionsInfo["NumClusters"] = int(Options["--numClusters"])
 650     
 651     OptionsInfo["Infile"] = Options["--infile"]
 652     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 653     
 654     OptionsInfo["Outfile"] = Options["--outfile"]
 655     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 656     
 657     OptionsInfo["Overwrite"] = Options["--overwrite"]
 658 
 659     OptionsInfo["OutFileMode"] = Options["--outfileMode"]
 660     SingleOutFileMode = True
 661     if not re.match("^SingleFile$", Options["--outfileMode"], re.I):
 662         SingleOutFileMode = False
 663     OptionsInfo["SingleOutFileMode"] = SingleOutFileMode
 664     
 665     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 666     OptionsInfo["OutfileBasename"] = FileName
 667     OptionsInfo["OutfileExt"] = FileExt
 668 
 669     TextOutFileMode = False
 670     TextOutFileDelim = ""
 671     TextOutFileTitleLine = True
 672     
 673     if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
 674         TextOutFileMode = True
 675         TextOutFileDelim = ","
 676     elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
 677         TextOutFileMode = True
 678         TextOutFileDelim = "\t"
 679     elif MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
 680         TextOutFileMode = True
 681         TextOutFileDelim = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
 682         TextOutFileTitleLine = OptionsInfo["OutfileParams"]["SMILESTitleLine"]
 683         
 684     OptionsInfo["TextOutFileMode"] = TextOutFileMode
 685     OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
 686     OptionsInfo["TextOutFileTitleLine"] = TextOutFileTitleLine
 687     
 688     OptionsInfo["SimilarityMetric"] = Options["--similarityMetric"]
 689     ProcessSimilarityMetricParameter()
 690 
 691     OptionsInfo["ParamsFingerprints"] = Options["--paramsFingerprints"]
 692     ProcessFingerprintsParameters()
 693     
 694 def RetrieveOptions():
 695     """Retrieve command line arguments and options."""
 696     
 697     # Get options...
 698     global Options
 699     Options = docopt(_docoptUsage_)
 700     
 701     # Set current working directory to the specified directory...
 702     WorkingDir = Options["--workingdir"]
 703     if WorkingDir:
 704         os.chdir(WorkingDir)
 705     
 706     # Handle examples option...
 707     if "--examples" in Options and Options["--examples"]:
 708         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 709         sys.exit(0)
 710 
 711 def ValidateOptions():
 712     """Validate option values."""
 713     
 714     MiscUtil.ValidateOptionFloatValue("-b, --butinaSimilarityCutoff", Options["--butinaSimilarityCutoff"], {">": 0.0, "<=" : 1.0})
 715     MiscUtil.ValidateOptionTextValue("--butinaReordering", Options["--butinaReordering"], "yes no")
 716     
 717     MiscUtil.ValidateOptionTextValue("-c, --clusteringMethod", Options["--clusteringMethod"], "Butina Centroid CLink Gower McQuitty SLink UPGMA Ward")
 718     MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs MACCS166Keys Morgan MorganFeatures PathLength TopologicalTorsions")
 719     MiscUtil.ValidateOptionTextValue("--fingerprintsType", Options["--fingerprintsType"], "IntVect BitVect  auto")
 720     
 721     MiscUtil.ValidateOptionIntegerValue("-n, --numClusters", Options["--numClusters"], {">": 0})
 722     
 723     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 724     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt")
 725     
 726     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi csv tsv txt")
 727     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 728     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 729         
 730     MiscUtil.ValidateOptionTextValue("--outfileMode", Options["--outfileMode"], "SingleFile MultipleFiles")
 731     
 732     MiscUtil.ValidateOptionTextValue("-s, --similarityMetric", Options["--similarityMetric"], "BraunBlanquet Cosine Dice Kulczynski RogotGoldberg Russel Sokal Tanimoto")
 733     
 734 # Setup a usage string for docopt...
 735 _docoptUsage_ = """
 736 RDKitClusterMolecules.py - Cluster molecules using 2D fingerprints
 737 
 738 Usage:
 739     RDKitClusterMolecules.py [--butinaSimilarityCutoff <number>]  [--butinaReordering <yes or no>]
 740                              [--clusteringMethod <Butina, Centroid, CLink...>] [--fingerprints <MACCS166Keys, Morgan, PathLength...> ]
 741                              [--fingerprintsType <IntVect, BitVect, or Auto>] [--infileParams <Name,Value,...>]
 742                              [--numClusters <number>] [--outfileMode <SingleFile or MultipleFiles>]
 743                              [ --outfileParams <Name,Value,...> ] [--overwrite] [--paramsFingerprints <Name,Value,...>]
 744                              [--similarityMetric <Dice, Tanimoto...>] [-w <dir>] -i <infile> -o <outfile> 
 745     RDKitClusterMolecules.py -h | --help | -e | --examples
 746 
 747 Description:
 748     Cluster molecules based on a variety of 2D fingerprints using Butina [ Ref 136 ] or any
 749     other available hierarchical clustering methodology and write them to output file(s).
 750 
 751     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 752     .txt, .csv, .tsv)
 753 
 754     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi), CSV/TSV
 755     (.csv, .tsv, .txt)
 756 
 757 Options:
 758     -b, --butinaSimilarityCutoff <number>  [default: 0.55]
 759         Similarity cutoff to use during Butina clustering. The molecule pairs with
 760         similarity value greater than specified value or distance less than '1 - specified
 761         value' are considered neighbors. This value is only used during 'Butina' value
 762         of '-c, --clusteringMethod' option and determines the number of clusters
 763         during the clustering of molecules. It is ignored for all other clustering methods.
 764     --butinaReordering <yes or no>  [default: no]
 765         Update number of neighbors for unassigned molecules after creating a new
 766         cluster in order to insure that the molecule with the largest number of
 767         unassigned neighbors is selected as the next cluster center.
 768     -c, --clusteringMethod <Butina, Centroid, CLink...>  [default: Butina]
 769         Clustering method to use for clustering molecules. Supported values:
 770         Butina, Centroid, CLink, Gower, McQuitty, SLink, UPGMA, Ward.
 771         Butina is an unsupervised database clustering method to automatically
 772         cluster small and large data sets. All other clustering methods correspond
 773         to hierarchical clustering and require a priori specification of number of
 774         clusters to be generated.
 775     -f, --fingerprints <MACCS166Keys, Morgan, PathLength...>  [default: Morgan]
 776         Fingerprints to use for calculating similarity/distance between molecules.
 777         Supported values: AtomPairs, MACCS166Keys, Morgan, MorganFeatures, PathLength,
 778         TopologicalTorsions. The PathLength fingerprints are Daylight like fingerprints.
 779         The Morgan and MorganFeature fingerprints are circular fingerprints, corresponding
 780         Scitegic's Extended Connectivity Fingerprints (ECFP) and Features Connectivity
 781         Fingerprints (FCFP). The values of default parameters for generating fingerprints
 782         can be modified using '-p, --paramsFingerprints' option.
 783     --fingerprintsType <IntVect, BitVect, or auto>  [default: auto]
 784         Fingerprints type to generate for calculating similarity. Supported values:
 785         IntVect, BitVect, Auto.
 786         
 787         The following default fingerprints type are automatically generated for
 788         available fingerprints, based on the value of similarty metric:
 789         
 790         AtomPairs              Tanimoto|Dice: IntVect     All Others: BitVect
 791         MACCS166Keys           All: BitVect
 792         Morgan                 Tanimoto|Dice: IntVect     All Others: BitVect
 793         MorganFeatures         Tanimoto|Dice: IntVect     All Others: BitVect
 794         PathLength             All: BitVect
 795         TopologicalTorsions    Tanimoto|Dice: IntVect     All Others: BitVect
 796  
 797         The Dice and Tanimoto similarity functions available in RDKit are able to
 798         handle fingerprints corresponding to both IntVect and BitVect. All other
 799         similarity functions, however, expect BitVect fingerprints to calculate
 800         pairwise similarity. Consequently, BitVect fingerprints, instead of
 801         default IntVect fingerprints, are generated for AtomPairs, Morgan,
 802         MorganFeatures, and TopologicalTorsions during the calculation
 803         of similarity using all other similarity functions.
 804         
 805         The IntVect fingerprints type is not available for MACCS166Keys and
 806         Pathlength fingerprints. In addition, IntVect fingerprints type is only
 807         valid for Tanimoto or Dice value of ' -s, --similarityMetric' option. The
 808         BitVect fingerprints type is valid for all values of '' -s, --similarityMetric'
 809         option.
 810     -e, --examples
 811         Print examples.
 812     -h, --help
 813         Print this help message.
 814     -i, --infile <infile>
 815         Input file name.
 816     --infileParams <Name,Value,...>  [default: auto]
 817         A comma delimited list of parameter name and value pairs for reading 
 818         molecules from files. The supported parameter names for different file
 819         formats, along with their default values, are shown below:
 820             
 821             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 822             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 823                 smilesTitleLine,auto,sanitize,yes
 824             
 825         Possible values for smilesDelimiter: space, comma or tab.
 826     -n, --numClusters <number>  [default: 10]
 827         Number of clusters to generate during hierarchical clustering. This option is
 828         ignored for 'Butina' value of '-c, --clusteringMethod' option.
 829     -o, --outfile <outfile>
 830         Output file name.
 831     --outfileMode <SingleFile or MultipleFiles>  [default: SingleFile]
 832         Write out a single file containing molecule clusters or generate an individual file
 833         for each cluster. Possible values: SingleFile or MultipleFiles. The molecules are
 834         grouped for each cluster before they are written to output file(s) along with
 835         appropriate cluster numbers. The cluster number is also appended to output
 836         file names during generation of multiple output files.
 837     --outfileParams <Name,Value,...>  [default: auto]
 838         A comma delimited list of parameter name and value pairs for writing
 839         molecules to files. The supported parameter names for different file
 840         formats, along with their default values, are shown below:
 841             
 842             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 843             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 844                 smilesTitleLine,yes
 845             
 846         Default value for compute2DCoords: yes for SMILES input file; no for all other
 847         file types. The kekulize and smilesIsomeric parameters are also used during
 848         generation of SMILES strings for CSV/TSV files.
 849     --overwrite
 850         Overwrite existing files.
 851     -p, --paramsFingerprints <Name,Value,...>  [default: auto]
 852         Parameter values to use for generating fingerprints. The default values
 853         are dependent on the value of '-f, --fingerprints' option. In general, it is a
 854         comma delimited list of parameter name and value pairs for the name of
 855         fingerprints specified using '-f, --fingerprints' option. The supported
 856         parameter names along with their default values for valid fingerprints
 857         names are shown below:
 858             
 859             AtomPairs: minLength,1 ,maxLength,30, useChirality,No,
 860                 fpSize, 2048, bitsPerHash,4
 861             Morgan: radius,2, useChirality,No, fpSize, 2048
 862             MorganFeatures:   radius,2, useChirality,No, fpSize, 2048
 863             PathLength: minPath,1, maxPath,7, fpSize, 2048, bitsPerHash,2
 864             TopologicalTorsions: useChirality,No, fpSize, 2048, bitsPerHash,4
 865             
 866         The fpSize and bitsPerHash are only used for BitVect fingerprints type
 867         specified using '--fingerprintsType' option.
 868     -s, --similarityMetric <Dice, Tanimoto...>  [default: Tanimoto]
 869         Similarity metric to use for calculating similarity/distance between molecules.
 870         Possible values: BraunBlanquet, Cosine, Dice, Kulczynski, RogotGoldberg,
 871         Russel, Sokal, Tanimoto.
 872     -w, --workingdir <dir>
 873         Location of working directory which defaults to the current directory.
 874 
 875 Examples:
 876     To cluster molecules using Butina methodology at a similarity cutoff of 0.55
 877     with automatic determination of number of clusters, Tanimoto similarity
 878     metric corresponding to Morgan fingerprints with radius of 2, and write out
 879     a single SMILES file containing clustered molecules along with cluster number
 880     for each molecule, type:
 881 
 882         % RDKitClusterMolecules.py  -i Sample.smi -o SampleOut.smi
 883 
 884     To cluster molecules using Butina methodology at a similarity cutoff of 0.55
 885     with automatic determination of number of clusters, Tanimoto similarity
 886     metric corresponding to Morgan fingerprints with radius of 2 and type
 887     BitVect, fingerprint BitVect size of 4096, and write out a single SMILES file
 888     containing clustered molecules along with cluster number for each molecule,
 889     type:
 890 
 891         % RDKitClusterMolecules.py  -f Morgan  --fingerprintsType  BitVect
 892           -p "fpSize,4096" -s Tanimoto -i Sample.smi -o SampleOut.smi
 893 
 894     To cluster molecules using Butina methodology at similarity cutoff of 0.45
 895     with automatic determination of number of clusters, Dice similarity metric
 896     corresponding to Morgan fingerprints with radius of 2, and write out multiple
 897     SD files containing clustered molecules for each cluster, type:
 898 
 899         % RDKitClusterMolecules.py  -b 0.45 -s Dice --outfileMode MultipleFiles
 900           -i Sample.smi -o SampleOut.sdf
 901 
 902     To cluster molecules using Ward hierarchical methodology to generate 15
 903     clusters, Dice similarity metric corresponding to Pathlength fingerprints with 
 904     path length between 1 and 7,  and write out a single TSV file for clustered
 905     molecules along with cluster numner for each molecule, type:
 906 
 907         % RDKitClusterMolecules.py  -c Ward -f PathLength -n 15
 908           -p 'minPath,1, maxPath,7' -i Sample.sdf -o SampleOut.tsv
 909 
 910     To cluster molecules using Centroid hierarchical methodology to generate 5
 911     clusters, Dice similarity metric corresponding to MACCS166Keys fingerprints
 912     for molecules in a SMILES CSV file, SMILES strings in column 1, name in
 913     column 2, and write out a single SD file for clustered molecules along with
 914     cluster numner for each molecule, type:
 915 
 916         % RDKitClusterMolecules.py  -c Centroid -f MACCS166Keys --infileParams
 917           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 918           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 919           -i SampleSMILES.csv -o SampleOut.sdf
 920 
 921 Author:
 922     Manish Sud(msud@san.rr.com)
 923 
 924 See also:
 925     RDKitConvertFileFormat.py, RDKitPickDiverseMolecules.py, RDKitSearchFunctionalGroups.py,
 926     RDKitSearchSMARTS.py
 927 
 928 Copyright:
 929     Copyright (C) 2024 Manish Sud. All rights reserved.
 930 
 931     The functionality available in this script is implemented using RDKit, an
 932     open source toolkit for cheminformatics developed by Greg Landrum.
 933 
 934     This file is part of MayaChemTools.
 935 
 936     MayaChemTools is free software; you can redistribute it and/or modify it under
 937     the terms of the GNU Lesser General Public License as published by the Free
 938     Software Foundation; either version 3 of the License, or (at your option) any
 939     later version.
 940 
 941 """
 942 
 943 if __name__ == "__main__":
 944     main()