MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitClusterMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit import DataStructs
  43     from rdkit.Chem.Fingerprints import FingerprintMols
  44     from rdkit.Chem import rdMolDescriptors
  45     from rdkit.ML.Cluster import Butina
  46     from rdkit.SimDivFilters import rdSimDivPickers
  47     from rdkit.SimDivFilters.rdSimDivPickers import HierarchicalClusterPicker
  48 except ImportError as ErrMsg:
  49     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  50     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  51     sys.exit(1)
  52 
  53 # MayaChemTools imports...
  54 try:
  55     from docopt import docopt
  56     import MiscUtil
  57     import RDKitUtil
  58 except ImportError as ErrMsg:
  59     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  60     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  61     sys.exit(1)
  62 
  63 ScriptName = os.path.basename(sys.argv[0])
  64 Options = {}
  65 OptionsInfo = {}
  66 
  67 def main():
  68     """Start execution of the script"""
  69     
  70     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  71     
  72     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  73     
  74     # Retrieve command line arguments and options...
  75     RetrieveOptions()
  76     
  77     # Process and validate command line arguments and options...
  78     ProcessOptions()
  79     
  80     # Perform actions required by the script...
  81     ClusterMolecules()
  82     
  83     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  84     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  85 
  86 def ClusterMolecules():
  87     """Cluster molecules."""
  88 
  89     Mols = RetrieveMolecules()
  90     MolsFingerprints = GenerateFingerprints(Mols)
  91     MolsClusters = PerformClustering(Mols, MolsFingerprints)
  92     
  93     WriteMolecules(MolsClusters)
  94 
  95 def PerformClustering(Mols, MolsFingerprints):
  96     """Perform clustering."""
  97 
  98     ClusteredMols = []
  99     if re.match("^Butina$", OptionsInfo["ClusteringMethod"], re.I):
 100         return PerformButinaClustering(Mols, MolsFingerprints)
 101     else:
 102         return PerformHierarchicalClustering(Mols, MolsFingerprints)
 103     
 104     return ClusteredMols
 105 
 106 def PerformButinaClustering(Mols, MolsFingerprints):
 107     """Perform clustering using Butina methodology."""
 108 
 109     MiscUtil.PrintInfo("\nClustering molecules using Butina methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"])
 110     
 111     FingerprintsCount = len(MolsFingerprints)
 112     DistanceCutoff =  1 - OptionsInfo["ButinaSimilarityCutoff"]
 113     Reordering = OptionsInfo["ButinaReordering"]
 114     
 115     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 116 
 117     ClusteredMolIndices = Butina.ClusterData(DistanceMatrix, FingerprintsCount, DistanceCutoff, reordering = Reordering, isDistData = True)
 118 
 119     MolsClusters = []
 120     for Cluster in ClusteredMolIndices:
 121         MolsCluster = [Mols[MolIndex] for MolIndex in Cluster]
 122         MolsClusters.append(MolsCluster)
 123 
 124     return MolsClusters
 125 
 126 def PerformHierarchicalClustering(Mols, MolsFingerprints):
 127     """Perform hierarchical clustering."""
 128 
 129     try:
 130         import numpy
 131     except ImportError:
 132         MiscUtil.PrintError("Failed to import numpy python module. This is required to cluster molecules using hierarchical clustering methodology.")
 133     
 134     if OptionsInfo["NumClusters"] > len(Mols):
 135         MiscUtil.PrintError("The number of clusters, %d, specified using \"-n, --numClusters\" must be less than total number of valid molecules, %d" % (OptionsInfo["NumClusters"], len(Mols)))
 136     
 137     MiscUtil.PrintInfo("\nCluster molecules using %s hierarchical clustering methodology and %s similarity metric......" % (OptionsInfo["SpecifiedHierarchicalClusteringMethod"], OptionsInfo["SimilarityMetric"]))
 138     
 139     NumFingerprints = len(MolsFingerprints)
 140     NumClusters = OptionsInfo["NumClusters"]
 141     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 142     
 143     ClusterPicker = HierarchicalClusterPicker(OptionsInfo["SpecifiedHierarchicalClusteringMethodID"])
 144     ClusteredMolIndices = ClusterPicker.Cluster(numpy.asarray(DistanceMatrix), NumFingerprints, NumClusters)
 145 
 146     MolsClusters = []
 147     for Cluster in ClusteredMolIndices:
 148         MolsCluster = [Mols[MolIndex] for MolIndex in Cluster]
 149         MolsClusters.append(MolsCluster)
 150     
 151     return MolsClusters
 152 
 153 def WriteMolecules(MolsClusters):
 154     """Write out molecules for each cluster along with cluster numbers."""
 155 
 156     ClustersCount = len(MolsClusters)
 157     
 158     SingleOutFileMode = OptionsInfo["SingleOutFileMode"]
 159     TextOutFileMode = OptionsInfo["TextOutFileMode"]
 160     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
 161 
 162     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 163     
 164     SMILESIsomeric = OptionsInfo["OutfileParams"]["SMILESIsomeric"]
 165     SMILESKekulize = OptionsInfo["OutfileParams"]["Kekulize"]
 166     
 167     # Setup outfile names and writers...
 168     SetupClustersOutFilesNames(len(MolsClusters))
 169     SingleClusterWriter, ClustersOutfilesWriters = SetupMoleculeWriters(ClustersCount)
 170 
 171     MolCount = 0
 172     SingleMolClustersCount = 0
 173     
 174     if SingleOutFileMode:
 175         Writer = SingleClusterWriter
 176     
 177     for ClusterIndex in range(0, ClustersCount):
 178         MolsCluster = MolsClusters[ClusterIndex]
 179         ClusterNum = ClusterIndex + 1
 180 
 181         if len(MolsCluster) == 1:
 182             SingleMolClustersCount += 1
 183         
 184         if not SingleOutFileMode:
 185             Writer = ClustersOutfilesWriters[ClusterIndex]
 186             
 187         for Mol in MolsCluster:
 188             MolCount += 1
 189 
 190             if TextOutFileMode:
 191                 # Write out text file including SMILES file...
 192                 SMILES = Chem.MolToSmiles(Mol, isomericSmiles = SMILESIsomeric, kekuleSmiles = SMILESKekulize)
 193                 MolName = RDKitUtil.GetMolName(Mol, MolCount)
 194                 Line = TextOutFileDelim.join([SMILES, MolName, "%d" % ClusterNum])
 195                 Writer.write("%s\n" % Line)
 196             else:
 197                 # Write out SD file...
 198                 Mol.SetProp("ClusterNumber", "%s" % ClusterNum)
 199                 if Compute2DCoords:
 200                     AllChem.Compute2DCoords(Mol)
 201                 Writer.write(Mol)
 202     
 203     if SingleClusterWriter is not None:
 204         SingleClusterWriter.close()
 205     for ClusterOutfileWriter in ClustersOutfilesWriters:
 206         ClusterOutfileWriter.close()
 207 
 208     MiscUtil.PrintInfo("\nTotal number of clusters: %d" % ClustersCount)
 209 
 210     if ClustersCount > 0:
 211         MiscUtil.PrintInfo("\nNumber of clusters containing only a single molecule: %d" % SingleMolClustersCount)
 212         MiscUtil.PrintInfo("Average number of molecules per cluster: %.1f" % (MolCount/ClustersCount))
 213     
 214         MiscUtil.PrintInfo("\nNumber of molecules in each cluster:")
 215         MiscUtil.PrintInfo("ClusterNumber,MolCount")
 216         ClusterNum = 0
 217         for MolsCluster in MolsClusters:
 218             ClusterNum += 1
 219             MiscUtil.PrintInfo("%d,%d" % (ClusterNum, len(MolsCluster)))
 220 
 221 def RetrieveMolecules():
 222     """Retrieve molecules."""
 223 
 224     Infile = OptionsInfo["Infile"]
 225     
 226     # Read molecules...
 227     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
 228     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 229     ValidMols, MolCount, ValidMolCount  = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"])
 230     
 231     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
 232     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 233     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 234 
 235     return ValidMols
 236 
 237 def GenerateFingerprints(Mols):
 238     """Generate fingerprints."""
 239 
 240     FingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 241     
 242     MolsFingerprints = []
 243     if re.match("^AtomPairs$", FingerprintsName, re.I):
 244         return GenerateAtomPairsFingerprints(Mols)
 245     elif re.match("^MACCS166Keys$", FingerprintsName, re.I):
 246         return GenerateMACCS166KeysFingerprints(Mols)
 247     elif re.match("^Morgan$", FingerprintsName, re.I):
 248         return GenerateMorganFingerprints(Mols)
 249     elif re.match("^MorganFeatures$", FingerprintsName, re.I):
 250         return GenerateMorganFeaturesFingerprints(Mols)
 251     elif re.match("^PathLength$", FingerprintsName, re.I):
 252         return GeneratePathLengthFingerprints(Mols)
 253     elif re.match("^TopologicalTorsions$", FingerprintsName, re.I):
 254         return GenerateTopologicalTorsionsFingerprints(Mols)
 255     else:
 256         MiscUtil.PrintError("Fingerprints name, %s, is not a valid name" % FingerprintsName)
 257     
 258     return MolsFingerprints
 259 
 260 def GenerateAtomPairsFingerprints(Mols):
 261     """Generate AtomPairs fingerprints."""
 262 
 263     MiscUtil.PrintInfo("\nGenerating AtomPairs fingerprints...")
 264     
 265     MinLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MinLength"]
 266     MaxLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MaxLength"]
 267     UseChirality = OptionsInfo["FingerprintsParams"]["AtomPairs"]["UseChirality"]
 268 
 269     if OptionsInfo["GenerateBitVectFingerints"]:
 270         # Generate ExplicitBitVect fingerprints...
 271         FPSize = 2048
 272         BitsPerHash = 4
 273         MolsFingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 274     else:
 275         # Generate IntSparseIntVect fingerprints...
 276         MolsFingerprints = [rdMolDescriptors.GetAtomPairFingerprint(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality) for Mol in Mols]
 277 
 278     return MolsFingerprints
 279 
 280 def GenerateMACCS166KeysFingerprints(Mols):
 281     """Generate MACCS166Keys fingerprints."""
 282 
 283     MiscUtil.PrintInfo("\nGenerating MACCS166Keys fingerprints...")
 284 
 285     # Generate ExplicitBitVect fingerprints...
 286     MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols]
 287 
 288     return MolsFingerprints
 289 
 290 def GenerateMorganFingerprints(Mols):
 291     """Generate Morgan fingerprints."""
 292 
 293     MiscUtil.PrintInfo("\nGenerating  Morgan fingerprints...")
 294     
 295     Radius = OptionsInfo["FingerprintsParams"]["Morgan"]["Radius"]
 296     UseChirality = OptionsInfo["FingerprintsParams"]["Morgan"]["UseChirality"]
 297     UseFeatures = False
 298 
 299     if OptionsInfo["GenerateBitVectFingerints"]:
 300         # Generate ExplicitBitVect fingerprints...
 301         FPSize = 2048
 302         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 303     else:
 304         # Generate UIntSparseIntVect fingerprints...
 305         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 306 
 307     return MolsFingerprints
 308 
 309 def GenerateMorganFeaturesFingerprints(Mols):
 310     """Generate MorganFeatures fingerprints."""
 311 
 312     MiscUtil.PrintInfo("\nGenerating  MorganFeatures fingerprints...")
 313     
 314     # Setup fingerprints parameters...
 315     Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"]
 316     UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["UseChirality"]
 317     UseFeatures = True
 318     
 319     if OptionsInfo["GenerateBitVectFingerints"]:
 320         # Generate ExplicitBitVect fingerprints...
 321         FPSize = 2048
 322         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 323     else:
 324         # Generate UIntSparseIntVect fingerprints...
 325         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 326 
 327     return MolsFingerprints
 328 
 329 def GeneratePathLengthFingerprints(Mols):
 330     """Generate PathLength fingerprints."""
 331 
 332     MiscUtil.PrintInfo("\nGenerating PathLength fingerprints ...")
 333     
 334     MinPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MinPath"]
 335     MaxPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MaxPath"]
 336     FPSize = OptionsInfo["FingerprintsParams"]["PathLength"]["FPSize"]
 337     BitsPerHash = OptionsInfo["FingerprintsParams"]["PathLength"]["BitsPerHash"]
 338     UseHs = False
 339     TargetDensity = 0.3
 340     MinSize = 54
 341 
 342     # Generate ExplicitBitVect fingerprints...
 343     MolsFingerprints = [FingerprintMols.FingerprintMol(Mol, minPath = MinPath, maxPath = MaxPath, fpSize = FPSize, bitsPerHash = BitsPerHash, useHs = UseHs, tgtDensity = TargetDensity, minSize = MinSize) for Mol in Mols]
 344 
 345     return MolsFingerprints
 346 
 347 def GenerateTopologicalTorsionsFingerprints(Mols):
 348     """Generate TopologicalTorsions fingerprints."""
 349 
 350     MiscUtil.PrintInfo("\nGenerating TopologicalTorsions fingerprints...")
 351     
 352     UseChirality = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["UseChirality"]
 353 
 354     if OptionsInfo["GenerateBitVectFingerints"]:
 355         FPSize = 2048
 356         BitsPerHash = 4
 357         MolsFingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Mol,  includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 358     else:
 359         # Generate LongSparseIntVect fingerprint...
 360         MolsFingerprints = [rdMolDescriptors.GetTopologicalTorsionFingerprint(Mol,  includeChirality = UseChirality) for Mol in Mols]
 361 
 362     return MolsFingerprints
 363 
 364 def GenerateLowerTriangularDistanceMatrix(MolsFingerprints):
 365     """Generate a lower triangular distance matrix without the diagonal."""
 366 
 367     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 368 
 369     DistanceMatrix = []
 370     NumFPs = len(MolsFingerprints)
 371     for Index1 in range(0, NumFPs):
 372         for Index2 in range(0, Index1):
 373             Distance =  1 - SimilarityFunction(MolsFingerprints[Index1], MolsFingerprints[Index2],)
 374             DistanceMatrix.append(Distance)
 375 
 376     return DistanceMatrix
 377 
 378 def SetupMoleculeWriters(ClustersCount):
 379     """Set up molecule writers for SD and text files."""
 380     
 381     Writer = None
 382     ClustersOutfilesWriters = []
 383 
 384     TextOutFileMode = OptionsInfo["TextOutFileMode"]
 385     TextOutFileDelim = OptionsInfo["TextOutFileDelim"]
 386     TextOutFileTitleLine = OptionsInfo["TextOutFileTitleLine"]
 387     
 388     if OptionsInfo["SingleOutFileMode"]:
 389         Outfile = OptionsInfo["Outfile"]
 390         if TextOutFileMode:
 391             Writer = open(Outfile, "w")
 392         else:
 393             Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 394         if Writer is None:
 395             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 396         
 397         if TextOutFileMode:
 398             if TextOutFileTitleLine:
 399                 WriteTextFileHeaderLine(Writer, TextOutFileDelim)
 400         
 401         MiscUtil.PrintInfo("Generating file %s..." % Outfile)
 402     else:
 403         for ClusterIndex in range(0, ClustersCount):
 404             Outfile = OptionsInfo["ClustersOutfiles"][ClusterIndex]
 405             if TextOutFileMode:
 406                 ClusterWriter = open(Outfile, "w")
 407             else:
 408                 ClusterWriter = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 409             if ClusterWriter is None:
 410                 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 411             
 412             if TextOutFileMode:
 413                 if TextOutFileTitleLine:
 414                     WriteTextFileHeaderLine(ClusterWriter, TextOutFileDelim)
 415         
 416             ClustersOutfilesWriters.append(ClusterWriter)
 417         
 418         if ClustersCount > 4:
 419             MiscUtil.PrintInfo("Generating %d output files with the following file name format: %s_Cluster<Num>.%s" % (ClustersCount, OptionsInfo["OutfileBasename"], OptionsInfo["OutfileExt"]))
 420         else:
 421             Delmiter = ','
 422             OutfileNames = Delmiter.join(OptionsInfo["ClustersOutfiles"])
 423             MiscUtil.PrintInfo("Generating %d output files: %s..." % (ClustersCount, OutfileNames))
 424         
 425     return (Writer, ClustersOutfilesWriters)
 426 
 427 def WriteTextFileHeaderLine(Writer, TextOutFileDelim):
 428     """Write out a header line for text files including SMILEs file."""
 429     
 430     Line = TextOutFileDelim.join(["SMILES", "Name", "ClusterNumber"])
 431     Writer.write("%s\n" % Line)
 432 
 433 def SetupClustersOutFilesNames(ClustersCount):
 434     """Set up out file names for clusters."""
 435 
 436     OptionsInfo["ClustersOutfiles"] = []
 437     if OptionsInfo["SingleOutFileMode"] or ClustersCount == 0:
 438         # Nothing to do...
 439         return
 440 
 441     OutfileBasename = OptionsInfo["OutfileBasename"]
 442     OutfileExt = OptionsInfo["OutfileExt"]
 443     
 444     ClusterOutfiles = []
 445     for ClusterIndex in range(0, ClustersCount):
 446         ClusterNum = ClusterIndex + 1
 447         ClusterOutfile = "%s_Cluster%d.%s" % (OutfileBasename, ClusterNum, OutfileExt)
 448         ClusterOutfiles.append(ClusterOutfile)
 449     
 450     OptionsInfo["ClustersOutfiles"] = ClusterOutfiles
 451     
 452 def ProcessFingerprintsParameters():
 453     """Set up and process fingerprints parameters."""
 454 
 455     SetupFingerprintsNamesAndParameters()
 456     ProcessSpecifiedFingerprintsName()
 457     ProcessSpecifiedFingerprintsParameters()
 458 
 459 def SetupFingerprintsNamesAndParameters():
 460     """Set up fingerprints parameters."""
 461     
 462     OptionsInfo["FingerprintsNames"] = ["AtomPairs", "MACCS166Keys", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
 463     
 464     OptionsInfo["FingerprintsParams"] = {}
 465     OptionsInfo["FingerprintsParams"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False}
 466     OptionsInfo["FingerprintsParams"]["MACCS166Keys"] = {}
 467     OptionsInfo["FingerprintsParams"]["Morgan"] = {"Radius": 2, "UseChirality": False}
 468     OptionsInfo["FingerprintsParams"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False}
 469     OptionsInfo["FingerprintsParams"]["TopologicalTorsions"] = {"UseChirality": False}
 470     OptionsInfo["FingerprintsParams"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "FPSize": 2048, "BitsPerHash": 2}
 471 
 472 def ProcessSpecifiedFingerprintsName():
 473     """Process specified fingerprints name."""
 474 
 475     #  Set up a canonical fingerprints name map...
 476     CanonicalFingerprintsNamesMap = {}
 477     for Name in OptionsInfo["FingerprintsNames"]:
 478         CanonicalName = Name.lower()
 479         CanonicalFingerprintsNamesMap[CanonicalName] = Name
 480 
 481     # Validate specified fingerprints name...
 482     CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
 483     if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
 484         MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"]))
 485     
 486     OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
 487 
 488 def ProcessSpecifiedFingerprintsParameters():
 489     """Process specified fingerprints parameters."""
 490 
 491     if re.match("^auto$", OptionsInfo["ParamsFingerprints"], re.I):
 492         # Nothing to process...
 493         return
 494 
 495     SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 496     
 497     # Parse specified fingerprints parameters...
 498     ParamsFingerprints = re.sub(" ", "", OptionsInfo["ParamsFingerprints"])
 499     if not ParamsFingerprints:
 500         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-p, --paramsFingerprints\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName))
 501 
 502     ParamsFingerprintsWords = ParamsFingerprints.split(",")
 503     if len(ParamsFingerprintsWords) % 2:
 504         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-p, --paramsFingerprints\" option must be an even number." % (len(ParamsFingerprintsWords)))
 505 
 506     # Setup canonical parameter names for specified fingerprints...
 507     ValidParamNames = []
 508     CanonicalParamNamesMap = {}
 509     for ParamName in sorted(OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName]):
 510         ValidParamNames.append(ParamName)
 511         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 512 
 513     # Validate and set paramater names and value...
 514     for Index in range(0, len(ParamsFingerprintsWords), 2):
 515         Name = ParamsFingerprintsWords[Index]
 516         Value = ParamsFingerprintsWords[Index + 1]
 517 
 518         CanonicalName = Name.lower()
 519         if  not CanonicalName in CanonicalParamNamesMap:
 520             MiscUtil.PrintError("The parameter name, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames)))
 521 
 522         ParamName = CanonicalParamNamesMap[CanonicalName]
 523         if re.match("^UseChirality$", ParamName, re.I):
 524             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 525                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName))
 526             ParamValue = False
 527             if re.match("^(Yes|True)$", Value, re.I):
 528                 ParamValue = True
 529         else:
 530             ParamValue = int(Value)
 531             if ParamValue <= 0:
 532                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName))
 533         
 534         # Set value...
 535         OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName][ParamName] = ParamValue
 536 
 537 def ProcessSimilarityMetricParameter():
 538     """Process specified similarity metric value."""
 539 
 540     SimilarityInfoMap = {}
 541     CanonicalNameMap = {}
 542     
 543     for SimilarityFunctionInfo in DataStructs.similarityFunctions:
 544         Name = SimilarityFunctionInfo[0]
 545         Function = SimilarityFunctionInfo[1]
 546         
 547         SimilarityInfoMap[Name] = Function
 548         CanonicalName = Name.lower()
 549         CanonicalNameMap[CanonicalName] = Name
 550     
 551     SpecifiedCanonicalName = OptionsInfo["SimilarityMetric"].lower()
 552     SimilarityFunction = None
 553     if  SpecifiedCanonicalName in CanonicalNameMap:
 554         SimilarityName = CanonicalNameMap[SpecifiedCanonicalName]
 555         SimilarityFunction = SimilarityInfoMap[SimilarityName]
 556     else:
 557         MiscUtil.PrintError("Similarity metric name, %s, is not a valid name. " % OptionsInfo["SimilarityMetric"])
 558         
 559     OptionsInfo["SimilarityMetric"] = SimilarityName
 560     OptionsInfo["SimilarityFunction"] = SimilarityFunction
 561 
 562     # RDKit similarity functions, besides Dice and Tanimoto, are not able to handle int bit vectors...
 563     GenerateBitVectFingerints = False
 564     if not re.match("^(Tanimoto|Dice)$", SimilarityName, re.I):
 565         GenerateBitVectFingerints = True
 566     OptionsInfo["GenerateBitVectFingerints"] = GenerateBitVectFingerints
 567     
 568 def ProcessClusteringMethodParameter():
 569     """Process specified clustering method parameter."""
 570 
 571     OptionsInfo["SpecifiedHierarchicalClusteringMethod"] = ""
 572     OptionsInfo["SpecifiedHierarchicalClusteringMethodID"] = ""
 573     
 574     if re.match("^Butina$", OptionsInfo["ClusteringMethod"], re.I):
 575         # Nothing to process...
 576         return
 577 
 578     # Setup a canonical cluster method name map..
 579     ClusteringMethodInfoMap = {}
 580     CanonicalClusteringMethodNameMap = {}
 581     for Name in sorted(rdSimDivPickers.ClusterMethod.names):
 582         NameID =  rdSimDivPickers.ClusterMethod.names[Name]
 583         ClusteringMethodInfoMap[Name] = NameID
 584         
 585         CanonicalName = Name.lower()
 586         CanonicalClusteringMethodNameMap[CanonicalName] = Name
 587 
 588     CanonicalName = OptionsInfo["ClusteringMethod"].lower()
 589     if not CanonicalName in CanonicalClusteringMethodNameMap:
 590         MiscUtil.PrintError("The clustering method, %s, specified using \"-c, --clusteringMethod\" option is not a valid name." % (OptionsInfo["ClusteringMethod"]))
 591 
 592     SpecifiedHierarchicalClusteringMethodName = CanonicalClusteringMethodNameMap[CanonicalName]
 593     OptionsInfo["SpecifiedHierarchicalClusteringMethod"] = SpecifiedHierarchicalClusteringMethodName
 594     OptionsInfo["SpecifiedHierarchicalClusteringMethodID"] = ClusteringMethodInfoMap[SpecifiedHierarchicalClusteringMethodName] 
 595     
 596 def ProcessOptions():
 597     """Process and validate command line arguments and options"""
 598     
 599     MiscUtil.PrintInfo("Processing options...")
 600     
 601     # Validate options...
 602     ValidateOptions()
 603     
 604     OptionsInfo["ButinaSimilarityCutoff"] = float(Options["--butinaSimilarityCutoff"])
 605     OptionsInfo["ButinaReordering"] = False
 606     if re.match("^Yes$", Options["--butinaReordering"], re.I):
 607         OptionsInfo["ButinaReordering"] = True
 608     
 609     OptionsInfo["Fingerprints"] = Options["--fingerprints"]
 610     
 611     OptionsInfo["ClusteringMethod"] = Options["--clusteringMethod"]
 612     ProcessClusteringMethodParameter()
 613 
 614     OptionsInfo["NumClusters"] = int(Options["--numClusters"])
 615     
 616     OptionsInfo["Infile"] = Options["--infile"]
 617     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 618     
 619     OptionsInfo["Outfile"] = Options["--outfile"]
 620     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 621     
 622     OptionsInfo["Overwrite"] = Options["--overwrite"]
 623 
 624     OptionsInfo["OutFileMode"] = Options["--outfileMode"]
 625     SingleOutFileMode = True
 626     if not re.match("^SingleFile$", Options["--outfileMode"], re.I):
 627         SingleOutFileMode = False
 628     OptionsInfo["SingleOutFileMode"] = SingleOutFileMode
 629     
 630     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 631     OptionsInfo["OutfileBasename"] = FileName
 632     OptionsInfo["OutfileExt"] = FileExt
 633 
 634     TextOutFileMode = False
 635     TextOutFileDelim = ""
 636     TextOutFileTitleLine = True
 637     
 638     if MiscUtil.CheckFileExt(Options["--outfile"], "csv"):
 639         TextOutFileMode = True
 640         TextOutFileDelim = ","
 641     elif MiscUtil.CheckFileExt(Options["--outfile"], "tsv txt"):
 642         TextOutFileMode = True
 643         TextOutFileDelim = "\t"
 644     elif MiscUtil.CheckFileExt(Options["--outfile"], "smi"):
 645         TextOutFileMode = True
 646         TextOutFileDelim = OptionsInfo["OutfileParams"]["SMILESDelimiter"]
 647         TextOutFileTitleLine = OptionsInfo["OutfileParams"]["SMILESTitleLine"]
 648         
 649     OptionsInfo["TextOutFileMode"] = TextOutFileMode
 650     OptionsInfo["TextOutFileDelim"] = TextOutFileDelim
 651     OptionsInfo["TextOutFileTitleLine"] = TextOutFileTitleLine
 652     
 653     OptionsInfo["SimilarityMetric"] = Options["--similarityMetric"]
 654     ProcessSimilarityMetricParameter()
 655     
 656     OptionsInfo["ParamsFingerprints"] = Options["--paramsFingerprints"]
 657     ProcessFingerprintsParameters()
 658     
 659 def RetrieveOptions():
 660     """Retrieve command line arguments and options"""
 661     
 662     # Get options...
 663     global Options
 664     Options = docopt(_docoptUsage_)
 665     
 666     # Set current working directory to the specified directory...
 667     WorkingDir = Options["--workingdir"]
 668     if WorkingDir:
 669         os.chdir(WorkingDir)
 670     
 671     # Handle examples option...
 672     if "--examples" in Options and Options["--examples"]:
 673         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 674         sys.exit(0)
 675 
 676 def ValidateOptions():
 677     """Validate option values"""
 678     
 679     MiscUtil.ValidateOptionFloatValue("-b, --butinaSimilarityCutoff", Options["--butinaSimilarityCutoff"], {">": 0.0, "<=" : 1.0})
 680     MiscUtil.ValidateOptionTextValue("--butinaReordering", Options["--butinaReordering"], "yes no")
 681     
 682     MiscUtil.ValidateOptionTextValue("-c, --clusteringMethod", Options["--clusteringMethod"], "Butina Centroid CLink Gower McQuitty SLink UPGMA Ward")
 683     MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs MACCS166Keys Morgan MorganFeatures PathLength TopologicalTorsions")
 684     
 685     MiscUtil.ValidateOptionIntegerValue("-n, --numClusters", Options["--numClusters"], {">": 0})
 686     
 687     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 688     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi csv tsv txt")
 689     
 690     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi csv tsv txt")
 691     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 692     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 693         
 694     MiscUtil.ValidateOptionTextValue("--outfileMode", Options["--outfileMode"], "SingleFile MultipleFiles")
 695     
 696     MiscUtil.ValidateOptionTextValue("-s, --similarityMetric", Options["--similarityMetric"], "BraunBlanquet Cosine Dice Kulczynski RogotGoldberg Russel Sokal Tanimoto")
 697     
 698 # Setup a usage string for docopt...
 699 _docoptUsage_ = """
 700 RDKitClusterMolecules.py - Cluster molecules using 2D fingerprints
 701 
 702 Usage:
 703     RDKitClusterMolecules.py [--butinaSimilarityCutoff <number>]  [--butinaReordering <yes or no>]
 704                              [--clusteringMethod <Butina, Centroid, CLink...>]
 705                              [--fingerprints <MACCS166Keys, Morgan, PathLength...> ] [--infileParams <Name,Value,...>]
 706                              [--numClusters <number>] [--outfileMode <SingleFile or MultipleFiles>]
 707                              [ --outfileParams <Name,Value,...> ] [--overwrite] [--paramsFingerprints <Name,Value,...>]
 708                              [--similarityMetric <Dice, Tanimoto...>] [-w <dir>] -i <infile> -o <outfile> 
 709     RDKitClusterMolecules.py -h | --help | -e | --examples
 710 
 711 Description:
 712     Cluster molecules based on a variety of 2D fingerprints using Butina [ Ref 136 ] or any
 713     other available hierarchical clustering methodology and write them to output file(s).
 714 
 715     The default fingerprints types for various fingerprints are shown below:
 716 
 717         AtomPairs              IntSparseIntVect
 718         MACCS166Keys           ExplicitBitVect
 719         Morgan                 UIntSparseIntVect
 720         MorganFeatures         UIntSparseIntVect
 721         PathLength             ExplicitBitVect
 722         TopologicalTorsions    LongSparseIntVect
 723  
 724     The Dice and Tanimoto similarity functions available in RDKit are able to
 725     handle fingerprints corresponding to both IntVect and BitVect. All other
 726     similarity functions, however, expect BitVect fingerprints to calculate
 727     pairwise similarity. Consequently, ExplicitBitVect fingerprints, instead of
 728     default IntVect fingerprints, are generated for AtomPairs, Morgan,
 729     MorganFeatures, and TopologicalTorsions to calculate similarity.
 730 
 731     The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi,
 732     .txt, .csv, .tsv)
 733 
 734     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi), CSV/TSV
 735     (.csv, .tsv, .txt)
 736 
 737 Options:
 738     -b, --butinaSimilarityCutoff <number>  [default: 0.55]
 739         Similarity cutoff to use during Butina clustering. The molecule pairs with
 740         similarity value greater than specified value or distance less than '1 - specified
 741         value' are considered neighbors. This value is only used during 'Butina' value
 742         of '-c, --clusteringMethod' option and determines the number of clusters
 743         during the clustering of molecules. It is ignored for all other clustering methods.
 744     --butinaReordering <yes or no>  [default: no]
 745         Update number of neighbors for unassigned molecules after creating a new
 746         cluster in order to insure that the molecule with the largest number of
 747         unassigned neighbors is selected as the next cluster center.
 748     -c, --clusteringMethod <Butina, Centroid, CLink...>  [default: Butina]
 749         Clustering method to use for clustering molecules. Supported values:
 750         Butina, Centroid, CLink, Gower, McQuitty, SLink, UPGMA, Ward.
 751         Butina is an unsupervised database clustering method to automatically
 752         cluster small and large data sets. All other clustering methods correspond
 753         to hierarchical clustering and require a priori specification of number of
 754         clusters to be generated.
 755     -f, --fingerprints <MACCS166Keys, Morgan, PathLength...>  [default: Morgan]
 756         Fingerprints to use for calculating similarity/distance between molecules.
 757         Supported values: AtomPairs, MACCS166Keys, Morgan, MorganFeatures, PathLength,
 758         TopologicalTorsions. The PathLength fingerprints are Daylight like fingerprints.
 759         The Morgan and MorganFeature fingerprints are circular fingerprints, corresponding
 760         Scitegic's Extended Connectivity Fingerprints (ECFP) and Features Connectivity
 761         Fingerprints (FCFP). The values of default parameters for generating fingerprints
 762         can be modified using '-p, --paramsFingerprints' option.
 763     -e, --examples
 764         Print examples.
 765     -h, --help
 766         Print this help message.
 767     -i, --infile <infile>
 768         Input file name.
 769     --infileParams <Name,Value,...>  [default: auto]
 770         A comma delimited list of parameter name and value pairs for reading 
 771         molecules from files. The supported parameter names for different file
 772         formats, along with their default values, are shown below:
 773             
 774             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 775             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 776                 smilesTitleLine,auto,sanitize,yes
 777             
 778         Possible values for smilesDelimiter: space, comma or tab.
 779     -n, --numClusters <number>  [default: 10]
 780         Number of clusters to generate during hierarchical clustering. This option is
 781         ignored for 'Butina' value of '-c, --clusteringMethod' option.
 782     -o, --outfile <outfile>
 783         Output file name.
 784     --outfileMode <SingleFile or MultipleFiles>  [default: SingleFile]
 785         Write out a single file containing molecule clusters or generate an individual file
 786         for each cluster. Possible values: SingleFile or MultipleFiles. The molecules are
 787         grouped for each cluster before they are written to output file(s) along with
 788         appropriate cluster numbers. The cluster number is also appended to output
 789         file names during generation of multiple output files.
 790     --outfileParams <Name,Value,...>  [default: auto]
 791         A comma delimited list of parameter name and value pairs for writing
 792         molecules to files. The supported parameter names for different file
 793         formats, along with their default values, are shown below:
 794             
 795             SD: compute2DCoords,auto,kekulize,no
 796             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 797                 smilesTitleLine,yes
 798             
 799         Default value for compute2DCoords: yes for SMILES input file; no for all other
 800         file types. The kekulize and smilesIsomeric parameters are also used during
 801         generation of SMILES strings for CSV/TSV files.
 802     --overwrite
 803         Overwrite existing files.
 804     -p, --paramsFingerprints <Name,Value,...>  [default: auto]
 805         Parameter values to use for generating fingerprints. The default values
 806         are dependent on the value of '-f, --fingerprints' option. In general, it is a
 807         comma delimited list of parameter name and value pairs for the name of
 808         fingerprints specified using '-f, --fingerprints' option. The supported
 809         parameter names along with their default values for valid fingerprints
 810         names are shown below:
 811             
 812             AtomPairs: minLength,1 ,maxLength,30, useChirality,No
 813             Morgan: radius,2, useChirality,No
 814             MorganFeatures:   radius,2, useChirality,No
 815             PathLength: minPath,1, maxPath,7, fpSize, 2048, bitsPerHash,2
 816             TopologicalTorsions: useChirality,No
 817             
 818     -s, --similarityMetric <Dice, Tanimoto...>  [default: Tanimoto]
 819         Similarity metric to use for calculating similarity/distance between molecules.
 820         Possible values: BraunBlanquet, Cosine, Dice, Kulczynski, RogotGoldberg,
 821         Russel, Sokal, Tanimoto.
 822     -w, --workingdir <dir>
 823         Location of working directory which defaults to the current directory.
 824 
 825 Examples:
 826     To cluster molecules using Butina methodology at a similarity cutoff of 0.55
 827     with automatic determination of number of clusters, Tanimoto similarity
 828     metric corresponding to Morgan fingerprints with radius of 2, and write out
 829     a single SMILES file containing clustered molecules along with cluster number
 830     for each molecule, type:
 831 
 832         % RDKitClusterMolecules.py  -i Sample.smi -o SampleOut.smi
 833 
 834     To cluster molecules using Butina methodology at similarity cutoff of 0.45
 835     with automatic determination of number of clusters, Dice similarity metric
 836     corresponding to Morgan fingerprints with radius of 2, and write out multiple
 837     SD files containing clustered molecules for each cluster, type:
 838 
 839         % RDKitClusterMolecules.py  -b 0.45 --outfileMode MultipleFiles
 840           -i Sample.smi -o SampleOut.sdf
 841 
 842     To cluster molecules using Ward hierarchical methodology to generate 15
 843     clusters, Dice similarity metric corresponding to Pathlength fingerprints with 
 844     path length between 1 and 7,  and write out a single TSV file for clustered
 845     molecules along with cluster numner for each molecule, type:
 846 
 847         % RDKitClusterMolecules.py  -c Ward -f PathLength -n 15
 848           -p 'minPath,1, maxPath,7' -i Sample.sdf -o SampleOut.tsv
 849 
 850     To cluster molecules using Centroid hierarchical methodology to generate 5
 851     clusters, Dice similarity metric corresponding to MACCS166Keys fingerprints
 852     for molecules in a SMILES CSV file, SMILES strings in column 1, name in
 853     column 2, and write out a single SD file for clustered molecules along with
 854     cluster numner for each molecule, type:
 855 
 856         % RDKitClusterMolecules.py  -c Centroid -f MACCS166Keys --infileParams
 857           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 858           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 859           -i SampleSMILES.csv -o SampleOut.sdf
 860 
 861 Author:
 862     Manish Sud(msud@san.rr.com)
 863 
 864 See also:
 865     RDKitConvertFileFormat.py, RDKitPickDiverseMolecules.py, RDKitSearchFunctionalGroups.py,
 866     RDKitSearchSMARTS.py
 867 
 868 Copyright:
 869     Copyright (C) 2019 Manish Sud. All rights reserved.
 870 
 871     The functionality available in this script is implemented using RDKit, an
 872     open source toolkit for cheminformatics developed by Greg Landrum.
 873 
 874     This file is part of MayaChemTools.
 875 
 876     MayaChemTools is free software; you can redistribute it and/or modify it under
 877     the terms of the GNU Lesser General Public License as published by the Free
 878     Software Foundation; either version 3 of the License, or (at your option) any
 879     later version.
 880 
 881 """
 882 
 883 if __name__ == "__main__":
 884     main()