MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitPickDiverseMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42     from rdkit import DataStructs
  43     from rdkit.Chem.Fingerprints import FingerprintMols
  44     from rdkit.Chem import rdMolDescriptors
  45     from rdkit.SimDivFilters import rdSimDivPickers
  46     from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
  47     from rdkit.SimDivFilters.rdSimDivPickers import HierarchicalClusterPicker
  48 except ImportError as ErrMsg:
  49     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  50     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  51     sys.exit(1)
  52 
  53 # MayaChemTools imports...
  54 try:
  55     from docopt import docopt
  56     import MiscUtil
  57     import RDKitUtil
  58 except ImportError as ErrMsg:
  59     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  60     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  61     sys.exit(1)
  62 
  63 ScriptName = os.path.basename(sys.argv[0])
  64 Options = {}
  65 OptionsInfo = {}
  66 
  67 def main():
  68     """Start execution of the script"""
  69     
  70     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  71     
  72     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  73     
  74     # Retrieve command line arguments and options...
  75     RetrieveOptions()
  76     
  77     # Process and validate command line arguments and options...
  78     ProcessOptions()
  79     
  80     # Perform actions required by the script...
  81     PickDiverseMolecules()
  82     
  83     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  84     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  85 
  86 def PickDiverseMolecules():
  87     """Pick diverse molecules."""
  88 
  89     Mols = RetrieveMolecules()
  90     MolsFingerprints = GenerateFingerprints(Mols)
  91     DiverseMols = SelectMolecules(Mols, MolsFingerprints)
  92     
  93     WriteMolecules(DiverseMols)
  94 
  95 def SelectMolecules(Mols, MolsFingerprints):
  96     """Select diverse molecules."""
  97 
  98     if OptionsInfo["NumMols"] > len(Mols):
  99         MiscUtil.PrintError("The number of diverse molecules to pick, %d, specified using \"-n, --numMols\" must be less than total number of valid molecules, %d" % (OptionsInfo["NumMols"], len(Mols)))
 100     
 101     DiverseMols = []
 102     if re.match("^MaxMin$", OptionsInfo["Mode"], re.I):
 103         return SelectMoleculesUsingMaxMin(Mols, MolsFingerprints)
 104     elif re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I):
 105         return SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints)
 106     else:
 107         MiscUtil.PrintError("The mode vaue, %s, is not a valid mode." % OptionsInfo["Mode"])
 108     
 109     return DiverseMols
 110 
 111 def SelectMoleculesUsingMaxMin(Mols, MolsFingerprints):
 112     """Select diverse molecules using MaxMin methodology."""
 113 
 114     MiscUtil.PrintInfo("\nSelecting diverse molecules using MaxMin methodology and %s similarity metric..." % OptionsInfo["SimilarityMetric"])
 115     
 116     DiverseMols = []
 117     
 118     PoolSize = len(MolsFingerprints)
 119     PickSize = OptionsInfo["NumMols"]
 120     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 121 
 122     Picker = MaxMinPicker()
 123     PairwiseDistance = lambda i, j: 1 - SimilarityFunction(MolsFingerprints[i], MolsFingerprints[j])
 124 
 125     MolIndices = Picker.LazyPick(PairwiseDistance, PoolSize, PickSize)
 126             
 127     for Index in list(MolIndices):
 128         DiverseMols.append(Mols[Index])
 129     
 130     return DiverseMols
 131 
 132 def SelectMoleculesUsingHierarchicalClustering(Mols, MolsFingerprints):
 133     """Select diverse molecules using hierarchical clustering  methodology."""
 134 
 135     try:
 136         import numpy
 137     except ImportError:
 138         MiscUtil.PrintError("Failed to import numpy python module. This is required for picking diverse molecules using hierarchical for clustering.")
 139     
 140     MiscUtil.PrintInfo("\nSelecting diverse molecules using %s hierarchical clustering methodology..." % OptionsInfo["SpecifiedClusteringMethod"])
 141     
 142     DiverseMols = []
 143     
 144     PoolSize = len(MolsFingerprints)
 145     PickSize = OptionsInfo["NumMols"]
 146     DistanceMatrix = GenerateLowerTriangularDistanceMatrix(MolsFingerprints)
 147     
 148     ClusterPicker = HierarchicalClusterPicker(OptionsInfo["SpecifiedClusteringMethodID"])
 149     MolIndices = ClusterPicker.Pick(numpy.asarray(DistanceMatrix), PoolSize, PickSize)
 150     
 151     for Index in MolIndices:
 152         DiverseMols.append(Mols[Index])
 153     
 154     return DiverseMols
 155 
 156 def RetrieveMolecules():
 157     """Retrieve molecules."""
 158 
 159     Infile = OptionsInfo["Infile"]
 160     
 161     # Read molecules...
 162     MiscUtil.PrintInfo("\nReading file %s..." % Infile)
 163     
 164     OptionsInfo["InfileParams"]["AllowEmptyMols"] = False
 165     ValidMols, MolCount, ValidMolCount  = RDKitUtil.ReadAndValidateMolecules(Infile, **OptionsInfo["InfileParams"])
 166     
 167     MiscUtil.PrintInfo("Total number of molecules: %d" % MolCount)
 168     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 169     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 170 
 171     return ValidMols
 172 
 173 def GenerateFingerprints(Mols):
 174     """Generate fingerprints."""
 175 
 176     FingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 177     
 178     MolsFingerprints = []
 179     if re.match("^AtomPairs$", FingerprintsName, re.I):
 180         return GenerateAtomPairsFingerprints(Mols)
 181     elif re.match("^MACCS166Keys$", FingerprintsName, re.I):
 182         return GenerateMACCS166KeysFingerprints(Mols)
 183     elif re.match("^Morgan$", FingerprintsName, re.I):
 184         return GenerateMorganFingerprints(Mols)
 185     elif re.match("^MorganFeatures$", FingerprintsName, re.I):
 186         return GenerateMorganFeaturesFingerprints(Mols)
 187     elif re.match("^PathLength$", FingerprintsName, re.I):
 188         return GeneratePathLengthFingerprints(Mols)
 189     elif re.match("^TopologicalTorsions$", FingerprintsName, re.I):
 190         return GenerateTopologicalTorsionsFingerprints(Mols)
 191     else:
 192         MiscUtil.PrintError("Fingerprints name, %s, is not a valid name" % FingerprintsName)
 193     
 194     return MolsFingerprints
 195 
 196 def GenerateAtomPairsFingerprints(Mols):
 197     """Generate AtomPairs fingerprints."""
 198 
 199     MiscUtil.PrintInfo("\nGenerating AtomPairs fingerprints...")
 200     
 201     MinLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MinLength"]
 202     MaxLength = OptionsInfo["FingerprintsParams"]["AtomPairs"]["MaxLength"]
 203     UseChirality = OptionsInfo["FingerprintsParams"]["AtomPairs"]["UseChirality"]
 204 
 205     if OptionsInfo["GenerateBitVectFingerints"]:
 206         # Generate ExplicitBitVect fingerprints...
 207         FPSize = 2048
 208         BitsPerHash = 4
 209         MolsFingerprints = [rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 210     else:
 211         # Generate IntSparseIntVect fingerprints...
 212         MolsFingerprints = [rdMolDescriptors.GetAtomPairFingerprint(Mol, minLength = MinLength, maxLength = MaxLength, includeChirality = UseChirality) for Mol in Mols]
 213 
 214     return MolsFingerprints
 215 
 216 def GenerateMACCS166KeysFingerprints(Mols):
 217     """Generate MACCS166Keys fingerprints."""
 218 
 219     MiscUtil.PrintInfo("\nGenerating MACCS166Keys fingerprints...")
 220 
 221     # Generate ExplicitBitVect fingerprints...
 222     MolsFingerprints = [rdMolDescriptors.GetMACCSKeysFingerprint(Mol) for Mol in Mols]
 223 
 224     return MolsFingerprints
 225 
 226 def GenerateMorganFingerprints(Mols):
 227     """Generate Morgan fingerprints."""
 228 
 229     MiscUtil.PrintInfo("\nGenerating  Morgan fingerprints...")
 230     
 231     Radius = OptionsInfo["FingerprintsParams"]["Morgan"]["Radius"]
 232     UseChirality = OptionsInfo["FingerprintsParams"]["Morgan"]["UseChirality"]
 233     UseFeatures = False
 234 
 235     if OptionsInfo["GenerateBitVectFingerints"]:
 236         # Generate ExplicitBitVect fingerprints...
 237         FPSize = 2048
 238         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 239     else:
 240         # Generate UIntSparseIntVect fingerprints...
 241         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 242 
 243     return MolsFingerprints
 244 
 245 def GenerateMorganFeaturesFingerprints(Mols):
 246     """Generate MorganFeatures fingerprints."""
 247 
 248     MiscUtil.PrintInfo("\nGenerating  MorganFeatures fingerprints...")
 249     
 250     # Setup fingerprints parameters...
 251     Radius = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["Radius"]
 252     UseChirality = OptionsInfo["FingerprintsParams"]["MorganFeatures"]["UseChirality"]
 253     UseFeatures = True
 254     
 255     if OptionsInfo["GenerateBitVectFingerints"]:
 256         # Generate ExplicitBitVect fingerprints...
 257         FPSize = 2048
 258         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprintAsBitVect(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality, nBits = FPSize) for Mol in Mols]
 259     else:
 260         # Generate UIntSparseIntVect fingerprints...
 261         MolsFingerprints = [rdMolDescriptors.GetMorganFingerprint(Mol, Radius, useFeatures = UseFeatures, useChirality = UseChirality) for Mol in Mols]
 262 
 263     return MolsFingerprints
 264 
 265 def GeneratePathLengthFingerprints(Mols):
 266     """Generate PathLength fingerprints."""
 267 
 268     MiscUtil.PrintInfo("\nGenerating PathLength fingerprints ...")
 269     
 270     MinPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MinPath"]
 271     MaxPath = OptionsInfo["FingerprintsParams"]["PathLength"]["MaxPath"]
 272     FPSize = OptionsInfo["FingerprintsParams"]["PathLength"]["FPSize"]
 273     BitsPerHash = OptionsInfo["FingerprintsParams"]["PathLength"]["BitsPerHash"]
 274     UseHs = False
 275     TargetDensity = 0.3
 276     MinSize = 54
 277 
 278     # Generate ExplicitBitVect fingerprints...
 279     MolsFingerprints = [FingerprintMols.FingerprintMol(Mol, minPath = MinPath, maxPath = MaxPath, fpSize = FPSize, bitsPerHash = BitsPerHash, useHs = UseHs, tgtDensity = TargetDensity, minSize = MinSize) for Mol in Mols]
 280 
 281     return MolsFingerprints
 282 
 283 def GenerateTopologicalTorsionsFingerprints(Mols):
 284     """Generate TopologicalTorsions fingerprints."""
 285 
 286     MiscUtil.PrintInfo("\nGenerating TopologicalTorsions fingerprints...")
 287     
 288     UseChirality = OptionsInfo["FingerprintsParams"]["TopologicalTorsions"]["UseChirality"]
 289 
 290     if OptionsInfo["GenerateBitVectFingerints"]:
 291         FPSize = 2048
 292         BitsPerHash = 4
 293         MolsFingerprints = [rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Mol,  includeChirality = UseChirality, nBits = FPSize, nBitsPerEntry = BitsPerHash) for Mol in Mols]
 294     else:
 295         # Generate LongSparseIntVect fingerprint...
 296         MolsFingerprints = [rdMolDescriptors.GetTopologicalTorsionFingerprint(Mol,  includeChirality = UseChirality) for Mol in Mols]
 297 
 298     return MolsFingerprints
 299 
 300 def GenerateLowerTriangularDistanceMatrix(MolsFingerprints):
 301     """Generate a lower triangular distance matrix without the diagonal."""
 302 
 303     SimilarityFunction = OptionsInfo["SimilarityFunction"]
 304 
 305     DistanceMatrix = []
 306     NumFPs = len(MolsFingerprints)
 307     for Index1 in range(0, NumFPs):
 308         for Index2 in range(0, Index1):
 309             Distance =  1 - SimilarityFunction(MolsFingerprints[Index1], MolsFingerprints[Index2],)
 310             DistanceMatrix.append(Distance)
 311 
 312     return DistanceMatrix
 313 
 314 def WriteMolecules(Mols):
 315     """Write out molecules."""
 316 
 317     Outfile = OptionsInfo["Outfile"]
 318     
 319     # Set up a molecule writer...
 320     Writer = None
 321     Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 322     if Writer is None:
 323         MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 324     MiscUtil.PrintInfo("\nGenerating file %s...\n" % Outfile)
 325 
 326     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 327     
 328     # Write out molecules...
 329     for Mol in Mols:
 330         if Compute2DCoords:
 331             AllChem.Compute2DCoords(Mol)
 332         Writer.write(Mol)
 333         
 334     if Writer is not None:
 335         Writer.close()
 336         
 337     MiscUtil.PrintInfo("Total number of diverse molecules selected: %d" % (len(Mols)))
 338 
 339 def ProcessFingerprintsParameters():
 340     """Set up and process fingerprints parameters."""
 341 
 342     SetupFingerprintsNamesAndParameters()
 343     ProcessSpecifiedFingerprintsName()
 344     ProcessSpecifiedFingerprintsParameters()
 345 
 346 def SetupFingerprintsNamesAndParameters():
 347     """Set up fingerprints parameters."""
 348     
 349     OptionsInfo["FingerprintsNames"] = ["AtomPairs", "MACCS166Keys", "Morgan", "MorganFeatures", "PathLength", "TopologicalTorsions"]
 350     
 351     OptionsInfo["FingerprintsParams"] = {}
 352     OptionsInfo["FingerprintsParams"]["AtomPairs"] = {"MinLength": 1, "MaxLength": 30, "UseChirality": False}
 353     OptionsInfo["FingerprintsParams"]["MACCS166Keys"] = {}
 354     OptionsInfo["FingerprintsParams"]["Morgan"] = {"Radius": 2, "UseChirality": False}
 355     OptionsInfo["FingerprintsParams"]["MorganFeatures"] = {"Radius": 2, "UseChirality": False}
 356     OptionsInfo["FingerprintsParams"]["TopologicalTorsions"] = {"UseChirality": False}
 357     OptionsInfo["FingerprintsParams"]["PathLength"] = {"MinPath": 1, "MaxPath": 7, "FPSize": 2048, "BitsPerHash": 2}
 358 
 359 def ProcessSpecifiedFingerprintsName():
 360     """Process specified fingerprints name."""
 361 
 362     #  Set up a canonical fingerprints name map...
 363     CanonicalFingerprintsNamesMap = {}
 364     for Name in OptionsInfo["FingerprintsNames"]:
 365         CanonicalName = Name.lower()
 366         CanonicalFingerprintsNamesMap[CanonicalName] = Name
 367 
 368     # Validate specified fingerprints name...
 369     CanonicalFingerprintsName = OptionsInfo["Fingerprints"].lower()
 370     if CanonicalFingerprintsName not in CanonicalFingerprintsNamesMap:
 371         MiscUtil.PrintError("The fingerprints name, %s, specified using \"-f, --fingerprints\" option is not a valid name." % (OptionsInfo["Fingerprints"]))
 372     
 373     OptionsInfo["SpecifiedFingerprints"] = CanonicalFingerprintsNamesMap[CanonicalFingerprintsName]
 374 
 375 def ProcessSpecifiedFingerprintsParameters():
 376     """Process specified fingerprints parameters."""
 377 
 378     if re.match("^auto$", OptionsInfo["ParamsFingerprints"], re.I):
 379         # Nothing to process...
 380         return
 381 
 382     SpecifiedFingerprintsName = OptionsInfo["SpecifiedFingerprints"]
 383     
 384     # Parse specified fingerprints parameters...
 385     ParamsFingerprints = re.sub(" ", "", OptionsInfo["ParamsFingerprints"])
 386     if not ParamsFingerprints:
 387         MiscUtil.PrintError("No valid parameter name and value pairs specified using \"-p, --paramsFingerprints\" option corrresponding to fingerprints %s." % (SpecifiedFingerprintsName))
 388 
 389     ParamsFingerprintsWords = ParamsFingerprints.split(",")
 390     if len(ParamsFingerprintsWords) % 2:
 391         MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-p, --paramsFingerprints\" option must be an even number." % (len(ParamsFingerprintsWords)))
 392 
 393     # Setup a canonical parameter names for specified fingerprints...
 394     ValidParamNames = []
 395     CanonicalParamNamesMap = {}
 396     for ParamName in sorted(OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName]):
 397         ValidParamNames.append(ParamName)
 398         CanonicalParamNamesMap[ParamName.lower()] = ParamName
 399 
 400     # Validate and set paramater names and value...
 401     for Index in range(0, len(ParamsFingerprintsWords), 2):
 402         Name = ParamsFingerprintsWords[Index]
 403         Value = ParamsFingerprintsWords[Index + 1]
 404 
 405         CanonicalName = Name.lower()
 406         if  not CanonicalName in CanonicalParamNamesMap:
 407             MiscUtil.PrintError("The parameter name, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid name. Supported parameter names: %s" % (Name, SpecifiedFingerprintsName, " ".join(ValidParamNames)))
 408 
 409         ParamName = CanonicalParamNamesMap[CanonicalName]
 410         if re.match("^UseChirality$", ParamName, re.I):
 411             if not re.match("^(Yes|No|True|False)$", Value, re.I):
 412                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: Yes No True False" % (Value, SpecifiedFingerprintsName))
 413             ParamValue = False
 414             if re.match("^(Yes|True)$", Value, re.I):
 415                 ParamValue = True
 416         else:
 417             ParamValue = int(Value)
 418             if ParamValue <= 0:
 419                 MiscUtil.PrintError("The parameter value, %s, specified using \"-p, --paramsFingerprints\" option for fingerprints, %s, is not a valid value. Supported values: > 0" % (Value, SpecifiedFingerprintsName))
 420         
 421         # Set value...
 422         OptionsInfo["FingerprintsParams"][SpecifiedFingerprintsName][ParamName] = ParamValue
 423 
 424 def ProcessSimilarityMetricParameter():
 425     """Process specified similarity metric value."""
 426 
 427     SimilarityInfoMap = {}
 428     CanonicalNameMap = {}
 429     
 430     for SimilarityFunctionInfo in DataStructs.similarityFunctions:
 431         Name = SimilarityFunctionInfo[0]
 432         Function = SimilarityFunctionInfo[1]
 433         
 434         SimilarityInfoMap[Name] = Function
 435         CanonicalName = Name.lower()
 436         CanonicalNameMap[CanonicalName] = Name
 437     
 438     SpecifiedCanonicalName = OptionsInfo["SimilarityMetric"].lower()
 439     SimilarityFunction = None
 440     if  SpecifiedCanonicalName in CanonicalNameMap:
 441         SimilarityName = CanonicalNameMap[SpecifiedCanonicalName]
 442         SimilarityFunction = SimilarityInfoMap[SimilarityName]
 443     else:
 444         MiscUtil.PrintError("Similarity metric name, %s, is not a valid name. " % OptionsInfo["SimilarityMetric"])
 445         
 446     OptionsInfo["SimilarityMetric"] = SimilarityName
 447     OptionsInfo["SimilarityFunction"] = SimilarityFunction
 448 
 449     # RDKit similarity functions, besides Dice and Tanimoto, are not able to handle int bit vectors...
 450     GenerateBitVectFingerints = False
 451     if not re.match("^(Tanimoto|Dice)$", SimilarityName, re.I):
 452         GenerateBitVectFingerints = True
 453     OptionsInfo["GenerateBitVectFingerints"] = GenerateBitVectFingerints
 454     
 455 def ProcessClusteringMethodParameter():
 456     """Process specified clustering method parameter."""
 457 
 458     OptionsInfo["SpecifiedClusteringMethod"] = ""
 459     OptionsInfo["SpecifiedClusteringMethodID"] = ""
 460     
 461     if not re.match("^HierarchicalClustering$", OptionsInfo["Mode"], re.I):
 462         # Nothing to process...
 463         return
 464 
 465     # Setup a canonical cluster method name map..
 466     ClusteringMethodInfoMap = {}
 467     CanonicalClusteringMethodNameMap = {}
 468     for Name in sorted(rdSimDivPickers.ClusterMethod.names):
 469         NameID =  rdSimDivPickers.ClusterMethod.names[Name]
 470         ClusteringMethodInfoMap[Name] = NameID
 471         
 472         CanonicalName = Name.lower()
 473         CanonicalClusteringMethodNameMap[CanonicalName] = Name
 474 
 475     CanonicalName = OptionsInfo["ClusteringMethod"].lower()
 476     if not CanonicalName in CanonicalClusteringMethodNameMap:
 477         MiscUtil.PrintError("The cluster method, %s, specified using \"-c, --clusteringMethod\" option is not a valid name." % (OptionsInfo["ClusteringMethod"]))
 478 
 479     SpecifiedClusteringMethodName = CanonicalClusteringMethodNameMap[CanonicalName]
 480     OptionsInfo["SpecifiedClusteringMethod"] = SpecifiedClusteringMethodName
 481     OptionsInfo["SpecifiedClusteringMethodID"] = ClusteringMethodInfoMap[SpecifiedClusteringMethodName] 
 482     
 483 def ProcessOptions():
 484     """Process and validate command line arguments and options"""
 485     
 486     MiscUtil.PrintInfo("Processing options...")
 487     
 488     # Validate options...
 489     ValidateOptions()
 490     
 491     OptionsInfo["Mode"] = Options["--mode"]
 492     OptionsInfo["Fingerprints"] = Options["--fingerprints"]
 493     
 494     OptionsInfo["ClusteringMethod"] = Options["--clusteringMethod"]
 495     ProcessClusteringMethodParameter()
 496 
 497     OptionsInfo["NumMols"] = int(Options["--numMols"])
 498     
 499     OptionsInfo["Infile"] = Options["--infile"]
 500     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 501     
 502     OptionsInfo["Outfile"] = Options["--outfile"]
 503     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 504     
 505     OptionsInfo["Overwrite"] = Options["--overwrite"]
 506 
 507     OptionsInfo["SimilarityMetric"] = Options["--similarityMetric"]
 508     ProcessSimilarityMetricParameter()
 509     
 510     OptionsInfo["ParamsFingerprints"] = Options["--paramsFingerprints"]
 511     ProcessFingerprintsParameters()
 512     
 513 def RetrieveOptions():
 514     """Retrieve command line arguments and options"""
 515     
 516     # Get options...
 517     global Options
 518     Options = docopt(_docoptUsage_)
 519     
 520     # Set current working directory to the specified directory...
 521     WorkingDir = Options["--workingdir"]
 522     if WorkingDir:
 523         os.chdir(WorkingDir)
 524     
 525     # Handle examples option...
 526     if "--examples" in Options and Options["--examples"]:
 527         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 528         sys.exit(0)
 529 
 530 def ValidateOptions():
 531     """Validate option values"""
 532     
 533     MiscUtil.ValidateOptionTextValue("-c, --clusteringMethod", Options["--clusteringMethod"], "Centroid CLink Gower McQuitty SLink UPGMA Ward")
 534     MiscUtil.ValidateOptionTextValue("-f, --fingerprints", Options["--fingerprints"], "AtomPairs MACCS166Keys Morgan MorganFeatures PathLength TopologicalTorsions")
 535     
 536     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "MaxMin HierarchicalClustering")
 537     MiscUtil.ValidateOptionIntegerValue("-n, --numMols", Options["--numMols"], {">": 0})
 538     
 539     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 540     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 541     
 542     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 543     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 544     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 545         
 546     MiscUtil.ValidateOptionTextValue("-s, --similarityMetric", Options["--similarityMetric"], "BraunBlanquet Cosine Dice Kulczynski RogotGoldberg Russel Sokal Tanimoto")
 547     
 548 # Setup a usage string for docopt...
 549 _docoptUsage_ = """
 550 RDKitPickDiverseMolecules.py - Pick a diverse subset of molecules
 551 
 552 Usage:
 553     RDKitPickDiverseMolecules.py [--clusteringMethod <Centroid, CLink...>]
 554                                  [--fingerprints <MACCS166Keys, Morgan, PathLength...>]
 555                                  [--infileParams <Name,Value,...>] [--mode <MaxMin or HierarchicalClustering>]
 556                                  [--numMols <number>]  [--outfileParams <Name,Value,...>] 
 557                                  [--overwrite] [--paramsFingerprints <Name,Value,...>]
 558                                  [--similarityMetric <Dice, Tanimoto...>] [-w <dir>] -i <infile> -o <outfile> 
 559     RDKitPickDiverseMolecules.py -h | --help | -e | --examples
 560 
 561 Description:
 562     Pick a subset of diverse molecules  based on a variety of 2D fingerprints using
 563     MaxMin [ Ref 135 ] or an available hierarchical clustering methodology and write
 564     them to a file.
 565 
 566     The default fingerprints types for various fingerprints are shown below:
 567 
 568         AtomPairs              IntSparseIntVect
 569         MACCS166Keys           ExplicitBitVect
 570         Morgan                 UIntSparseIntVect
 571         MorganFeatures         UIntSparseIntVect
 572         PathLength             ExplicitBitVect
 573         TopologicalTorsions    LongSparseIntVect
 574  
 575     The Dice and Tanimoto similarity functions available in RDKit are able to
 576     handle fingerprints corresponding to both IntVect and BitVect. All other
 577     similarity functions, however, expect BitVect fingerprints to calculate
 578     pairwise similarity. Consequently, ExplicitBitVect fingerprints are generated
 579     for AtomPairs, Morgan, MorganFeatures, and TopologicalTorsions for
 580     similarity calculations instead of default IntVect fingerprints.
 581 
 582     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt)
 583 
 584     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 585 
 586 Options:
 587     -c, --clusteringMethod <Centroid, CLink...>  [default: Centroid]
 588         Clustering method to use for picking a subset of diverse molecules during
 589         hierarchical clustering. Supported values: Centroid, CLink, Gower,
 590         McQuitty, SLink, UPGMA, Ward. This option is ignored for 'MaxMin' value
 591         of '-m, --mode' option. The Clink and SLink corresponding to CompleteLink
 592         and SingleLink cluster method.
 593     -f, --fingerprints <MACCS166Keys, Morgan, PathLength...>  [default: Morgan]
 594         Fingerprints to use for calculating similarity/distance between molecules.
 595         Supported values: AtomPairs, MACCS166Keys, Morgan, MorganFeatures, PathLength,
 596         TopologicalTorsions. The PathLength fingerprints are Daylight like fingerprints.
 597         The Morgan and MorganFeature fingerprints are circular fingerprints, corresponding
 598         Scitegic's Extended Connectivity Fingerprints (ECFP) and Features Connectivity
 599         Fingerprints (FCFP). The values of default parameters for generating fingerprints
 600         can be modified using '-p, --paramsFingerprints' option.
 601     -e, --examples
 602         Print examples.
 603     -h, --help
 604         Print this help message.
 605     -i, --infile <infile>
 606         Input file name.
 607     --infileParams <Name,Value,...>  [default: auto]
 608         A comma delimited list of parameter name and value pairs for reading
 609         molecules from files. The supported parameter names for different file
 610         formats, along with their default values, are shown below:
 611             
 612             SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 613             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 614                 smilesTitleLine,auto,sanitize,yes
 615             
 616         Possible values for smilesDelimiter: space, comma or tab.
 617     -m, --mode <MaxMin or HierarchicalClustering>  [default: MaxMin]
 618         Pick a diverse subset of molecules using MaxMin or hierarchical clustering
 619         methodology.
 620     -n, --numMols <number>  [default: 25]
 621         Number of diverse molecules to pick.
 622     -o, --outfile <outfile>
 623         Output file name.
 624     --outfileParams <Name,Value,...>  [default: auto]
 625         A comma delimited list of parameter name and value pairs for writing
 626         molecules to files. The supported parameter names for different file
 627         formats, along with their default values, are shown below:
 628             
 629             SD: compute2DCoords,auto,kekulize,no
 630             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 631                 smilesTitleLine,yes
 632             
 633         Default value for compute2DCoords: yes for SMILES input file; no for all other
 634         file types.
 635     --overwrite
 636         Overwrite existing files.
 637     -p, --paramsFingerprints <Name,Value,...>  [default: auto]
 638         Parameter values to use for generating fingerprints. The default values
 639         are dependent on the value of '-f, --fingerprints' option. In general, it is a
 640         comma delimited list of parameter name and value pairs for the name of
 641         the fingerprints specified using '-f, --fingerprints' option. The supported
 642         parameter names along with their default values for valid fingerprints
 643         names are shown below:
 644             
 645             AtomPairs: minLength,1 ,maxLength,30, useChirality,No
 646             Morgan:   radius,2, useChirality,No
 647             MorganFeatures:   radius,2, useChirality,No
 648             PathLength: minPath,1, maxPath,7, fpSize, 2048, bitsPerHash,2
 649             TopologicalTorsions: useChirality,No
 650             
 651     -s, --similarityMetric <Dice, Tanimoto...>  [default: Tanimoto]
 652         Similarity metric to use for calculating similarity/distance between molecules.
 653         Possible values: BraunBlanquet, Cosine, Dice, Kulczynski, RogotGoldberg,
 654         Russel, Sokal, Tanimoto.
 655     -w, --workingdir <dir>
 656         Location of working directory which defaults to the current directory.
 657 
 658 Examples:
 659     To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity
 660     metric corresponding to Morgan fingerprints with radius of 2, and write
 661     out a SMILES file, type:
 662 
 663         % RDKitPickDiverseMolecules.py  -i Sample.smi -o SampleOut.smi
 664 
 665     To pick 50 diverse molecules using MaxMin methodology, Dice similarity metric
 666     corresponding to PathLength fingerprints with max path length of 6, and write
 667     out a SD file, type:
 668 
 669         % RDKitPickDiverseMolecules.py  -m MaxMin -f PathLength -s Dice -n 50
 670           -p 'maxPath,6' -i Sample.sdf -o SampleOut.sdf
 671 
 672     To pick 25 diverse molecules using Centroid hierarchical clustering methodology,
 673     Tanimoto similarity metric corresponding to Morgan fingerprints with radius of 2,
 674     and write out a SMILES file, type:
 675 
 676         % RDKitPickDiverseMolecules.py  -m HierarchicalClustering -i Sample.smi
 677           -o SampleOut.smi
 678 
 679     To pick 50 diverse molecules using Ward hierarchical methodology methodology,
 680     Dice similarity metric corresponding to MorganFeatures fingerprints with radius
 681     of 2 along with deploying chirality, and write out a SD file, type:
 682 
 683         % RDKitPickDiverseMolecules.py  -m HierarchicalClustering -c Ward -n 50
 684           -f MorganFeatures -p 'radius,2,useChirality,No' -i Sample.sdf -o
 685           SampleOut.sdf
 686 
 687     To pick 25 diverse molecules using MaxMin methodology, Tanimoto similarity
 688     metric corresponding to Morgan fingerprints with radius of 2 from a CSV SMIKES
 689     file , SMILES strings in column 1, name in olumn 2, and write out a SD file, type:
 690 
 691         % RDKitPickDiverseMolecules.py  --infileParams
 692           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 693           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 694           -i SampleSMILES.csv -o SampleOut.sdf
 695 
 696 Author:
 697     Manish Sud(msud@san.rr.com)
 698 
 699 See also:
 700     RDKitClusterMolecules.py, RDKitConvertFileFormat.py, RDKitSearchFunctionalGroups.py,
 701     RDKitSearchSMARTS.py
 702 
 703 Copyright:
 704     Copyright (C) 2019 Manish Sud. All rights reserved.
 705 
 706     The functionality available in this script is implemented using RDKit, an
 707     open source toolkit for cheminformatics developed by Greg Landrum.
 708 
 709     This file is part of MayaChemTools.
 710 
 711     MayaChemTools is free software; you can redistribute it and/or modify it under
 712     the terms of the GNU Lesser General Public License as published by the Free
 713     Software Foundation; either version 3 of the License, or (at your option) any
 714     later version.
 715 
 716 """
 717 
 718 if __name__ == "__main__":
 719     main()