MayaChemTools

    1 #!/bin/env python
    2 #
    3 # File: VisualizeChemspaceUsingTMAP.py
    4 # Author: Manish Sud <msud@san.rr.com>
    5 #
    6 # Copyright (C) 2024 Manish Sud. All rights reserved.
    7 #
    8 # The functionality available in this script is implemented using TMAP and
    9 # Faerun, open source software packages for visualizing chemspace, and
   10 # RDKit, an open source toolkit for cheminformatics developed by Greg
   11 # Landrum.
   12 #
   13 # This file is part of MayaChemTools.
   14 #
   15 # MayaChemTools is free software; you can redistribute it and/or modify it under
   16 # the terms of the GNU Lesser General Public License as published by the Free
   17 # Software Foundation; either version 3 of the License, or (at your option) any
   18 # later version.
   19 #
   20 # MayaChemTools is distributed in the hope that it will be useful, but without
   21 # any warranty; without even the implied warranty of merchantability of fitness
   22 # for a particular purpose.  See the GNU Lesser General Public License for more
   23 # details.
   24 #
   25 # You should have received a copy of the GNU Lesser General Public License
   26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   28 # Boston, MA, 02111-1307, USA.
   29 #
   30 
   31 from __future__ import print_function
   32 
   33 # Add local python path to the global path and import standard library modules...
   34 import os
   35 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
   36 import time
   37 import re
   38 import csv
   39 import shutil
   40 import multiprocessing as mp
   41 import pandas as pd
   42 import numpy as np
   43 
   44 # TMAP and Faerun imports...
   45 try:
   46     import tmap as tm
   47     from faerun import Faerun
   48     from mhfp.encoder import MHFPEncoder
   49 except ImportError as ErrMsg:
   50     sys.stderr.write("\nFailed to import TMAP/Faerun module/package: %s\n" % ErrMsg)
   51     sys.stderr.write("Check/update your TMAP environment and try again.\n\n")
   52     sys.exit(1)
   53 
   54 # RDKit imports...
   55 try:
   56     from rdkit import rdBase
   57 except ImportError as ErrMsg:
   58     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
   59     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
   60     sys.exit(1)
   61 
   62 # MayaChemTools imports...
   63 try:
   64     from docopt import docopt
   65     import MiscUtil
   66     import RDKitUtil
   67 except ImportError as ErrMsg:
   68     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
   69     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
   70     sys.exit(1)
   71 
   72 ScriptName = os.path.basename(sys.argv[0])
   73 Options = {}
   74 OptionsInfo = {}
   75 
   76 def main():
   77     """Start execution of the script."""
   78     
   79     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
   80     
   81     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
   82     
   83     # Retrieve command line arguments and options...
   84     RetrieveOptions()
   85     
   86     # Process and validate command line arguments and options...
   87     ProcessOptions()
   88     
   89     # Perform actions required by the script...
   90     VisualizeChemspace()
   91     
   92     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
   93     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
   94 
   95 def VisualizeChemspace():
   96     """Visualize chemspace using TMAP."""
   97 
   98     InfileDF = ReadMoleculeData()
   99 
  100     MolCount, ValidMolCount, VisualizationFailedCount = ProcessMolecules(InfileDF)
  101     
  102     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  103     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  104     MiscUtil.PrintInfo("Number of molecules failed during chemspace visualization: %d" % VisualizationFailedCount)
  105     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
  106 
  107 def ProcessMolecules(InfileDF):
  108     """Process molecules and generate TMAP."""
  109 
  110     MolCount = len(InfileDF)
  111     (ValidMolCount, VisualizationFailedCount) = [0] * 2
  112     
  113     # Setup parameter values for "auto" options based on the number of molecules...
  114     ProcessMolCountBasedAutoOptions(MolCount)
  115     
  116     # Setup LSH forest...
  117     LSHForest, ValidMolCount, VisualizationFailedCount = SetupLSHForest(InfileDF)
  118     if ValidMolCount == 0:
  119         return (MolCount, ValidMolCount, VisualizationFailedCount)
  120 
  121     SetupTMAPDisplayMessage(MolCount, ValidMolCount)
  122     
  123     # Generate TMAP coordinates... 
  124     PlotCoordsInfo = GenerateTMAPCoordinates(LSHForest)
  125     
  126     # Setup TMAP plot data...
  127     PlotDataInfo = SetupTMAPPlotData(InfileDF)
  128 
  129     # Setup TMAP plot...
  130     GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo)
  131     
  132     return (MolCount, ValidMolCount, VisualizationFailedCount)
  133 
  134 def SetupLSHForest(InfileDF):
  135     """Setup LSH forest. """
  136 
  137     if OptionsInfo["LSHForestFileRestoreMode"]:
  138         return RestoreLSHForest((InfileDF))
  139     else:
  140         return GenerateLSHForest(InfileDF)
  141             
  142 def RestoreLSHForest(InfileDF):
  143     """Restore LSH forest. """
  144 
  145     (ValidMolCount, VisualizationFailedCount) = [0] * 2
  146 
  147     # Set valid molecule count to number of molecules in input file...
  148     ValidMolCount = len(InfileDF)
  149     
  150     LSHForestFile = OptionsInfo["OutfileLSHForest"]
  151     MiscUtil. PrintInfo("\nRestoring LSH forest from %s..." % LSHForestFile)
  152     if not os.path.isfile(LSHForestFile):
  153         MiscUtil. PrintError("LSH forest file %s is missing. Failed to restore LSH forest...\n" % LSHForestFile)
  154     
  155     LSHForest = InitializeLSHForest()
  156     LSHForest.restore(LSHForestFile)
  157 
  158     if LSHForest.size() != ValidMolCount:
  159         MiscUtil.PrintError("The number of molecules, %s, in input file must match number of nodes, %s, in LSH forest during its restoration from a file using \"--lshForestFileWrite\" option." % (ValidMolCount, LSHForest.size()))
  160     
  161     return (LSHForest, ValidMolCount, VisualizationFailedCount)
  162 
  163 def GenerateLSHForest(InfileDF):
  164     """Generate LSH forest. """
  165     
  166     MinHashFingerprints, ValidMolCount, FingerprintsFailedCount = GenerateMinHashFingerprints(InfileDF)
  167     
  168     MiscUtil. PrintInfo("\nGenerating LSH forest...")
  169     LSHForest = InitializeLSHForest()
  170     
  171     LSHForest.batch_add(MinHashFingerprints)
  172     LSHForest.index()
  173 
  174     # Write out LSH forest...
  175     if OptionsInfo["LSHForestFileWriteMode"]:
  176         OutfileLSHForest = OptionsInfo["OutfileLSHForest"]
  177         if FingerprintsFailedCount > 0:
  178             MiscUtil. PrintWarning("The MinHash fingerprints generation failed for %s molecules. Skipped writing of file %s..." % (FingerprintsFailedCount, OutfileLSHForest))
  179         else:
  180             MiscUtil. PrintInfo("Writing LSH forest file %s..." % OutfileLSHForest)
  181             LSHForest.store(OutfileLSHForest)
  182     
  183     return (LSHForest, ValidMolCount, FingerprintsFailedCount)
  184 
  185 def GenerateMinHashFingerprints(InfileDF):
  186     """Generate MinHash fingerprints."""
  187 
  188     if OptionsInfo["MPMode"]:
  189         return GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF)
  190     else:
  191         return GenerateMinHashFingerprintsUsingSingleProcess(InfileDF)
  192 
  193 def GenerateMinHashFingerprintsUsingSingleProcess(InfileDF):
  194     """Generate MHFPs using a single processs. """
  195 
  196     MiscUtil. PrintInfo("\nGenerating MinHash fingerprints using a single process...")
  197     
  198     MinHashFingerprintsEncoder = InitializeMinHashFingerprintsEncoder()
  199     
  200     (ValidMolCount, FingerprintsFailedCount) = [0] * 2
  201     MinHashFingerprints = []
  202     FingerprintsFailedRowIndices = []
  203     
  204     SMILESColname = OptionsInfo["SMILESColname"]
  205     for MolIndex, SMILES in enumerate(InfileDF[SMILESColname]):
  206         MinHashFingerprint = GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES)
  207         if MinHashFingerprint is None:
  208             FingerprintsFailedCount += 1
  209             FingerprintsFailedRowIndices.append(MolIndex)
  210         else:
  211             ValidMolCount += 1
  212             MinHashFingerprints.append(tm.VectorUint(MinHashFingerprint))
  213 
  214     # Remove failed molecules from the dataframe...
  215     RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
  216     
  217     return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
  218 
  219 def GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF):
  220     """Generate MHFPs using multiprocessing."""
  221 
  222     MiscUtil. PrintInfo("\nGenerating MinHash fingerprints using multiprocessing...")
  223     
  224     MPParams = OptionsInfo["MPParams"]
  225     
  226     # Setup data for initializing a worker process...
  227     InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo))
  228 
  229     # Setup SMILES iterator...
  230     SMILESColname = OptionsInfo["SMILESColname"]
  231     WorkerProcessDataIterable = SetupSMILESWithMolIndices(InfileDF[SMILESColname])
  232     
  233     # Setup process pool along with data initialization for each process...
  234     if not OptionsInfo["QuietMode"]:
  235         MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()"))
  236         MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"])))
  237     
  238     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
  239     
  240     # Start processing...
  241     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
  242         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  243     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
  244         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  245     else:
  246         MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"]))
  247     
  248     (ValidMolCount, FingerprintsFailedCount) = [0] * 2
  249     MinHashFingerprints = []
  250     FingerprintsFailedRowIndices = []
  251     
  252     for Result in Results:
  253         Molndex, MinHashFingerprint = Result
  254         
  255         if MinHashFingerprint is None:
  256             FingerprintsFailedCount += 1
  257             FingerprintsFailedRowIndices.append(Molndex)
  258         else:
  259             ValidMolCount += 1
  260             MinHashFingerprints.append(tm.VectorUint(np.array(MinHashFingerprint)))
  261     
  262     # Remove failed molecules from the dataframe...
  263     RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
  264     
  265     return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
  266 
  267 def InitializeWorkerProcess(*EncodedArgs):
  268     """Initialize data for a worker process."""
  269     
  270     global Options, OptionsInfo
  271     
  272     if not OptionsInfo["QuietMode"]:
  273         MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
  274     
  275     # Decode Options and OptionInfo...
  276     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
  277     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
  278 
  279     # Initialize MHFP encoder...
  280     OptionsInfo["MinHashFingerprintsEncoder"] = InitializeMinHashFingerprintsEncoder()
  281 
  282 def WorkerProcess(MolInfo):
  283     """Process data for a worker process."""
  284     
  285     MolIndex, SMILES = MolInfo
  286     
  287     MinHashFingerprint = GenerateMinHashFingerprintForMolecule(OptionsInfo["MinHashFingerprintsEncoder"], SMILES)
  288     if MinHashFingerprint is not None:
  289         MinHashFingerprint = MinHashFingerprint.tolist()
  290 
  291     return (MolIndex, MinHashFingerprint)
  292 
  293 def SetupSMILESWithMolIndices(SMILES):
  294     """Setup an iterator to generate SMILES string along with a molecule index."""
  295 
  296     for MolIndex, MolSMILES in enumerate(SMILES):
  297         yield(MolIndex, MolSMILES)
  298 
  299 def GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES):
  300     """Generate MinHash fingerprint for a molecule. """
  301     
  302     MinHashFingerprint = None
  303     try:
  304         MinHashFingerprint = MinHashFingerprintsEncoder.encode(SMILES, radius = OptionsInfo["MinHashFPParams"]["Radius"], rings = OptionsInfo["MinHashFPParams"]["Rings"], kekulize = OptionsInfo["MinHashFPParams"]["Kekulize"], min_radius = OptionsInfo["MinHashFPParams"]["MinRadius"], sanitize = OptionsInfo["MinHashFPParams"]["Sanitize"])
  305     except Exception as ErrMsg:
  306         if not OptionsInfo["QuietMode"]:
  307             MiscUtil.PrintWarning("Failed to generate MinHash fingerprint for SMILES %s:\n%s\n" % (SMILES, ErrMsg))
  308         else:
  309             MiscUtil.PrintInfo("")
  310         MinHashFingerprint = None
  311     
  312     return MinHashFingerprint
  313 
  314 def RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices):
  315     """Remove fingerprints failed rows."""
  316     
  317     if len(FingerprintsFailedRowIndices):
  318         InfileDF.drop(FingerprintsFailedRowIndices, inplace = True)
  319         InfileDF.reset_index(drop = True, inplace = True)
  320     
  321 def GenerateTMAPCoordinates(LSHForest):
  322     """Generate TMAP coordinates. """
  323     
  324     MiscUtil.PrintInfo("\nGenerating TMAP plot coordinates...")
  325     
  326     PlotCoordsInfo = {}
  327     PlotCoordsInfo["NodeXCoords"] = None
  328     PlotCoordsInfo["NodeYCoords"] = None
  329     PlotCoordsInfo["EdgeNodeStartList"] = None
  330     PlotCoordsInfo["EdgeNodeToList"] = None
  331     
  332 
  333     LSHLayoutConfigParams = OptionsInfo["LSHLayoutConfigParams"]
  334     LSHLayoutConfig  = tm.LayoutConfiguration()
  335     
  336     LSHLayoutConfig.k = LSHLayoutConfigParams["K"]
  337     LSHLayoutConfig.kc = LSHLayoutConfigParams["KC"]
  338     LSHLayoutConfig.fme_iterations = LSHLayoutConfigParams["FMEIterations"]
  339     LSHLayoutConfig.fme_randomize = LSHLayoutConfigParams["FMERandomize"]
  340     LSHLayoutConfig.fme_threads = LSHLayoutConfigParams["FMEThreads"]
  341     LSHLayoutConfig.fme_precision = LSHLayoutConfigParams["FMEPrecision"]
  342     LSHLayoutConfig.sl_repeats = LSHLayoutConfigParams["SLRepeats"]
  343     LSHLayoutConfig.sl_extra_scaling_steps = LSHLayoutConfigParams["SLExtraScalingSteps"]
  344     LSHLayoutConfig.sl_scaling_min = LSHLayoutConfigParams["SLScalingMin"]
  345     LSHLayoutConfig.sl_scaling_max = LSHLayoutConfigParams["SLScalingMax"]
  346     LSHLayoutConfig.sl_scaling_type = LSHLayoutConfigParams["SLScalingType"]
  347     LSHLayoutConfig.mmm_repeats = LSHLayoutConfigParams["MMMRepeats"]
  348     LSHLayoutConfig.placer = LSHLayoutConfigParams["Placer"]
  349     LSHLayoutConfig.merger = LSHLayoutConfigParams["Merger"]
  350     LSHLayoutConfig.merger_factor = LSHLayoutConfigParams["MergerFactor"]
  351     LSHLayoutConfig.merger_adjustment = LSHLayoutConfigParams["MergerAdjustment"]
  352     LSHLayoutConfig.node_size = 1.0 / LSHLayoutConfigParams["NodeSizeDenominator"]
  353 
  354     NodeXCoords, NodeYCoords, EdgeNodeStartList, EdgeNodeToList, _ = tm.layout_from_lsh_forest(LSHForest, config = LSHLayoutConfig)
  355 
  356     PlotCoordsInfo["NodeXCoords"] = NodeXCoords
  357     PlotCoordsInfo["NodeYCoords"] = NodeYCoords
  358     PlotCoordsInfo["EdgeNodeStartList"] = EdgeNodeStartList
  359     PlotCoordsInfo["EdgeNodeToList"] = EdgeNodeToList
  360     
  361     return PlotCoordsInfo
  362 
  363 def SetupTMAPPlotData(InfileDF):
  364     """Setup plot data for TMAP plot."""
  365     
  366     MiscUtil.PrintInfo("\nSetting up TMAP plot data...")
  367     
  368     PlotDataInfo = {}
  369     PlotDataInfo["Columns"] = []
  370     PlotDataInfo["Colormaps"] = []
  371     PlotDataInfo["CategoricalStatus"] = []
  372     PlotDataInfo["LegendLabels"] = []
  373     PlotDataInfo["SeriesTitles"] = []
  374     
  375     # Setup categorical data...
  376     if OptionsInfo["CategoricalDataColnames"] is not None:
  377         for ColnameIndex, Colname in enumerate(OptionsInfo["CategoricalDataColnames"]):
  378             CategoryLabels, CategoryData = Faerun.create_categories(InfileDF[Colname])
  379             if len(CategoryLabels) > OptionsInfo["CategoricalDataMaxDisplay"]:
  380                 CategoryLabels, CategoryData = RemapCategoricalPlotData(CategoryLabels, CategoryData)
  381             
  382             PlotDataInfo["Columns"].append(CategoryData)
  383             PlotDataInfo["Colormaps"].append(OptionsInfo["CategoricalDataColormapsList"][ColnameIndex])
  384             PlotDataInfo["CategoricalStatus"].append(True)
  385             PlotDataInfo["LegendLabels"].append(CategoryLabels)
  386             PlotDataInfo["SeriesTitles"].append(Colname)
  387      
  388     # Setup numerical data...
  389     if OptionsInfo["NumericalDataColnames"] is not None:
  390         for ColnameIndex, Colname in enumerate(OptionsInfo["NumericalDataColnames"]):
  391             PlotDataInfo["Columns"].append(InfileDF[Colname])
  392             PlotDataInfo["Colormaps"].append(OptionsInfo["NumericalDataColormapsList"][ColnameIndex])
  393             PlotDataInfo["CategoricalStatus"].append(False)
  394             PlotDataInfo["LegendLabels"].append(None)
  395             PlotDataInfo["SeriesTitles"].append(Colname)
  396 
  397     # Setup structure display data...
  398     FirstCol = True
  399     SMILESSelectedData = []
  400     SMILESSelectedLabels = []
  401     FirstCol = True
  402     for Colname in OptionsInfo["StructureDisplayDataColnames"]:
  403         if FirstCol:
  404             FirstCol = False
  405             SMILESSelectedData = InfileDF[Colname]
  406             SMILESSelectedLabels.append(Colname)
  407         else:
  408             SMILESSelectedData = SMILESSelectedData +  '__'  + InfileDF[Colname].astype(str)
  409             SMILESSelectedLabels.append(Colname)
  410     
  411     PlotDataInfo["SMILESSelectedData"] = SMILESSelectedData
  412     PlotDataInfo["SMILESSelectedLabels"] = SMILESSelectedLabels
  413     
  414     return PlotDataInfo
  415 
  416 def RemapCategoricalPlotData(CategoryLabels, CategoryData):
  417     """Ramap categorical plot data."""
  418     
  419     if len(CategoryLabels) <= OptionsInfo["CategoricalDataMaxDisplay"]:
  420         return (CategoryLabels, CategoryData)
  421     
  422     # Track categories to remap...
  423     CategoryLabelsNew = []
  424     CategoryValuesToRemap = []
  425     LastCategoryValue = 0
  426     
  427     for CategoryLabelIndex, CategoryLabel in enumerate(CategoryLabels):
  428         CategoryValue, CategroyName = CategoryLabel
  429         if CategoryLabelIndex < OptionsInfo["CategoricalDataMaxDisplay"]:
  430             CategoryLabelsNew.append((CategoryValue, CategroyName))
  431             LastCategoryValue = CategoryValue
  432         else:
  433             CategoryValuesToRemap.append(CategoryValue)
  434     
  435     # Set up other category...
  436     OtherCategoryValue = LastCategoryValue + 1
  437     OtherCategoryName = "Other"
  438     CategoryLabelsNew.append((OtherCategoryValue, OtherCategoryName))
  439     
  440     # Update category labels and data...
  441     CategoryLabels = CategoryLabelsNew
  442     for ValueIndex, Value in enumerate(CategoryData):
  443         if Value in CategoryValuesToRemap:
  444             CategoryData[ValueIndex] = OtherCategoryValue
  445     
  446     return (CategoryLabels, CategoryData)
  447     
  448 def GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo):
  449     """Generate TMAP plot. """
  450     
  451     MiscUtil.PrintInfo("\nGenerating TMAP plot...")
  452     
  453     # Initialize Faerun plot...
  454     FaerunConfigParams = OptionsInfo["FaerunConfigParams"]
  455     ImpressMsg = OptionsInfo["TMAPDisplayMsg"]
  456     TMAPFaerunPlot = Faerun(clear_color = FaerunConfigParams["ClearColor"], view = "front", coords = False, title = "", x_title = "", y_title = "", show_legend = FaerunConfigParams["ShowLegend"],  legend_title = FaerunConfigParams["LegendTitle"], legend_orientation = FaerunConfigParams["LegendOrientation"], legend_number_format = FaerunConfigParams["LegendNumberFormat"], scale = FaerunConfigParams["Scale"], alpha_blending = FaerunConfigParams["AlphaBlending"], anti_aliasing = FaerunConfigParams["AntiAliasing"], thumbnail_width = FaerunConfigParams["ThumbnailWidth"], thumbnail_fixed = FaerunConfigParams["ThumbnailFixed"], impress = ImpressMsg)
  457     
  458     # Setup scatter plot...
  459     ScatterPlotName = "Data"
  460     ScatterTreePlotName = "%s_tree" % ScatterPlotName
  461     FaerunScatterPlotParams = OptionsInfo["FaerunScatterPlotParams"]
  462     TMAPFaerunPlot.add_scatter(ScatterPlotName, {"x": PlotCoordsInfo["NodeXCoords"], "y": PlotCoordsInfo["NodeYCoords"], "c": PlotDataInfo["Columns"], "labels": PlotDataInfo["SMILESSelectedData"]}, colormap = PlotDataInfo["Colormaps"], shader = FaerunScatterPlotParams["Shader"],  point_scale = FaerunScatterPlotParams["PointScale"],  max_point_size = FaerunScatterPlotParams["MaxPointSize"],  fog_intensity = FaerunScatterPlotParams["FogIntensity"], categorical = PlotDataInfo["CategoricalStatus"], interactive = FaerunScatterPlotParams["Interactive"], has_legend = True,  legend_labels = PlotDataInfo["LegendLabels"], series_title = PlotDataInfo["SeriesTitles"], selected_labels = PlotDataInfo["SMILESSelectedLabels"])
  463 
  464     # Add scatter plot to Faerun...
  465     TMAPFaerunPlot.add_tree(ScatterTreePlotName, {"from": PlotCoordsInfo["EdgeNodeStartList"], "to": PlotCoordsInfo["EdgeNodeToList"]}, point_helper = ScatterPlotName)
  466 
  467     # Write out TMAP plot HTML and JS files...
  468     MiscUtil.PrintInfo("Writing TMAP plot files %s and %s..." % (OptionsInfo["Outfile"], OptionsInfo["OutfileJS"]))
  469     TMAPFaerunPlot.plot(OptionsInfo["OutfilePrefix"], template = "smiles")
  470 
  471     if OptionsInfo["MergeHTMLandJSFilesMode"]:
  472         MergeTMAPResultsHTMLAndJSFiles()
  473     
  474 def MergeTMAPResultsHTMLAndJSFiles():
  475     """Merge TMAP HTML and JS files."""
  476     
  477     MiscUtil.PrintInfo("\nMerging TMAP plot file %s into  %s..." % (OptionsInfo["OutfileJS"], OptionsInfo["Outfile"]))
  478     
  479     TMAPResultsHTMLFile = OptionsInfo["Outfile"]
  480     TMAPResultsJSFile = OptionsInfo["OutfileJS"]
  481     
  482     TMAPResultsTMPHTMLFile = "Tmp%s.html" % OptionsInfo["OutfilePrefix"]
  483 
  484     HTMLResultsFH = open(TMAPResultsHTMLFile, "r")
  485     JSResultsFH = open(TMAPResultsJSFile, "r")
  486     
  487     TMPHTMLResultsFH = open(TMAPResultsTMPHTMLFile, "w")
  488 
  489     for HTMLLine in HTMLResultsFH:
  490         HTMLLine = HTMLLine.rstrip()
  491         if re.search("%s" % TMAPResultsJSFile, HTMLLine, re.IGNORECASE):
  492             TMPHTMLResultsFH.write("    <script>\n")
  493 
  494             FirstLine = True
  495             for JSLine in JSResultsFH:
  496                 JSLine = JSLine.rstrip()
  497                 if FirstLine:
  498                     FirstLine = False
  499                     TMPHTMLResultsFH.write("    %s\n" % JSLine)
  500                 else:
  501                     TMPHTMLResultsFH.write("%s\n" % JSLine)
  502             TMPHTMLResultsFH.write("\n    </script>\n")
  503             
  504         else:
  505             TMPHTMLResultsFH.write("%s\n" % HTMLLine)
  506     
  507     HTMLResultsFH.close()
  508     JSResultsFH.close()
  509     TMPHTMLResultsFH.close()
  510 
  511     MiscUtil.PrintInfo("Moving %s to %s..." % (TMAPResultsTMPHTMLFile, OptionsInfo["Outfile"]))
  512     shutil.move(TMAPResultsTMPHTMLFile, TMAPResultsHTMLFile)
  513     
  514     MiscUtil.PrintInfo("Removing %s file..." % (OptionsInfo["OutfileJS"]))
  515     os.remove(TMAPResultsJSFile)
  516     
  517 def InitializeLSHForest():
  518     """Initialize LSH forest. """
  519 
  520     LSHForestParams = OptionsInfo["LSHForestParams"]
  521     LSHForest = tm.LSHForest(LSHForestParams["Dim"], LSHForestParams["NumPrefixTrees"], LSHForestParams["Store"])
  522 
  523     return LSHForest
  524 
  525 def InitializeMinHashFingerprintsEncoder():
  526     """Initialize MinHash fingerprints encoder."""
  527 
  528     MinHashFPParams = OptionsInfo["MinHashFPParams"]
  529     MinHashFingerprintsEncoder = MHFPEncoder(n_permutations = MinHashFPParams["NumPermutations"], seed = MinHashFPParams["Seed"])
  530 
  531     return MinHashFingerprintsEncoder
  532     
  533 def ReadMoleculeData():
  534     """Read molecule data."""
  535 
  536     Infile = OptionsInfo["Infile"]
  537     InfileDelimiter = OptionsInfo["InfileDelimiter"]
  538     
  539     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  540     InfileDF = pd.read_csv(Infile, sep = InfileDelimiter)
  541 
  542     return InfileDF
  543 
  544 def ProcessMolCountBasedAutoOptions(MolCount):
  545     """Process auto option values dependent on number of molecules."""
  546 
  547     #  Process "auto" option for LSHForestParams...
  548     ParamName = "NumPrefixTrees"
  549     ParamValue = "%s" % OptionsInfo["LSHForestParams"][ParamName]
  550     if re.match("^auto$", ParamValue, re.I):
  551         ParamValue = 128 if MolCount <= 10E03 else 8
  552         OptionsInfo["LSHForestParams"][ParamName] = ParamValue
  553     
  554     #  Process "auto" option for FaerunScatterPlotParams...
  555     ParamName = "PointScale"
  556     ParamValue = OptionsInfo["FaerunScatterPlotParams"][ParamName]
  557     ParamValue = "%s" % ParamValue
  558     if re.match("^auto$", ParamValue, re.I):
  559         if MolCount <= 10E03:
  560             ParamValue = 4.0
  561         elif MolCount <= 10E04:
  562             ParamValue = 2.0
  563         else:
  564             ParamValue = 1.0
  565         OptionsInfo["FaerunScatterPlotParams"][ParamName] = ParamValue
  566 
  567     #  Process "auto" option for LSHLayoutConfigParams...
  568     for ParamName in ["K", "KC",  "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
  569         ParamValue = "%s" % OptionsInfo["LSHLayoutConfigParams"][ParamName]
  570         
  571         if not re.match("^auto$", ParamValue, re.I):
  572             continue
  573         
  574         if re.match("^K$", ParamName, re.I):
  575             ParamValue = 75 if MolCount <= 10E03 else 10
  576         elif re.match("^KC$", ParamName, re.I):
  577             ParamValue = 20 if MolCount <= 10E03 else 10
  578         elif re.match("^SLRepeats$", ParamName, re.I):
  579             ParamValue = 2 if MolCount <= 10E03 else 1
  580         elif re.match("^SLExtraScalingSteps$", ParamName, re.I):
  581             ParamValue = 4 if MolCount <= 10E03 else 2
  582         elif re.match("^MMMRepeats$", ParamName, re.I):
  583             ParamValue = 2 if MolCount <= 10E03 else 1
  584         elif re.match("^NodeSizeDenominator$", ParamName, re.I):
  585             ParamValue = 65.0 if MolCount <= 10E03 else 70.0
  586         
  587         OptionsInfo["LSHLayoutConfigParams"][ParamName] = ParamValue
  588     
  589 def SetupTMAPDisplayMessage(MolCount, ValidMolCount):
  590     """Setup TMAP display message."""
  591     
  592     # Setup default TMAP display message using valid molecule count...
  593     if re.match("^auto$", OptionsInfo["TMAPDisplayMsg"], re.I):
  594         if MolCount == ValidMolCount:
  595             OptionsInfo["TMAPDisplayMsg"] = "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s" % (OptionsInfo["Infile"], MolCount)
  596         else:
  597             OptionsInfo["TMAPDisplayMsg"] = "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s<br/>Number of valid molecules: %s" % (OptionsInfo["Infile"], MolCount, ValidMolCount)
  598 
  599 def ProcessFaerunConfigParametersOption():
  600     """Process option for faerun configuration parameters."""
  601     
  602     ParamsOptionName = "--faerunConfigParams"
  603     ParamsOptionValue = Options[ParamsOptionName]
  604     ParamsDefaultInfo = {"ClearColor": ["str", "#000000"], "ShowLegend": ["bool", True], "LegendTitle": ["str", "Legend"], "LegendOrientation": ["str", "vertical"], "LegendNumberFormat": ["str", "{:.2f}"], "Scale": ["float", 750.0], "AlphaBlending": ["bool", False], "AntiAliasing": ["bool", True], "ThumbnailWidth": ["int", 250], "ThumbnailFixed": ["bool", False]}
  605     
  606     FaerunConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo)
  607     
  608     ParamName = "LegendOrientation"
  609     ParamValue = FaerunConfigParams[ParamName]
  610     if not re.match("^(vertical|horizontal)$", ParamValue, re.I):
  611         MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: vertical or horizontal\n" % (ParamValue, ParamName, ParamsOptionName))
  612     FaerunConfigParams[ParamName] = ParamValue.lower()
  613     
  614     for ParamName in ["Scale", "ThumbnailWidth"]:
  615         ParamValue = FaerunConfigParams[ParamName]
  616         if  ParamValue <= 0:
  617             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  618 
  619     OptionsInfo["FaerunConfigParams"] = FaerunConfigParams
  620     
  621 def ProcessFaerunScatterPlotParamsOption():
  622     """Process option for faerun scatter plot parameters."""
  623 
  624     ParamsOptionName = "--faerunScatterPlotParams"
  625     ParamsOptionValue = Options[ParamsOptionName]
  626     ParamsDefaultInfo = {"Shader": ["str", "circle"], "PointScale": ["str", "auto"], "MaxPointSize": ["float", 100.0], "FogIntensity": ["float", 0.0], "Interactive": ["bool", True]}
  627     
  628     FaerunScatterPlotParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo)
  629     
  630     ParamName = "PointScale"
  631     ParamValue = FaerunScatterPlotParams[ParamName]
  632     if not re.match("^auto$", ParamValue, re.I):
  633         if not MiscUtil.IsFloat(ParamValue):
  634             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be a float." % (ParamValue, ParamName, ParamsOptionName))
  635         ParamValue = float(ParamValue)
  636         if  ParamValue <= 0:
  637             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  638         FaerunScatterPlotParams[ParamName] = ParamValue
  639     
  640     ParamName = "MaxPointSize"
  641     ParamValue = FaerunScatterPlotParams[ParamName]
  642     if  ParamValue <= 0:
  643         MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  644     
  645     ParamName = "FogIntensity"
  646     ParamValue = FaerunScatterPlotParams[ParamName]
  647     if  ParamValue < 0:
  648         MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName))
  649     
  650     OptionsInfo["FaerunScatterPlotParams"] = FaerunScatterPlotParams
  651     
  652 def ProcessLSHForestParamsOption():
  653     """Process option for LSH forest parameters."""
  654 
  655     ParamsOptionName = "--lshForestParams"
  656     ParamsOptionValue = Options[ParamsOptionName]
  657     ParamsDefaultInfo = {"Dim": ["int", 2048], "NumPrefixTrees": ["str", "auto"], "Store": ["bool", True]}
  658     
  659     LSHForestParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo)
  660     
  661     ParamName = "Dim"
  662     ParamValue = LSHForestParams[ParamName]
  663     if  ParamValue <= 0:
  664         MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  665     
  666     ParamName = "NumPrefixTrees"
  667     ParamValue = LSHForestParams[ParamName]
  668     if not re.match("^auto$", ParamValue, re.I):
  669         if not MiscUtil.IsInteger(ParamValue):
  670             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be an integer." % (ParamValue, ParamName, ParamsOptionName))
  671         ParamValue = int(ParamValue)
  672         if  ParamValue <= 0:
  673             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  674         LSHForestParams[ParamName] = ParamValue
  675     
  676     OptionsInfo["LSHForestParams"] = LSHForestParams
  677     
  678 def ProcessLSHLayoutConfigParamsOption():
  679     """Process option for LSH configuration parameters."""
  680 
  681     ParamsOptionName = "--lshLayoutConfigParams"
  682     ParamsOptionValue = Options[ParamsOptionName]
  683     ParamsDefaultInfo = {"K": ["str", "auto"], "KC": ["str", "auto"], "FMEIterations": ["int", 1000], "FMERandomize": ["bool", False], "FMEThreads": ["int", 4], "FMEPrecision": ["int", 4], "SLRepeats": ["str", "auto"], "SLExtraScalingSteps": ["str", "auto"], "SLScalingMin": ["float", 1.0], "SLScalingMax": ["float", 1.0], "SLScalingType": ["str", "RelativeToDrawing"], "MMMRepeats": ["str", "auto"], "Placer": ["str", "Barycenter"], "Merger": ["str", "LocalBiconnected"], "MergerFactor": ["float", 2.0], "MergerAdjustment": ["int", 0], "NodeSizeDenominator": ["str", "auto"]}
  684     
  685     LSHLayoutConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo)
  686     
  687     for ParamName in ["FMEIterations", "FMEThreads", "FMEPrecision", "SLScalingMin", "SLScalingMax", "MergerFactor", "MergerAdjustment"]:
  688         ParamValue = LSHLayoutConfigParams[ParamName]
  689         if re.match("^%s$" % ParamName, "MergerAdjustment", re.I):
  690             if  ParamValue < 0:
  691                 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName))
  692         else:
  693             if  ParamValue <= 0:
  694                 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  695 
  696     # Process "auto" values...
  697     for ParamName in ["K", "KC",  "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
  698         ParamValue = LSHLayoutConfigParams[ParamName]
  699         
  700         if not re.match("^auto$", ParamValue, re.I):
  701             if re.match("^NodeSizeDenominator$", ParamName, re.I):
  702                 if not MiscUtil.IsFloat(ParamValue):
  703                     MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be a float." % (ParamValue, ParamName, ParamsOptionName))
  704                 ParamValue = float(ParamValue)
  705             else:
  706                 if not MiscUtil.IsInteger(ParamValue):
  707                     MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be an integer." % (ParamValue, ParamName, ParamsOptionName))
  708                 ParamValue = int(ParamValue)
  709             
  710             if  ParamValue <= 0:
  711                 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  712             LSHLayoutConfigParams[ParamName] = ParamValue
  713     
  714     # Map SLScalingType to TMAP object...
  715     ParamInfo = {"Absolute": tm.ScalingType.Absolute, "RelativeToAvgLength": tm.ScalingType.RelativeToAvgLength, "RelativeToDesiredLength": tm.ScalingType.RelativeToDesiredLength, "RelativeToDrawing": tm.ScalingType.RelativeToDrawing}
  716     ParamName = "SLScalingType"
  717     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  718     
  719     # Map Placer to TMAP object...
  720     ParamInfo = {"Barycenter": tm.Placer.Barycenter, "Solar": tm.Placer.Solar, "Circle": tm.Placer.Circle, "Median": tm.Placer.Median, "Random": tm.Placer.Random, "Zero": tm.Placer.Zero}
  721     ParamName = "Placer"
  722     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  723     
  724     # Map Merger to TMAP object...
  725     ParamInfo = {"EdgeCover": tm.Merger.EdgeCover, "LocalBiconnected": tm.Merger.LocalBiconnected, "Solar": tm.Merger.Solar, "IndependentSet": tm.Merger.IndependentSet}
  726     ParamName = "Merger"
  727     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  728 
  729     OptionsInfo["LSHLayoutConfigParams"] = LSHLayoutConfigParams
  730 
  731 def MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo):
  732     """Map LSH layout configuration patameter valut to TMAP object. """
  733     
  734     ParamValue = LSHLayoutConfigParams[ParamName]
  735     if ParamValue not in ParamInfo:
  736         MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: %s\n" % (ParamValue, ParamName, ParamsOptionName, ", ".join(sorted(ParamInfo.keys()))))
  737     LSHLayoutConfigParams[ParamName] = ParamInfo[ParamValue]
  738     
  739 def ProcessMinHashFPParamsOption():
  740     """Process option for MinHash parameters."""
  741 
  742     ParamsOptionName = "--minHashFPParams"
  743     ParamsOptionValue = Options[ParamsOptionName]
  744     ParamsDefaultInfo = {"Radius": ["int", 3], "Rings": ["bool", True], "Kekulize": ["bool", True], "Sanitize": ["bool", True], "MinRadius": ["int", 1], "NumPermutations": ["int", 2048], "Seed": ["int", 42]}
  745     
  746     MinHashFPParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo)
  747     
  748     for ParamName in ["Radius", "MinRadius", "NumPermutations"]:
  749         ParamValue = MinHashFPParams[ParamName]
  750         if  ParamValue <= 0:
  751             MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName))
  752     
  753     OptionsInfo["MinHashFPParams"] = MinHashFPParams
  754 
  755 def ProcessInfileDelimiterOption():
  756     """Process option infile delimiter."""
  757 
  758     InfileDelim = Options["--infileDelimiter"]
  759     if re.match("^auto$", InfileDelim, re.I):
  760         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Infile"])
  761         if re.match("^csv$", FileExt, re.I):
  762             InfileDelim = "comma"
  763         elif re.match("^(tsv|txt)$", FileExt, re.I):
  764             InfileDelim = "tab"
  765         elif re.match("^(smi)$", FileExt, re.I):
  766             InfileDelim = "space"
  767         else:
  768             MiscUtil.PrintError("The input file delimiter couldn't be determined from its extension %s. You must explicitly specify an input file delimiter using option\"--infileDelimiter\".\n" % (InfileDelim))
  769 
  770     InfileDelimMap = {"comma": ",", "tab": "\t", "space": " "}
  771     OptionsInfo["InfileDelimiter"] = InfileDelimMap[InfileDelim]
  772 
  773 def ProcessColumnModeOption():
  774     """Process column mode option."""
  775 
  776     CollabelMode, ColnumMode = [False, False]
  777     Colmode = Options["--colmode"]
  778     if re.match("^collabel$", Colmode, re.I):
  779         CollabelMode = True
  780     elif re.match("^colnum$", Colmode, re.I):
  781         ColnumMode = True
  782     else:
  783         MiscUtil.PrintError("The value, %s, specified for option \"-c, --colmode\" is not valid. Supported values: collabel or colnum\n" % (Colmode))
  784 
  785     OptionsInfo["Colmode"] = Colmode
  786     OptionsInfo["CollabelMode"] = CollabelMode
  787     OptionsInfo["ColnumMode"] = ColnumMode
  788 
  789 def RetrieveColumnNames():
  790     """Retrieve column names. """
  791     
  792     Infile = OptionsInfo["Infile"]
  793     
  794     InfileFH = open(Infile, "r")
  795     InfileReader = csv.reader(InfileFH, delimiter = OptionsInfo["InfileDelimiter"], quotechar = '"')
  796     Colnames = next(InfileReader)
  797     InfileFH.close()
  798 
  799     if len(Colnames) == 0:
  800         MiscUtil.PrintError("The first line in input file, %s, is empty. It must contain column names.\n" % Infile)
  801     
  802     ColnameToColnumMap = {}
  803     ColnumToColnameMap = {}
  804     for ColIndex, Colname in enumerate(Colnames):
  805         Colnum = ColIndex + 1
  806         ColnameToColnumMap[Colname] = Colnum
  807         ColnumToColnameMap[Colnum] = Colname
  808 
  809     OptionsInfo["Colnames"] = Colnames
  810     OptionsInfo["ColCount"] = len(Colnames)
  811     OptionsInfo["ColnameToColnumMap"] = ColnameToColnumMap
  812     OptionsInfo["ColnumToColnameMap"] = ColnumToColnameMap
  813     
  814     # Initialize for tracking specified column names...
  815     SpecifiedColsInfo = {}
  816     SpecifiedColsInfo["Colnames"] = []
  817     SpecifiedColsInfo["Colnum"] = {}
  818     SpecifiedColsInfo["OptionName"] = {}
  819     
  820     OptionsInfo["SpecifiedColsInfo"] = SpecifiedColsInfo
  821 
  822 def ProcessSMILESColOption():
  823     """Process SMILES column option."""
  824     
  825     SMILESCol = Options["--colSMILES"]
  826     if re.match("^auto$", SMILESCol, re.I):
  827         Colname = "SMILES"
  828         if Colname not in OptionsInfo["ColnameToColnumMap"]:
  829             MiscUtil.PrintError("The SMILES column name, %s, doen't exist in input file. You must specify a valid SMILES column name or number using \"--colSMILES\" option.\n" % Colname)
  830         
  831         Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
  832         SMILESColspec = Colnum if OptionsInfo["ColnumMode"] else Colname
  833     else:
  834         SMILESColspec = SMILESCol
  835 
  836     SMILESColname, SMILESColnum = ProcessColumnSpecification("--colSMILES", SMILESColspec)
  837     
  838     OptionsInfo["SMILESCol"] = SMILESCol
  839     OptionsInfo["SMILESColname"] = SMILESColname
  840     OptionsInfo["SMILESColnum"] = SMILESColnum
  841 
  842 def ProcessCategoricalDataColsOption():
  843     """Process categorical data columns option."""
  844     
  845     CategoricalDataColnames, CategoricalDataColnums = [None] *2
  846     CategoricalDataCols = Options["--categoricalDataCols"]
  847     if not re.match("^none$", CategoricalDataCols, re.I):
  848         CategoricalDataColnames = []
  849         CategoricalDataColnums = []
  850         for DataCol in CategoricalDataCols.split(","):
  851             DataCol = DataCol.strip()
  852             DataColname, DataColnum = ProcessColumnSpecification("--categoricalDataCols", DataCol)
  853             CategoricalDataColnames.append(DataColname)
  854             CategoricalDataColnums.append(DataColnum)
  855     
  856     OptionsInfo["CategoricalDataCols"] = CategoricalDataCols
  857     OptionsInfo["CategoricalDataColnames"] = CategoricalDataColnames
  858     OptionsInfo["CategoricalDataColnums"] = CategoricalDataColnums
  859 
  860 def ProcessCategoricalDataColormapsOption():
  861     """Process categorical data color maps option. """
  862 
  863     if OptionsInfo["CategoricalDataColnames"] is None:
  864         OptionsInfo["CategoricalDataColormaps"] = Options["--categoricalDataColormaps"]
  865         OptionsInfo["CategoricalDataColormapsList"] = None
  866         return
  867         
  868     CategoricalDataColormapsList = []
  869     CategoricalDataColCount = len(OptionsInfo["CategoricalDataColnames"])
  870     
  871     CategoricalDataColormaps = Options["--categoricalDataColormaps"]
  872     if not re.match("^auto$", CategoricalDataColormaps, re.I):
  873         ColormapsWords = CategoricalDataColormaps.split(",")
  874         if len(ColormapsWords) != CategoricalDataColCount:
  875             MiscUtil.PrintInfo("The number of colormaps, %s, specified using \"--categoricalDataColormaps\" must be equal to the number of columns, %s, specified using \"--categoricalDataCols\" option." % (len(ColormapsWords), CategoricalDataColCount))
  876         for Colormap in ColormapsWords:
  877             Colormap = Colormap.strip()
  878             CategoricalDataColormapsList.append(Colormap)
  879     else:
  880         CategoricalDataColormapsList = ["tab10"] * CategoricalDataColCount
  881 
  882     OptionsInfo["CategoricalDataColormaps"] = CategoricalDataColormaps
  883     OptionsInfo["CategoricalDataColormapsList"] = CategoricalDataColormapsList
  884     
  885 def ProcessNumericalDataColsOption():
  886     """Process numerical data columns option."""
  887     
  888     NumericalDataColnames, NumericalDataColnums = [None] *2
  889     NumericalDataCols = Options["--numericalDataCols"]
  890     if not re.match("^none$", NumericalDataCols, re.I):
  891         NumericalDataColnames = []
  892         NumericalDataColnums = []
  893         for DataCol in NumericalDataCols.split(","):
  894             DataCol = DataCol.strip()
  895             DataColname, DataColnum = ProcessColumnSpecification("--numericalDataCols", DataCol)
  896             NumericalDataColnames.append(DataColname)
  897             NumericalDataColnums.append(DataColnum)
  898     
  899     OptionsInfo["NumericalDataCols"] = NumericalDataCols
  900     OptionsInfo["NumericalDataColnames"] = NumericalDataColnames
  901     OptionsInfo["NumericalDataColnums"] = NumericalDataColnums
  902         
  903 def ProcessNumericalDataColormapsOption():
  904     """Process numerical data color maps option. """
  905 
  906     if OptionsInfo["NumericalDataColnames"] is None:
  907         OptionsInfo["NumericalDataColormaps"] = Options["--numericalDataColormaps"]
  908         OptionsInfo["NumericalDataColormapsList"] = None
  909         return
  910 
  911     NumericalDataColormapsList = []
  912     NumericalDataColCount = len(OptionsInfo["NumericalDataColnames"])
  913     
  914     NumericalDataColormaps = Options["--numericalDataColormaps"]
  915     if not re.match("^auto$", NumericalDataColormaps, re.I):
  916         ColormapsWords = NumericalDataColormaps.split(",")
  917         if len(ColormapsWords) != NumericalDataColCount:
  918             MiscUtil.PrintInfo("The number of colormaps, %s, specified using \"--categoricalDataColormaps\" must be equal to the number of columns, %s, specified using \"--categoricalDataCols\" option." % (len(ColormapsWords), NumericalDataColCount))
  919         for Colormap in ColormapsWords:
  920             Colormap = Colormap.strip()
  921             NumericalDataColormapsList.append(Colormap)
  922     else:
  923         NumericalDataColormapsList = ["viridis"] * NumericalDataColCount
  924 
  925     OptionsInfo["NumericalDataColormaps"] = NumericalDataColormaps
  926     OptionsInfo["NumericalDataColormapsList"] = NumericalDataColormapsList
  927 
  928 def ProcessStructureDisplayDataColsOption():
  929     """Process structure display data columns option."""
  930     
  931     StructureDisplayDataColnames = []
  932     StructureDisplayDataColnums = []
  933 
  934     # Add SMILES column...
  935     StructureDisplayDataColnames.append(OptionsInfo["SMILESColname"])
  936     StructureDisplayDataColnums.append(OptionsInfo["SMILESColnum"])
  937     
  938     # Process specified columns...
  939     OptionName = "--structureDisplayDataCols"
  940     StructureDisplayDataCols = Options[OptionName]
  941     if re.match("^auto$", StructureDisplayDataCols, re.I):
  942         # Automatically add 'Name' column...
  943         Colname = "Name"
  944         if Colname in OptionsInfo["ColnameToColnumMap"]:
  945             Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
  946             StructureDisplayDataColnames.append(Colname)
  947             StructureDisplayDataColnums.append(Colnum)
  948     else:
  949         for DataCol in StructureDisplayDataCols.split(","):
  950             DataCol = DataCol.strip()
  951             if OptionsInfo["ColnumMode"]:
  952                 Colnum = int(DataCol)
  953                 if Colnum not in OptionsInfo["ColnumToColnameMap"]:
  954                     MiscUtil.PrintError("The column number, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n" % (Colnum, OptionName, OptionsInfo["ColCount"]))
  955                 Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
  956             else:
  957                 Colname = DataCol
  958                 if Colname not in OptionsInfo["ColnameToColnumMap"]:
  959                     MiscUtil.PrintError("The column name, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column name. Valid values: %s\n" % (Colname, OptionName, " ".join(OptionsInfo["Colnames"])))
  960                 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
  961                 
  962             if Colname in StructureDisplayDataColnames:
  963                 StructureDisplayDataColnumsStrs = ["%s" % Num for Num in StructureDisplayDataColnums]
  964                 if OptionsInfo["ColnumMode"]:
  965                     MiscUtil.PrintError("The column number, %s, specified using \"%s\" option is a duplicate column number. It has already been used for this option. You must specify a different column number. Used column names: %s; Used column nums: %s\n" % (Colnum, OptionName, " ".join(StructureDisplayDataColnames), " ".join(StructureDisplayDataColnumsStrs)))
  966                 else:
  967                     MiscUtil.PrintError("The column name, %s, specified using \"%s\" option is a duplicate column name. It has already been used for this option. You must specify a different column name. Used column names: %s; Used column nums: %s\n" % (Colname, OptionName, " ".join(StructureDisplayDataColnames), " ".join(StructureDisplayDataColnumsStrs)))
  968                     
  969             StructureDisplayDataColnames.append(Colname)
  970             StructureDisplayDataColnums.append(Colnum)
  971     
  972     OptionsInfo["StructureDisplayDataCols"] = StructureDisplayDataCols
  973     OptionsInfo["StructureDisplayDataColnames"] = StructureDisplayDataColnames
  974     OptionsInfo["StructureDisplayDataColnums"] = StructureDisplayDataColnums
  975 
  976 def ProcessColumnSpecification(OptionName, Colspec):
  977     """Process column specification corresponding to a column name or number."""
  978 
  979     Colname, Colnum = [None, None]
  980     if OptionsInfo["ColnumMode"]:
  981         Colnum = int(Colspec)
  982         if Colnum not in OptionsInfo["ColnumToColnameMap"]:
  983             MiscUtil.PrintError("The column number, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n" % (Colnum, OptionName, OptionsInfo["ColCount"]))
  984         Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
  985     else:
  986         Colname = Colspec
  987         if Colname not in OptionsInfo["ColnameToColnumMap"]:
  988             MiscUtil.PrintError("The column name, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column name. Valid values: %s\n" % (Colname, OptionName, " ".join(OptionsInfo["Colnames"])))
  989         Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
  990 
  991     # Track and check for duplicate column specification...
  992     SpecifiedColsInfo = OptionsInfo["SpecifiedColsInfo"] 
  993     if Colname in SpecifiedColsInfo["Colnames"]:
  994         if OptionsInfo["ColnumMode"]:
  995             MiscUtil.PrintError("The column number, %s, specified using \"%s\" option is a duplicate column number. It has already been used for \"%s\" option. You must specify a different column number.\n" % (Colnum, OptionName, SpecifiedColsInfo["OptionName"][Colname]))
  996         else:
  997             MiscUtil.PrintError("The column name, %s, specified using \"%s\" option is a duplicate column name. It has already been used for \"%s\" option. You must specify a different column name.\n" % (Colname, OptionName, SpecifiedColsInfo["OptionName"][Colname]))
  998     else:
  999         SpecifiedColsInfo["Colnames"].append(Colname)
 1000         SpecifiedColsInfo["Colnum"][Colname] = Colnum
 1001         SpecifiedColsInfo["OptionName"][Colname] = OptionName
 1002         
 1003     return (Colname, Colnum)
 1004     
 1005 def ProcessOptions():
 1006     """Process and validate command line arguments and options."""
 1007 
 1008     MiscUtil.PrintInfo("Processing options...")
 1009     
 1010     # Validate options...
 1011     ValidateOptions()
 1012 
 1013     OptionsInfo["Infile"] = Options["--infile"]
 1014     
 1015     Outfile = Options["--outfile"]
 1016     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 1017     OptionsInfo["OutfilePrefix"] = FileName
 1018     OptionsInfo["OutfileExt"] = FileExt
 1019     
 1020     OptionsInfo["Outfile"] = Outfile
 1021     OptionsInfo["OutfileJS"] = "%s.js" % FileName
 1022     OptionsInfo["OutfileLSHForest"] = "%s.dat" % FileName
 1023 
 1024     ProcessInfileDelimiterOption()
 1025     RetrieveColumnNames()
 1026     
 1027     ProcessColumnModeOption()
 1028     ProcessSMILESColOption()
 1029     
 1030     OptionsInfo["CategoricalDataMaxDisplay"] = int(Options["--categoricalDataMaxDisplay"])
 1031     ProcessCategoricalDataColsOption()
 1032     ProcessCategoricalDataColormapsOption()
 1033 
 1034     ProcessNumericalDataColsOption()
 1035     ProcessNumericalDataColormapsOption()
 1036     
 1037     ProcessStructureDisplayDataColsOption()
 1038     
 1039     ProcessFaerunConfigParametersOption()
 1040     ProcessFaerunScatterPlotParamsOption()
 1041     
 1042     OptionsInfo["LSHForestFileWriteMode"] = True if re.match("^yes$", Options["--lshForestFileWrite"], re.I) else False
 1043     OptionsInfo["LSHForestFileRestoreMode"] = True if re.match("^yes$", Options["--lshForestFileRestore"], re.I) else False
 1044     if OptionsInfo["LSHForestFileRestoreMode"]:
 1045         LSHForestFile = OptionsInfo["OutfileLSHForest"]
 1046         if not os.path.isfile(LSHForestFile):
 1047             MiscUtil.PrintError("The LSH forest file, %s, must be present for, %s, value of \"--lshForestFileRestore\" option." % (LSHForestFile, Options["--lshForestFileRestore"]))
 1048     
 1049     ProcessLSHForestParamsOption()
 1050     ProcessLSHLayoutConfigParamsOption()
 1051     
 1052     OptionsInfo["MergeHTMLandJSFilesMode"] = True if re.match("^yes$", Options["--mergeHTMLandJSFiles"], re.I) else False
 1053     
 1054     ProcessMinHashFPParamsOption()
 1055     
 1056     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 1057     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 1058     
 1059     OptionsInfo["Overwrite"] = Options["--overwrite"]
 1060     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
 1061     
 1062     OptionsInfo["TMAPDisplayMsg"] = Options["--tmapDisplayMsg"]
 1063 
 1064 def RetrieveOptions():
 1065     """Retrieve command line arguments and options."""
 1066     
 1067     # Get options...
 1068     global Options
 1069     Options = docopt(_docoptUsage_)
 1070     
 1071     # Set current working directory to the specified directory...
 1072     WorkingDir = Options["--workingdir"]
 1073     if WorkingDir:
 1074         os.chdir(WorkingDir)
 1075     
 1076     # Handle examples option...
 1077     if "--examples" in Options and Options["--examples"]:
 1078         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 1079         sys.exit(0)
 1080     
 1081 def ValidateOptions():
 1082     """Validate option values."""
 1083 
 1084     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 1085     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "smi csv tsv txt")
 1086     
 1087     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "html")
 1088     MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 1089     MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 1090 
 1091     MiscUtil.ValidateOptionTextValue("-c, --colmode", Options["--colmode"], "collabel colnum")
 1092 
 1093     if re.match("^none$", Options["--categoricalDataCols"], re.I) and re.match("^none$", Options["--numericalDataCols"], re.I):
 1094         MiscUtil.PrintError("You must specify al least one caetgorical or numerical data column using option \"--categoricalDataCols\" or \"--numericalDataCols\". It is used to color TMAP.")
 1095 
 1096     ColnumMode = True if re.match("^colnum$", Options["--colmode"], re.I) else False
 1097     if ColnumMode and not re.match("^auto$", Options["--colSMILES"], re.I):
 1098         MiscUtil.ValidateOptionIntegerValue("--colSMILES", Options["--colSMILES"], {">": 0})
 1099     
 1100     if ColnumMode and not re.match("^none$", Options["--categoricalDataCols"], re.I):
 1101         MiscUtil.ValidateOptionNumberValues("--categoricalDataCols", Options["--categoricalDataCols"], 0, ",", "integer", {">": 0})
 1102     
 1103     MiscUtil.ValidateOptionIntegerValue("--categoricalDataMaxDisplay", Options["--categoricalDataMaxDisplay"], {">": 0})
 1104     
 1105     if not re.match("^auto$", Options["--categoricalDataColormaps"], re.I):
 1106         ColormapCount = len(Options["--categoricalDataColormaps"].split(","))
 1107         ColCount = len(Options["--categoricalDataCols"].split(","))
 1108         if ColormapCount != ColCount:
 1109             MiscUtil.PrintError("The number of colormaps, %s, specified using option \"--categoricalDataColormaps\" must be equal to number of columns, %s,  specified using option \"-categoricalDataCols\". " % (ColormapCount, ColCount))
 1110     
 1111     if ColnumMode and not re.match("^none$", Options["--numericalDataCols"], re.I):
 1112         MiscUtil.ValidateOptionNumberValues("--numericalDataCols", Options["--numericalDataCols"], 0, ",", "integer", {">": 0})
 1113     
 1114     if not re.match("^auto$", Options["--numericalDataColormaps"], re.I):
 1115         ColormapCount = len(Options["--numericalDataColormaps"].split(","))
 1116         ColCount = len(Options["--numericalDataCols"].split(","))
 1117         if ColormapCount != ColCount:
 1118             MiscUtil.PrintError("The number of colormaps, %s, specified using option \"--numericalDataColormaps\" must be equal to number of columns, %s,  specified using option \"-numericalDataCols\". " % (ColormapCount, ColCount))
 1119     
 1120     if not re.match("^auto$", Options["--structureDisplayDataCols"], re.I):
 1121         if ColnumMode and not re.match("^none$", Options["--structureDisplayDataCols"], re.I):
 1122             MiscUtil.ValidateOptionNumberValues("--structureDisplayDataCols", Options["--structureDisplayDataCols"], 0, ",", "integer", {">": 0})
 1123     
 1124     if not re.match("^auto$", Options["--infileDelimiter"], re.I):
 1125         MiscUtil.ValidateOptionTextValue(" --infileDelimiter", Options["--infileDelimiter"], "comma tab space")
 1126     
 1127     MiscUtil.ValidateOptionTextValue("--lshForestFileWrite", Options["--lshForestFileWrite"], "yes no")
 1128     MiscUtil.ValidateOptionTextValue("--lshForestFileRestore", Options["--lshForestFileRestore"], "yes no")
 1129     MiscUtil.ValidateOptionTextValue("--mergeHTMLandJSFiles", Options["--mergeHTMLandJSFiles"], "yes no")
 1130     
 1131     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 1132 
 1133 
 1134 # Setup a usage string for docopt...
 1135 _docoptUsage_ = """
 1136 VisualizeChemspaceUsingTMAP.py - Visualize chemspace
 1137 
 1138 Usage:
 1139     VisualizeChemspaceUsingTMAP.py [--categoricalDataCols <collabel1,... or colnum1,...>] [--categoricalDataColormaps <Colormap1, Colormap2,...>]
 1140                                    [--categoricalDataMaxDisplay <number>] [--colmode <collabel or colnum>] [--colSMILES <text or number>]
 1141                                    [--faerunConfigParams <Name,Value,...>] [--faerunScatterPlotParams <Name,Value,...>]
 1142                                    [--infileDelimiter <comma, tab, or space>] [--lshForestFileWrite <yes or no>] [--lshForestFileRestore <yes or no>]
 1143                                    [--lshForestParams <Name,Value,...>] [--lshLayoutConfigParams  <Name,Value,...>] [--mergeHTMLandJSFiles <yes or no>]
 1144                                    [--minHashFPParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
 1145                                    [--numericalDataCols <collabel1,... or colnum1,...>] [--numericalDataColormaps <Colormap1, Colormap2,...>]
 1146                                    [--overwrite] [--quiet <yes or no>] [--structureDisplayDataCols <collabel1,... or colnum1,...> ]
 1147                                    [--tmapDisplayMsg <text>] [-w <dir>] -i <infile> -o <outfile> 
 1148     VisualizeChemspaceUsingTMAP.py -h | --help | -e | --examples
 1149 
 1150 Description:
 1151     Generate an interactive TreeMAP (TMAP) [Ref 171, 172] visualization for molecules
 1152     in a text input file. The text input file must have a column containing SMILES strings.
 1153     In addition, it must contain at least one column corresponding to categorical or
 1154     numerical data for coloring TMAP nodes. You may optionally map multiple categorical
 1155     and numerical data columns on to a TMAP visualization. A HTML file is generated for
 1156     interactive visualization of chemspace in a browser.
 1157 
 1158     The TMAP methodology is able to generate a reasonably interactive visualization
 1159     for relatively large data sets. A brief description of the methodology is as follows.
 1160     A set of MinHash Fingerprints (MHFPs) are calculated for molecules in input file
 1161     followed by the generation of a Locality Sensitivity Hashing (LSH) forest employing
 1162     MHFPs. A c-approximate k-Nearest Neighbor Graph (c-k-NNG) is constructed from
 1163     LSH, which is used to construct a Minimum Spanning Tree (MST) or Forest (MSF).
 1164     The final TMAP visualization is generated by laying out MST and MSF on a plane
 1165     using an algorithm provided by the Open Graph Drawing Framework (OGDF). The
 1166     OGDF provides flexibility to adjust graph layout methodology in terms of not only
 1167     aesthetics but also computational time.
 1168 
 1169     The supported input file formats are: CSV (.csv) TSV (.txt or .tsv),
 1170     SMILES (.smi)
 1171 
 1172     The supported output file format is: HTML (.html).
 1173 
 1174 Options:
 1175     --categoricalDataCols <collabel1,... or colnum1,...>  [default: none]
 1176         A comma delimited list of column labels or numbers corresponding to
 1177         categorical data to map on a TMAP visualization.
 1178     --categoricalDataColormaps <Colormap1, Colormap2,...>  [default: auto]
 1179         A comma delimited list of color map names corresponding to categorical
 1180         data. The default is to use 'tab10' color map name for mapping categorical
 1181         data on a TMAP. The number of specified color maps must match the number
 1182         of categorical data columns. You must specify valid color map names
 1183         supported by Matplotlib. No validation is performed. Example color map
 1184         names for categorical data: Pastel1, Pastel2, Paired, Accent, Dark2, Set1,
 1185         Set2, Set3, tab10, tab20, tab20b, tab20c.
 1186     --categoricalDataMaxDisplay <number>  [default: 6]
 1187         Maximum number of categories in a category column to display on a TMAP
 1188         visualization. The rest of the categories are aggregated under a new
 1189         category named 'Other' before mapping on to a TMAP visualization.
 1190     -c, --colmode <collabel or colnum>  [default: collabel]
 1191         Use column number or name for the specification of columns in input
 1192         text file containing SMILES strings and molecule names along with any 
 1193         categorical or numerical data.
 1194     --colSMILES <text or number>  [default: auto]
 1195         Column name or number corresponding to SMILES strings. The default value
 1196         is automatically set based on the value of '-c, --colmode': 'SMILES'  for
 1197         'collabel'; SMILES string column number for 'colnum'. SMILES strings must
 1198         be present in input file.
 1199     -e, --examples
 1200         Print examples.
 1201     --faerunConfigParams <Name,Value,...>  [default: auto]
 1202         A comma delimited list of parameter name and value pairs for configuring
 1203         faerun (Ref 172) to generate a TMAP visualization.
 1204         
 1205         The supported parameter names along with their default and possible
 1206         values are shown below:
 1207              
 1208             clearColor, #000000
 1209             showLegend, yes  [ Possible values: yes or no ] 
 1210             legendTitle, Legend
 1211             legendOrientation, vertical  [ Possible values: vertical or
 1212                 horizontal ]
 1213             legendNumberFormat, {:.2f}
 1214             scale, 750.0
 1215             alphaBlending, no  [ Possible values: yes or no ]
 1216             antiAliasing, yes  [Possible values: yes or no]
 1217             thumbnailWidth, 250
 1218             thumbnailFixed, no  [ Possible values: yes or no ]
 1219             
 1220         A brief description of parameters, as available in the code for faerun, is
 1221         provided below:
 1222         
 1223             clearColor: Background color
 1224             showLegend: Show legend at lower right
 1225             legendTitle: Legend title
 1226             legendOrientation: Legend Orientation
 1227             legendNumberFormat: Number string format applied to numbers
 1228                 displayed in legend
 1229             scale: Scaling factor for scaling normalized coordinates
 1230             AlphaBlending: Activate alpha blending. It is required for smoothCircle
 1231                 shader.
 1232             antiAliasing: Activate anti-aliasing. It might adversly impact
 1233                 rendering performance.
 1234             thumbnailWidth: Width of thumbnail images for structures
 1235             thumbnailFixed:  Show thumbnail images at a fixed location at the
 1236                 top instead of next to the mouse
 1237             
 1238     --faerunScatterPlotParams <Name,Value,...>  [default: auto]
 1239         A comma delimited list of parameter name and value pairs for generating
 1240         scatter plot representing a TMAP using faerun (Ref 172).
 1241         
 1242         The supported parameter names along with their default and possible
 1243         values are shown below:
 1244              
 1245             shader, circle  [ Possible values: circle, smoothCircle,
 1246                 sphere, or any valid value]
 1247             pointScale, auto  [ 4 if MolCout<=10K; 2 if MolCount<=100K; else 1 ]
 1248             maxPointSize, 100.0
 1249             fogIntensity, 0.0
 1250             interactive, yes  [ Possible values: yes or no ] 
 1251              
 1252         A brief description of parameters is provided below:
 1253         
 1254             shader: Shader to use for visualizating data points
 1255             pointScale: Relative size of data points
 1256             maxPointSize: Maximum size of the data points during zooming
 1257             fogIntensity: Intensity of distance fog
 1258             interactive: Generate interactive scatter plot
 1259             
 1260     -h, --help
 1261         Print this help message.
 1262     -i, --infile <infile>
 1263         Input file name. The SMILES strings must be present in the input file.
 1264         Supported formats: CSV (.csv) TSV (.txt or .tsv), or SMILES (.smi)
 1265     --infileDelimiter <comma, tab, or space>  [default: auto]
 1266         Input file delimiter for processing data. The default value is automatically
 1267         set based on the type of input file: comma - CSV (.csv); tab - TSV (.txt or
 1268         .tsv);  space - SMILES (.smi)
 1269     --lshForestFileWrite <yes or no>  [default: yes]
 1270         Write LSH forest data a file for subsequent generation of a TMAP visualization.
 1271         Default file name: <OutfileRoot>_LSHForest.dat. The LSH forest data is
 1272         generated using MinHash fingerprints. You may restore LSH forest data
 1273         using '--lshForestFileRestore' option to skip the generation of fingerprints.
 1274     --lshForestFileRestore <yes or no>  [default: no]
 1275         Check and restore LSH forest data from a file for generating a TMAP
 1276         visualization and skip the generation of MinHash fingerprints. Default file
 1277         name: <OutfileRoot>_LSHForest.dat
 1278     --lshForestParams <Name,Value,...>  [default: auto]
 1279         A comma delimited list of parameter name and value pairs for generating
 1280         LSH (Locality Sensitivity Hashing) forest from MinHash fingerprints.
 1281         
 1282         The supported parameter names along with their default and possible
 1283         values are shown below:
 1284              
 1285             dim, 2048
 1286             numPrefixTrees, auto  [ 128 if MolCount <= 10K else 8 ]
 1287             store, yes  [ Possible values: yes or no ]
 1288             
 1289         A brief description of parameters, as available in the code for LSH, is
 1290         provided below:
 1291         
 1292             dim: Dimensionality of MinHashes to be added to LSHForest
 1293             numPrefixTrees: Number of prefix trees to use
 1294             store: store the data for enhanced retrieval
 1295             
 1296     --lshLayoutConfigParams <Name,Value,...>  [default: auto]
 1297         A comma delimited list of parameter name and value pairs for configuring
 1298         LSH (Locality Sensitivity Hashing) layout.
 1299         
 1300         The supported parameter names along with their default and possible
 1301         values are shown below:
 1302             
 1303             k, auto  [ 75 if MolCount <= 10K else 10]
 1304             kc, auto  [ 20 if MolCount <= 10K else 10]
 1305             fmeIterations, 1000
 1306             fmeRandomize, no  [ Possible values: yes or no ]
 1307             fmeThreads, 4
 1308             fmePrecision, 4
 1309             slRrepeats, auto  [ 2 if MolCount <= 10K else 1]
 1310             slExtraScalingSteps, auto  [ 4 if MolCount <= 10K else 2 ]
 1311             slScalingMin, 1.0
 1312             slScalingMax, 1.0
 1313             slScalingType, RelativeToDrawing  [ Possible values: Absolute,
 1314                 RelativeToAvgLength, RelativeToDesiredLength, or
 1315                 RelativeToDrawing ]
 1316             mmmRepeats, auto  [ 2 MolCount <= 10K else 1 ]
 1317             placer, Barycenter  [ Possible valeues: Barycenter, Solar, Circle,
 1318                 Median, Random, or Zero ]
 1319             merger, LocalBiconnected  [ Possible values: EdgeCover,
 1320                 LocalBiconnected, Solar, or IndependentSet ]
 1321             mergerFactor, 2.0
 1322             mergerAdjustment, 0
 1323             nodeSizeDenominator, auto  [ 65 if MolCout <= 10K else 70.0]
 1324             
 1325         A brief description of parameters, as available in the code for LSH, is
 1326         provided below:
 1327             
 1328             k: Number of nearest neighbors used to create k-nearest neighbor
 1329                 graph
 1330             kc: Scalar by which k is multiplied before querying LSH forest.
 1331                 The results are then sorted in decreasing order based on linear
 1332                 scan distances. 
 1333             fmeIterations: Maximum number of iterations of Fast Multipole
 1334                 Embedder (FME)
 1335             fmeRandomize: Randomize FME layout at the start
 1336             fmeThreads: Number of threads for FME
 1337             fmePrecision: Number of coefficients of multipole expansion
 1338             slRepeats: Number of repeats of scaling layout algorithm
 1339             slExtraScalingSteps: Number of repeats of scaling
 1340             slScalingMin: Minimum scaling factor
 1341             slScalingMax: Maximum scaling factor.
 1342             slScalingType: Scaling type corresponding to relative scale of graph
 1343             mmmRepeats, Number of repeats of layout at each level
 1344             placer: Methodology for defining initial positions of vertices in a
 1345                 graph at each level
 1346             merger: Vertex merging methodology used during coarsening phase
 1347                 of multilevel algorithm
 1348             mergerFactor: Ratio of sizes between two levels up to which merging
 1349                 is performed.  It doesn't apply to all merging methodologies.
 1350             mergerAdjustment: Edge  length  adjustment  for merging methodology.
 1351                 It doesn't apply to all merging methodologies.
 1352             nodeSizeDenominator: Node size denominator affecting the magnitude
 1353                 of repelling force between nodes. Node size corresponds to
 1354                 1.0 / nodeSizeDenominator. You may want to increase the value
 1355                 nodeSizeDenominator to decrease node size and resolve overlaps
 1356                 in  a crowded tree.
 1357             
 1358     --mergeHTMLandJSFiles <yes or no>  [default: yes]
 1359         Merge TMAP JS data file into HTML file and delete JS data file. Default
 1360         file names: <OutfileRoot>.html, <OutfileRoot>.js.
 1361     --minHashFPParams <Name,Value,...>  [default: auto]
 1362         A comma delimited list of parameter name and value pairs for generating
 1363         Min Hash Fingerprints (MHFP).
 1364         
 1365         The supported parameter names along with their default and possible
 1366         values are shown below:
 1367             
 1368             radius, 3
 1369             rings, yes  [ Possible values: yes or no ]
 1370             kekulize, yes  [ Possible values: yes or no ]
 1371             sanitize, yes  [ Possible values: yes or no ]
 1372             minRadius, 1
 1373             numPermutations, 2048
 1374             seed, 42
 1375             
 1376         A brief description of parameters, as available in the code for MHFP,  is
 1377         provided below:
 1378             
 1379             radius:  MHFP radius (A radius of 3 corresponds to MHFP6)
 1380             rings:  Include rings in shingling
 1381             kekulize:  Kekulize SMILES
 1382             sanitize:  Sanitize SMILES
 1383             minRadius: Minimum radius that is used to extract n-grams
 1384             numPermutations: Number of permutations used for hashing
 1385             seed: Random number seed for numpy.random
 1386             
 1387     --mp <yes or no>  [default: no]
 1388         Use multiprocessing for the generation of fingerprints.
 1389          
 1390         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 1391         function employing lazy RDKit data iterable. This allows processing of
 1392         arbitrary large data sets without any additional requirements memory.
 1393         
 1394         All input data may be optionally loaded into memory by mp.Pool.map()
 1395         before starting worker processes in a process pool by setting the value
 1396         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 1397         
 1398         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 1399         data mode may adversely impact the performance. The '--mpParams' section
 1400         provides additional information to tune the value of 'chunkSize'.
 1401     --mpParams <Name,Value,...>  [default: auto]
 1402         A comma delimited list of parameter name and value pairs to configure
 1403         multiprocessing during the generation of fingerprints.
 1404         
 1405         The supported parameter names along with their default and possible
 1406         values are shown below:
 1407         
 1408             chunkSize, auto
 1409             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 1410             numProcesses, auto   [ Default: mp.cpu_count() ]
 1411         
 1412         These parameters are used by the following functions to configure and
 1413         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 1414         mp.Pool.imap().
 1415         
 1416         The chunkSize determines chunks of input data passed to each worker
 1417         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 1418         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 1419         
 1420         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 1421         automatically converts RDKit data iterable into a list, loads all data into
 1422         memory, and calculates the default chunkSize using the following method
 1423         as shown in its code:
 1424         
 1425             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 1426             if extra: chunkSize += 1
 1427         
 1428         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 1429         and 100 data items.
 1430         
 1431         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 1432         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 1433         data into memory. Consequently, the size of input data is not known a priori.
 1434         It's not possible to estimate an optimal value for the chunkSize. The default 
 1435         chunkSize is set to 1.
 1436         
 1437         The default value for the chunkSize during 'Lazy' data mode may adversely
 1438         impact the performance due to the overhead associated with exchanging
 1439         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 1440         a larger value during 'Lazy' input data mode, based on the size of your input
 1441         data and number of processes in the process pool.
 1442         
 1443         The mp.Pool.map() function waits for all worker processes to process all
 1444         the data and return the results. The mp.Pool.imap() function, however,
 1445         returns the the results obtained from worker processes as soon as the
 1446         results become available for specified chunks of data.
 1447         
 1448         The order of data in the results returned by both mp.Pool.map() and 
 1449         mp.Pool.imap() functions always corresponds to the input data.
 1450     --numericalDataCols <collabel1,... or colnum1,...>  [default: none]
 1451         A comma demlimited list of column labels or numbers corresponding to
 1452         numerical data to map on a TMAP visualization.
 1453     --numericalDataColormaps <Colormap1, Colormap2,...>  [default: auto]
 1454         A comma demlimited list of color map names corresponding to numerical
 1455         data. The default is to use 'viridis' color map name for mapping numerical
 1456         data on a TMAP. The number of specified color maps must mtach the number
 1457         of numerical data columns. You must specify valid color map names
 1458         supported by Matplotlib. No validation is performed. Example color map
 1459         names for numerical data: viridis, plasma, inferno, magma, cividis.
 1460     -o, --outfile <outfile>
 1461         Output HTML file name for writing out a TMAP visualization.
 1462     --overwrite
 1463         Overwrite existing files.
 1464     -q, --quiet <yes or no>  [default: no]
 1465         Use quiet mode. The warning and information messages will not be printed.
 1466     --structureDisplayDataCols <collabel1,... or colnum1,...>  [default: auto]
 1467         A comma delimited list of column labels or numbers corresponding to data
 1468         to display under a thumbnail image of a structure in a TMAP visualization.
 1469         The default column is set to 'Name' and it is automatically shown. In addition,
 1470         the SMILES string column is always used to display SMILES under the structures.
 1471     -t, --tmapDisplayMsg <text>  [default: auto]
 1472         A brief message to display at the top left in HTML page containing a TMAP
 1473         visualization. You must specify a valid HTML string. No validation is
 1474         performed. Default message: TMAP chemspace visualization<br/>
 1475         Input file: <InfileName><br/>Number of molecules: <Count>
 1476     -w, --workingdir <dir>
 1477         Location of working directory which defaults to the current directory.
 1478 
 1479 Examples:
 1480     To visualize chemspace for SMILES strings present in a column name SMILES in
 1481     input file, mapping a categorical data column on TMAP, writing out LSH forest
 1482     for subsequent use to skip the generation of fingerprints, merging TMAP JS file
 1483     into HTML file, and write out a HTML file containing TMAP visualization, type:
 1484 
 1485         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1486           -i SampleChemspace.csv -o SampleChemspace.html
 1487 
 1488     To run the first example for SMILES strings in column name SMILES in input file
 1489     and write out a HTML file containing TMAP visualization, type:
 1490 
 1491         % VisualizeChemspaceUsingTMAP.py --colSMILES SMILES
 1492           --categoricalDataCols Source
 1493           -i SampleChemspace.csv -o SampleChemspace.html
 1494 
 1495     To run the first example for mapping categrorical data in column number 4 in
 1496     input file and write out a HTML file containing TMAP visualization, type:
 1497 
 1498         % VisualizeChemspaceUsingTMAP.py --colmode colnum
 1499           --categoricalDataCols 4
 1500           -i SampleChemspace.csv -o SampleChemspace.html
 1501 
 1502     To run the first example for mapping both categrorical and numerical data
 1503     coumns and write out a HTML file containing TMAP visualization, type:
 1504 
 1505         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1506           --numericalDataCols "MolWt,MolLogP"
 1507           -i SampleChemspace.csv -o SampleChemspace.html
 1508 
 1509     To run the first example for mapping both categrorical and numerical data
 1510     coumns along with specified colormaps and write out a HTML file containing
 1511     TMAP visualization, type:
 1512 
 1513         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1514           --categoricalDataColormaps "tab10"
 1515           --numericalDataCols "MolWt,MolLogP"
 1516           --numericalDataColormaps "viridis, plasma"
 1517           -i SampleChemspace.csv -o SampleChemspace.html
 1518 
 1519     To run the first example for mapping both categrorical and numerical data
 1520     coumns along with displaying specific data under the structure display  and
 1521     write out a HTML file containing TMAP visualization, type:
 1522 
 1523         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1524           --numericalDataCols "MolWt,NHOHCount,NOCount,MolLogP,
 1525           NumRotatableBonds,TPSA" --structureDisplayDataCols "Name,ID"
 1526           -i SampleChemspace.csv -o SampleChemspace.html
 1527 
 1528     To run the first example for restoring LSH forest data from a file to skip the
 1529     generation of fingerpritns and write out a HTML file containing TMAP
 1530     visualization, type:
 1531 
 1532         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1533            --lshForestFileRestore yes -i SampleChemspace.csv -o SampleChemspace.html
 1534 
 1535     To run the first example in multiprocessing mode on all available CPUs without
 1536     loading all data into memory and write out  a HTML file containing TMAP
 1537     visualization, type:
 1538 
 1539         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1540           --mp yes -i SampleChemspace.csv -o SampleChemspace.html
 1541 
 1542     To run the first example in multiprocessing mode on all available CPUs by
 1543     loading all data into memory and write out  a HTML file containing TMAP
 1544     visualization, type:
 1545 
 1546         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1547           --mp yes --mpParams "inputDataMode,InMemory"
 1548           -i SampleChemspace.csv -o SampleChemspace.html
 1549 
 1550     To run the first example in multiprocessing mode on specific number of CPUs
 1551     and chunk size without loading all data into memory and write out a HTML file
 1552     containing TMAP visualization, type:
 1553 
 1554         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1555           --mp yes --mpParams "inputDataMode,lazy,numProcesses,4,
 1556           chunkSize,50" -i SampleChemspace.csv -o SampleChemspace.html
 1557 
 1558     To run the first example using a set of specified parameters to generate
 1559     fingerprints and LSH forest, configure faerun and scatter plot layout, and
 1560     write out a HTML file containing TMAP visualization, type:
 1561 
 1562         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1563           --minHashFPParams "radius,3,numPermutations,2048"
 1564           --lshForestParams "dim,2048,numPrefixTrees,128"
 1565           --lshLayoutConfigParams "k,75,kc,20,slRepeats,2,
 1566           slExtraScalingSteps,4,mmmRepeats,2" 
 1567           --faerunConfigParams "clearColor, #000000,thumbnailWidth, 250"
 1568           --faerunScatterPlotParams "shader,circle,pointScale,4"
 1569           --tmapDisplayMsg "TMAP Chemspace visualization"
 1570           -i SampleChemspace.csv -o SampleChemspace.html
 1571 
 1572 Author:
 1573     Manish Sud(msud@san.rr.com)
 1574 
 1575 See also:
 1576     RDKitConvertFileFormat.py, RDKitCalculateMolecularDescriptors.py,
 1577     RDKitStandardizeMolecules.py
 1578 
 1579 Copyright:
 1580     Copyright (C) 2024 Manish Sud. All rights reserved.
 1581 
 1582     The functionality available in this script is implemented using TMAP and
 1583     Faerun, open source software packages for visualizing chemspace, and
 1584     RDKit, an open source toolkit for cheminformatics developed by Greg
 1585     Landrum.
 1586 
 1587     This file is part of MayaChemTools.
 1588 
 1589     MayaChemTools is free software; you can redistribute it and/or modify it under
 1590     the terms of the GNU Lesser General Public License as published by the Free
 1591     Software Foundation; either version 3 of the License, or (at your option) any
 1592     later version.
 1593 
 1594 """
 1595 
 1596 if __name__ == "__main__":
 1597     main()