MayaChemTools

    1 #!/bin/env python
    2 #
    3 # File: VisualizeChemspaceUsingTMAP.py
    4 # Author: Manish Sud <msud@san.rr.com>
    5 #
    6 # Copyright (C) 2026 Manish Sud. All rights reserved.
    7 #
    8 # The functionality available in this script is implemented using TMAP and
    9 # Faerun, open source software packages for visualizing chemspace, and
   10 # RDKit, an open source toolkit for cheminformatics developed by Greg
   11 # Landrum.
   12 #
   13 # This file is part of MayaChemTools.
   14 #
   15 # MayaChemTools is free software; you can redistribute it and/or modify it under
   16 # the terms of the GNU Lesser General Public License as published by the Free
   17 # Software Foundation; either version 3 of the License, or (at your option) any
   18 # later version.
   19 #
   20 # MayaChemTools is distributed in the hope that it will be useful, but without
   21 # any warranty; without even the implied warranty of merchantability of fitness
   22 # for a particular purpose.  See the GNU Lesser General Public License for more
   23 # details.
   24 #
   25 # You should have received a copy of the GNU Lesser General Public License
   26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
   27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
   28 # Boston, MA, 02111-1307, USA.
   29 #
   30 
   31 from __future__ import print_function
   32 
   33 import os
   34 import sys
   35 import time
   36 import re
   37 import csv
   38 import shutil
   39 import multiprocessing as mp
   40 import pandas as pd
   41 import numpy as np
   42 
   43 # TMAP and Faerun imports...
   44 try:
   45     import tmap as tm
   46     from faerun import Faerun
   47     from mhfp.encoder import MHFPEncoder
   48 except ImportError as ErrMsg:
   49     sys.stderr.write("\nFailed to import TMAP/Faerun module/package: %s\n" % ErrMsg)
   50     sys.stderr.write("Check/update your TMAP environment and try again.\n\n")
   51     sys.exit(1)
   52 
   53 # RDKit imports...
   54 try:
   55     from rdkit import rdBase
   56 except ImportError as ErrMsg:
   57     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
   58     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
   59     sys.exit(1)
   60 
   61 # MayaChemTools imports...
   62 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
   63 try:
   64     from docopt import docopt
   65     import MiscUtil
   66 except ImportError as ErrMsg:
   67     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
   68     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
   69     sys.exit(1)
   70 
   71 ScriptName = os.path.basename(sys.argv[0])
   72 Options = {}
   73 OptionsInfo = {}
   74 
   75 
   76 def main():
   77     """Start execution of the script."""
   78 
   79     MiscUtil.PrintInfo(
   80         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
   81         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
   82     )
   83 
   84     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
   85 
   86     # Retrieve command line arguments and options...
   87     RetrieveOptions()
   88 
   89     # Process and validate command line arguments and options...
   90     ProcessOptions()
   91 
   92     # Perform actions required by the script...
   93     VisualizeChemspace()
   94 
   95     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
   96     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
   97 
   98 
   99 def VisualizeChemspace():
  100     """Visualize chemspace using TMAP."""
  101 
  102     InfileDF = ReadMoleculeData()
  103 
  104     MolCount, ValidMolCount, VisualizationFailedCount = ProcessMolecules(InfileDF)
  105 
  106     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
  107     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
  108     MiscUtil.PrintInfo("Number of molecules failed during chemspace visualization: %d" % VisualizationFailedCount)
  109     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
  110 
  111 
  112 def ProcessMolecules(InfileDF):
  113     """Process molecules and generate TMAP."""
  114 
  115     MolCount = len(InfileDF)
  116     (ValidMolCount, VisualizationFailedCount) = [0] * 2
  117 
  118     # Setup parameter values for "auto" options based on the number of molecules...
  119     ProcessMolCountBasedAutoOptions(MolCount)
  120 
  121     # Setup LSH forest...
  122     LSHForest, ValidMolCount, VisualizationFailedCount = SetupLSHForest(InfileDF)
  123     if ValidMolCount == 0:
  124         return (MolCount, ValidMolCount, VisualizationFailedCount)
  125 
  126     SetupTMAPDisplayMessage(MolCount, ValidMolCount)
  127 
  128     # Generate TMAP coordinates...
  129     PlotCoordsInfo = GenerateTMAPCoordinates(LSHForest)
  130 
  131     # Setup TMAP plot data...
  132     PlotDataInfo = SetupTMAPPlotData(InfileDF)
  133 
  134     # Setup TMAP plot...
  135     GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo)
  136 
  137     return (MolCount, ValidMolCount, VisualizationFailedCount)
  138 
  139 
  140 def SetupLSHForest(InfileDF):
  141     """Setup LSH forest."""
  142 
  143     if OptionsInfo["LSHForestFileRestoreMode"]:
  144         return RestoreLSHForest((InfileDF))
  145     else:
  146         return GenerateLSHForest(InfileDF)
  147 
  148 
  149 def RestoreLSHForest(InfileDF):
  150     """Restore LSH forest."""
  151 
  152     (ValidMolCount, VisualizationFailedCount) = [0] * 2
  153 
  154     # Set valid molecule count to number of molecules in input file...
  155     ValidMolCount = len(InfileDF)
  156 
  157     LSHForestFile = OptionsInfo["OutfileLSHForest"]
  158     MiscUtil.PrintInfo("\nRestoring LSH forest from %s..." % LSHForestFile)
  159     if not os.path.isfile(LSHForestFile):
  160         MiscUtil.PrintError("LSH forest file %s is missing. Failed to restore LSH forest...\n" % LSHForestFile)
  161 
  162     LSHForest = InitializeLSHForest()
  163     LSHForest.restore(LSHForestFile)
  164 
  165     if LSHForest.size() != ValidMolCount:
  166         MiscUtil.PrintError(
  167             'The number of molecules, %s, in input file must match number of nodes, %s, in LSH forest during its restoration from a file using "--lshForestFileWrite" option.'
  168             % (ValidMolCount, LSHForest.size())
  169         )
  170 
  171     return (LSHForest, ValidMolCount, VisualizationFailedCount)
  172 
  173 
  174 def GenerateLSHForest(InfileDF):
  175     """Generate LSH forest."""
  176 
  177     MinHashFingerprints, ValidMolCount, FingerprintsFailedCount = GenerateMinHashFingerprints(InfileDF)
  178 
  179     MiscUtil.PrintInfo("\nGenerating LSH forest...")
  180     LSHForest = InitializeLSHForest()
  181 
  182     LSHForest.batch_add(MinHashFingerprints)
  183     LSHForest.index()
  184 
  185     # Write out LSH forest...
  186     if OptionsInfo["LSHForestFileWriteMode"]:
  187         OutfileLSHForest = OptionsInfo["OutfileLSHForest"]
  188         if FingerprintsFailedCount > 0:
  189             MiscUtil.PrintWarning(
  190                 "The MinHash fingerprints generation failed for %s molecules. Skipped writing of file %s..."
  191                 % (FingerprintsFailedCount, OutfileLSHForest)
  192             )
  193         else:
  194             MiscUtil.PrintInfo("Writing LSH forest file %s..." % OutfileLSHForest)
  195             LSHForest.store(OutfileLSHForest)
  196 
  197     return (LSHForest, ValidMolCount, FingerprintsFailedCount)
  198 
  199 
  200 def GenerateMinHashFingerprints(InfileDF):
  201     """Generate MinHash fingerprints."""
  202 
  203     if OptionsInfo["MPMode"]:
  204         return GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF)
  205     else:
  206         return GenerateMinHashFingerprintsUsingSingleProcess(InfileDF)
  207 
  208 
  209 def GenerateMinHashFingerprintsUsingSingleProcess(InfileDF):
  210     """Generate MHFPs using a single processs."""
  211 
  212     MiscUtil.PrintInfo("\nGenerating MinHash fingerprints using a single process...")
  213 
  214     MinHashFingerprintsEncoder = InitializeMinHashFingerprintsEncoder()
  215 
  216     (ValidMolCount, FingerprintsFailedCount) = [0] * 2
  217     MinHashFingerprints = []
  218     FingerprintsFailedRowIndices = []
  219 
  220     SMILESColname = OptionsInfo["SMILESColname"]
  221     for MolIndex, SMILES in enumerate(InfileDF[SMILESColname]):
  222         MinHashFingerprint = GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES)
  223         if MinHashFingerprint is None:
  224             FingerprintsFailedCount += 1
  225             FingerprintsFailedRowIndices.append(MolIndex)
  226         else:
  227             ValidMolCount += 1
  228             MinHashFingerprints.append(tm.VectorUint(MinHashFingerprint))
  229 
  230     # Remove failed molecules from the dataframe...
  231     RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
  232 
  233     return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
  234 
  235 
  236 def GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF):
  237     """Generate MHFPs using multiprocessing."""
  238 
  239     MiscUtil.PrintInfo("\nGenerating MinHash fingerprints using multiprocessing...")
  240 
  241     MPParams = OptionsInfo["MPParams"]
  242 
  243     # Setup data for initializing a worker process...
  244     InitializeWorkerProcessArgs = (
  245         MiscUtil.ObjectToBase64EncodedString(Options),
  246         MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
  247     )
  248 
  249     # Setup SMILES iterator...
  250     SMILESColname = OptionsInfo["SMILESColname"]
  251     WorkerProcessDataIterable = SetupSMILESWithMolIndices(InfileDF[SMILESColname])
  252 
  253     # Setup process pool along with data initialization for each process...
  254     if not OptionsInfo["QuietMode"]:
  255         MiscUtil.PrintInfo(
  256             "\nConfiguring multiprocessing using %s method..."
  257             % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
  258         )
  259         MiscUtil.PrintInfo(
  260             "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
  261             % (
  262                 MPParams["NumProcesses"],
  263                 MPParams["InputDataMode"],
  264                 ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
  265             )
  266         )
  267 
  268     ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
  269 
  270     # Start processing...
  271     if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
  272         Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  273     elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
  274         Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
  275     else:
  276         MiscUtil.PrintError(
  277             'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
  278         )
  279 
  280     (ValidMolCount, FingerprintsFailedCount) = [0] * 2
  281     MinHashFingerprints = []
  282     FingerprintsFailedRowIndices = []
  283 
  284     for Result in Results:
  285         Molndex, MinHashFingerprint = Result
  286 
  287         if MinHashFingerprint is None:
  288             FingerprintsFailedCount += 1
  289             FingerprintsFailedRowIndices.append(Molndex)
  290         else:
  291             ValidMolCount += 1
  292             MinHashFingerprints.append(tm.VectorUint(np.array(MinHashFingerprint)))
  293 
  294     # Remove failed molecules from the dataframe...
  295     RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
  296 
  297     return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
  298 
  299 
  300 def InitializeWorkerProcess(*EncodedArgs):
  301     """Initialize data for a worker process."""
  302 
  303     global Options, OptionsInfo
  304 
  305     if not OptionsInfo["QuietMode"]:
  306         MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
  307 
  308     # Decode Options and OptionInfo...
  309     Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
  310     OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
  311 
  312     # Initialize MHFP encoder...
  313     OptionsInfo["MinHashFingerprintsEncoder"] = InitializeMinHashFingerprintsEncoder()
  314 
  315 
  316 def WorkerProcess(MolInfo):
  317     """Process data for a worker process."""
  318 
  319     MolIndex, SMILES = MolInfo
  320 
  321     MinHashFingerprint = GenerateMinHashFingerprintForMolecule(OptionsInfo["MinHashFingerprintsEncoder"], SMILES)
  322     if MinHashFingerprint is not None:
  323         MinHashFingerprint = MinHashFingerprint.tolist()
  324 
  325     return (MolIndex, MinHashFingerprint)
  326 
  327 
  328 def SetupSMILESWithMolIndices(SMILES):
  329     """Setup an iterator to generate SMILES string along with a molecule index."""
  330 
  331     for MolIndex, MolSMILES in enumerate(SMILES):
  332         yield (MolIndex, MolSMILES)
  333 
  334 
  335 def GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES):
  336     """Generate MinHash fingerprint for a molecule."""
  337 
  338     MinHashFingerprint = None
  339     try:
  340         MinHashFingerprint = MinHashFingerprintsEncoder.encode(
  341             SMILES,
  342             radius=OptionsInfo["MinHashFPParams"]["Radius"],
  343             rings=OptionsInfo["MinHashFPParams"]["Rings"],
  344             kekulize=OptionsInfo["MinHashFPParams"]["Kekulize"],
  345             min_radius=OptionsInfo["MinHashFPParams"]["MinRadius"],
  346             sanitize=OptionsInfo["MinHashFPParams"]["Sanitize"],
  347         )
  348     except Exception as ErrMsg:
  349         if not OptionsInfo["QuietMode"]:
  350             MiscUtil.PrintWarning("Failed to generate MinHash fingerprint for SMILES %s:\n%s\n" % (SMILES, ErrMsg))
  351         else:
  352             MiscUtil.PrintInfo("")
  353         MinHashFingerprint = None
  354 
  355     return MinHashFingerprint
  356 
  357 
  358 def RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices):
  359     """Remove fingerprints failed rows."""
  360 
  361     if len(FingerprintsFailedRowIndices):
  362         InfileDF.drop(FingerprintsFailedRowIndices, inplace=True)
  363         InfileDF.reset_index(drop=True, inplace=True)
  364 
  365 
  366 def GenerateTMAPCoordinates(LSHForest):
  367     """Generate TMAP coordinates."""
  368 
  369     MiscUtil.PrintInfo("\nGenerating TMAP plot coordinates...")
  370 
  371     PlotCoordsInfo = {}
  372     PlotCoordsInfo["NodeXCoords"] = None
  373     PlotCoordsInfo["NodeYCoords"] = None
  374     PlotCoordsInfo["EdgeNodeStartList"] = None
  375     PlotCoordsInfo["EdgeNodeToList"] = None
  376 
  377     LSHLayoutConfigParams = OptionsInfo["LSHLayoutConfigParams"]
  378     LSHLayoutConfig = tm.LayoutConfiguration()
  379 
  380     LSHLayoutConfig.k = LSHLayoutConfigParams["K"]
  381     LSHLayoutConfig.kc = LSHLayoutConfigParams["KC"]
  382     LSHLayoutConfig.fme_iterations = LSHLayoutConfigParams["FMEIterations"]
  383     LSHLayoutConfig.fme_randomize = LSHLayoutConfigParams["FMERandomize"]
  384     LSHLayoutConfig.fme_threads = LSHLayoutConfigParams["FMEThreads"]
  385     LSHLayoutConfig.fme_precision = LSHLayoutConfigParams["FMEPrecision"]
  386     LSHLayoutConfig.sl_repeats = LSHLayoutConfigParams["SLRepeats"]
  387     LSHLayoutConfig.sl_extra_scaling_steps = LSHLayoutConfigParams["SLExtraScalingSteps"]
  388     LSHLayoutConfig.sl_scaling_min = LSHLayoutConfigParams["SLScalingMin"]
  389     LSHLayoutConfig.sl_scaling_max = LSHLayoutConfigParams["SLScalingMax"]
  390     LSHLayoutConfig.sl_scaling_type = LSHLayoutConfigParams["SLScalingType"]
  391     LSHLayoutConfig.mmm_repeats = LSHLayoutConfigParams["MMMRepeats"]
  392     LSHLayoutConfig.placer = LSHLayoutConfigParams["Placer"]
  393     LSHLayoutConfig.merger = LSHLayoutConfigParams["Merger"]
  394     LSHLayoutConfig.merger_factor = LSHLayoutConfigParams["MergerFactor"]
  395     LSHLayoutConfig.merger_adjustment = LSHLayoutConfigParams["MergerAdjustment"]
  396     LSHLayoutConfig.node_size = 1.0 / LSHLayoutConfigParams["NodeSizeDenominator"]
  397 
  398     NodeXCoords, NodeYCoords, EdgeNodeStartList, EdgeNodeToList, _ = tm.layout_from_lsh_forest(
  399         LSHForest, config=LSHLayoutConfig
  400     )
  401 
  402     PlotCoordsInfo["NodeXCoords"] = NodeXCoords
  403     PlotCoordsInfo["NodeYCoords"] = NodeYCoords
  404     PlotCoordsInfo["EdgeNodeStartList"] = EdgeNodeStartList
  405     PlotCoordsInfo["EdgeNodeToList"] = EdgeNodeToList
  406 
  407     return PlotCoordsInfo
  408 
  409 
  410 def SetupTMAPPlotData(InfileDF):
  411     """Setup plot data for TMAP plot."""
  412 
  413     MiscUtil.PrintInfo("\nSetting up TMAP plot data...")
  414 
  415     PlotDataInfo = {}
  416     PlotDataInfo["Columns"] = []
  417     PlotDataInfo["Colormaps"] = []
  418     PlotDataInfo["CategoricalStatus"] = []
  419     PlotDataInfo["LegendLabels"] = []
  420     PlotDataInfo["SeriesTitles"] = []
  421 
  422     # Setup categorical data...
  423     if OptionsInfo["CategoricalDataColnames"] is not None:
  424         for ColnameIndex, Colname in enumerate(OptionsInfo["CategoricalDataColnames"]):
  425             CategoryLabels, CategoryData = Faerun.create_categories(InfileDF[Colname])
  426             if len(CategoryLabels) > OptionsInfo["CategoricalDataMaxDisplay"]:
  427                 CategoryLabels, CategoryData = RemapCategoricalPlotData(CategoryLabels, CategoryData)
  428 
  429             PlotDataInfo["Columns"].append(CategoryData)
  430             PlotDataInfo["Colormaps"].append(OptionsInfo["CategoricalDataColormapsList"][ColnameIndex])
  431             PlotDataInfo["CategoricalStatus"].append(True)
  432             PlotDataInfo["LegendLabels"].append(CategoryLabels)
  433             PlotDataInfo["SeriesTitles"].append(Colname)
  434 
  435     # Setup numerical data...
  436     if OptionsInfo["NumericalDataColnames"] is not None:
  437         for ColnameIndex, Colname in enumerate(OptionsInfo["NumericalDataColnames"]):
  438             PlotDataInfo["Columns"].append(InfileDF[Colname])
  439             PlotDataInfo["Colormaps"].append(OptionsInfo["NumericalDataColormapsList"][ColnameIndex])
  440             PlotDataInfo["CategoricalStatus"].append(False)
  441             PlotDataInfo["LegendLabels"].append(None)
  442             PlotDataInfo["SeriesTitles"].append(Colname)
  443 
  444     # Setup structure display data...
  445     FirstCol = True
  446     SMILESSelectedData = []
  447     SMILESSelectedLabels = []
  448     FirstCol = True
  449     for Colname in OptionsInfo["StructureDisplayDataColnames"]:
  450         if FirstCol:
  451             FirstCol = False
  452             SMILESSelectedData = InfileDF[Colname]
  453             SMILESSelectedLabels.append(Colname)
  454         else:
  455             SMILESSelectedData = SMILESSelectedData + "__" + InfileDF[Colname].astype(str)
  456             SMILESSelectedLabels.append(Colname)
  457 
  458     PlotDataInfo["SMILESSelectedData"] = SMILESSelectedData
  459     PlotDataInfo["SMILESSelectedLabels"] = SMILESSelectedLabels
  460 
  461     return PlotDataInfo
  462 
  463 
  464 def RemapCategoricalPlotData(CategoryLabels, CategoryData):
  465     """Ramap categorical plot data."""
  466 
  467     if len(CategoryLabels) <= OptionsInfo["CategoricalDataMaxDisplay"]:
  468         return (CategoryLabels, CategoryData)
  469 
  470     # Track categories to remap...
  471     CategoryLabelsNew = []
  472     CategoryValuesToRemap = []
  473     LastCategoryValue = 0
  474 
  475     for CategoryLabelIndex, CategoryLabel in enumerate(CategoryLabels):
  476         CategoryValue, CategroyName = CategoryLabel
  477         if CategoryLabelIndex < OptionsInfo["CategoricalDataMaxDisplay"]:
  478             CategoryLabelsNew.append((CategoryValue, CategroyName))
  479             LastCategoryValue = CategoryValue
  480         else:
  481             CategoryValuesToRemap.append(CategoryValue)
  482 
  483     # Set up other category...
  484     OtherCategoryValue = LastCategoryValue + 1
  485     OtherCategoryName = "Other"
  486     CategoryLabelsNew.append((OtherCategoryValue, OtherCategoryName))
  487 
  488     # Update category labels and data...
  489     CategoryLabels = CategoryLabelsNew
  490     for ValueIndex, Value in enumerate(CategoryData):
  491         if Value in CategoryValuesToRemap:
  492             CategoryData[ValueIndex] = OtherCategoryValue
  493 
  494     return (CategoryLabels, CategoryData)
  495 
  496 
  497 def GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo):
  498     """Generate TMAP plot."""
  499 
  500     MiscUtil.PrintInfo("\nGenerating TMAP plot...")
  501 
  502     # Initialize Faerun plot...
  503     FaerunConfigParams = OptionsInfo["FaerunConfigParams"]
  504     ImpressMsg = OptionsInfo["TMAPDisplayMsg"]
  505     TMAPFaerunPlot = Faerun(
  506         clear_color=FaerunConfigParams["ClearColor"],
  507         view="front",
  508         coords=False,
  509         title="",
  510         x_title="",
  511         y_title="",
  512         show_legend=FaerunConfigParams["ShowLegend"],
  513         legend_title=FaerunConfigParams["LegendTitle"],
  514         legend_orientation=FaerunConfigParams["LegendOrientation"],
  515         legend_number_format=FaerunConfigParams["LegendNumberFormat"],
  516         scale=FaerunConfigParams["Scale"],
  517         alpha_blending=FaerunConfigParams["AlphaBlending"],
  518         anti_aliasing=FaerunConfigParams["AntiAliasing"],
  519         thumbnail_width=FaerunConfigParams["ThumbnailWidth"],
  520         thumbnail_fixed=FaerunConfigParams["ThumbnailFixed"],
  521         impress=ImpressMsg,
  522     )
  523 
  524     # Setup scatter plot...
  525     ScatterPlotName = "Data"
  526     ScatterTreePlotName = "%s_tree" % ScatterPlotName
  527     FaerunScatterPlotParams = OptionsInfo["FaerunScatterPlotParams"]
  528     TMAPFaerunPlot.add_scatter(
  529         ScatterPlotName,
  530         {
  531             "x": PlotCoordsInfo["NodeXCoords"],
  532             "y": PlotCoordsInfo["NodeYCoords"],
  533             "c": PlotDataInfo["Columns"],
  534             "labels": PlotDataInfo["SMILESSelectedData"],
  535         },
  536         colormap=PlotDataInfo["Colormaps"],
  537         shader=FaerunScatterPlotParams["Shader"],
  538         point_scale=FaerunScatterPlotParams["PointScale"],
  539         max_point_size=FaerunScatterPlotParams["MaxPointSize"],
  540         fog_intensity=FaerunScatterPlotParams["FogIntensity"],
  541         categorical=PlotDataInfo["CategoricalStatus"],
  542         interactive=FaerunScatterPlotParams["Interactive"],
  543         has_legend=True,
  544         legend_labels=PlotDataInfo["LegendLabels"],
  545         series_title=PlotDataInfo["SeriesTitles"],
  546         selected_labels=PlotDataInfo["SMILESSelectedLabels"],
  547     )
  548 
  549     # Add scatter plot to Faerun...
  550     TMAPFaerunPlot.add_tree(
  551         ScatterTreePlotName,
  552         {"from": PlotCoordsInfo["EdgeNodeStartList"], "to": PlotCoordsInfo["EdgeNodeToList"]},
  553         point_helper=ScatterPlotName,
  554     )
  555 
  556     # Write out TMAP plot HTML and JS files...
  557     MiscUtil.PrintInfo("Writing TMAP plot files %s and %s..." % (OptionsInfo["Outfile"], OptionsInfo["OutfileJS"]))
  558     TMAPFaerunPlot.plot(OptionsInfo["OutfilePrefix"], template="smiles")
  559 
  560     if OptionsInfo["MergeHTMLandJSFilesMode"]:
  561         MergeTMAPResultsHTMLAndJSFiles()
  562 
  563 
  564 def MergeTMAPResultsHTMLAndJSFiles():
  565     """Merge TMAP HTML and JS files."""
  566 
  567     MiscUtil.PrintInfo("\nMerging TMAP plot file %s into  %s..." % (OptionsInfo["OutfileJS"], OptionsInfo["Outfile"]))
  568 
  569     TMAPResultsHTMLFile = OptionsInfo["Outfile"]
  570     TMAPResultsJSFile = OptionsInfo["OutfileJS"]
  571 
  572     TMAPResultsTMPHTMLFile = "Tmp%s.html" % OptionsInfo["OutfilePrefix"]
  573 
  574     HTMLResultsFH = open(TMAPResultsHTMLFile, "r")
  575     JSResultsFH = open(TMAPResultsJSFile, "r")
  576 
  577     TMPHTMLResultsFH = open(TMAPResultsTMPHTMLFile, "w")
  578 
  579     for HTMLLine in HTMLResultsFH:
  580         HTMLLine = HTMLLine.rstrip()
  581         if re.search("%s" % TMAPResultsJSFile, HTMLLine, re.IGNORECASE):
  582             TMPHTMLResultsFH.write("    <script>\n")
  583 
  584             FirstLine = True
  585             for JSLine in JSResultsFH:
  586                 JSLine = JSLine.rstrip()
  587                 if FirstLine:
  588                     FirstLine = False
  589                     TMPHTMLResultsFH.write("    %s\n" % JSLine)
  590                 else:
  591                     TMPHTMLResultsFH.write("%s\n" % JSLine)
  592             TMPHTMLResultsFH.write("\n    </script>\n")
  593 
  594         else:
  595             TMPHTMLResultsFH.write("%s\n" % HTMLLine)
  596 
  597     HTMLResultsFH.close()
  598     JSResultsFH.close()
  599     TMPHTMLResultsFH.close()
  600 
  601     MiscUtil.PrintInfo("Moving %s to %s..." % (TMAPResultsTMPHTMLFile, OptionsInfo["Outfile"]))
  602     shutil.move(TMAPResultsTMPHTMLFile, TMAPResultsHTMLFile)
  603 
  604     MiscUtil.PrintInfo("Removing %s file..." % (OptionsInfo["OutfileJS"]))
  605     os.remove(TMAPResultsJSFile)
  606 
  607 
  608 def InitializeLSHForest():
  609     """Initialize LSH forest."""
  610 
  611     LSHForestParams = OptionsInfo["LSHForestParams"]
  612     LSHForest = tm.LSHForest(LSHForestParams["Dim"], LSHForestParams["NumPrefixTrees"], LSHForestParams["Store"])
  613 
  614     return LSHForest
  615 
  616 
  617 def InitializeMinHashFingerprintsEncoder():
  618     """Initialize MinHash fingerprints encoder."""
  619 
  620     MinHashFPParams = OptionsInfo["MinHashFPParams"]
  621     MinHashFingerprintsEncoder = MHFPEncoder(
  622         n_permutations=MinHashFPParams["NumPermutations"], seed=MinHashFPParams["Seed"]
  623     )
  624 
  625     return MinHashFingerprintsEncoder
  626 
  627 
  628 def ReadMoleculeData():
  629     """Read molecule data."""
  630 
  631     Infile = OptionsInfo["Infile"]
  632     InfileDelimiter = OptionsInfo["InfileDelimiter"]
  633 
  634     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  635     InfileDF = pd.read_csv(Infile, sep=InfileDelimiter)
  636 
  637     return InfileDF
  638 
  639 
  640 def ProcessMolCountBasedAutoOptions(MolCount):
  641     """Process auto option values dependent on number of molecules."""
  642 
  643     #  Process "auto" option for LSHForestParams...
  644     ParamName = "NumPrefixTrees"
  645     ParamValue = "%s" % OptionsInfo["LSHForestParams"][ParamName]
  646     if re.match("^auto$", ParamValue, re.I):
  647         ParamValue = 128 if MolCount <= 10e03 else 8
  648         OptionsInfo["LSHForestParams"][ParamName] = ParamValue
  649 
  650     #  Process "auto" option for FaerunScatterPlotParams...
  651     ParamName = "PointScale"
  652     ParamValue = OptionsInfo["FaerunScatterPlotParams"][ParamName]
  653     ParamValue = "%s" % ParamValue
  654     if re.match("^auto$", ParamValue, re.I):
  655         if MolCount <= 10e03:
  656             ParamValue = 4.0
  657         elif MolCount <= 10e04:
  658             ParamValue = 2.0
  659         else:
  660             ParamValue = 1.0
  661         OptionsInfo["FaerunScatterPlotParams"][ParamName] = ParamValue
  662 
  663     #  Process "auto" option for LSHLayoutConfigParams...
  664     for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
  665         ParamValue = "%s" % OptionsInfo["LSHLayoutConfigParams"][ParamName]
  666 
  667         if not re.match("^auto$", ParamValue, re.I):
  668             continue
  669 
  670         if re.match("^K$", ParamName, re.I):
  671             ParamValue = 75 if MolCount <= 10e03 else 10
  672         elif re.match("^KC$", ParamName, re.I):
  673             ParamValue = 20 if MolCount <= 10e03 else 10
  674         elif re.match("^SLRepeats$", ParamName, re.I):
  675             ParamValue = 2 if MolCount <= 10e03 else 1
  676         elif re.match("^SLExtraScalingSteps$", ParamName, re.I):
  677             ParamValue = 4 if MolCount <= 10e03 else 2
  678         elif re.match("^MMMRepeats$", ParamName, re.I):
  679             ParamValue = 2 if MolCount <= 10e03 else 1
  680         elif re.match("^NodeSizeDenominator$", ParamName, re.I):
  681             ParamValue = 65.0 if MolCount <= 10e03 else 70.0
  682 
  683         OptionsInfo["LSHLayoutConfigParams"][ParamName] = ParamValue
  684 
  685 
  686 def SetupTMAPDisplayMessage(MolCount, ValidMolCount):
  687     """Setup TMAP display message."""
  688 
  689     # Setup default TMAP display message using valid molecule count...
  690     if re.match("^auto$", OptionsInfo["TMAPDisplayMsg"], re.I):
  691         if MolCount == ValidMolCount:
  692             OptionsInfo["TMAPDisplayMsg"] = (
  693                 "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s"
  694                 % (OptionsInfo["Infile"], MolCount)
  695             )
  696         else:
  697             OptionsInfo["TMAPDisplayMsg"] = (
  698                 "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s<br/>Number of valid molecules: %s"
  699                 % (OptionsInfo["Infile"], MolCount, ValidMolCount)
  700             )
  701 
  702 
  703 def ProcessFaerunConfigParametersOption():
  704     """Process option for faerun configuration parameters."""
  705 
  706     ParamsOptionName = "--faerunConfigParams"
  707     ParamsOptionValue = Options[ParamsOptionName]
  708     ParamsDefaultInfo = {
  709         "ClearColor": ["str", "#000000"],
  710         "ShowLegend": ["bool", True],
  711         "LegendTitle": ["str", "Legend"],
  712         "LegendOrientation": ["str", "vertical"],
  713         "LegendNumberFormat": ["str", "{:.2f}"],
  714         "Scale": ["float", 750.0],
  715         "AlphaBlending": ["bool", False],
  716         "AntiAliasing": ["bool", True],
  717         "ThumbnailWidth": ["int", 250],
  718         "ThumbnailFixed": ["bool", False],
  719     }
  720 
  721     FaerunConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(
  722         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  723     )
  724 
  725     ParamName = "LegendOrientation"
  726     ParamValue = FaerunConfigParams[ParamName]
  727     if not re.match("^(vertical|horizontal)$", ParamValue, re.I):
  728         MiscUtil.PrintError(
  729             'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: vertical or horizontal\n'
  730             % (ParamValue, ParamName, ParamsOptionName)
  731         )
  732     FaerunConfigParams[ParamName] = ParamValue.lower()
  733 
  734     for ParamName in ["Scale", "ThumbnailWidth"]:
  735         ParamValue = FaerunConfigParams[ParamName]
  736         if ParamValue <= 0:
  737             MiscUtil.PrintError(
  738                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  739                 % (ParamValue, ParamName, ParamsOptionName)
  740             )
  741 
  742     OptionsInfo["FaerunConfigParams"] = FaerunConfigParams
  743 
  744 
  745 def ProcessFaerunScatterPlotParamsOption():
  746     """Process option for faerun scatter plot parameters."""
  747 
  748     ParamsOptionName = "--faerunScatterPlotParams"
  749     ParamsOptionValue = Options[ParamsOptionName]
  750     ParamsDefaultInfo = {
  751         "Shader": ["str", "circle"],
  752         "PointScale": ["str", "auto"],
  753         "MaxPointSize": ["float", 100.0],
  754         "FogIntensity": ["float", 0.0],
  755         "Interactive": ["bool", True],
  756     }
  757 
  758     FaerunScatterPlotParams = MiscUtil.ProcessOptionNameValuePairParameters(
  759         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  760     )
  761 
  762     ParamName = "PointScale"
  763     ParamValue = FaerunScatterPlotParams[ParamName]
  764     if not re.match("^auto$", ParamValue, re.I):
  765         if not MiscUtil.IsFloat(ParamValue):
  766             MiscUtil.PrintError(
  767                 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be a float.'
  768                 % (ParamValue, ParamName, ParamsOptionName)
  769             )
  770         ParamValue = float(ParamValue)
  771         if ParamValue <= 0:
  772             MiscUtil.PrintError(
  773                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  774                 % (ParamValue, ParamName, ParamsOptionName)
  775             )
  776         FaerunScatterPlotParams[ParamName] = ParamValue
  777 
  778     ParamName = "MaxPointSize"
  779     ParamValue = FaerunScatterPlotParams[ParamName]
  780     if ParamValue <= 0:
  781         MiscUtil.PrintError(
  782             'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  783             % (ParamValue, ParamName, ParamsOptionName)
  784         )
  785 
  786     ParamName = "FogIntensity"
  787     ParamValue = FaerunScatterPlotParams[ParamName]
  788     if ParamValue < 0:
  789         MiscUtil.PrintError(
  790             'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
  791             % (ParamValue, ParamName, ParamsOptionName)
  792         )
  793 
  794     OptionsInfo["FaerunScatterPlotParams"] = FaerunScatterPlotParams
  795 
  796 
  797 def ProcessLSHForestParamsOption():
  798     """Process option for LSH forest parameters."""
  799 
  800     ParamsOptionName = "--lshForestParams"
  801     ParamsOptionValue = Options[ParamsOptionName]
  802     ParamsDefaultInfo = {"Dim": ["int", 2048], "NumPrefixTrees": ["str", "auto"], "Store": ["bool", True]}
  803 
  804     LSHForestParams = MiscUtil.ProcessOptionNameValuePairParameters(
  805         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  806     )
  807 
  808     ParamName = "Dim"
  809     ParamValue = LSHForestParams[ParamName]
  810     if ParamValue <= 0:
  811         MiscUtil.PrintError(
  812             'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  813             % (ParamValue, ParamName, ParamsOptionName)
  814         )
  815 
  816     ParamName = "NumPrefixTrees"
  817     ParamValue = LSHForestParams[ParamName]
  818     if not re.match("^auto$", ParamValue, re.I):
  819         if not MiscUtil.IsInteger(ParamValue):
  820             MiscUtil.PrintError(
  821                 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be an integer.'
  822                 % (ParamValue, ParamName, ParamsOptionName)
  823             )
  824         ParamValue = int(ParamValue)
  825         if ParamValue <= 0:
  826             MiscUtil.PrintError(
  827                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  828                 % (ParamValue, ParamName, ParamsOptionName)
  829             )
  830         LSHForestParams[ParamName] = ParamValue
  831 
  832     OptionsInfo["LSHForestParams"] = LSHForestParams
  833 
  834 
  835 def ProcessLSHLayoutConfigParamsOption():
  836     """Process option for LSH configuration parameters."""
  837 
  838     ParamsOptionName = "--lshLayoutConfigParams"
  839     ParamsOptionValue = Options[ParamsOptionName]
  840     ParamsDefaultInfo = {
  841         "K": ["str", "auto"],
  842         "KC": ["str", "auto"],
  843         "FMEIterations": ["int", 1000],
  844         "FMERandomize": ["bool", False],
  845         "FMEThreads": ["int", 4],
  846         "FMEPrecision": ["int", 4],
  847         "SLRepeats": ["str", "auto"],
  848         "SLExtraScalingSteps": ["str", "auto"],
  849         "SLScalingMin": ["float", 1.0],
  850         "SLScalingMax": ["float", 1.0],
  851         "SLScalingType": ["str", "RelativeToDrawing"],
  852         "MMMRepeats": ["str", "auto"],
  853         "Placer": ["str", "Barycenter"],
  854         "Merger": ["str", "LocalBiconnected"],
  855         "MergerFactor": ["float", 2.0],
  856         "MergerAdjustment": ["int", 0],
  857         "NodeSizeDenominator": ["str", "auto"],
  858     }
  859 
  860     LSHLayoutConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(
  861         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  862     )
  863 
  864     for ParamName in [
  865         "FMEIterations",
  866         "FMEThreads",
  867         "FMEPrecision",
  868         "SLScalingMin",
  869         "SLScalingMax",
  870         "MergerFactor",
  871         "MergerAdjustment",
  872     ]:
  873         ParamValue = LSHLayoutConfigParams[ParamName]
  874         if re.match("^%s$" % ParamName, "MergerAdjustment", re.I):
  875             if ParamValue < 0:
  876                 MiscUtil.PrintError(
  877                     'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
  878                     % (ParamValue, ParamName, ParamsOptionName)
  879                 )
  880         else:
  881             if ParamValue <= 0:
  882                 MiscUtil.PrintError(
  883                     'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  884                     % (ParamValue, ParamName, ParamsOptionName)
  885                 )
  886 
  887     # Process "auto" values...
  888     for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
  889         ParamValue = LSHLayoutConfigParams[ParamName]
  890 
  891         if not re.match("^auto$", ParamValue, re.I):
  892             if re.match("^NodeSizeDenominator$", ParamName, re.I):
  893                 if not MiscUtil.IsFloat(ParamValue):
  894                     MiscUtil.PrintError(
  895                         'The parameter value, %s, specified for parameter name, %s, using "%s" option must be a float.'
  896                         % (ParamValue, ParamName, ParamsOptionName)
  897                     )
  898                 ParamValue = float(ParamValue)
  899             else:
  900                 if not MiscUtil.IsInteger(ParamValue):
  901                     MiscUtil.PrintError(
  902                         'The parameter value, %s, specified for parameter name, %s, using "%s" option must be an integer.'
  903                         % (ParamValue, ParamName, ParamsOptionName)
  904                     )
  905                 ParamValue = int(ParamValue)
  906 
  907             if ParamValue <= 0:
  908                 MiscUtil.PrintError(
  909                     'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  910                     % (ParamValue, ParamName, ParamsOptionName)
  911                 )
  912             LSHLayoutConfigParams[ParamName] = ParamValue
  913 
  914     # Map SLScalingType to TMAP object...
  915     ParamInfo = {
  916         "Absolute": tm.ScalingType.Absolute,
  917         "RelativeToAvgLength": tm.ScalingType.RelativeToAvgLength,
  918         "RelativeToDesiredLength": tm.ScalingType.RelativeToDesiredLength,
  919         "RelativeToDrawing": tm.ScalingType.RelativeToDrawing,
  920     }
  921     ParamName = "SLScalingType"
  922     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  923 
  924     # Map Placer to TMAP object...
  925     ParamInfo = {
  926         "Barycenter": tm.Placer.Barycenter,
  927         "Solar": tm.Placer.Solar,
  928         "Circle": tm.Placer.Circle,
  929         "Median": tm.Placer.Median,
  930         "Random": tm.Placer.Random,
  931         "Zero": tm.Placer.Zero,
  932     }
  933     ParamName = "Placer"
  934     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  935 
  936     # Map Merger to TMAP object...
  937     ParamInfo = {
  938         "EdgeCover": tm.Merger.EdgeCover,
  939         "LocalBiconnected": tm.Merger.LocalBiconnected,
  940         "Solar": tm.Merger.Solar,
  941         "IndependentSet": tm.Merger.IndependentSet,
  942     }
  943     ParamName = "Merger"
  944     MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
  945 
  946     OptionsInfo["LSHLayoutConfigParams"] = LSHLayoutConfigParams
  947 
  948 
  949 def MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo):
  950     """Map LSH layout configuration patameter valut to TMAP object."""
  951 
  952     ParamValue = LSHLayoutConfigParams[ParamName]
  953     if ParamValue not in ParamInfo:
  954         MiscUtil.PrintError(
  955             'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: %s\n'
  956             % (ParamValue, ParamName, ParamsOptionName, ", ".join(sorted(ParamInfo.keys())))
  957         )
  958     LSHLayoutConfigParams[ParamName] = ParamInfo[ParamValue]
  959 
  960 
  961 def ProcessMinHashFPParamsOption():
  962     """Process option for MinHash parameters."""
  963 
  964     ParamsOptionName = "--minHashFPParams"
  965     ParamsOptionValue = Options[ParamsOptionName]
  966     ParamsDefaultInfo = {
  967         "Radius": ["int", 3],
  968         "Rings": ["bool", True],
  969         "Kekulize": ["bool", True],
  970         "Sanitize": ["bool", True],
  971         "MinRadius": ["int", 1],
  972         "NumPermutations": ["int", 2048],
  973         "Seed": ["int", 42],
  974     }
  975 
  976     MinHashFPParams = MiscUtil.ProcessOptionNameValuePairParameters(
  977         ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
  978     )
  979 
  980     for ParamName in ["Radius", "MinRadius", "NumPermutations"]:
  981         ParamValue = MinHashFPParams[ParamName]
  982         if ParamValue <= 0:
  983             MiscUtil.PrintError(
  984                 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
  985                 % (ParamValue, ParamName, ParamsOptionName)
  986             )
  987 
  988     OptionsInfo["MinHashFPParams"] = MinHashFPParams
  989 
  990 
  991 def ProcessInfileDelimiterOption():
  992     """Process option infile delimiter."""
  993 
  994     InfileDelim = Options["--infileDelimiter"]
  995     if re.match("^auto$", InfileDelim, re.I):
  996         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Infile"])
  997         if re.match("^csv$", FileExt, re.I):
  998             InfileDelim = "comma"
  999         elif re.match("^(tsv|txt)$", FileExt, re.I):
 1000             InfileDelim = "tab"
 1001         elif re.match("^(smi)$", FileExt, re.I):
 1002             InfileDelim = "space"
 1003         else:
 1004             MiscUtil.PrintError(
 1005                 'The input file delimiter couldn\'t be determined from its extension %s. You must explicitly specify an input file delimiter using option"--infileDelimiter".\n'
 1006                 % (InfileDelim)
 1007             )
 1008 
 1009     InfileDelimMap = {"comma": ",", "tab": "\t", "space": " "}
 1010     OptionsInfo["InfileDelimiter"] = InfileDelimMap[InfileDelim]
 1011 
 1012 
 1013 def ProcessColumnModeOption():
 1014     """Process column mode option."""
 1015 
 1016     CollabelMode, ColnumMode = [False, False]
 1017     Colmode = Options["--colmode"]
 1018     if re.match("^collabel$", Colmode, re.I):
 1019         CollabelMode = True
 1020     elif re.match("^colnum$", Colmode, re.I):
 1021         ColnumMode = True
 1022     else:
 1023         MiscUtil.PrintError(
 1024             'The value, %s, specified for option "-c, --colmode" is not valid. Supported values: collabel or colnum\n'
 1025             % (Colmode)
 1026         )
 1027 
 1028     OptionsInfo["Colmode"] = Colmode
 1029     OptionsInfo["CollabelMode"] = CollabelMode
 1030     OptionsInfo["ColnumMode"] = ColnumMode
 1031 
 1032 
 1033 def RetrieveColumnNames():
 1034     """Retrieve column names."""
 1035 
 1036     Infile = OptionsInfo["Infile"]
 1037 
 1038     InfileFH = open(Infile, "r")
 1039     InfileReader = csv.reader(InfileFH, delimiter=OptionsInfo["InfileDelimiter"], quotechar='"')
 1040     Colnames = next(InfileReader)
 1041     InfileFH.close()
 1042 
 1043     if len(Colnames) == 0:
 1044         MiscUtil.PrintError("The first line in input file, %s, is empty. It must contain column names.\n" % Infile)
 1045 
 1046     ColnameToColnumMap = {}
 1047     ColnumToColnameMap = {}
 1048     for ColIndex, Colname in enumerate(Colnames):
 1049         Colnum = ColIndex + 1
 1050         ColnameToColnumMap[Colname] = Colnum
 1051         ColnumToColnameMap[Colnum] = Colname
 1052 
 1053     OptionsInfo["Colnames"] = Colnames
 1054     OptionsInfo["ColCount"] = len(Colnames)
 1055     OptionsInfo["ColnameToColnumMap"] = ColnameToColnumMap
 1056     OptionsInfo["ColnumToColnameMap"] = ColnumToColnameMap
 1057 
 1058     # Initialize for tracking specified column names...
 1059     SpecifiedColsInfo = {}
 1060     SpecifiedColsInfo["Colnames"] = []
 1061     SpecifiedColsInfo["Colnum"] = {}
 1062     SpecifiedColsInfo["OptionName"] = {}
 1063 
 1064     OptionsInfo["SpecifiedColsInfo"] = SpecifiedColsInfo
 1065 
 1066 
 1067 def ProcessSMILESColOption():
 1068     """Process SMILES column option."""
 1069 
 1070     SMILESCol = Options["--colSMILES"]
 1071     if re.match("^auto$", SMILESCol, re.I):
 1072         Colname = "SMILES"
 1073         if Colname not in OptionsInfo["ColnameToColnumMap"]:
 1074             MiscUtil.PrintError(
 1075                 'The SMILES column name, %s, doen\'t exist in input file. You must specify a valid SMILES column name or number using "--colSMILES" option.\n'
 1076                 % Colname
 1077             )
 1078 
 1079         Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
 1080         SMILESColspec = Colnum if OptionsInfo["ColnumMode"] else Colname
 1081     else:
 1082         SMILESColspec = SMILESCol
 1083 
 1084     SMILESColname, SMILESColnum = ProcessColumnSpecification("--colSMILES", SMILESColspec)
 1085 
 1086     OptionsInfo["SMILESCol"] = SMILESCol
 1087     OptionsInfo["SMILESColname"] = SMILESColname
 1088     OptionsInfo["SMILESColnum"] = SMILESColnum
 1089 
 1090 
 1091 def ProcessCategoricalDataColsOption():
 1092     """Process categorical data columns option."""
 1093 
 1094     CategoricalDataColnames, CategoricalDataColnums = [None] * 2
 1095     CategoricalDataCols = Options["--categoricalDataCols"]
 1096     if not re.match("^none$", CategoricalDataCols, re.I):
 1097         CategoricalDataColnames = []
 1098         CategoricalDataColnums = []
 1099         for DataCol in CategoricalDataCols.split(","):
 1100             DataCol = DataCol.strip()
 1101             DataColname, DataColnum = ProcessColumnSpecification("--categoricalDataCols", DataCol)
 1102             CategoricalDataColnames.append(DataColname)
 1103             CategoricalDataColnums.append(DataColnum)
 1104 
 1105     OptionsInfo["CategoricalDataCols"] = CategoricalDataCols
 1106     OptionsInfo["CategoricalDataColnames"] = CategoricalDataColnames
 1107     OptionsInfo["CategoricalDataColnums"] = CategoricalDataColnums
 1108 
 1109 
 1110 def ProcessCategoricalDataColormapsOption():
 1111     """Process categorical data color maps option."""
 1112 
 1113     if OptionsInfo["CategoricalDataColnames"] is None:
 1114         OptionsInfo["CategoricalDataColormaps"] = Options["--categoricalDataColormaps"]
 1115         OptionsInfo["CategoricalDataColormapsList"] = None
 1116         return
 1117 
 1118     CategoricalDataColormapsList = []
 1119     CategoricalDataColCount = len(OptionsInfo["CategoricalDataColnames"])
 1120 
 1121     CategoricalDataColormaps = Options["--categoricalDataColormaps"]
 1122     if not re.match("^auto$", CategoricalDataColormaps, re.I):
 1123         ColormapsWords = CategoricalDataColormaps.split(",")
 1124         if len(ColormapsWords) != CategoricalDataColCount:
 1125             MiscUtil.PrintInfo(
 1126                 'The number of colormaps, %s, specified using "--categoricalDataColormaps" must be equal to the number of columns, %s, specified using "--categoricalDataCols" option.'
 1127                 % (len(ColormapsWords), CategoricalDataColCount)
 1128             )
 1129         for Colormap in ColormapsWords:
 1130             Colormap = Colormap.strip()
 1131             CategoricalDataColormapsList.append(Colormap)
 1132     else:
 1133         CategoricalDataColormapsList = ["tab10"] * CategoricalDataColCount
 1134 
 1135     OptionsInfo["CategoricalDataColormaps"] = CategoricalDataColormaps
 1136     OptionsInfo["CategoricalDataColormapsList"] = CategoricalDataColormapsList
 1137 
 1138 
 1139 def ProcessNumericalDataColsOption():
 1140     """Process numerical data columns option."""
 1141 
 1142     NumericalDataColnames, NumericalDataColnums = [None] * 2
 1143     NumericalDataCols = Options["--numericalDataCols"]
 1144     if not re.match("^none$", NumericalDataCols, re.I):
 1145         NumericalDataColnames = []
 1146         NumericalDataColnums = []
 1147         for DataCol in NumericalDataCols.split(","):
 1148             DataCol = DataCol.strip()
 1149             DataColname, DataColnum = ProcessColumnSpecification("--numericalDataCols", DataCol)
 1150             NumericalDataColnames.append(DataColname)
 1151             NumericalDataColnums.append(DataColnum)
 1152 
 1153     OptionsInfo["NumericalDataCols"] = NumericalDataCols
 1154     OptionsInfo["NumericalDataColnames"] = NumericalDataColnames
 1155     OptionsInfo["NumericalDataColnums"] = NumericalDataColnums
 1156 
 1157 
 1158 def ProcessNumericalDataColormapsOption():
 1159     """Process numerical data color maps option."""
 1160 
 1161     if OptionsInfo["NumericalDataColnames"] is None:
 1162         OptionsInfo["NumericalDataColormaps"] = Options["--numericalDataColormaps"]
 1163         OptionsInfo["NumericalDataColormapsList"] = None
 1164         return
 1165 
 1166     NumericalDataColormapsList = []
 1167     NumericalDataColCount = len(OptionsInfo["NumericalDataColnames"])
 1168 
 1169     NumericalDataColormaps = Options["--numericalDataColormaps"]
 1170     if not re.match("^auto$", NumericalDataColormaps, re.I):
 1171         ColormapsWords = NumericalDataColormaps.split(",")
 1172         if len(ColormapsWords) != NumericalDataColCount:
 1173             MiscUtil.PrintInfo(
 1174                 'The number of colormaps, %s, specified using "--categoricalDataColormaps" must be equal to the number of columns, %s, specified using "--categoricalDataCols" option.'
 1175                 % (len(ColormapsWords), NumericalDataColCount)
 1176             )
 1177         for Colormap in ColormapsWords:
 1178             Colormap = Colormap.strip()
 1179             NumericalDataColormapsList.append(Colormap)
 1180     else:
 1181         NumericalDataColormapsList = ["viridis"] * NumericalDataColCount
 1182 
 1183     OptionsInfo["NumericalDataColormaps"] = NumericalDataColormaps
 1184     OptionsInfo["NumericalDataColormapsList"] = NumericalDataColormapsList
 1185 
 1186 
 1187 def ProcessStructureDisplayDataColsOption():
 1188     """Process structure display data columns option."""
 1189 
 1190     StructureDisplayDataColnames = []
 1191     StructureDisplayDataColnums = []
 1192 
 1193     # Add SMILES column...
 1194     StructureDisplayDataColnames.append(OptionsInfo["SMILESColname"])
 1195     StructureDisplayDataColnums.append(OptionsInfo["SMILESColnum"])
 1196 
 1197     # Process specified columns...
 1198     OptionName = "--structureDisplayDataCols"
 1199     StructureDisplayDataCols = Options[OptionName]
 1200     if re.match("^auto$", StructureDisplayDataCols, re.I):
 1201         # Automatically add 'Name' column...
 1202         Colname = "Name"
 1203         if Colname in OptionsInfo["ColnameToColnumMap"]:
 1204             Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
 1205             StructureDisplayDataColnames.append(Colname)
 1206             StructureDisplayDataColnums.append(Colnum)
 1207     else:
 1208         for DataCol in StructureDisplayDataCols.split(","):
 1209             DataCol = DataCol.strip()
 1210             if OptionsInfo["ColnumMode"]:
 1211                 Colnum = int(DataCol)
 1212                 if Colnum not in OptionsInfo["ColnumToColnameMap"]:
 1213                     MiscUtil.PrintError(
 1214                         'The column number, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n'
 1215                         % (Colnum, OptionName, OptionsInfo["ColCount"])
 1216                     )
 1217                 Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
 1218             else:
 1219                 Colname = DataCol
 1220                 if Colname not in OptionsInfo["ColnameToColnumMap"]:
 1221                     MiscUtil.PrintError(
 1222                         'The column name, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column name. Valid values: %s\n'
 1223                         % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))
 1224                     )
 1225                 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
 1226 
 1227             if Colname in StructureDisplayDataColnames:
 1228                 StructureDisplayDataColnumsStrs = ["%s" % Num for Num in StructureDisplayDataColnums]
 1229                 if OptionsInfo["ColnumMode"]:
 1230                     MiscUtil.PrintError(
 1231                         'The column number, %s, specified using "%s" option is a duplicate column number. It has already been used for this option. You must specify a different column number. Used column names: %s; Used column nums: %s\n'
 1232                         % (
 1233                             Colnum,
 1234                             OptionName,
 1235                             " ".join(StructureDisplayDataColnames),
 1236                             " ".join(StructureDisplayDataColnumsStrs),
 1237                         )
 1238                     )
 1239                 else:
 1240                     MiscUtil.PrintError(
 1241                         'The column name, %s, specified using "%s" option is a duplicate column name. It has already been used for this option. You must specify a different column name. Used column names: %s; Used column nums: %s\n'
 1242                         % (
 1243                             Colname,
 1244                             OptionName,
 1245                             " ".join(StructureDisplayDataColnames),
 1246                             " ".join(StructureDisplayDataColnumsStrs),
 1247                         )
 1248                     )
 1249 
 1250             StructureDisplayDataColnames.append(Colname)
 1251             StructureDisplayDataColnums.append(Colnum)
 1252 
 1253     OptionsInfo["StructureDisplayDataCols"] = StructureDisplayDataCols
 1254     OptionsInfo["StructureDisplayDataColnames"] = StructureDisplayDataColnames
 1255     OptionsInfo["StructureDisplayDataColnums"] = StructureDisplayDataColnums
 1256 
 1257 
 1258 def ProcessColumnSpecification(OptionName, Colspec):
 1259     """Process column specification corresponding to a column name or number."""
 1260 
 1261     Colname, Colnum = [None, None]
 1262     if OptionsInfo["ColnumMode"]:
 1263         Colnum = int(Colspec)
 1264         if Colnum not in OptionsInfo["ColnumToColnameMap"]:
 1265             MiscUtil.PrintError(
 1266                 'The column number, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n'
 1267                 % (Colnum, OptionName, OptionsInfo["ColCount"])
 1268             )
 1269         Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
 1270     else:
 1271         Colname = Colspec
 1272         if Colname not in OptionsInfo["ColnameToColnumMap"]:
 1273             MiscUtil.PrintError(
 1274                 'The column name, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column name. Valid values: %s\n'
 1275                 % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))
 1276             )
 1277         Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
 1278 
 1279     # Track and check for duplicate column specification...
 1280     SpecifiedColsInfo = OptionsInfo["SpecifiedColsInfo"]
 1281     if Colname in SpecifiedColsInfo["Colnames"]:
 1282         if OptionsInfo["ColnumMode"]:
 1283             MiscUtil.PrintError(
 1284                 'The column number, %s, specified using "%s" option is a duplicate column number. It has already been used for "%s" option. You must specify a different column number.\n'
 1285                 % (Colnum, OptionName, SpecifiedColsInfo["OptionName"][Colname])
 1286             )
 1287         else:
 1288             MiscUtil.PrintError(
 1289                 'The column name, %s, specified using "%s" option is a duplicate column name. It has already been used for "%s" option. You must specify a different column name.\n'
 1290                 % (Colname, OptionName, SpecifiedColsInfo["OptionName"][Colname])
 1291             )
 1292     else:
 1293         SpecifiedColsInfo["Colnames"].append(Colname)
 1294         SpecifiedColsInfo["Colnum"][Colname] = Colnum
 1295         SpecifiedColsInfo["OptionName"][Colname] = OptionName
 1296 
 1297     return (Colname, Colnum)
 1298 
 1299 
 1300 def ProcessOptions():
 1301     """Process and validate command line arguments and options."""
 1302 
 1303     MiscUtil.PrintInfo("Processing options...")
 1304 
 1305     # Validate options...
 1306     ValidateOptions()
 1307 
 1308     OptionsInfo["Infile"] = Options["--infile"]
 1309 
 1310     Outfile = Options["--outfile"]
 1311     FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
 1312     OptionsInfo["OutfilePrefix"] = FileName
 1313     OptionsInfo["OutfileExt"] = FileExt
 1314 
 1315     OptionsInfo["Outfile"] = Outfile
 1316     OptionsInfo["OutfileJS"] = "%s.js" % FileName
 1317     OptionsInfo["OutfileLSHForest"] = "%s.dat" % FileName
 1318 
 1319     ProcessInfileDelimiterOption()
 1320     RetrieveColumnNames()
 1321 
 1322     ProcessColumnModeOption()
 1323     ProcessSMILESColOption()
 1324 
 1325     OptionsInfo["CategoricalDataMaxDisplay"] = int(Options["--categoricalDataMaxDisplay"])
 1326     ProcessCategoricalDataColsOption()
 1327     ProcessCategoricalDataColormapsOption()
 1328 
 1329     ProcessNumericalDataColsOption()
 1330     ProcessNumericalDataColormapsOption()
 1331 
 1332     ProcessStructureDisplayDataColsOption()
 1333 
 1334     ProcessFaerunConfigParametersOption()
 1335     ProcessFaerunScatterPlotParamsOption()
 1336 
 1337     OptionsInfo["LSHForestFileWriteMode"] = True if re.match("^yes$", Options["--lshForestFileWrite"], re.I) else False
 1338     OptionsInfo["LSHForestFileRestoreMode"] = (
 1339         True if re.match("^yes$", Options["--lshForestFileRestore"], re.I) else False
 1340     )
 1341     if OptionsInfo["LSHForestFileRestoreMode"]:
 1342         LSHForestFile = OptionsInfo["OutfileLSHForest"]
 1343         if not os.path.isfile(LSHForestFile):
 1344             MiscUtil.PrintError(
 1345                 'The LSH forest file, %s, must be present for, %s, value of "--lshForestFileRestore" option.'
 1346                 % (LSHForestFile, Options["--lshForestFileRestore"])
 1347             )
 1348 
 1349     ProcessLSHForestParamsOption()
 1350     ProcessLSHLayoutConfigParamsOption()
 1351 
 1352     OptionsInfo["MergeHTMLandJSFilesMode"] = (
 1353         True if re.match("^yes$", Options["--mergeHTMLandJSFiles"], re.I) else False
 1354     )
 1355 
 1356     ProcessMinHashFPParamsOption()
 1357 
 1358     OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
 1359     OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
 1360 
 1361     OptionsInfo["Overwrite"] = Options["--overwrite"]
 1362     OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
 1363 
 1364     OptionsInfo["TMAPDisplayMsg"] = Options["--tmapDisplayMsg"]
 1365 
 1366 
 1367 def RetrieveOptions():
 1368     """Retrieve command line arguments and options."""
 1369 
 1370     # Get options...
 1371     global Options
 1372     Options = docopt(_docoptUsage_)
 1373 
 1374     # Set current working directory to the specified directory...
 1375     WorkingDir = Options["--workingdir"]
 1376     if WorkingDir:
 1377         os.chdir(WorkingDir)
 1378 
 1379     # Handle examples option...
 1380     if "--examples" in Options and Options["--examples"]:
 1381         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 1382         sys.exit(0)
 1383 
 1384 
 1385 def ValidateOptions():
 1386     """Validate option values."""
 1387 
 1388     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 1389     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "smi csv tsv txt")
 1390 
 1391     MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "html")
 1392     MiscUtil.ValidateOptionsOutputFileOverwrite(
 1393         "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 1394     )
 1395     MiscUtil.ValidateOptionsDistinctFileNames(
 1396         "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 1397     )
 1398 
 1399     MiscUtil.ValidateOptionTextValue("-c, --colmode", Options["--colmode"], "collabel colnum")
 1400 
 1401     if re.match("^none$", Options["--categoricalDataCols"], re.I) and re.match(
 1402         "^none$", Options["--numericalDataCols"], re.I
 1403     ):
 1404         MiscUtil.PrintError(
 1405             'You must specify al least one caetgorical or numerical data column using option "--categoricalDataCols" or "--numericalDataCols". It is used to color TMAP.'
 1406         )
 1407 
 1408     ColnumMode = True if re.match("^colnum$", Options["--colmode"], re.I) else False
 1409     if ColnumMode and not re.match("^auto$", Options["--colSMILES"], re.I):
 1410         MiscUtil.ValidateOptionIntegerValue("--colSMILES", Options["--colSMILES"], {">": 0})
 1411 
 1412     if ColnumMode and not re.match("^none$", Options["--categoricalDataCols"], re.I):
 1413         MiscUtil.ValidateOptionNumberValues(
 1414             "--categoricalDataCols", Options["--categoricalDataCols"], 0, ",", "integer", {">": 0}
 1415         )
 1416 
 1417     MiscUtil.ValidateOptionIntegerValue("--categoricalDataMaxDisplay", Options["--categoricalDataMaxDisplay"], {">": 0})
 1418 
 1419     if not re.match("^auto$", Options["--categoricalDataColormaps"], re.I):
 1420         ColormapCount = len(Options["--categoricalDataColormaps"].split(","))
 1421         ColCount = len(Options["--categoricalDataCols"].split(","))
 1422         if ColormapCount != ColCount:
 1423             MiscUtil.PrintError(
 1424                 'The number of colormaps, %s, specified using option "--categoricalDataColormaps" must be equal to number of columns, %s,  specified using option "-categoricalDataCols". '
 1425                 % (ColormapCount, ColCount)
 1426             )
 1427 
 1428     if ColnumMode and not re.match("^none$", Options["--numericalDataCols"], re.I):
 1429         MiscUtil.ValidateOptionNumberValues(
 1430             "--numericalDataCols", Options["--numericalDataCols"], 0, ",", "integer", {">": 0}
 1431         )
 1432 
 1433     if not re.match("^auto$", Options["--numericalDataColormaps"], re.I):
 1434         ColormapCount = len(Options["--numericalDataColormaps"].split(","))
 1435         ColCount = len(Options["--numericalDataCols"].split(","))
 1436         if ColormapCount != ColCount:
 1437             MiscUtil.PrintError(
 1438                 'The number of colormaps, %s, specified using option "--numericalDataColormaps" must be equal to number of columns, %s,  specified using option "-numericalDataCols". '
 1439                 % (ColormapCount, ColCount)
 1440             )
 1441 
 1442     if not re.match("^auto$", Options["--structureDisplayDataCols"], re.I):
 1443         if ColnumMode and not re.match("^none$", Options["--structureDisplayDataCols"], re.I):
 1444             MiscUtil.ValidateOptionNumberValues(
 1445                 "--structureDisplayDataCols", Options["--structureDisplayDataCols"], 0, ",", "integer", {">": 0}
 1446             )
 1447 
 1448     if not re.match("^auto$", Options["--infileDelimiter"], re.I):
 1449         MiscUtil.ValidateOptionTextValue(" --infileDelimiter", Options["--infileDelimiter"], "comma tab space")
 1450 
 1451     MiscUtil.ValidateOptionTextValue("--lshForestFileWrite", Options["--lshForestFileWrite"], "yes no")
 1452     MiscUtil.ValidateOptionTextValue("--lshForestFileRestore", Options["--lshForestFileRestore"], "yes no")
 1453     MiscUtil.ValidateOptionTextValue("--mergeHTMLandJSFiles", Options["--mergeHTMLandJSFiles"], "yes no")
 1454 
 1455     MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
 1456 
 1457 
 1458 # Setup a usage string for docopt...
 1459 _docoptUsage_ = """
 1460 VisualizeChemspaceUsingTMAP.py - Visualize chemspace
 1461 
 1462 Usage:
 1463     VisualizeChemspaceUsingTMAP.py [--categoricalDataCols <collabel1,... or colnum1,...>] [--categoricalDataColormaps <Colormap1, Colormap2,...>]
 1464                                    [--categoricalDataMaxDisplay <number>] [--colmode <collabel or colnum>] [--colSMILES <text or number>]
 1465                                    [--faerunConfigParams <Name,Value,...>] [--faerunScatterPlotParams <Name,Value,...>]
 1466                                    [--infileDelimiter <comma, tab, or space>] [--lshForestFileWrite <yes or no>] [--lshForestFileRestore <yes or no>]
 1467                                    [--lshForestParams <Name,Value,...>] [--lshLayoutConfigParams  <Name,Value,...>] [--mergeHTMLandJSFiles <yes or no>]
 1468                                    [--minHashFPParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
 1469                                    [--numericalDataCols <collabel1,... or colnum1,...>] [--numericalDataColormaps <Colormap1, Colormap2,...>]
 1470                                    [--overwrite] [--quiet <yes or no>] [--structureDisplayDataCols <collabel1,... or colnum1,...> ]
 1471                                    [--tmapDisplayMsg <text>] [-w <dir>] -i <infile> -o <outfile> 
 1472     VisualizeChemspaceUsingTMAP.py -h | --help | -e | --examples
 1473 
 1474 Description:
 1475     Generate an interactive TreeMAP (TMAP) [Ref 171, 172] visualization for molecules
 1476     in a text input file. The text input file must have a column containing SMILES strings.
 1477     In addition, it must contain at least one column corresponding to categorical or
 1478     numerical data for coloring TMAP nodes. You may optionally map multiple categorical
 1479     and numerical data columns on to a TMAP visualization. A HTML file is generated for
 1480     interactive visualization of chemspace in a browser.
 1481 
 1482     The TMAP methodology is able to generate a reasonably interactive visualization
 1483     for relatively large data sets. A brief description of the methodology is as follows.
 1484     A set of MinHash Fingerprints (MHFPs) are calculated for molecules in input file
 1485     followed by the generation of a Locality Sensitivity Hashing (LSH) forest employing
 1486     MHFPs. A c-approximate k-Nearest Neighbor Graph (c-k-NNG) is constructed from
 1487     LSH, which is used to construct a Minimum Spanning Tree (MST) or Forest (MSF).
 1488     The final TMAP visualization is generated by laying out MST and MSF on a plane
 1489     using an algorithm provided by the Open Graph Drawing Framework (OGDF). The
 1490     OGDF provides flexibility to adjust graph layout methodology in terms of not only
 1491     aesthetics but also computational time.
 1492 
 1493     The supported input file formats are: CSV (.csv) TSV (.txt or .tsv),
 1494     SMILES (.smi)
 1495 
 1496     The supported output file format is: HTML (.html).
 1497 
 1498 Options:
 1499     --categoricalDataCols <collabel1,... or colnum1,...>  [default: none]
 1500         A comma delimited list of column labels or numbers corresponding to
 1501         categorical data to map on a TMAP visualization.
 1502     --categoricalDataColormaps <Colormap1, Colormap2,...>  [default: auto]
 1503         A comma delimited list of color map names corresponding to categorical
 1504         data. The default is to use 'tab10' color map name for mapping categorical
 1505         data on a TMAP. The number of specified color maps must match the number
 1506         of categorical data columns. You must specify valid color map names
 1507         supported by Matplotlib. No validation is performed. Example color map
 1508         names for categorical data: Pastel1, Pastel2, Paired, Accent, Dark2, Set1,
 1509         Set2, Set3, tab10, tab20, tab20b, tab20c.
 1510     --categoricalDataMaxDisplay <number>  [default: 6]
 1511         Maximum number of categories in a category column to display on a TMAP
 1512         visualization. The rest of the categories are aggregated under a new
 1513         category named 'Other' before mapping on to a TMAP visualization.
 1514     -c, --colmode <collabel or colnum>  [default: collabel]
 1515         Use column number or name for the specification of columns in input
 1516         text file containing SMILES strings and molecule names along with any 
 1517         categorical or numerical data.
 1518     --colSMILES <text or number>  [default: auto]
 1519         Column name or number corresponding to SMILES strings. The default value
 1520         is automatically set based on the value of '-c, --colmode': 'SMILES'  for
 1521         'collabel'; SMILES string column number for 'colnum'. SMILES strings must
 1522         be present in input file.
 1523     -e, --examples
 1524         Print examples.
 1525     --faerunConfigParams <Name,Value,...>  [default: auto]
 1526         A comma delimited list of parameter name and value pairs for configuring
 1527         faerun (Ref 172) to generate a TMAP visualization.
 1528         
 1529         The supported parameter names along with their default and possible
 1530         values are shown below:
 1531              
 1532             clearColor, #000000
 1533             showLegend, yes  [ Possible values: yes or no ] 
 1534             legendTitle, Legend
 1535             legendOrientation, vertical  [ Possible values: vertical or
 1536                 horizontal ]
 1537             legendNumberFormat, {:.2f}
 1538             scale, 750.0
 1539             alphaBlending, no  [ Possible values: yes or no ]
 1540             antiAliasing, yes  [Possible values: yes or no]
 1541             thumbnailWidth, 250
 1542             thumbnailFixed, no  [ Possible values: yes or no ]
 1543             
 1544         A brief description of parameters, as available in the code for faerun, is
 1545         provided below:
 1546         
 1547             clearColor: Background color
 1548             showLegend: Show legend at lower right
 1549             legendTitle: Legend title
 1550             legendOrientation: Legend Orientation
 1551             legendNumberFormat: Number string format applied to numbers
 1552                 displayed in legend
 1553             scale: Scaling factor for scaling normalized coordinates
 1554             AlphaBlending: Activate alpha blending. It is required for smoothCircle
 1555                 shader.
 1556             antiAliasing: Activate anti-aliasing. It might adversly impact
 1557                 rendering performance.
 1558             thumbnailWidth: Width of thumbnail images for structures
 1559             thumbnailFixed:  Show thumbnail images at a fixed location at the
 1560                 top instead of next to the mouse
 1561             
 1562     --faerunScatterPlotParams <Name,Value,...>  [default: auto]
 1563         A comma delimited list of parameter name and value pairs for generating
 1564         scatter plot representing a TMAP using faerun (Ref 172).
 1565         
 1566         The supported parameter names along with their default and possible
 1567         values are shown below:
 1568              
 1569             shader, circle  [ Possible values: circle, smoothCircle,
 1570                 sphere, or any valid value]
 1571             pointScale, auto  [ 4 if MolCout<=10K; 2 if MolCount<=100K; else 1 ]
 1572             maxPointSize, 100.0
 1573             fogIntensity, 0.0
 1574             interactive, yes  [ Possible values: yes or no ] 
 1575              
 1576         A brief description of parameters is provided below:
 1577         
 1578             shader: Shader to use for visualizating data points
 1579             pointScale: Relative size of data points
 1580             maxPointSize: Maximum size of the data points during zooming
 1581             fogIntensity: Intensity of distance fog
 1582             interactive: Generate interactive scatter plot
 1583             
 1584     -h, --help
 1585         Print this help message.
 1586     -i, --infile <infile>
 1587         Input file name. The SMILES strings must be present in the input file.
 1588         Supported formats: CSV (.csv) TSV (.txt or .tsv), or SMILES (.smi)
 1589     --infileDelimiter <comma, tab, or space>  [default: auto]
 1590         Input file delimiter for processing data. The default value is automatically
 1591         set based on the type of input file: comma - CSV (.csv); tab - TSV (.txt or
 1592         .tsv);  space - SMILES (.smi)
 1593     --lshForestFileWrite <yes or no>  [default: yes]
 1594         Write LSH forest data a file for subsequent generation of a TMAP visualization.
 1595         Default file name: <OutfileRoot>_LSHForest.dat. The LSH forest data is
 1596         generated using MinHash fingerprints. You may restore LSH forest data
 1597         using '--lshForestFileRestore' option to skip the generation of fingerprints.
 1598     --lshForestFileRestore <yes or no>  [default: no]
 1599         Check and restore LSH forest data from a file for generating a TMAP
 1600         visualization and skip the generation of MinHash fingerprints. Default file
 1601         name: <OutfileRoot>_LSHForest.dat
 1602     --lshForestParams <Name,Value,...>  [default: auto]
 1603         A comma delimited list of parameter name and value pairs for generating
 1604         LSH (Locality Sensitivity Hashing) forest from MinHash fingerprints.
 1605         
 1606         The supported parameter names along with their default and possible
 1607         values are shown below:
 1608              
 1609             dim, 2048
 1610             numPrefixTrees, auto  [ 128 if MolCount <= 10K else 8 ]
 1611             store, yes  [ Possible values: yes or no ]
 1612             
 1613         A brief description of parameters, as available in the code for LSH, is
 1614         provided below:
 1615         
 1616             dim: Dimensionality of MinHashes to be added to LSHForest
 1617             numPrefixTrees: Number of prefix trees to use
 1618             store: store the data for enhanced retrieval
 1619             
 1620     --lshLayoutConfigParams <Name,Value,...>  [default: auto]
 1621         A comma delimited list of parameter name and value pairs for configuring
 1622         LSH (Locality Sensitivity Hashing) layout.
 1623         
 1624         The supported parameter names along with their default and possible
 1625         values are shown below:
 1626             
 1627             k, auto  [ 75 if MolCount <= 10K else 10]
 1628             kc, auto  [ 20 if MolCount <= 10K else 10]
 1629             fmeIterations, 1000
 1630             fmeRandomize, no  [ Possible values: yes or no ]
 1631             fmeThreads, 4
 1632             fmePrecision, 4
 1633             slRrepeats, auto  [ 2 if MolCount <= 10K else 1]
 1634             slExtraScalingSteps, auto  [ 4 if MolCount <= 10K else 2 ]
 1635             slScalingMin, 1.0
 1636             slScalingMax, 1.0
 1637             slScalingType, RelativeToDrawing  [ Possible values: Absolute,
 1638                 RelativeToAvgLength, RelativeToDesiredLength, or
 1639                 RelativeToDrawing ]
 1640             mmmRepeats, auto  [ 2 MolCount <= 10K else 1 ]
 1641             placer, Barycenter  [ Possible valeues: Barycenter, Solar, Circle,
 1642                 Median, Random, or Zero ]
 1643             merger, LocalBiconnected  [ Possible values: EdgeCover,
 1644                 LocalBiconnected, Solar, or IndependentSet ]
 1645             mergerFactor, 2.0
 1646             mergerAdjustment, 0
 1647             nodeSizeDenominator, auto  [ 65 if MolCout <= 10K else 70.0]
 1648             
 1649         A brief description of parameters, as available in the code for LSH, is
 1650         provided below:
 1651             
 1652             k: Number of nearest neighbors used to create k-nearest neighbor
 1653                 graph
 1654             kc: Scalar by which k is multiplied before querying LSH forest.
 1655                 The results are then sorted in decreasing order based on linear
 1656                 scan distances. 
 1657             fmeIterations: Maximum number of iterations of Fast Multipole
 1658                 Embedder (FME)
 1659             fmeRandomize: Randomize FME layout at the start
 1660             fmeThreads: Number of threads for FME
 1661             fmePrecision: Number of coefficients of multipole expansion
 1662             slRepeats: Number of repeats of scaling layout algorithm
 1663             slExtraScalingSteps: Number of repeats of scaling
 1664             slScalingMin: Minimum scaling factor
 1665             slScalingMax: Maximum scaling factor.
 1666             slScalingType: Scaling type corresponding to relative scale of graph
 1667             mmmRepeats, Number of repeats of layout at each level
 1668             placer: Methodology for defining initial positions of vertices in a
 1669                 graph at each level
 1670             merger: Vertex merging methodology used during coarsening phase
 1671                 of multilevel algorithm
 1672             mergerFactor: Ratio of sizes between two levels up to which merging
 1673                 is performed.  It doesn't apply to all merging methodologies.
 1674             mergerAdjustment: Edge  length  adjustment  for merging methodology.
 1675                 It doesn't apply to all merging methodologies.
 1676             nodeSizeDenominator: Node size denominator affecting the magnitude
 1677                 of repelling force between nodes. Node size corresponds to
 1678                 1.0 / nodeSizeDenominator. You may want to increase the value
 1679                 nodeSizeDenominator to decrease node size and resolve overlaps
 1680                 in  a crowded tree.
 1681             
 1682     --mergeHTMLandJSFiles <yes or no>  [default: yes]
 1683         Merge TMAP JS data file into HTML file and delete JS data file. Default
 1684         file names: <OutfileRoot>.html, <OutfileRoot>.js.
 1685     --minHashFPParams <Name,Value,...>  [default: auto]
 1686         A comma delimited list of parameter name and value pairs for generating
 1687         Min Hash Fingerprints (MHFP).
 1688         
 1689         The supported parameter names along with their default and possible
 1690         values are shown below:
 1691             
 1692             radius, 3
 1693             rings, yes  [ Possible values: yes or no ]
 1694             kekulize, yes  [ Possible values: yes or no ]
 1695             sanitize, yes  [ Possible values: yes or no ]
 1696             minRadius, 1
 1697             numPermutations, 2048
 1698             seed, 42
 1699             
 1700         A brief description of parameters, as available in the code for MHFP,  is
 1701         provided below:
 1702             
 1703             radius:  MHFP radius (A radius of 3 corresponds to MHFP6)
 1704             rings:  Include rings in shingling
 1705             kekulize:  Kekulize SMILES
 1706             sanitize:  Sanitize SMILES
 1707             minRadius: Minimum radius that is used to extract n-grams
 1708             numPermutations: Number of permutations used for hashing
 1709             seed: Random number seed for numpy.random
 1710             
 1711     --mp <yes or no>  [default: no]
 1712         Use multiprocessing for the generation of fingerprints.
 1713          
 1714         By default, input data is retrieved in a lazy manner via mp.Pool.imap()
 1715         function employing lazy RDKit data iterable. This allows processing of
 1716         arbitrary large data sets without any additional requirements memory.
 1717         
 1718         All input data may be optionally loaded into memory by mp.Pool.map()
 1719         before starting worker processes in a process pool by setting the value
 1720         of 'inputDataMode' to 'InMemory' in '--mpParams' option.
 1721         
 1722         A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
 1723         data mode may adversely impact the performance. The '--mpParams' section
 1724         provides additional information to tune the value of 'chunkSize'.
 1725     --mpParams <Name,Value,...>  [default: auto]
 1726         A comma delimited list of parameter name and value pairs to configure
 1727         multiprocessing during the generation of fingerprints.
 1728         
 1729         The supported parameter names along with their default and possible
 1730         values are shown below:
 1731         
 1732             chunkSize, auto
 1733             inputDataMode, Lazy   [ Possible values: InMemory or Lazy ]
 1734             numProcesses, auto   [ Default: mp.cpu_count() ]
 1735         
 1736         These parameters are used by the following functions to configure and
 1737         control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
 1738         mp.Pool.imap().
 1739         
 1740         The chunkSize determines chunks of input data passed to each worker
 1741         process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
 1742         The default value of chunkSize is dependent on the value of 'inputDataMode'.
 1743         
 1744         The mp.Pool.map() function, invoked during 'InMemory' input data mode,
 1745         automatically converts RDKit data iterable into a list, loads all data into
 1746         memory, and calculates the default chunkSize using the following method
 1747         as shown in its code:
 1748         
 1749             chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
 1750             if extra: chunkSize += 1
 1751         
 1752         For example, the default chunkSize will be 7 for a pool of 4 worker processes
 1753         and 100 data items.
 1754         
 1755         The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
 1756         'lazy' RDKit data iterable to retrieve data as needed, without loading all the
 1757         data into memory. Consequently, the size of input data is not known a priori.
 1758         It's not possible to estimate an optimal value for the chunkSize. The default 
 1759         chunkSize is set to 1.
 1760         
 1761         The default value for the chunkSize during 'Lazy' data mode may adversely
 1762         impact the performance due to the overhead associated with exchanging
 1763         small chunks of data. It is generally a good idea to explicitly set chunkSize to
 1764         a larger value during 'Lazy' input data mode, based on the size of your input
 1765         data and number of processes in the process pool.
 1766         
 1767         The mp.Pool.map() function waits for all worker processes to process all
 1768         the data and return the results. The mp.Pool.imap() function, however,
 1769         returns the the results obtained from worker processes as soon as the
 1770         results become available for specified chunks of data.
 1771         
 1772         The order of data in the results returned by both mp.Pool.map() and 
 1773         mp.Pool.imap() functions always corresponds to the input data.
 1774     --numericalDataCols <collabel1,... or colnum1,...>  [default: none]
 1775         A comma demlimited list of column labels or numbers corresponding to
 1776         numerical data to map on a TMAP visualization.
 1777     --numericalDataColormaps <Colormap1, Colormap2,...>  [default: auto]
 1778         A comma demlimited list of color map names corresponding to numerical
 1779         data. The default is to use 'viridis' color map name for mapping numerical
 1780         data on a TMAP. The number of specified color maps must mtach the number
 1781         of numerical data columns. You must specify valid color map names
 1782         supported by Matplotlib. No validation is performed. Example color map
 1783         names for numerical data: viridis, plasma, inferno, magma, cividis.
 1784     -o, --outfile <outfile>
 1785         Output HTML file name for writing out a TMAP visualization.
 1786     --overwrite
 1787         Overwrite existing files.
 1788     -q, --quiet <yes or no>  [default: no]
 1789         Use quiet mode. The warning and information messages will not be printed.
 1790     --structureDisplayDataCols <collabel1,... or colnum1,...>  [default: auto]
 1791         A comma delimited list of column labels or numbers corresponding to data
 1792         to display under a thumbnail image of a structure in a TMAP visualization.
 1793         The default column is set to 'Name' and it is automatically shown. In addition,
 1794         the SMILES string column is always used to display SMILES under the structures.
 1795     -t, --tmapDisplayMsg <text>  [default: auto]
 1796         A brief message to display at the top left in HTML page containing a TMAP
 1797         visualization. You must specify a valid HTML string. No validation is
 1798         performed. Default message: TMAP chemspace visualization<br/>
 1799         Input file: <InfileName><br/>Number of molecules: <Count>
 1800     -w, --workingdir <dir>
 1801         Location of working directory which defaults to the current directory.
 1802 
 1803 Examples:
 1804     To visualize chemspace for SMILES strings present in a column name SMILES in
 1805     input file, mapping a categorical data column on TMAP, writing out LSH forest
 1806     for subsequent use to skip the generation of fingerprints, merging TMAP JS file
 1807     into HTML file, and write out a HTML file containing TMAP visualization, type:
 1808 
 1809         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1810           -i SampleChemspace.csv -o SampleChemspace.html
 1811 
 1812     To run the first example for SMILES strings in column name SMILES in input file
 1813     and write out a HTML file containing TMAP visualization, type:
 1814 
 1815         % VisualizeChemspaceUsingTMAP.py --colSMILES SMILES
 1816           --categoricalDataCols Source
 1817           -i SampleChemspace.csv -o SampleChemspace.html
 1818 
 1819     To run the first example for mapping categrorical data in column number 4 in
 1820     input file and write out a HTML file containing TMAP visualization, type:
 1821 
 1822         % VisualizeChemspaceUsingTMAP.py --colmode colnum
 1823           --categoricalDataCols 4
 1824           -i SampleChemspace.csv -o SampleChemspace.html
 1825 
 1826     To run the first example for mapping both categrorical and numerical data
 1827     coumns and write out a HTML file containing TMAP visualization, type:
 1828 
 1829         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1830           --numericalDataCols "MolWt,MolLogP"
 1831           -i SampleChemspace.csv -o SampleChemspace.html
 1832 
 1833     To run the first example for mapping both categrorical and numerical data
 1834     coumns along with specified colormaps and write out a HTML file containing
 1835     TMAP visualization, type:
 1836 
 1837         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1838           --categoricalDataColormaps "tab10"
 1839           --numericalDataCols "MolWt,MolLogP"
 1840           --numericalDataColormaps "viridis, plasma"
 1841           -i SampleChemspace.csv -o SampleChemspace.html
 1842 
 1843     To run the first example for mapping both categrorical and numerical data
 1844     coumns along with displaying specific data under the structure display  and
 1845     write out a HTML file containing TMAP visualization, type:
 1846 
 1847         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
 1848           --numericalDataCols "MolWt,NHOHCount,NOCount,MolLogP,
 1849           NumRotatableBonds,TPSA" --structureDisplayDataCols "Name,ID"
 1850           -i SampleChemspace.csv -o SampleChemspace.html
 1851 
 1852     To run the first example for restoring LSH forest data from a file to skip the
 1853     generation of fingerpritns and write out a HTML file containing TMAP
 1854     visualization, type:
 1855 
 1856         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1857            --lshForestFileRestore yes -i SampleChemspace.csv -o SampleChemspace.html
 1858 
 1859     To run the first example in multiprocessing mode on all available CPUs without
 1860     loading all data into memory and write out  a HTML file containing TMAP
 1861     visualization, type:
 1862 
 1863         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1864           --mp yes -i SampleChemspace.csv -o SampleChemspace.html
 1865 
 1866     To run the first example in multiprocessing mode on all available CPUs by
 1867     loading all data into memory and write out  a HTML file containing TMAP
 1868     visualization, type:
 1869 
 1870         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1871           --mp yes --mpParams "inputDataMode,InMemory"
 1872           -i SampleChemspace.csv -o SampleChemspace.html
 1873 
 1874     To run the first example in multiprocessing mode on specific number of CPUs
 1875     and chunk size without loading all data into memory and write out a HTML file
 1876     containing TMAP visualization, type:
 1877 
 1878         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1879           --mp yes --mpParams "inputDataMode,lazy,numProcesses,4,
 1880           chunkSize,50" -i SampleChemspace.csv -o SampleChemspace.html
 1881 
 1882     To run the first example using a set of specified parameters to generate
 1883     fingerprints and LSH forest, configure faerun and scatter plot layout, and
 1884     write out a HTML file containing TMAP visualization, type:
 1885 
 1886         % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
 1887           --minHashFPParams "radius,3,numPermutations,2048"
 1888           --lshForestParams "dim,2048,numPrefixTrees,128"
 1889           --lshLayoutConfigParams "k,75,kc,20,slRepeats,2,
 1890           slExtraScalingSteps,4,mmmRepeats,2" 
 1891           --faerunConfigParams "clearColor, #000000,thumbnailWidth, 250"
 1892           --faerunScatterPlotParams "shader,circle,pointScale,4"
 1893           --tmapDisplayMsg "TMAP Chemspace visualization"
 1894           -i SampleChemspace.csv -o SampleChemspace.html
 1895 
 1896 Author:
 1897     Manish Sud(msud@san.rr.com)
 1898 
 1899 See also:
 1900     RDKitConvertFileFormat.py, RDKitCalculateMolecularDescriptors.py,
 1901     RDKitStandardizeMolecules.py
 1902 
 1903 Copyright:
 1904     Copyright (C) 2026 Manish Sud. All rights reserved.
 1905 
 1906     The functionality available in this script is implemented using TMAP and
 1907     Faerun, open source software packages for visualizing chemspace, and
 1908     RDKit, an open source toolkit for cheminformatics developed by Greg
 1909     Landrum.
 1910 
 1911     This file is part of MayaChemTools.
 1912 
 1913     MayaChemTools is free software; you can redistribute it and/or modify it under
 1914     the terms of the GNU Lesser General Public License as published by the Free
 1915     Software Foundation; either version 3 of the License, or (at your option) any
 1916     later version.
 1917 
 1918 """
 1919 
 1920 if __name__ == "__main__":
 1921     main()