1 #!/bin/env python 2 # 3 # File: VisualizeChemspaceUsingTMAP.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using TMAP and 9 # Faerun, open source software packages for visualizing chemspace, and 10 # RDKit, an open source toolkit for cheminformatics developed by Greg 11 # Landrum. 12 # 13 # This file is part of MayaChemTools. 14 # 15 # MayaChemTools is free software; you can redistribute it and/or modify it under 16 # the terms of the GNU Lesser General Public License as published by the Free 17 # Software Foundation; either version 3 of the License, or (at your option) any 18 # later version. 19 # 20 # MayaChemTools is distributed in the hope that it will be useful, but without 21 # any warranty; without even the implied warranty of merchantability of fitness 22 # for a particular purpose. See the GNU Lesser General Public License for more 23 # details. 24 # 25 # You should have received a copy of the GNU Lesser General Public License 26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 28 # Boston, MA, 02111-1307, USA. 29 # 30 31 from __future__ import print_function 32 33 # Add local python path to the global path and import standard library modules... 34 import os 35 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 36 import time 37 import re 38 import csv 39 import shutil 40 import multiprocessing as mp 41 import pandas as pd 42 import numpy as np 43 44 # TMAP and Faerun imports... 45 try: 46 import tmap as tm 47 from faerun import Faerun 48 from mhfp.encoder import MHFPEncoder 49 except ImportError as ErrMsg: 50 sys.stderr.write("\nFailed to import TMAP/Faerun module/package: %s\n" % ErrMsg) 51 sys.stderr.write("Check/update your TMAP environment and try again.\n\n") 52 sys.exit(1) 53 54 # RDKit imports... 55 try: 56 from rdkit import rdBase 57 except ImportError as ErrMsg: 58 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 59 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 60 sys.exit(1) 61 62 # MayaChemTools imports... 63 try: 64 from docopt import docopt 65 import MiscUtil 66 import RDKitUtil 67 except ImportError as ErrMsg: 68 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 69 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 70 sys.exit(1) 71 72 ScriptName = os.path.basename(sys.argv[0]) 73 Options = {} 74 OptionsInfo = {} 75 76 def main(): 77 """Start execution of the script.""" 78 79 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 80 81 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 82 83 # Retrieve command line arguments and options... 84 RetrieveOptions() 85 86 # Process and validate command line arguments and options... 87 ProcessOptions() 88 89 # Perform actions required by the script... 90 VisualizeChemspace() 91 92 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 93 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 94 95 def VisualizeChemspace(): 96 """Visualize chemspace using TMAP.""" 97 98 InfileDF = ReadMoleculeData() 99 100 MolCount, ValidMolCount, VisualizationFailedCount = ProcessMolecules(InfileDF) 101 102 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 103 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 104 MiscUtil.PrintInfo("Number of molecules failed during chemspace visualization: %d" % VisualizationFailedCount) 105 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 106 107 def ProcessMolecules(InfileDF): 108 """Process molecules and generate TMAP.""" 109 110 MolCount = len(InfileDF) 111 (ValidMolCount, VisualizationFailedCount) = [0] * 2 112 113 # Setup parameter values for "auto" options based on the number of molecules... 114 ProcessMolCountBasedAutoOptions(MolCount) 115 116 # Setup LSH forest... 117 LSHForest, ValidMolCount, VisualizationFailedCount = SetupLSHForest(InfileDF) 118 if ValidMolCount == 0: 119 return (MolCount, ValidMolCount, VisualizationFailedCount) 120 121 SetupTMAPDisplayMessage(MolCount, ValidMolCount) 122 123 # Generate TMAP coordinates... 124 PlotCoordsInfo = GenerateTMAPCoordinates(LSHForest) 125 126 # Setup TMAP plot data... 127 PlotDataInfo = SetupTMAPPlotData(InfileDF) 128 129 # Setup TMAP plot... 130 GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo) 131 132 return (MolCount, ValidMolCount, VisualizationFailedCount) 133 134 def SetupLSHForest(InfileDF): 135 """Setup LSH forest. """ 136 137 if OptionsInfo["LSHForestFileRestoreMode"]: 138 return RestoreLSHForest((InfileDF)) 139 else: 140 return GenerateLSHForest(InfileDF) 141 142 def RestoreLSHForest(InfileDF): 143 """Restore LSH forest. """ 144 145 (ValidMolCount, VisualizationFailedCount) = [0] * 2 146 147 # Set valid molecule count to number of molecules in input file... 148 ValidMolCount = len(InfileDF) 149 150 LSHForestFile = OptionsInfo["OutfileLSHForest"] 151 MiscUtil. PrintInfo("\nRestoring LSH forest from %s..." % LSHForestFile) 152 if not os.path.isfile(LSHForestFile): 153 MiscUtil. PrintError("LSH forest file %s is missing. Failed to restore LSH forest...\n" % LSHForestFile) 154 155 LSHForest = InitializeLSHForest() 156 LSHForest.restore(LSHForestFile) 157 158 if LSHForest.size() != ValidMolCount: 159 MiscUtil.PrintError("The number of molecules, %s, in input file must match number of nodes, %s, in LSH forest during its restoration from a file using \"--lshForestFileWrite\" option." % (ValidMolCount, LSHForest.size())) 160 161 return (LSHForest, ValidMolCount, VisualizationFailedCount) 162 163 def GenerateLSHForest(InfileDF): 164 """Generate LSH forest. """ 165 166 MinHashFingerprints, ValidMolCount, FingerprintsFailedCount = GenerateMinHashFingerprints(InfileDF) 167 168 MiscUtil. PrintInfo("\nGenerating LSH forest...") 169 LSHForest = InitializeLSHForest() 170 171 LSHForest.batch_add(MinHashFingerprints) 172 LSHForest.index() 173 174 # Write out LSH forest... 175 if OptionsInfo["LSHForestFileWriteMode"]: 176 OutfileLSHForest = OptionsInfo["OutfileLSHForest"] 177 if FingerprintsFailedCount > 0: 178 MiscUtil. PrintWarning("The MinHash fingerprints generation failed for %s molecules. Skipped writing of file %s..." % (FingerprintsFailedCount, OutfileLSHForest)) 179 else: 180 MiscUtil. PrintInfo("Writing LSH forest file %s..." % OutfileLSHForest) 181 LSHForest.store(OutfileLSHForest) 182 183 return (LSHForest, ValidMolCount, FingerprintsFailedCount) 184 185 def GenerateMinHashFingerprints(InfileDF): 186 """Generate MinHash fingerprints.""" 187 188 if OptionsInfo["MPMode"]: 189 return GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF) 190 else: 191 return GenerateMinHashFingerprintsUsingSingleProcess(InfileDF) 192 193 def GenerateMinHashFingerprintsUsingSingleProcess(InfileDF): 194 """Generate MHFPs using a single processs. """ 195 196 MiscUtil. PrintInfo("\nGenerating MinHash fingerprints using a single process...") 197 198 MinHashFingerprintsEncoder = InitializeMinHashFingerprintsEncoder() 199 200 (ValidMolCount, FingerprintsFailedCount) = [0] * 2 201 MinHashFingerprints = [] 202 FingerprintsFailedRowIndices = [] 203 204 SMILESColname = OptionsInfo["SMILESColname"] 205 for MolIndex, SMILES in enumerate(InfileDF[SMILESColname]): 206 MinHashFingerprint = GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES) 207 if MinHashFingerprint is None: 208 FingerprintsFailedCount += 1 209 FingerprintsFailedRowIndices.append(MolIndex) 210 else: 211 ValidMolCount += 1 212 MinHashFingerprints.append(tm.VectorUint(MinHashFingerprint)) 213 214 # Remove failed molecules from the dataframe... 215 RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices) 216 217 return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount) 218 219 def GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF): 220 """Generate MHFPs using multiprocessing.""" 221 222 MiscUtil. PrintInfo("\nGenerating MinHash fingerprints using multiprocessing...") 223 224 MPParams = OptionsInfo["MPParams"] 225 226 # Setup data for initializing a worker process... 227 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 228 229 # Setup SMILES iterator... 230 SMILESColname = OptionsInfo["SMILESColname"] 231 WorkerProcessDataIterable = SetupSMILESWithMolIndices(InfileDF[SMILESColname]) 232 233 # Setup process pool along with data initialization for each process... 234 if not OptionsInfo["QuietMode"]: 235 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 236 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 237 238 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 239 240 # Start processing... 241 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 242 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 243 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 244 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 245 else: 246 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 247 248 (ValidMolCount, FingerprintsFailedCount) = [0] * 2 249 MinHashFingerprints = [] 250 FingerprintsFailedRowIndices = [] 251 252 for Result in Results: 253 Molndex, MinHashFingerprint = Result 254 255 if MinHashFingerprint is None: 256 FingerprintsFailedCount += 1 257 FingerprintsFailedRowIndices.append(Molndex) 258 else: 259 ValidMolCount += 1 260 MinHashFingerprints.append(tm.VectorUint(np.array(MinHashFingerprint))) 261 262 # Remove failed molecules from the dataframe... 263 RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices) 264 265 return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount) 266 267 def InitializeWorkerProcess(*EncodedArgs): 268 """Initialize data for a worker process.""" 269 270 global Options, OptionsInfo 271 272 if not OptionsInfo["QuietMode"]: 273 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 274 275 # Decode Options and OptionInfo... 276 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 277 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 278 279 # Initialize MHFP encoder... 280 OptionsInfo["MinHashFingerprintsEncoder"] = InitializeMinHashFingerprintsEncoder() 281 282 def WorkerProcess(MolInfo): 283 """Process data for a worker process.""" 284 285 MolIndex, SMILES = MolInfo 286 287 MinHashFingerprint = GenerateMinHashFingerprintForMolecule(OptionsInfo["MinHashFingerprintsEncoder"], SMILES) 288 if MinHashFingerprint is not None: 289 MinHashFingerprint = MinHashFingerprint.tolist() 290 291 return (MolIndex, MinHashFingerprint) 292 293 def SetupSMILESWithMolIndices(SMILES): 294 """Setup an iterator to generate SMILES string along with a molecule index.""" 295 296 for MolIndex, MolSMILES in enumerate(SMILES): 297 yield(MolIndex, MolSMILES) 298 299 def GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES): 300 """Generate MinHash fingerprint for a molecule. """ 301 302 MinHashFingerprint = None 303 try: 304 MinHashFingerprint = MinHashFingerprintsEncoder.encode(SMILES, radius = OptionsInfo["MinHashFPParams"]["Radius"], rings = OptionsInfo["MinHashFPParams"]["Rings"], kekulize = OptionsInfo["MinHashFPParams"]["Kekulize"], min_radius = OptionsInfo["MinHashFPParams"]["MinRadius"], sanitize = OptionsInfo["MinHashFPParams"]["Sanitize"]) 305 except Exception as ErrMsg: 306 if not OptionsInfo["QuietMode"]: 307 MiscUtil.PrintWarning("Failed to generate MinHash fingerprint for SMILES %s:\n%s\n" % (SMILES, ErrMsg)) 308 else: 309 MiscUtil.PrintInfo("") 310 MinHashFingerprint = None 311 312 return MinHashFingerprint 313 314 def RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices): 315 """Remove fingerprints failed rows.""" 316 317 if len(FingerprintsFailedRowIndices): 318 InfileDF.drop(FingerprintsFailedRowIndices, inplace = True) 319 InfileDF.reset_index(drop = True, inplace = True) 320 321 def GenerateTMAPCoordinates(LSHForest): 322 """Generate TMAP coordinates. """ 323 324 MiscUtil.PrintInfo("\nGenerating TMAP plot coordinates...") 325 326 PlotCoordsInfo = {} 327 PlotCoordsInfo["NodeXCoords"] = None 328 PlotCoordsInfo["NodeYCoords"] = None 329 PlotCoordsInfo["EdgeNodeStartList"] = None 330 PlotCoordsInfo["EdgeNodeToList"] = None 331 332 333 LSHLayoutConfigParams = OptionsInfo["LSHLayoutConfigParams"] 334 LSHLayoutConfig = tm.LayoutConfiguration() 335 336 LSHLayoutConfig.k = LSHLayoutConfigParams["K"] 337 LSHLayoutConfig.kc = LSHLayoutConfigParams["KC"] 338 LSHLayoutConfig.fme_iterations = LSHLayoutConfigParams["FMEIterations"] 339 LSHLayoutConfig.fme_randomize = LSHLayoutConfigParams["FMERandomize"] 340 LSHLayoutConfig.fme_threads = LSHLayoutConfigParams["FMEThreads"] 341 LSHLayoutConfig.fme_precision = LSHLayoutConfigParams["FMEPrecision"] 342 LSHLayoutConfig.sl_repeats = LSHLayoutConfigParams["SLRepeats"] 343 LSHLayoutConfig.sl_extra_scaling_steps = LSHLayoutConfigParams["SLExtraScalingSteps"] 344 LSHLayoutConfig.sl_scaling_min = LSHLayoutConfigParams["SLScalingMin"] 345 LSHLayoutConfig.sl_scaling_max = LSHLayoutConfigParams["SLScalingMax"] 346 LSHLayoutConfig.sl_scaling_type = LSHLayoutConfigParams["SLScalingType"] 347 LSHLayoutConfig.mmm_repeats = LSHLayoutConfigParams["MMMRepeats"] 348 LSHLayoutConfig.placer = LSHLayoutConfigParams["Placer"] 349 LSHLayoutConfig.merger = LSHLayoutConfigParams["Merger"] 350 LSHLayoutConfig.merger_factor = LSHLayoutConfigParams["MergerFactor"] 351 LSHLayoutConfig.merger_adjustment = LSHLayoutConfigParams["MergerAdjustment"] 352 LSHLayoutConfig.node_size = 1.0 / LSHLayoutConfigParams["NodeSizeDenominator"] 353 354 NodeXCoords, NodeYCoords, EdgeNodeStartList, EdgeNodeToList, _ = tm.layout_from_lsh_forest(LSHForest, config = LSHLayoutConfig) 355 356 PlotCoordsInfo["NodeXCoords"] = NodeXCoords 357 PlotCoordsInfo["NodeYCoords"] = NodeYCoords 358 PlotCoordsInfo["EdgeNodeStartList"] = EdgeNodeStartList 359 PlotCoordsInfo["EdgeNodeToList"] = EdgeNodeToList 360 361 return PlotCoordsInfo 362 363 def SetupTMAPPlotData(InfileDF): 364 """Setup plot data for TMAP plot.""" 365 366 MiscUtil.PrintInfo("\nSetting up TMAP plot data...") 367 368 PlotDataInfo = {} 369 PlotDataInfo["Columns"] = [] 370 PlotDataInfo["Colormaps"] = [] 371 PlotDataInfo["CategoricalStatus"] = [] 372 PlotDataInfo["LegendLabels"] = [] 373 PlotDataInfo["SeriesTitles"] = [] 374 375 # Setup categorical data... 376 if OptionsInfo["CategoricalDataColnames"] is not None: 377 for ColnameIndex, Colname in enumerate(OptionsInfo["CategoricalDataColnames"]): 378 CategoryLabels, CategoryData = Faerun.create_categories(InfileDF[Colname]) 379 if len(CategoryLabels) > OptionsInfo["CategoricalDataMaxDisplay"]: 380 CategoryLabels, CategoryData = RemapCategoricalPlotData(CategoryLabels, CategoryData) 381 382 PlotDataInfo["Columns"].append(CategoryData) 383 PlotDataInfo["Colormaps"].append(OptionsInfo["CategoricalDataColormapsList"][ColnameIndex]) 384 PlotDataInfo["CategoricalStatus"].append(True) 385 PlotDataInfo["LegendLabels"].append(CategoryLabels) 386 PlotDataInfo["SeriesTitles"].append(Colname) 387 388 # Setup numerical data... 389 if OptionsInfo["NumericalDataColnames"] is not None: 390 for ColnameIndex, Colname in enumerate(OptionsInfo["NumericalDataColnames"]): 391 PlotDataInfo["Columns"].append(InfileDF[Colname]) 392 PlotDataInfo["Colormaps"].append(OptionsInfo["NumericalDataColormapsList"][ColnameIndex]) 393 PlotDataInfo["CategoricalStatus"].append(False) 394 PlotDataInfo["LegendLabels"].append(None) 395 PlotDataInfo["SeriesTitles"].append(Colname) 396 397 # Setup structure display data... 398 FirstCol = True 399 SMILESSelectedData = [] 400 SMILESSelectedLabels = [] 401 FirstCol = True 402 for Colname in OptionsInfo["StructureDisplayDataColnames"]: 403 if FirstCol: 404 FirstCol = False 405 SMILESSelectedData = InfileDF[Colname] 406 SMILESSelectedLabels.append(Colname) 407 else: 408 SMILESSelectedData = SMILESSelectedData + '__' + InfileDF[Colname].astype(str) 409 SMILESSelectedLabels.append(Colname) 410 411 PlotDataInfo["SMILESSelectedData"] = SMILESSelectedData 412 PlotDataInfo["SMILESSelectedLabels"] = SMILESSelectedLabels 413 414 return PlotDataInfo 415 416 def RemapCategoricalPlotData(CategoryLabels, CategoryData): 417 """Ramap categorical plot data.""" 418 419 if len(CategoryLabels) <= OptionsInfo["CategoricalDataMaxDisplay"]: 420 return (CategoryLabels, CategoryData) 421 422 # Track categories to remap... 423 CategoryLabelsNew = [] 424 CategoryValuesToRemap = [] 425 LastCategoryValue = 0 426 427 for CategoryLabelIndex, CategoryLabel in enumerate(CategoryLabels): 428 CategoryValue, CategroyName = CategoryLabel 429 if CategoryLabelIndex < OptionsInfo["CategoricalDataMaxDisplay"]: 430 CategoryLabelsNew.append((CategoryValue, CategroyName)) 431 LastCategoryValue = CategoryValue 432 else: 433 CategoryValuesToRemap.append(CategoryValue) 434 435 # Set up other category... 436 OtherCategoryValue = LastCategoryValue + 1 437 OtherCategoryName = "Other" 438 CategoryLabelsNew.append((OtherCategoryValue, OtherCategoryName)) 439 440 # Update category labels and data... 441 CategoryLabels = CategoryLabelsNew 442 for ValueIndex, Value in enumerate(CategoryData): 443 if Value in CategoryValuesToRemap: 444 CategoryData[ValueIndex] = OtherCategoryValue 445 446 return (CategoryLabels, CategoryData) 447 448 def GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo): 449 """Generate TMAP plot. """ 450 451 MiscUtil.PrintInfo("\nGenerating TMAP plot...") 452 453 # Initialize Faerun plot... 454 FaerunConfigParams = OptionsInfo["FaerunConfigParams"] 455 ImpressMsg = OptionsInfo["TMAPDisplayMsg"] 456 TMAPFaerunPlot = Faerun(clear_color = FaerunConfigParams["ClearColor"], view = "front", coords = False, title = "", x_title = "", y_title = "", show_legend = FaerunConfigParams["ShowLegend"], legend_title = FaerunConfigParams["LegendTitle"], legend_orientation = FaerunConfigParams["LegendOrientation"], legend_number_format = FaerunConfigParams["LegendNumberFormat"], scale = FaerunConfigParams["Scale"], alpha_blending = FaerunConfigParams["AlphaBlending"], anti_aliasing = FaerunConfigParams["AntiAliasing"], thumbnail_width = FaerunConfigParams["ThumbnailWidth"], thumbnail_fixed = FaerunConfigParams["ThumbnailFixed"], impress = ImpressMsg) 457 458 # Setup scatter plot... 459 ScatterPlotName = "Data" 460 ScatterTreePlotName = "%s_tree" % ScatterPlotName 461 FaerunScatterPlotParams = OptionsInfo["FaerunScatterPlotParams"] 462 TMAPFaerunPlot.add_scatter(ScatterPlotName, {"x": PlotCoordsInfo["NodeXCoords"], "y": PlotCoordsInfo["NodeYCoords"], "c": PlotDataInfo["Columns"], "labels": PlotDataInfo["SMILESSelectedData"]}, colormap = PlotDataInfo["Colormaps"], shader = FaerunScatterPlotParams["Shader"], point_scale = FaerunScatterPlotParams["PointScale"], max_point_size = FaerunScatterPlotParams["MaxPointSize"], fog_intensity = FaerunScatterPlotParams["FogIntensity"], categorical = PlotDataInfo["CategoricalStatus"], interactive = FaerunScatterPlotParams["Interactive"], has_legend = True, legend_labels = PlotDataInfo["LegendLabels"], series_title = PlotDataInfo["SeriesTitles"], selected_labels = PlotDataInfo["SMILESSelectedLabels"]) 463 464 # Add scatter plot to Faerun... 465 TMAPFaerunPlot.add_tree(ScatterTreePlotName, {"from": PlotCoordsInfo["EdgeNodeStartList"], "to": PlotCoordsInfo["EdgeNodeToList"]}, point_helper = ScatterPlotName) 466 467 # Write out TMAP plot HTML and JS files... 468 MiscUtil.PrintInfo("Writing TMAP plot files %s and %s..." % (OptionsInfo["Outfile"], OptionsInfo["OutfileJS"])) 469 TMAPFaerunPlot.plot(OptionsInfo["OutfilePrefix"], template = "smiles") 470 471 if OptionsInfo["MergeHTMLandJSFilesMode"]: 472 MergeTMAPResultsHTMLAndJSFiles() 473 474 def MergeTMAPResultsHTMLAndJSFiles(): 475 """Merge TMAP HTML and JS files.""" 476 477 MiscUtil.PrintInfo("\nMerging TMAP plot file %s into %s..." % (OptionsInfo["OutfileJS"], OptionsInfo["Outfile"])) 478 479 TMAPResultsHTMLFile = OptionsInfo["Outfile"] 480 TMAPResultsJSFile = OptionsInfo["OutfileJS"] 481 482 TMAPResultsTMPHTMLFile = "Tmp%s.html" % OptionsInfo["OutfilePrefix"] 483 484 HTMLResultsFH = open(TMAPResultsHTMLFile, "r") 485 JSResultsFH = open(TMAPResultsJSFile, "r") 486 487 TMPHTMLResultsFH = open(TMAPResultsTMPHTMLFile, "w") 488 489 for HTMLLine in HTMLResultsFH: 490 HTMLLine = HTMLLine.rstrip() 491 if re.search("%s" % TMAPResultsJSFile, HTMLLine, re.IGNORECASE): 492 TMPHTMLResultsFH.write(" <script>\n") 493 494 FirstLine = True 495 for JSLine in JSResultsFH: 496 JSLine = JSLine.rstrip() 497 if FirstLine: 498 FirstLine = False 499 TMPHTMLResultsFH.write(" %s\n" % JSLine) 500 else: 501 TMPHTMLResultsFH.write("%s\n" % JSLine) 502 TMPHTMLResultsFH.write("\n </script>\n") 503 504 else: 505 TMPHTMLResultsFH.write("%s\n" % HTMLLine) 506 507 HTMLResultsFH.close() 508 JSResultsFH.close() 509 TMPHTMLResultsFH.close() 510 511 MiscUtil.PrintInfo("Moving %s to %s..." % (TMAPResultsTMPHTMLFile, OptionsInfo["Outfile"])) 512 shutil.move(TMAPResultsTMPHTMLFile, TMAPResultsHTMLFile) 513 514 MiscUtil.PrintInfo("Removing %s file..." % (OptionsInfo["OutfileJS"])) 515 os.remove(TMAPResultsJSFile) 516 517 def InitializeLSHForest(): 518 """Initialize LSH forest. """ 519 520 LSHForestParams = OptionsInfo["LSHForestParams"] 521 LSHForest = tm.LSHForest(LSHForestParams["Dim"], LSHForestParams["NumPrefixTrees"], LSHForestParams["Store"]) 522 523 return LSHForest 524 525 def InitializeMinHashFingerprintsEncoder(): 526 """Initialize MinHash fingerprints encoder.""" 527 528 MinHashFPParams = OptionsInfo["MinHashFPParams"] 529 MinHashFingerprintsEncoder = MHFPEncoder(n_permutations = MinHashFPParams["NumPermutations"], seed = MinHashFPParams["Seed"]) 530 531 return MinHashFingerprintsEncoder 532 533 def ReadMoleculeData(): 534 """Read molecule data.""" 535 536 Infile = OptionsInfo["Infile"] 537 InfileDelimiter = OptionsInfo["InfileDelimiter"] 538 539 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 540 InfileDF = pd.read_csv(Infile, sep = InfileDelimiter) 541 542 return InfileDF 543 544 def ProcessMolCountBasedAutoOptions(MolCount): 545 """Process auto option values dependent on number of molecules.""" 546 547 # Process "auto" option for LSHForestParams... 548 ParamName = "NumPrefixTrees" 549 ParamValue = "%s" % OptionsInfo["LSHForestParams"][ParamName] 550 if re.match("^auto$", ParamValue, re.I): 551 ParamValue = 128 if MolCount <= 10E03 else 8 552 OptionsInfo["LSHForestParams"][ParamName] = ParamValue 553 554 # Process "auto" option for FaerunScatterPlotParams... 555 ParamName = "PointScale" 556 ParamValue = OptionsInfo["FaerunScatterPlotParams"][ParamName] 557 ParamValue = "%s" % ParamValue 558 if re.match("^auto$", ParamValue, re.I): 559 if MolCount <= 10E03: 560 ParamValue = 4.0 561 elif MolCount <= 10E04: 562 ParamValue = 2.0 563 else: 564 ParamValue = 1.0 565 OptionsInfo["FaerunScatterPlotParams"][ParamName] = ParamValue 566 567 # Process "auto" option for LSHLayoutConfigParams... 568 for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]: 569 ParamValue = "%s" % OptionsInfo["LSHLayoutConfigParams"][ParamName] 570 571 if not re.match("^auto$", ParamValue, re.I): 572 continue 573 574 if re.match("^K$", ParamName, re.I): 575 ParamValue = 75 if MolCount <= 10E03 else 10 576 elif re.match("^KC$", ParamName, re.I): 577 ParamValue = 20 if MolCount <= 10E03 else 10 578 elif re.match("^SLRepeats$", ParamName, re.I): 579 ParamValue = 2 if MolCount <= 10E03 else 1 580 elif re.match("^SLExtraScalingSteps$", ParamName, re.I): 581 ParamValue = 4 if MolCount <= 10E03 else 2 582 elif re.match("^MMMRepeats$", ParamName, re.I): 583 ParamValue = 2 if MolCount <= 10E03 else 1 584 elif re.match("^NodeSizeDenominator$", ParamName, re.I): 585 ParamValue = 65.0 if MolCount <= 10E03 else 70.0 586 587 OptionsInfo["LSHLayoutConfigParams"][ParamName] = ParamValue 588 589 def SetupTMAPDisplayMessage(MolCount, ValidMolCount): 590 """Setup TMAP display message.""" 591 592 # Setup default TMAP display message using valid molecule count... 593 if re.match("^auto$", OptionsInfo["TMAPDisplayMsg"], re.I): 594 if MolCount == ValidMolCount: 595 OptionsInfo["TMAPDisplayMsg"] = "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s" % (OptionsInfo["Infile"], MolCount) 596 else: 597 OptionsInfo["TMAPDisplayMsg"] = "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s<br/>Number of valid molecules: %s" % (OptionsInfo["Infile"], MolCount, ValidMolCount) 598 599 def ProcessFaerunConfigParametersOption(): 600 """Process option for faerun configuration parameters.""" 601 602 ParamsOptionName = "--faerunConfigParams" 603 ParamsOptionValue = Options[ParamsOptionName] 604 ParamsDefaultInfo = {"ClearColor": ["str", "#000000"], "ShowLegend": ["bool", True], "LegendTitle": ["str", "Legend"], "LegendOrientation": ["str", "vertical"], "LegendNumberFormat": ["str", "{:.2f}"], "Scale": ["float", 750.0], "AlphaBlending": ["bool", False], "AntiAliasing": ["bool", True], "ThumbnailWidth": ["int", 250], "ThumbnailFixed": ["bool", False]} 605 606 FaerunConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 607 608 ParamName = "LegendOrientation" 609 ParamValue = FaerunConfigParams[ParamName] 610 if not re.match("^(vertical|horizontal)$", ParamValue, re.I): 611 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: vertical or horizontal\n" % (ParamValue, ParamName, ParamsOptionName)) 612 FaerunConfigParams[ParamName] = ParamValue.lower() 613 614 for ParamName in ["Scale", "ThumbnailWidth"]: 615 ParamValue = FaerunConfigParams[ParamName] 616 if ParamValue <= 0: 617 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 618 619 OptionsInfo["FaerunConfigParams"] = FaerunConfigParams 620 621 def ProcessFaerunScatterPlotParamsOption(): 622 """Process option for faerun scatter plot parameters.""" 623 624 ParamsOptionName = "--faerunScatterPlotParams" 625 ParamsOptionValue = Options[ParamsOptionName] 626 ParamsDefaultInfo = {"Shader": ["str", "circle"], "PointScale": ["str", "auto"], "MaxPointSize": ["float", 100.0], "FogIntensity": ["float", 0.0], "Interactive": ["bool", True]} 627 628 FaerunScatterPlotParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 629 630 ParamName = "PointScale" 631 ParamValue = FaerunScatterPlotParams[ParamName] 632 if not re.match("^auto$", ParamValue, re.I): 633 if not MiscUtil.IsFloat(ParamValue): 634 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be a float." % (ParamValue, ParamName, ParamsOptionName)) 635 ParamValue = float(ParamValue) 636 if ParamValue <= 0: 637 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 638 FaerunScatterPlotParams[ParamName] = ParamValue 639 640 ParamName = "MaxPointSize" 641 ParamValue = FaerunScatterPlotParams[ParamName] 642 if ParamValue <= 0: 643 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 644 645 ParamName = "FogIntensity" 646 ParamValue = FaerunScatterPlotParams[ParamName] 647 if ParamValue < 0: 648 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName)) 649 650 OptionsInfo["FaerunScatterPlotParams"] = FaerunScatterPlotParams 651 652 def ProcessLSHForestParamsOption(): 653 """Process option for LSH forest parameters.""" 654 655 ParamsOptionName = "--lshForestParams" 656 ParamsOptionValue = Options[ParamsOptionName] 657 ParamsDefaultInfo = {"Dim": ["int", 2048], "NumPrefixTrees": ["str", "auto"], "Store": ["bool", True]} 658 659 LSHForestParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 660 661 ParamName = "Dim" 662 ParamValue = LSHForestParams[ParamName] 663 if ParamValue <= 0: 664 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 665 666 ParamName = "NumPrefixTrees" 667 ParamValue = LSHForestParams[ParamName] 668 if not re.match("^auto$", ParamValue, re.I): 669 if not MiscUtil.IsInteger(ParamValue): 670 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be an integer." % (ParamValue, ParamName, ParamsOptionName)) 671 ParamValue = int(ParamValue) 672 if ParamValue <= 0: 673 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 674 LSHForestParams[ParamName] = ParamValue 675 676 OptionsInfo["LSHForestParams"] = LSHForestParams 677 678 def ProcessLSHLayoutConfigParamsOption(): 679 """Process option for LSH configuration parameters.""" 680 681 ParamsOptionName = "--lshLayoutConfigParams" 682 ParamsOptionValue = Options[ParamsOptionName] 683 ParamsDefaultInfo = {"K": ["str", "auto"], "KC": ["str", "auto"], "FMEIterations": ["int", 1000], "FMERandomize": ["bool", False], "FMEThreads": ["int", 4], "FMEPrecision": ["int", 4], "SLRepeats": ["str", "auto"], "SLExtraScalingSteps": ["str", "auto"], "SLScalingMin": ["float", 1.0], "SLScalingMax": ["float", 1.0], "SLScalingType": ["str", "RelativeToDrawing"], "MMMRepeats": ["str", "auto"], "Placer": ["str", "Barycenter"], "Merger": ["str", "LocalBiconnected"], "MergerFactor": ["float", 2.0], "MergerAdjustment": ["int", 0], "NodeSizeDenominator": ["str", "auto"]} 684 685 LSHLayoutConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 686 687 for ParamName in ["FMEIterations", "FMEThreads", "FMEPrecision", "SLScalingMin", "SLScalingMax", "MergerFactor", "MergerAdjustment"]: 688 ParamValue = LSHLayoutConfigParams[ParamName] 689 if re.match("^%s$" % ParamName, "MergerAdjustment", re.I): 690 if ParamValue < 0: 691 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0\n" % (ParamValue, ParamName, ParamsOptionName)) 692 else: 693 if ParamValue <= 0: 694 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 695 696 # Process "auto" values... 697 for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]: 698 ParamValue = LSHLayoutConfigParams[ParamName] 699 700 if not re.match("^auto$", ParamValue, re.I): 701 if re.match("^NodeSizeDenominator$", ParamName, re.I): 702 if not MiscUtil.IsFloat(ParamValue): 703 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be a float." % (ParamValue, ParamName, ParamsOptionName)) 704 ParamValue = float(ParamValue) 705 else: 706 if not MiscUtil.IsInteger(ParamValue): 707 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option must be an integer." % (ParamValue, ParamName, ParamsOptionName)) 708 ParamValue = int(ParamValue) 709 710 if ParamValue <= 0: 711 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 712 LSHLayoutConfigParams[ParamName] = ParamValue 713 714 # Map SLScalingType to TMAP object... 715 ParamInfo = {"Absolute": tm.ScalingType.Absolute, "RelativeToAvgLength": tm.ScalingType.RelativeToAvgLength, "RelativeToDesiredLength": tm.ScalingType.RelativeToDesiredLength, "RelativeToDrawing": tm.ScalingType.RelativeToDrawing} 716 ParamName = "SLScalingType" 717 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo) 718 719 # Map Placer to TMAP object... 720 ParamInfo = {"Barycenter": tm.Placer.Barycenter, "Solar": tm.Placer.Solar, "Circle": tm.Placer.Circle, "Median": tm.Placer.Median, "Random": tm.Placer.Random, "Zero": tm.Placer.Zero} 721 ParamName = "Placer" 722 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo) 723 724 # Map Merger to TMAP object... 725 ParamInfo = {"EdgeCover": tm.Merger.EdgeCover, "LocalBiconnected": tm.Merger.LocalBiconnected, "Solar": tm.Merger.Solar, "IndependentSet": tm.Merger.IndependentSet} 726 ParamName = "Merger" 727 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo) 728 729 OptionsInfo["LSHLayoutConfigParams"] = LSHLayoutConfigParams 730 731 def MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo): 732 """Map LSH layout configuration patameter valut to TMAP object. """ 733 734 ParamValue = LSHLayoutConfigParams[ParamName] 735 if ParamValue not in ParamInfo: 736 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: %s\n" % (ParamValue, ParamName, ParamsOptionName, ", ".join(sorted(ParamInfo.keys())))) 737 LSHLayoutConfigParams[ParamName] = ParamInfo[ParamValue] 738 739 def ProcessMinHashFPParamsOption(): 740 """Process option for MinHash parameters.""" 741 742 ParamsOptionName = "--minHashFPParams" 743 ParamsOptionValue = Options[ParamsOptionName] 744 ParamsDefaultInfo = {"Radius": ["int", 3], "Rings": ["bool", True], "Kekulize": ["bool", True], "Sanitize": ["bool", True], "MinRadius": ["int", 1], "NumPermutations": ["int", 2048], "Seed": ["int", 42]} 745 746 MinHashFPParams = MiscUtil.ProcessOptionNameValuePairParameters(ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo) 747 748 for ParamName in ["Radius", "MinRadius", "NumPermutations"]: 749 ParamValue = MinHashFPParams[ParamName] 750 if ParamValue <= 0: 751 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: > 0\n" % (ParamValue, ParamName, ParamsOptionName)) 752 753 OptionsInfo["MinHashFPParams"] = MinHashFPParams 754 755 def ProcessInfileDelimiterOption(): 756 """Process option infile delimiter.""" 757 758 InfileDelim = Options["--infileDelimiter"] 759 if re.match("^auto$", InfileDelim, re.I): 760 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Infile"]) 761 if re.match("^csv$", FileExt, re.I): 762 InfileDelim = "comma" 763 elif re.match("^(tsv|txt)$", FileExt, re.I): 764 InfileDelim = "tab" 765 elif re.match("^(smi)$", FileExt, re.I): 766 InfileDelim = "space" 767 else: 768 MiscUtil.PrintError("The input file delimiter couldn't be determined from its extension %s. You must explicitly specify an input file delimiter using option\"--infileDelimiter\".\n" % (InfileDelim)) 769 770 InfileDelimMap = {"comma": ",", "tab": "\t", "space": " "} 771 OptionsInfo["InfileDelimiter"] = InfileDelimMap[InfileDelim] 772 773 def ProcessColumnModeOption(): 774 """Process column mode option.""" 775 776 CollabelMode, ColnumMode = [False, False] 777 Colmode = Options["--colmode"] 778 if re.match("^collabel$", Colmode, re.I): 779 CollabelMode = True 780 elif re.match("^colnum$", Colmode, re.I): 781 ColnumMode = True 782 else: 783 MiscUtil.PrintError("The value, %s, specified for option \"-c, --colmode\" is not valid. Supported values: collabel or colnum\n" % (Colmode)) 784 785 OptionsInfo["Colmode"] = Colmode 786 OptionsInfo["CollabelMode"] = CollabelMode 787 OptionsInfo["ColnumMode"] = ColnumMode 788 789 def RetrieveColumnNames(): 790 """Retrieve column names. """ 791 792 Infile = OptionsInfo["Infile"] 793 794 InfileFH = open(Infile, "r") 795 InfileReader = csv.reader(InfileFH, delimiter = OptionsInfo["InfileDelimiter"], quotechar = '"') 796 Colnames = next(InfileReader) 797 InfileFH.close() 798 799 if len(Colnames) == 0: 800 MiscUtil.PrintError("The first line in input file, %s, is empty. It must contain column names.\n" % Infile) 801 802 ColnameToColnumMap = {} 803 ColnumToColnameMap = {} 804 for ColIndex, Colname in enumerate(Colnames): 805 Colnum = ColIndex + 1 806 ColnameToColnumMap[Colname] = Colnum 807 ColnumToColnameMap[Colnum] = Colname 808 809 OptionsInfo["Colnames"] = Colnames 810 OptionsInfo["ColCount"] = len(Colnames) 811 OptionsInfo["ColnameToColnumMap"] = ColnameToColnumMap 812 OptionsInfo["ColnumToColnameMap"] = ColnumToColnameMap 813 814 # Initialize for tracking specified column names... 815 SpecifiedColsInfo = {} 816 SpecifiedColsInfo["Colnames"] = [] 817 SpecifiedColsInfo["Colnum"] = {} 818 SpecifiedColsInfo["OptionName"] = {} 819 820 OptionsInfo["SpecifiedColsInfo"] = SpecifiedColsInfo 821 822 def ProcessSMILESColOption(): 823 """Process SMILES column option.""" 824 825 SMILESCol = Options["--colSMILES"] 826 if re.match("^auto$", SMILESCol, re.I): 827 Colname = "SMILES" 828 if Colname not in OptionsInfo["ColnameToColnumMap"]: 829 MiscUtil.PrintError("The SMILES column name, %s, doen't exist in input file. You must specify a valid SMILES column name or number using \"--colSMILES\" option.\n" % Colname) 830 831 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 832 SMILESColspec = Colnum if OptionsInfo["ColnumMode"] else Colname 833 else: 834 SMILESColspec = SMILESCol 835 836 SMILESColname, SMILESColnum = ProcessColumnSpecification("--colSMILES", SMILESColspec) 837 838 OptionsInfo["SMILESCol"] = SMILESCol 839 OptionsInfo["SMILESColname"] = SMILESColname 840 OptionsInfo["SMILESColnum"] = SMILESColnum 841 842 def ProcessCategoricalDataColsOption(): 843 """Process categorical data columns option.""" 844 845 CategoricalDataColnames, CategoricalDataColnums = [None] *2 846 CategoricalDataCols = Options["--categoricalDataCols"] 847 if not re.match("^none$", CategoricalDataCols, re.I): 848 CategoricalDataColnames = [] 849 CategoricalDataColnums = [] 850 for DataCol in CategoricalDataCols.split(","): 851 DataCol = DataCol.strip() 852 DataColname, DataColnum = ProcessColumnSpecification("--categoricalDataCols", DataCol) 853 CategoricalDataColnames.append(DataColname) 854 CategoricalDataColnums.append(DataColnum) 855 856 OptionsInfo["CategoricalDataCols"] = CategoricalDataCols 857 OptionsInfo["CategoricalDataColnames"] = CategoricalDataColnames 858 OptionsInfo["CategoricalDataColnums"] = CategoricalDataColnums 859 860 def ProcessCategoricalDataColormapsOption(): 861 """Process categorical data color maps option. """ 862 863 if OptionsInfo["CategoricalDataColnames"] is None: 864 OptionsInfo["CategoricalDataColormaps"] = Options["--categoricalDataColormaps"] 865 OptionsInfo["CategoricalDataColormapsList"] = None 866 return 867 868 CategoricalDataColormapsList = [] 869 CategoricalDataColCount = len(OptionsInfo["CategoricalDataColnames"]) 870 871 CategoricalDataColormaps = Options["--categoricalDataColormaps"] 872 if not re.match("^auto$", CategoricalDataColormaps, re.I): 873 ColormapsWords = CategoricalDataColormaps.split(",") 874 if len(ColormapsWords) != CategoricalDataColCount: 875 MiscUtil.PrintInfo("The number of colormaps, %s, specified using \"--categoricalDataColormaps\" must be equal to the number of columns, %s, specified using \"--categoricalDataCols\" option." % (len(ColormapsWords), CategoricalDataColCount)) 876 for Colormap in ColormapsWords: 877 Colormap = Colormap.strip() 878 CategoricalDataColormapsList.append(Colormap) 879 else: 880 CategoricalDataColormapsList = ["tab10"] * CategoricalDataColCount 881 882 OptionsInfo["CategoricalDataColormaps"] = CategoricalDataColormaps 883 OptionsInfo["CategoricalDataColormapsList"] = CategoricalDataColormapsList 884 885 def ProcessNumericalDataColsOption(): 886 """Process numerical data columns option.""" 887 888 NumericalDataColnames, NumericalDataColnums = [None] *2 889 NumericalDataCols = Options["--numericalDataCols"] 890 if not re.match("^none$", NumericalDataCols, re.I): 891 NumericalDataColnames = [] 892 NumericalDataColnums = [] 893 for DataCol in NumericalDataCols.split(","): 894 DataCol = DataCol.strip() 895 DataColname, DataColnum = ProcessColumnSpecification("--numericalDataCols", DataCol) 896 NumericalDataColnames.append(DataColname) 897 NumericalDataColnums.append(DataColnum) 898 899 OptionsInfo["NumericalDataCols"] = NumericalDataCols 900 OptionsInfo["NumericalDataColnames"] = NumericalDataColnames 901 OptionsInfo["NumericalDataColnums"] = NumericalDataColnums 902 903 def ProcessNumericalDataColormapsOption(): 904 """Process numerical data color maps option. """ 905 906 if OptionsInfo["NumericalDataColnames"] is None: 907 OptionsInfo["NumericalDataColormaps"] = Options["--numericalDataColormaps"] 908 OptionsInfo["NumericalDataColormapsList"] = None 909 return 910 911 NumericalDataColormapsList = [] 912 NumericalDataColCount = len(OptionsInfo["NumericalDataColnames"]) 913 914 NumericalDataColormaps = Options["--numericalDataColormaps"] 915 if not re.match("^auto$", NumericalDataColormaps, re.I): 916 ColormapsWords = NumericalDataColormaps.split(",") 917 if len(ColormapsWords) != NumericalDataColCount: 918 MiscUtil.PrintInfo("The number of colormaps, %s, specified using \"--categoricalDataColormaps\" must be equal to the number of columns, %s, specified using \"--categoricalDataCols\" option." % (len(ColormapsWords), NumericalDataColCount)) 919 for Colormap in ColormapsWords: 920 Colormap = Colormap.strip() 921 NumericalDataColormapsList.append(Colormap) 922 else: 923 NumericalDataColormapsList = ["viridis"] * NumericalDataColCount 924 925 OptionsInfo["NumericalDataColormaps"] = NumericalDataColormaps 926 OptionsInfo["NumericalDataColormapsList"] = NumericalDataColormapsList 927 928 def ProcessStructureDisplayDataColsOption(): 929 """Process structure display data columns option.""" 930 931 StructureDisplayDataColnames = [] 932 StructureDisplayDataColnums = [] 933 934 # Add SMILES column... 935 StructureDisplayDataColnames.append(OptionsInfo["SMILESColname"]) 936 StructureDisplayDataColnums.append(OptionsInfo["SMILESColnum"]) 937 938 # Process specified columns... 939 OptionName = "--structureDisplayDataCols" 940 StructureDisplayDataCols = Options[OptionName] 941 if re.match("^auto$", StructureDisplayDataCols, re.I): 942 # Automatically add 'Name' column... 943 Colname = "Name" 944 if Colname in OptionsInfo["ColnameToColnumMap"]: 945 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 946 StructureDisplayDataColnames.append(Colname) 947 StructureDisplayDataColnums.append(Colnum) 948 else: 949 for DataCol in StructureDisplayDataCols.split(","): 950 DataCol = DataCol.strip() 951 if OptionsInfo["ColnumMode"]: 952 Colnum = int(DataCol) 953 if Colnum not in OptionsInfo["ColnumToColnameMap"]: 954 MiscUtil.PrintError("The column number, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n" % (Colnum, OptionName, OptionsInfo["ColCount"])) 955 Colname = OptionsInfo["ColnumToColnameMap"][Colnum] 956 else: 957 Colname = DataCol 958 if Colname not in OptionsInfo["ColnameToColnumMap"]: 959 MiscUtil.PrintError("The column name, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column name. Valid values: %s\n" % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))) 960 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 961 962 if Colname in StructureDisplayDataColnames: 963 StructureDisplayDataColnumsStrs = ["%s" % Num for Num in StructureDisplayDataColnums] 964 if OptionsInfo["ColnumMode"]: 965 MiscUtil.PrintError("The column number, %s, specified using \"%s\" option is a duplicate column number. It has already been used for this option. You must specify a different column number. Used column names: %s; Used column nums: %s\n" % (Colnum, OptionName, " ".join(StructureDisplayDataColnames), " ".join(StructureDisplayDataColnumsStrs))) 966 else: 967 MiscUtil.PrintError("The column name, %s, specified using \"%s\" option is a duplicate column name. It has already been used for this option. You must specify a different column name. Used column names: %s; Used column nums: %s\n" % (Colname, OptionName, " ".join(StructureDisplayDataColnames), " ".join(StructureDisplayDataColnumsStrs))) 968 969 StructureDisplayDataColnames.append(Colname) 970 StructureDisplayDataColnums.append(Colnum) 971 972 OptionsInfo["StructureDisplayDataCols"] = StructureDisplayDataCols 973 OptionsInfo["StructureDisplayDataColnames"] = StructureDisplayDataColnames 974 OptionsInfo["StructureDisplayDataColnums"] = StructureDisplayDataColnums 975 976 def ProcessColumnSpecification(OptionName, Colspec): 977 """Process column specification corresponding to a column name or number.""" 978 979 Colname, Colnum = [None, None] 980 if OptionsInfo["ColnumMode"]: 981 Colnum = int(Colspec) 982 if Colnum not in OptionsInfo["ColnumToColnameMap"]: 983 MiscUtil.PrintError("The column number, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n" % (Colnum, OptionName, OptionsInfo["ColCount"])) 984 Colname = OptionsInfo["ColnumToColnameMap"][Colnum] 985 else: 986 Colname = Colspec 987 if Colname not in OptionsInfo["ColnameToColnumMap"]: 988 MiscUtil.PrintError("The column name, %s, specified using \"%s\" option doesn't exist in input file. You must specify a valid column name. Valid values: %s\n" % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))) 989 Colnum = OptionsInfo["ColnameToColnumMap"][Colname] 990 991 # Track and check for duplicate column specification... 992 SpecifiedColsInfo = OptionsInfo["SpecifiedColsInfo"] 993 if Colname in SpecifiedColsInfo["Colnames"]: 994 if OptionsInfo["ColnumMode"]: 995 MiscUtil.PrintError("The column number, %s, specified using \"%s\" option is a duplicate column number. It has already been used for \"%s\" option. You must specify a different column number.\n" % (Colnum, OptionName, SpecifiedColsInfo["OptionName"][Colname])) 996 else: 997 MiscUtil.PrintError("The column name, %s, specified using \"%s\" option is a duplicate column name. It has already been used for \"%s\" option. You must specify a different column name.\n" % (Colname, OptionName, SpecifiedColsInfo["OptionName"][Colname])) 998 else: 999 SpecifiedColsInfo["Colnames"].append(Colname) 1000 SpecifiedColsInfo["Colnum"][Colname] = Colnum 1001 SpecifiedColsInfo["OptionName"][Colname] = OptionName 1002 1003 return (Colname, Colnum) 1004 1005 def ProcessOptions(): 1006 """Process and validate command line arguments and options.""" 1007 1008 MiscUtil.PrintInfo("Processing options...") 1009 1010 # Validate options... 1011 ValidateOptions() 1012 1013 OptionsInfo["Infile"] = Options["--infile"] 1014 1015 Outfile = Options["--outfile"] 1016 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 1017 OptionsInfo["OutfilePrefix"] = FileName 1018 OptionsInfo["OutfileExt"] = FileExt 1019 1020 OptionsInfo["Outfile"] = Outfile 1021 OptionsInfo["OutfileJS"] = "%s.js" % FileName 1022 OptionsInfo["OutfileLSHForest"] = "%s.dat" % FileName 1023 1024 ProcessInfileDelimiterOption() 1025 RetrieveColumnNames() 1026 1027 ProcessColumnModeOption() 1028 ProcessSMILESColOption() 1029 1030 OptionsInfo["CategoricalDataMaxDisplay"] = int(Options["--categoricalDataMaxDisplay"]) 1031 ProcessCategoricalDataColsOption() 1032 ProcessCategoricalDataColormapsOption() 1033 1034 ProcessNumericalDataColsOption() 1035 ProcessNumericalDataColormapsOption() 1036 1037 ProcessStructureDisplayDataColsOption() 1038 1039 ProcessFaerunConfigParametersOption() 1040 ProcessFaerunScatterPlotParamsOption() 1041 1042 OptionsInfo["LSHForestFileWriteMode"] = True if re.match("^yes$", Options["--lshForestFileWrite"], re.I) else False 1043 OptionsInfo["LSHForestFileRestoreMode"] = True if re.match("^yes$", Options["--lshForestFileRestore"], re.I) else False 1044 if OptionsInfo["LSHForestFileRestoreMode"]: 1045 LSHForestFile = OptionsInfo["OutfileLSHForest"] 1046 if not os.path.isfile(LSHForestFile): 1047 MiscUtil.PrintError("The LSH forest file, %s, must be present for, %s, value of \"--lshForestFileRestore\" option." % (LSHForestFile, Options["--lshForestFileRestore"])) 1048 1049 ProcessLSHForestParamsOption() 1050 ProcessLSHLayoutConfigParamsOption() 1051 1052 OptionsInfo["MergeHTMLandJSFilesMode"] = True if re.match("^yes$", Options["--mergeHTMLandJSFiles"], re.I) else False 1053 1054 ProcessMinHashFPParamsOption() 1055 1056 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 1057 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 1058 1059 OptionsInfo["Overwrite"] = Options["--overwrite"] 1060 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 1061 1062 OptionsInfo["TMAPDisplayMsg"] = Options["--tmapDisplayMsg"] 1063 1064 def RetrieveOptions(): 1065 """Retrieve command line arguments and options.""" 1066 1067 # Get options... 1068 global Options 1069 Options = docopt(_docoptUsage_) 1070 1071 # Set current working directory to the specified directory... 1072 WorkingDir = Options["--workingdir"] 1073 if WorkingDir: 1074 os.chdir(WorkingDir) 1075 1076 # Handle examples option... 1077 if "--examples" in Options and Options["--examples"]: 1078 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 1079 sys.exit(0) 1080 1081 def ValidateOptions(): 1082 """Validate option values.""" 1083 1084 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 1085 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "smi csv tsv txt") 1086 1087 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "html") 1088 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 1089 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 1090 1091 MiscUtil.ValidateOptionTextValue("-c, --colmode", Options["--colmode"], "collabel colnum") 1092 1093 if re.match("^none$", Options["--categoricalDataCols"], re.I) and re.match("^none$", Options["--numericalDataCols"], re.I): 1094 MiscUtil.PrintError("You must specify al least one caetgorical or numerical data column using option \"--categoricalDataCols\" or \"--numericalDataCols\". It is used to color TMAP.") 1095 1096 ColnumMode = True if re.match("^colnum$", Options["--colmode"], re.I) else False 1097 if ColnumMode and not re.match("^auto$", Options["--colSMILES"], re.I): 1098 MiscUtil.ValidateOptionIntegerValue("--colSMILES", Options["--colSMILES"], {">": 0}) 1099 1100 if ColnumMode and not re.match("^none$", Options["--categoricalDataCols"], re.I): 1101 MiscUtil.ValidateOptionNumberValues("--categoricalDataCols", Options["--categoricalDataCols"], 0, ",", "integer", {">": 0}) 1102 1103 MiscUtil.ValidateOptionIntegerValue("--categoricalDataMaxDisplay", Options["--categoricalDataMaxDisplay"], {">": 0}) 1104 1105 if not re.match("^auto$", Options["--categoricalDataColormaps"], re.I): 1106 ColormapCount = len(Options["--categoricalDataColormaps"].split(",")) 1107 ColCount = len(Options["--categoricalDataCols"].split(",")) 1108 if ColormapCount != ColCount: 1109 MiscUtil.PrintError("The number of colormaps, %s, specified using option \"--categoricalDataColormaps\" must be equal to number of columns, %s, specified using option \"-categoricalDataCols\". " % (ColormapCount, ColCount)) 1110 1111 if ColnumMode and not re.match("^none$", Options["--numericalDataCols"], re.I): 1112 MiscUtil.ValidateOptionNumberValues("--numericalDataCols", Options["--numericalDataCols"], 0, ",", "integer", {">": 0}) 1113 1114 if not re.match("^auto$", Options["--numericalDataColormaps"], re.I): 1115 ColormapCount = len(Options["--numericalDataColormaps"].split(",")) 1116 ColCount = len(Options["--numericalDataCols"].split(",")) 1117 if ColormapCount != ColCount: 1118 MiscUtil.PrintError("The number of colormaps, %s, specified using option \"--numericalDataColormaps\" must be equal to number of columns, %s, specified using option \"-numericalDataCols\". " % (ColormapCount, ColCount)) 1119 1120 if not re.match("^auto$", Options["--structureDisplayDataCols"], re.I): 1121 if ColnumMode and not re.match("^none$", Options["--structureDisplayDataCols"], re.I): 1122 MiscUtil.ValidateOptionNumberValues("--structureDisplayDataCols", Options["--structureDisplayDataCols"], 0, ",", "integer", {">": 0}) 1123 1124 if not re.match("^auto$", Options["--infileDelimiter"], re.I): 1125 MiscUtil.ValidateOptionTextValue(" --infileDelimiter", Options["--infileDelimiter"], "comma tab space") 1126 1127 MiscUtil.ValidateOptionTextValue("--lshForestFileWrite", Options["--lshForestFileWrite"], "yes no") 1128 MiscUtil.ValidateOptionTextValue("--lshForestFileRestore", Options["--lshForestFileRestore"], "yes no") 1129 MiscUtil.ValidateOptionTextValue("--mergeHTMLandJSFiles", Options["--mergeHTMLandJSFiles"], "yes no") 1130 1131 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 1132 1133 1134 # Setup a usage string for docopt... 1135 _docoptUsage_ = """ 1136 VisualizeChemspaceUsingTMAP.py - Visualize chemspace 1137 1138 Usage: 1139 VisualizeChemspaceUsingTMAP.py [--categoricalDataCols <collabel1,... or colnum1,...>] [--categoricalDataColormaps <Colormap1, Colormap2,...>] 1140 [--categoricalDataMaxDisplay <number>] [--colmode <collabel or colnum>] [--colSMILES <text or number>] 1141 [--faerunConfigParams <Name,Value,...>] [--faerunScatterPlotParams <Name,Value,...>] 1142 [--infileDelimiter <comma, tab, or space>] [--lshForestFileWrite <yes or no>] [--lshForestFileRestore <yes or no>] 1143 [--lshForestParams <Name,Value,...>] [--lshLayoutConfigParams <Name,Value,...>] [--mergeHTMLandJSFiles <yes or no>] 1144 [--minHashFPParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 1145 [--numericalDataCols <collabel1,... or colnum1,...>] [--numericalDataColormaps <Colormap1, Colormap2,...>] 1146 [--overwrite] [--quiet <yes or no>] [--structureDisplayDataCols <collabel1,... or colnum1,...> ] 1147 [--tmapDisplayMsg <text>] [-w <dir>] -i <infile> -o <outfile> 1148 VisualizeChemspaceUsingTMAP.py -h | --help | -e | --examples 1149 1150 Description: 1151 Generate an interactive TreeMAP (TMAP) [Ref 171, 172] visualization for molecules 1152 in a text input file. The text input file must have a column containing SMILES strings. 1153 In addition, it must contain at least one column corresponding to categorical or 1154 numerical data for coloring TMAP nodes. You may optionally map multiple categorical 1155 and numerical data columns on to a TMAP visualization. A HTML file is generated for 1156 interactive visualization of chemspace in a browser. 1157 1158 The TMAP methodology is able to generate a reasonably interactive visualization 1159 for relatively large data sets. A brief description of the methodology is as follows. 1160 A set of MinHash Fingerprints (MHFPs) are calculated for molecules in input file 1161 followed by the generation of a Locality Sensitivity Hashing (LSH) forest employing 1162 MHFPs. A c-approximate k-Nearest Neighbor Graph (c-k-NNG) is constructed from 1163 LSH, which is used to construct a Minimum Spanning Tree (MST) or Forest (MSF). 1164 The final TMAP visualization is generated by laying out MST and MSF on a plane 1165 using an algorithm provided by the Open Graph Drawing Framework (OGDF). The 1166 OGDF provides flexibility to adjust graph layout methodology in terms of not only 1167 aesthetics but also computational time. 1168 1169 The supported input file formats are: CSV (.csv) TSV (.txt or .tsv), 1170 SMILES (.smi) 1171 1172 The supported output file format is: HTML (.html). 1173 1174 Options: 1175 --categoricalDataCols <collabel1,... or colnum1,...> [default: none] 1176 A comma delimited list of column labels or numbers corresponding to 1177 categorical data to map on a TMAP visualization. 1178 --categoricalDataColormaps <Colormap1, Colormap2,...> [default: auto] 1179 A comma delimited list of color map names corresponding to categorical 1180 data. The default is to use 'tab10' color map name for mapping categorical 1181 data on a TMAP. The number of specified color maps must match the number 1182 of categorical data columns. You must specify valid color map names 1183 supported by Matplotlib. No validation is performed. Example color map 1184 names for categorical data: Pastel1, Pastel2, Paired, Accent, Dark2, Set1, 1185 Set2, Set3, tab10, tab20, tab20b, tab20c. 1186 --categoricalDataMaxDisplay <number> [default: 6] 1187 Maximum number of categories in a category column to display on a TMAP 1188 visualization. The rest of the categories are aggregated under a new 1189 category named 'Other' before mapping on to a TMAP visualization. 1190 -c, --colmode <collabel or colnum> [default: collabel] 1191 Use column number or name for the specification of columns in input 1192 text file containing SMILES strings and molecule names along with any 1193 categorical or numerical data. 1194 --colSMILES <text or number> [default: auto] 1195 Column name or number corresponding to SMILES strings. The default value 1196 is automatically set based on the value of '-c, --colmode': 'SMILES' for 1197 'collabel'; SMILES string column number for 'colnum'. SMILES strings must 1198 be present in input file. 1199 -e, --examples 1200 Print examples. 1201 --faerunConfigParams <Name,Value,...> [default: auto] 1202 A comma delimited list of parameter name and value pairs for configuring 1203 faerun (Ref 172) to generate a TMAP visualization. 1204 1205 The supported parameter names along with their default and possible 1206 values are shown below: 1207 1208 clearColor, #000000 1209 showLegend, yes [ Possible values: yes or no ] 1210 legendTitle, Legend 1211 legendOrientation, vertical [ Possible values: vertical or 1212 horizontal ] 1213 legendNumberFormat, {:.2f} 1214 scale, 750.0 1215 alphaBlending, no [ Possible values: yes or no ] 1216 antiAliasing, yes [Possible values: yes or no] 1217 thumbnailWidth, 250 1218 thumbnailFixed, no [ Possible values: yes or no ] 1219 1220 A brief description of parameters, as available in the code for faerun, is 1221 provided below: 1222 1223 clearColor: Background color 1224 showLegend: Show legend at lower right 1225 legendTitle: Legend title 1226 legendOrientation: Legend Orientation 1227 legendNumberFormat: Number string format applied to numbers 1228 displayed in legend 1229 scale: Scaling factor for scaling normalized coordinates 1230 AlphaBlending: Activate alpha blending. It is required for smoothCircle 1231 shader. 1232 antiAliasing: Activate anti-aliasing. It might adversly impact 1233 rendering performance. 1234 thumbnailWidth: Width of thumbnail images for structures 1235 thumbnailFixed: Show thumbnail images at a fixed location at the 1236 top instead of next to the mouse 1237 1238 --faerunScatterPlotParams <Name,Value,...> [default: auto] 1239 A comma delimited list of parameter name and value pairs for generating 1240 scatter plot representing a TMAP using faerun (Ref 172). 1241 1242 The supported parameter names along with their default and possible 1243 values are shown below: 1244 1245 shader, circle [ Possible values: circle, smoothCircle, 1246 sphere, or any valid value] 1247 pointScale, auto [ 4 if MolCout<=10K; 2 if MolCount<=100K; else 1 ] 1248 maxPointSize, 100.0 1249 fogIntensity, 0.0 1250 interactive, yes [ Possible values: yes or no ] 1251 1252 A brief description of parameters is provided below: 1253 1254 shader: Shader to use for visualizating data points 1255 pointScale: Relative size of data points 1256 maxPointSize: Maximum size of the data points during zooming 1257 fogIntensity: Intensity of distance fog 1258 interactive: Generate interactive scatter plot 1259 1260 -h, --help 1261 Print this help message. 1262 -i, --infile <infile> 1263 Input file name. The SMILES strings must be present in the input file. 1264 Supported formats: CSV (.csv) TSV (.txt or .tsv), or SMILES (.smi) 1265 --infileDelimiter <comma, tab, or space> [default: auto] 1266 Input file delimiter for processing data. The default value is automatically 1267 set based on the type of input file: comma - CSV (.csv); tab - TSV (.txt or 1268 .tsv); space - SMILES (.smi) 1269 --lshForestFileWrite <yes or no> [default: yes] 1270 Write LSH forest data a file for subsequent generation of a TMAP visualization. 1271 Default file name: <OutfileRoot>_LSHForest.dat. The LSH forest data is 1272 generated using MinHash fingerprints. You may restore LSH forest data 1273 using '--lshForestFileRestore' option to skip the generation of fingerprints. 1274 --lshForestFileRestore <yes or no> [default: no] 1275 Check and restore LSH forest data from a file for generating a TMAP 1276 visualization and skip the generation of MinHash fingerprints. Default file 1277 name: <OutfileRoot>_LSHForest.dat 1278 --lshForestParams <Name,Value,...> [default: auto] 1279 A comma delimited list of parameter name and value pairs for generating 1280 LSH (Locality Sensitivity Hashing) forest from MinHash fingerprints. 1281 1282 The supported parameter names along with their default and possible 1283 values are shown below: 1284 1285 dim, 2048 1286 numPrefixTrees, auto [ 128 if MolCount <= 10K else 8 ] 1287 store, yes [ Possible values: yes or no ] 1288 1289 A brief description of parameters, as available in the code for LSH, is 1290 provided below: 1291 1292 dim: Dimensionality of MinHashes to be added to LSHForest 1293 numPrefixTrees: Number of prefix trees to use 1294 store: store the data for enhanced retrieval 1295 1296 --lshLayoutConfigParams <Name,Value,...> [default: auto] 1297 A comma delimited list of parameter name and value pairs for configuring 1298 LSH (Locality Sensitivity Hashing) layout. 1299 1300 The supported parameter names along with their default and possible 1301 values are shown below: 1302 1303 k, auto [ 75 if MolCount <= 10K else 10] 1304 kc, auto [ 20 if MolCount <= 10K else 10] 1305 fmeIterations, 1000 1306 fmeRandomize, no [ Possible values: yes or no ] 1307 fmeThreads, 4 1308 fmePrecision, 4 1309 slRrepeats, auto [ 2 if MolCount <= 10K else 1] 1310 slExtraScalingSteps, auto [ 4 if MolCount <= 10K else 2 ] 1311 slScalingMin, 1.0 1312 slScalingMax, 1.0 1313 slScalingType, RelativeToDrawing [ Possible values: Absolute, 1314 RelativeToAvgLength, RelativeToDesiredLength, or 1315 RelativeToDrawing ] 1316 mmmRepeats, auto [ 2 MolCount <= 10K else 1 ] 1317 placer, Barycenter [ Possible valeues: Barycenter, Solar, Circle, 1318 Median, Random, or Zero ] 1319 merger, LocalBiconnected [ Possible values: EdgeCover, 1320 LocalBiconnected, Solar, or IndependentSet ] 1321 mergerFactor, 2.0 1322 mergerAdjustment, 0 1323 nodeSizeDenominator, auto [ 65 if MolCout <= 10K else 70.0] 1324 1325 A brief description of parameters, as available in the code for LSH, is 1326 provided below: 1327 1328 k: Number of nearest neighbors used to create k-nearest neighbor 1329 graph 1330 kc: Scalar by which k is multiplied before querying LSH forest. 1331 The results are then sorted in decreasing order based on linear 1332 scan distances. 1333 fmeIterations: Maximum number of iterations of Fast Multipole 1334 Embedder (FME) 1335 fmeRandomize: Randomize FME layout at the start 1336 fmeThreads: Number of threads for FME 1337 fmePrecision: Number of coefficients of multipole expansion 1338 slRepeats: Number of repeats of scaling layout algorithm 1339 slExtraScalingSteps: Number of repeats of scaling 1340 slScalingMin: Minimum scaling factor 1341 slScalingMax: Maximum scaling factor. 1342 slScalingType: Scaling type corresponding to relative scale of graph 1343 mmmRepeats, Number of repeats of layout at each level 1344 placer: Methodology for defining initial positions of vertices in a 1345 graph at each level 1346 merger: Vertex merging methodology used during coarsening phase 1347 of multilevel algorithm 1348 mergerFactor: Ratio of sizes between two levels up to which merging 1349 is performed. It doesn't apply to all merging methodologies. 1350 mergerAdjustment: Edge length adjustment for merging methodology. 1351 It doesn't apply to all merging methodologies. 1352 nodeSizeDenominator: Node size denominator affecting the magnitude 1353 of repelling force between nodes. Node size corresponds to 1354 1.0 / nodeSizeDenominator. You may want to increase the value 1355 nodeSizeDenominator to decrease node size and resolve overlaps 1356 in a crowded tree. 1357 1358 --mergeHTMLandJSFiles <yes or no> [default: yes] 1359 Merge TMAP JS data file into HTML file and delete JS data file. Default 1360 file names: <OutfileRoot>.html, <OutfileRoot>.js. 1361 --minHashFPParams <Name,Value,...> [default: auto] 1362 A comma delimited list of parameter name and value pairs for generating 1363 Min Hash Fingerprints (MHFP). 1364 1365 The supported parameter names along with their default and possible 1366 values are shown below: 1367 1368 radius, 3 1369 rings, yes [ Possible values: yes or no ] 1370 kekulize, yes [ Possible values: yes or no ] 1371 sanitize, yes [ Possible values: yes or no ] 1372 minRadius, 1 1373 numPermutations, 2048 1374 seed, 42 1375 1376 A brief description of parameters, as available in the code for MHFP, is 1377 provided below: 1378 1379 radius: MHFP radius (A radius of 3 corresponds to MHFP6) 1380 rings: Include rings in shingling 1381 kekulize: Kekulize SMILES 1382 sanitize: Sanitize SMILES 1383 minRadius: Minimum radius that is used to extract n-grams 1384 numPermutations: Number of permutations used for hashing 1385 seed: Random number seed for numpy.random 1386 1387 --mp <yes or no> [default: no] 1388 Use multiprocessing for the generation of fingerprints. 1389 1390 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 1391 function employing lazy RDKit data iterable. This allows processing of 1392 arbitrary large data sets without any additional requirements memory. 1393 1394 All input data may be optionally loaded into memory by mp.Pool.map() 1395 before starting worker processes in a process pool by setting the value 1396 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 1397 1398 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 1399 data mode may adversely impact the performance. The '--mpParams' section 1400 provides additional information to tune the value of 'chunkSize'. 1401 --mpParams <Name,Value,...> [default: auto] 1402 A comma delimited list of parameter name and value pairs to configure 1403 multiprocessing during the generation of fingerprints. 1404 1405 The supported parameter names along with their default and possible 1406 values are shown below: 1407 1408 chunkSize, auto 1409 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 1410 numProcesses, auto [ Default: mp.cpu_count() ] 1411 1412 These parameters are used by the following functions to configure and 1413 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 1414 mp.Pool.imap(). 1415 1416 The chunkSize determines chunks of input data passed to each worker 1417 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 1418 The default value of chunkSize is dependent on the value of 'inputDataMode'. 1419 1420 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 1421 automatically converts RDKit data iterable into a list, loads all data into 1422 memory, and calculates the default chunkSize using the following method 1423 as shown in its code: 1424 1425 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 1426 if extra: chunkSize += 1 1427 1428 For example, the default chunkSize will be 7 for a pool of 4 worker processes 1429 and 100 data items. 1430 1431 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 1432 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 1433 data into memory. Consequently, the size of input data is not known a priori. 1434 It's not possible to estimate an optimal value for the chunkSize. The default 1435 chunkSize is set to 1. 1436 1437 The default value for the chunkSize during 'Lazy' data mode may adversely 1438 impact the performance due to the overhead associated with exchanging 1439 small chunks of data. It is generally a good idea to explicitly set chunkSize to 1440 a larger value during 'Lazy' input data mode, based on the size of your input 1441 data and number of processes in the process pool. 1442 1443 The mp.Pool.map() function waits for all worker processes to process all 1444 the data and return the results. The mp.Pool.imap() function, however, 1445 returns the the results obtained from worker processes as soon as the 1446 results become available for specified chunks of data. 1447 1448 The order of data in the results returned by both mp.Pool.map() and 1449 mp.Pool.imap() functions always corresponds to the input data. 1450 --numericalDataCols <collabel1,... or colnum1,...> [default: none] 1451 A comma demlimited list of column labels or numbers corresponding to 1452 numerical data to map on a TMAP visualization. 1453 --numericalDataColormaps <Colormap1, Colormap2,...> [default: auto] 1454 A comma demlimited list of color map names corresponding to numerical 1455 data. The default is to use 'viridis' color map name for mapping numerical 1456 data on a TMAP. The number of specified color maps must mtach the number 1457 of numerical data columns. You must specify valid color map names 1458 supported by Matplotlib. No validation is performed. Example color map 1459 names for numerical data: viridis, plasma, inferno, magma, cividis. 1460 -o, --outfile <outfile> 1461 Output HTML file name for writing out a TMAP visualization. 1462 --overwrite 1463 Overwrite existing files. 1464 -q, --quiet <yes or no> [default: no] 1465 Use quiet mode. The warning and information messages will not be printed. 1466 --structureDisplayDataCols <collabel1,... or colnum1,...> [default: auto] 1467 A comma delimited list of column labels or numbers corresponding to data 1468 to display under a thumbnail image of a structure in a TMAP visualization. 1469 The default column is set to 'Name' and it is automatically shown. In addition, 1470 the SMILES string column is always used to display SMILES under the structures. 1471 -t, --tmapDisplayMsg <text> [default: auto] 1472 A brief message to display at the top left in HTML page containing a TMAP 1473 visualization. You must specify a valid HTML string. No validation is 1474 performed. Default message: TMAP chemspace visualization<br/> 1475 Input file: <InfileName><br/>Number of molecules: <Count> 1476 -w, --workingdir <dir> 1477 Location of working directory which defaults to the current directory. 1478 1479 Examples: 1480 To visualize chemspace for SMILES strings present in a column name SMILES in 1481 input file, mapping a categorical data column on TMAP, writing out LSH forest 1482 for subsequent use to skip the generation of fingerprints, merging TMAP JS file 1483 into HTML file, and write out a HTML file containing TMAP visualization, type: 1484 1485 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1486 -i SampleChemspace.csv -o SampleChemspace.html 1487 1488 To run the first example for SMILES strings in column name SMILES in input file 1489 and write out a HTML file containing TMAP visualization, type: 1490 1491 % VisualizeChemspaceUsingTMAP.py --colSMILES SMILES 1492 --categoricalDataCols Source 1493 -i SampleChemspace.csv -o SampleChemspace.html 1494 1495 To run the first example for mapping categrorical data in column number 4 in 1496 input file and write out a HTML file containing TMAP visualization, type: 1497 1498 % VisualizeChemspaceUsingTMAP.py --colmode colnum 1499 --categoricalDataCols 4 1500 -i SampleChemspace.csv -o SampleChemspace.html 1501 1502 To run the first example for mapping both categrorical and numerical data 1503 coumns and write out a HTML file containing TMAP visualization, type: 1504 1505 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source" 1506 --numericalDataCols "MolWt,MolLogP" 1507 -i SampleChemspace.csv -o SampleChemspace.html 1508 1509 To run the first example for mapping both categrorical and numerical data 1510 coumns along with specified colormaps and write out a HTML file containing 1511 TMAP visualization, type: 1512 1513 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source" 1514 --categoricalDataColormaps "tab10" 1515 --numericalDataCols "MolWt,MolLogP" 1516 --numericalDataColormaps "viridis, plasma" 1517 -i SampleChemspace.csv -o SampleChemspace.html 1518 1519 To run the first example for mapping both categrorical and numerical data 1520 coumns along with displaying specific data under the structure display and 1521 write out a HTML file containing TMAP visualization, type: 1522 1523 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source" 1524 --numericalDataCols "MolWt,NHOHCount,NOCount,MolLogP, 1525 NumRotatableBonds,TPSA" --structureDisplayDataCols "Name,ID" 1526 -i SampleChemspace.csv -o SampleChemspace.html 1527 1528 To run the first example for restoring LSH forest data from a file to skip the 1529 generation of fingerpritns and write out a HTML file containing TMAP 1530 visualization, type: 1531 1532 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1533 --lshForestFileRestore yes -i SampleChemspace.csv -o SampleChemspace.html 1534 1535 To run the first example in multiprocessing mode on all available CPUs without 1536 loading all data into memory and write out a HTML file containing TMAP 1537 visualization, type: 1538 1539 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1540 --mp yes -i SampleChemspace.csv -o SampleChemspace.html 1541 1542 To run the first example in multiprocessing mode on all available CPUs by 1543 loading all data into memory and write out a HTML file containing TMAP 1544 visualization, type: 1545 1546 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1547 --mp yes --mpParams "inputDataMode,InMemory" 1548 -i SampleChemspace.csv -o SampleChemspace.html 1549 1550 To run the first example in multiprocessing mode on specific number of CPUs 1551 and chunk size without loading all data into memory and write out a HTML file 1552 containing TMAP visualization, type: 1553 1554 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1555 --mp yes --mpParams "inputDataMode,lazy,numProcesses,4, 1556 chunkSize,50" -i SampleChemspace.csv -o SampleChemspace.html 1557 1558 To run the first example using a set of specified parameters to generate 1559 fingerprints and LSH forest, configure faerun and scatter plot layout, and 1560 write out a HTML file containing TMAP visualization, type: 1561 1562 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source 1563 --minHashFPParams "radius,3,numPermutations,2048" 1564 --lshForestParams "dim,2048,numPrefixTrees,128" 1565 --lshLayoutConfigParams "k,75,kc,20,slRepeats,2, 1566 slExtraScalingSteps,4,mmmRepeats,2" 1567 --faerunConfigParams "clearColor, #000000,thumbnailWidth, 250" 1568 --faerunScatterPlotParams "shader,circle,pointScale,4" 1569 --tmapDisplayMsg "TMAP Chemspace visualization" 1570 -i SampleChemspace.csv -o SampleChemspace.html 1571 1572 Author: 1573 Manish Sud(msud@san.rr.com) 1574 1575 See also: 1576 RDKitConvertFileFormat.py, RDKitCalculateMolecularDescriptors.py, 1577 RDKitStandardizeMolecules.py 1578 1579 Copyright: 1580 Copyright (C) 2024 Manish Sud. All rights reserved. 1581 1582 The functionality available in this script is implemented using TMAP and 1583 Faerun, open source software packages for visualizing chemspace, and 1584 RDKit, an open source toolkit for cheminformatics developed by Greg 1585 Landrum. 1586 1587 This file is part of MayaChemTools. 1588 1589 MayaChemTools is free software; you can redistribute it and/or modify it under 1590 the terms of the GNU Lesser General Public License as published by the Free 1591 Software Foundation; either version 3 of the License, or (at your option) any 1592 later version. 1593 1594 """ 1595 1596 if __name__ == "__main__": 1597 main()