1 #!/bin/env python
2 #
3 # File: VisualizeChemspaceUsingTMAP.py
4 # Author: Manish Sud <msud@san.rr.com>
5 #
6 # Copyright (C) 2026 Manish Sud. All rights reserved.
7 #
8 # The functionality available in this script is implemented using TMAP and
9 # Faerun, open source software packages for visualizing chemspace, and
10 # RDKit, an open source toolkit for cheminformatics developed by Greg
11 # Landrum.
12 #
13 # This file is part of MayaChemTools.
14 #
15 # MayaChemTools is free software; you can redistribute it and/or modify it under
16 # the terms of the GNU Lesser General Public License as published by the Free
17 # Software Foundation; either version 3 of the License, or (at your option) any
18 # later version.
19 #
20 # MayaChemTools is distributed in the hope that it will be useful, but without
21 # any warranty; without even the implied warranty of merchantability of fitness
22 # for a particular purpose. See the GNU Lesser General Public License for more
23 # details.
24 #
25 # You should have received a copy of the GNU Lesser General Public License
26 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
27 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
28 # Boston, MA, 02111-1307, USA.
29 #
30
31 from __future__ import print_function
32
33 import os
34 import sys
35 import time
36 import re
37 import csv
38 import shutil
39 import multiprocessing as mp
40 import pandas as pd
41 import numpy as np
42
43 # TMAP and Faerun imports...
44 try:
45 import tmap as tm
46 from faerun import Faerun
47 from mhfp.encoder import MHFPEncoder
48 except ImportError as ErrMsg:
49 sys.stderr.write("\nFailed to import TMAP/Faerun module/package: %s\n" % ErrMsg)
50 sys.stderr.write("Check/update your TMAP environment and try again.\n\n")
51 sys.exit(1)
52
53 # RDKit imports...
54 try:
55 from rdkit import rdBase
56 except ImportError as ErrMsg:
57 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
58 sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
59 sys.exit(1)
60
61 # MayaChemTools imports...
62 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
63 try:
64 from docopt import docopt
65 import MiscUtil
66 except ImportError as ErrMsg:
67 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
68 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
69 sys.exit(1)
70
71 ScriptName = os.path.basename(sys.argv[0])
72 Options = {}
73 OptionsInfo = {}
74
75
76 def main():
77 """Start execution of the script."""
78
79 MiscUtil.PrintInfo(
80 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
81 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
82 )
83
84 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
85
86 # Retrieve command line arguments and options...
87 RetrieveOptions()
88
89 # Process and validate command line arguments and options...
90 ProcessOptions()
91
92 # Perform actions required by the script...
93 VisualizeChemspace()
94
95 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
96 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
97
98
99 def VisualizeChemspace():
100 """Visualize chemspace using TMAP."""
101
102 InfileDF = ReadMoleculeData()
103
104 MolCount, ValidMolCount, VisualizationFailedCount = ProcessMolecules(InfileDF)
105
106 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
107 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
108 MiscUtil.PrintInfo("Number of molecules failed during chemspace visualization: %d" % VisualizationFailedCount)
109 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
110
111
112 def ProcessMolecules(InfileDF):
113 """Process molecules and generate TMAP."""
114
115 MolCount = len(InfileDF)
116 (ValidMolCount, VisualizationFailedCount) = [0] * 2
117
118 # Setup parameter values for "auto" options based on the number of molecules...
119 ProcessMolCountBasedAutoOptions(MolCount)
120
121 # Setup LSH forest...
122 LSHForest, ValidMolCount, VisualizationFailedCount = SetupLSHForest(InfileDF)
123 if ValidMolCount == 0:
124 return (MolCount, ValidMolCount, VisualizationFailedCount)
125
126 SetupTMAPDisplayMessage(MolCount, ValidMolCount)
127
128 # Generate TMAP coordinates...
129 PlotCoordsInfo = GenerateTMAPCoordinates(LSHForest)
130
131 # Setup TMAP plot data...
132 PlotDataInfo = SetupTMAPPlotData(InfileDF)
133
134 # Setup TMAP plot...
135 GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo)
136
137 return (MolCount, ValidMolCount, VisualizationFailedCount)
138
139
140 def SetupLSHForest(InfileDF):
141 """Setup LSH forest."""
142
143 if OptionsInfo["LSHForestFileRestoreMode"]:
144 return RestoreLSHForest((InfileDF))
145 else:
146 return GenerateLSHForest(InfileDF)
147
148
149 def RestoreLSHForest(InfileDF):
150 """Restore LSH forest."""
151
152 (ValidMolCount, VisualizationFailedCount) = [0] * 2
153
154 # Set valid molecule count to number of molecules in input file...
155 ValidMolCount = len(InfileDF)
156
157 LSHForestFile = OptionsInfo["OutfileLSHForest"]
158 MiscUtil.PrintInfo("\nRestoring LSH forest from %s..." % LSHForestFile)
159 if not os.path.isfile(LSHForestFile):
160 MiscUtil.PrintError("LSH forest file %s is missing. Failed to restore LSH forest...\n" % LSHForestFile)
161
162 LSHForest = InitializeLSHForest()
163 LSHForest.restore(LSHForestFile)
164
165 if LSHForest.size() != ValidMolCount:
166 MiscUtil.PrintError(
167 'The number of molecules, %s, in input file must match number of nodes, %s, in LSH forest during its restoration from a file using "--lshForestFileWrite" option.'
168 % (ValidMolCount, LSHForest.size())
169 )
170
171 return (LSHForest, ValidMolCount, VisualizationFailedCount)
172
173
174 def GenerateLSHForest(InfileDF):
175 """Generate LSH forest."""
176
177 MinHashFingerprints, ValidMolCount, FingerprintsFailedCount = GenerateMinHashFingerprints(InfileDF)
178
179 MiscUtil.PrintInfo("\nGenerating LSH forest...")
180 LSHForest = InitializeLSHForest()
181
182 LSHForest.batch_add(MinHashFingerprints)
183 LSHForest.index()
184
185 # Write out LSH forest...
186 if OptionsInfo["LSHForestFileWriteMode"]:
187 OutfileLSHForest = OptionsInfo["OutfileLSHForest"]
188 if FingerprintsFailedCount > 0:
189 MiscUtil.PrintWarning(
190 "The MinHash fingerprints generation failed for %s molecules. Skipped writing of file %s..."
191 % (FingerprintsFailedCount, OutfileLSHForest)
192 )
193 else:
194 MiscUtil.PrintInfo("Writing LSH forest file %s..." % OutfileLSHForest)
195 LSHForest.store(OutfileLSHForest)
196
197 return (LSHForest, ValidMolCount, FingerprintsFailedCount)
198
199
200 def GenerateMinHashFingerprints(InfileDF):
201 """Generate MinHash fingerprints."""
202
203 if OptionsInfo["MPMode"]:
204 return GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF)
205 else:
206 return GenerateMinHashFingerprintsUsingSingleProcess(InfileDF)
207
208
209 def GenerateMinHashFingerprintsUsingSingleProcess(InfileDF):
210 """Generate MHFPs using a single processs."""
211
212 MiscUtil.PrintInfo("\nGenerating MinHash fingerprints using a single process...")
213
214 MinHashFingerprintsEncoder = InitializeMinHashFingerprintsEncoder()
215
216 (ValidMolCount, FingerprintsFailedCount) = [0] * 2
217 MinHashFingerprints = []
218 FingerprintsFailedRowIndices = []
219
220 SMILESColname = OptionsInfo["SMILESColname"]
221 for MolIndex, SMILES in enumerate(InfileDF[SMILESColname]):
222 MinHashFingerprint = GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES)
223 if MinHashFingerprint is None:
224 FingerprintsFailedCount += 1
225 FingerprintsFailedRowIndices.append(MolIndex)
226 else:
227 ValidMolCount += 1
228 MinHashFingerprints.append(tm.VectorUint(MinHashFingerprint))
229
230 # Remove failed molecules from the dataframe...
231 RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
232
233 return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
234
235
236 def GenerateMinHashFingerprintsUsingMultipleProcesses(InfileDF):
237 """Generate MHFPs using multiprocessing."""
238
239 MiscUtil.PrintInfo("\nGenerating MinHash fingerprints using multiprocessing...")
240
241 MPParams = OptionsInfo["MPParams"]
242
243 # Setup data for initializing a worker process...
244 InitializeWorkerProcessArgs = (
245 MiscUtil.ObjectToBase64EncodedString(Options),
246 MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
247 )
248
249 # Setup SMILES iterator...
250 SMILESColname = OptionsInfo["SMILESColname"]
251 WorkerProcessDataIterable = SetupSMILESWithMolIndices(InfileDF[SMILESColname])
252
253 # Setup process pool along with data initialization for each process...
254 if not OptionsInfo["QuietMode"]:
255 MiscUtil.PrintInfo(
256 "\nConfiguring multiprocessing using %s method..."
257 % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
258 )
259 MiscUtil.PrintInfo(
260 "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
261 % (
262 MPParams["NumProcesses"],
263 MPParams["InputDataMode"],
264 ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
265 )
266 )
267
268 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
269
270 # Start processing...
271 if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
272 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
273 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
274 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
275 else:
276 MiscUtil.PrintError(
277 'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
278 )
279
280 (ValidMolCount, FingerprintsFailedCount) = [0] * 2
281 MinHashFingerprints = []
282 FingerprintsFailedRowIndices = []
283
284 for Result in Results:
285 Molndex, MinHashFingerprint = Result
286
287 if MinHashFingerprint is None:
288 FingerprintsFailedCount += 1
289 FingerprintsFailedRowIndices.append(Molndex)
290 else:
291 ValidMolCount += 1
292 MinHashFingerprints.append(tm.VectorUint(np.array(MinHashFingerprint)))
293
294 # Remove failed molecules from the dataframe...
295 RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices)
296
297 return (MinHashFingerprints, ValidMolCount, FingerprintsFailedCount)
298
299
300 def InitializeWorkerProcess(*EncodedArgs):
301 """Initialize data for a worker process."""
302
303 global Options, OptionsInfo
304
305 if not OptionsInfo["QuietMode"]:
306 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
307
308 # Decode Options and OptionInfo...
309 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
310 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
311
312 # Initialize MHFP encoder...
313 OptionsInfo["MinHashFingerprintsEncoder"] = InitializeMinHashFingerprintsEncoder()
314
315
316 def WorkerProcess(MolInfo):
317 """Process data for a worker process."""
318
319 MolIndex, SMILES = MolInfo
320
321 MinHashFingerprint = GenerateMinHashFingerprintForMolecule(OptionsInfo["MinHashFingerprintsEncoder"], SMILES)
322 if MinHashFingerprint is not None:
323 MinHashFingerprint = MinHashFingerprint.tolist()
324
325 return (MolIndex, MinHashFingerprint)
326
327
328 def SetupSMILESWithMolIndices(SMILES):
329 """Setup an iterator to generate SMILES string along with a molecule index."""
330
331 for MolIndex, MolSMILES in enumerate(SMILES):
332 yield (MolIndex, MolSMILES)
333
334
335 def GenerateMinHashFingerprintForMolecule(MinHashFingerprintsEncoder, SMILES):
336 """Generate MinHash fingerprint for a molecule."""
337
338 MinHashFingerprint = None
339 try:
340 MinHashFingerprint = MinHashFingerprintsEncoder.encode(
341 SMILES,
342 radius=OptionsInfo["MinHashFPParams"]["Radius"],
343 rings=OptionsInfo["MinHashFPParams"]["Rings"],
344 kekulize=OptionsInfo["MinHashFPParams"]["Kekulize"],
345 min_radius=OptionsInfo["MinHashFPParams"]["MinRadius"],
346 sanitize=OptionsInfo["MinHashFPParams"]["Sanitize"],
347 )
348 except Exception as ErrMsg:
349 if not OptionsInfo["QuietMode"]:
350 MiscUtil.PrintWarning("Failed to generate MinHash fingerprint for SMILES %s:\n%s\n" % (SMILES, ErrMsg))
351 else:
352 MiscUtil.PrintInfo("")
353 MinHashFingerprint = None
354
355 return MinHashFingerprint
356
357
358 def RemoveFingerprintsFailedRows(InfileDF, FingerprintsFailedRowIndices):
359 """Remove fingerprints failed rows."""
360
361 if len(FingerprintsFailedRowIndices):
362 InfileDF.drop(FingerprintsFailedRowIndices, inplace=True)
363 InfileDF.reset_index(drop=True, inplace=True)
364
365
366 def GenerateTMAPCoordinates(LSHForest):
367 """Generate TMAP coordinates."""
368
369 MiscUtil.PrintInfo("\nGenerating TMAP plot coordinates...")
370
371 PlotCoordsInfo = {}
372 PlotCoordsInfo["NodeXCoords"] = None
373 PlotCoordsInfo["NodeYCoords"] = None
374 PlotCoordsInfo["EdgeNodeStartList"] = None
375 PlotCoordsInfo["EdgeNodeToList"] = None
376
377 LSHLayoutConfigParams = OptionsInfo["LSHLayoutConfigParams"]
378 LSHLayoutConfig = tm.LayoutConfiguration()
379
380 LSHLayoutConfig.k = LSHLayoutConfigParams["K"]
381 LSHLayoutConfig.kc = LSHLayoutConfigParams["KC"]
382 LSHLayoutConfig.fme_iterations = LSHLayoutConfigParams["FMEIterations"]
383 LSHLayoutConfig.fme_randomize = LSHLayoutConfigParams["FMERandomize"]
384 LSHLayoutConfig.fme_threads = LSHLayoutConfigParams["FMEThreads"]
385 LSHLayoutConfig.fme_precision = LSHLayoutConfigParams["FMEPrecision"]
386 LSHLayoutConfig.sl_repeats = LSHLayoutConfigParams["SLRepeats"]
387 LSHLayoutConfig.sl_extra_scaling_steps = LSHLayoutConfigParams["SLExtraScalingSteps"]
388 LSHLayoutConfig.sl_scaling_min = LSHLayoutConfigParams["SLScalingMin"]
389 LSHLayoutConfig.sl_scaling_max = LSHLayoutConfigParams["SLScalingMax"]
390 LSHLayoutConfig.sl_scaling_type = LSHLayoutConfigParams["SLScalingType"]
391 LSHLayoutConfig.mmm_repeats = LSHLayoutConfigParams["MMMRepeats"]
392 LSHLayoutConfig.placer = LSHLayoutConfigParams["Placer"]
393 LSHLayoutConfig.merger = LSHLayoutConfigParams["Merger"]
394 LSHLayoutConfig.merger_factor = LSHLayoutConfigParams["MergerFactor"]
395 LSHLayoutConfig.merger_adjustment = LSHLayoutConfigParams["MergerAdjustment"]
396 LSHLayoutConfig.node_size = 1.0 / LSHLayoutConfigParams["NodeSizeDenominator"]
397
398 NodeXCoords, NodeYCoords, EdgeNodeStartList, EdgeNodeToList, _ = tm.layout_from_lsh_forest(
399 LSHForest, config=LSHLayoutConfig
400 )
401
402 PlotCoordsInfo["NodeXCoords"] = NodeXCoords
403 PlotCoordsInfo["NodeYCoords"] = NodeYCoords
404 PlotCoordsInfo["EdgeNodeStartList"] = EdgeNodeStartList
405 PlotCoordsInfo["EdgeNodeToList"] = EdgeNodeToList
406
407 return PlotCoordsInfo
408
409
410 def SetupTMAPPlotData(InfileDF):
411 """Setup plot data for TMAP plot."""
412
413 MiscUtil.PrintInfo("\nSetting up TMAP plot data...")
414
415 PlotDataInfo = {}
416 PlotDataInfo["Columns"] = []
417 PlotDataInfo["Colormaps"] = []
418 PlotDataInfo["CategoricalStatus"] = []
419 PlotDataInfo["LegendLabels"] = []
420 PlotDataInfo["SeriesTitles"] = []
421
422 # Setup categorical data...
423 if OptionsInfo["CategoricalDataColnames"] is not None:
424 for ColnameIndex, Colname in enumerate(OptionsInfo["CategoricalDataColnames"]):
425 CategoryLabels, CategoryData = Faerun.create_categories(InfileDF[Colname])
426 if len(CategoryLabels) > OptionsInfo["CategoricalDataMaxDisplay"]:
427 CategoryLabels, CategoryData = RemapCategoricalPlotData(CategoryLabels, CategoryData)
428
429 PlotDataInfo["Columns"].append(CategoryData)
430 PlotDataInfo["Colormaps"].append(OptionsInfo["CategoricalDataColormapsList"][ColnameIndex])
431 PlotDataInfo["CategoricalStatus"].append(True)
432 PlotDataInfo["LegendLabels"].append(CategoryLabels)
433 PlotDataInfo["SeriesTitles"].append(Colname)
434
435 # Setup numerical data...
436 if OptionsInfo["NumericalDataColnames"] is not None:
437 for ColnameIndex, Colname in enumerate(OptionsInfo["NumericalDataColnames"]):
438 PlotDataInfo["Columns"].append(InfileDF[Colname])
439 PlotDataInfo["Colormaps"].append(OptionsInfo["NumericalDataColormapsList"][ColnameIndex])
440 PlotDataInfo["CategoricalStatus"].append(False)
441 PlotDataInfo["LegendLabels"].append(None)
442 PlotDataInfo["SeriesTitles"].append(Colname)
443
444 # Setup structure display data...
445 FirstCol = True
446 SMILESSelectedData = []
447 SMILESSelectedLabels = []
448 FirstCol = True
449 for Colname in OptionsInfo["StructureDisplayDataColnames"]:
450 if FirstCol:
451 FirstCol = False
452 SMILESSelectedData = InfileDF[Colname]
453 SMILESSelectedLabels.append(Colname)
454 else:
455 SMILESSelectedData = SMILESSelectedData + "__" + InfileDF[Colname].astype(str)
456 SMILESSelectedLabels.append(Colname)
457
458 PlotDataInfo["SMILESSelectedData"] = SMILESSelectedData
459 PlotDataInfo["SMILESSelectedLabels"] = SMILESSelectedLabels
460
461 return PlotDataInfo
462
463
464 def RemapCategoricalPlotData(CategoryLabels, CategoryData):
465 """Ramap categorical plot data."""
466
467 if len(CategoryLabels) <= OptionsInfo["CategoricalDataMaxDisplay"]:
468 return (CategoryLabels, CategoryData)
469
470 # Track categories to remap...
471 CategoryLabelsNew = []
472 CategoryValuesToRemap = []
473 LastCategoryValue = 0
474
475 for CategoryLabelIndex, CategoryLabel in enumerate(CategoryLabels):
476 CategoryValue, CategroyName = CategoryLabel
477 if CategoryLabelIndex < OptionsInfo["CategoricalDataMaxDisplay"]:
478 CategoryLabelsNew.append((CategoryValue, CategroyName))
479 LastCategoryValue = CategoryValue
480 else:
481 CategoryValuesToRemap.append(CategoryValue)
482
483 # Set up other category...
484 OtherCategoryValue = LastCategoryValue + 1
485 OtherCategoryName = "Other"
486 CategoryLabelsNew.append((OtherCategoryValue, OtherCategoryName))
487
488 # Update category labels and data...
489 CategoryLabels = CategoryLabelsNew
490 for ValueIndex, Value in enumerate(CategoryData):
491 if Value in CategoryValuesToRemap:
492 CategoryData[ValueIndex] = OtherCategoryValue
493
494 return (CategoryLabels, CategoryData)
495
496
497 def GenerateTMAPPlot(InfileDF, PlotCoordsInfo, PlotDataInfo):
498 """Generate TMAP plot."""
499
500 MiscUtil.PrintInfo("\nGenerating TMAP plot...")
501
502 # Initialize Faerun plot...
503 FaerunConfigParams = OptionsInfo["FaerunConfigParams"]
504 ImpressMsg = OptionsInfo["TMAPDisplayMsg"]
505 TMAPFaerunPlot = Faerun(
506 clear_color=FaerunConfigParams["ClearColor"],
507 view="front",
508 coords=False,
509 title="",
510 x_title="",
511 y_title="",
512 show_legend=FaerunConfigParams["ShowLegend"],
513 legend_title=FaerunConfigParams["LegendTitle"],
514 legend_orientation=FaerunConfigParams["LegendOrientation"],
515 legend_number_format=FaerunConfigParams["LegendNumberFormat"],
516 scale=FaerunConfigParams["Scale"],
517 alpha_blending=FaerunConfigParams["AlphaBlending"],
518 anti_aliasing=FaerunConfigParams["AntiAliasing"],
519 thumbnail_width=FaerunConfigParams["ThumbnailWidth"],
520 thumbnail_fixed=FaerunConfigParams["ThumbnailFixed"],
521 impress=ImpressMsg,
522 )
523
524 # Setup scatter plot...
525 ScatterPlotName = "Data"
526 ScatterTreePlotName = "%s_tree" % ScatterPlotName
527 FaerunScatterPlotParams = OptionsInfo["FaerunScatterPlotParams"]
528 TMAPFaerunPlot.add_scatter(
529 ScatterPlotName,
530 {
531 "x": PlotCoordsInfo["NodeXCoords"],
532 "y": PlotCoordsInfo["NodeYCoords"],
533 "c": PlotDataInfo["Columns"],
534 "labels": PlotDataInfo["SMILESSelectedData"],
535 },
536 colormap=PlotDataInfo["Colormaps"],
537 shader=FaerunScatterPlotParams["Shader"],
538 point_scale=FaerunScatterPlotParams["PointScale"],
539 max_point_size=FaerunScatterPlotParams["MaxPointSize"],
540 fog_intensity=FaerunScatterPlotParams["FogIntensity"],
541 categorical=PlotDataInfo["CategoricalStatus"],
542 interactive=FaerunScatterPlotParams["Interactive"],
543 has_legend=True,
544 legend_labels=PlotDataInfo["LegendLabels"],
545 series_title=PlotDataInfo["SeriesTitles"],
546 selected_labels=PlotDataInfo["SMILESSelectedLabels"],
547 )
548
549 # Add scatter plot to Faerun...
550 TMAPFaerunPlot.add_tree(
551 ScatterTreePlotName,
552 {"from": PlotCoordsInfo["EdgeNodeStartList"], "to": PlotCoordsInfo["EdgeNodeToList"]},
553 point_helper=ScatterPlotName,
554 )
555
556 # Write out TMAP plot HTML and JS files...
557 MiscUtil.PrintInfo("Writing TMAP plot files %s and %s..." % (OptionsInfo["Outfile"], OptionsInfo["OutfileJS"]))
558 TMAPFaerunPlot.plot(OptionsInfo["OutfilePrefix"], template="smiles")
559
560 if OptionsInfo["MergeHTMLandJSFilesMode"]:
561 MergeTMAPResultsHTMLAndJSFiles()
562
563
564 def MergeTMAPResultsHTMLAndJSFiles():
565 """Merge TMAP HTML and JS files."""
566
567 MiscUtil.PrintInfo("\nMerging TMAP plot file %s into %s..." % (OptionsInfo["OutfileJS"], OptionsInfo["Outfile"]))
568
569 TMAPResultsHTMLFile = OptionsInfo["Outfile"]
570 TMAPResultsJSFile = OptionsInfo["OutfileJS"]
571
572 TMAPResultsTMPHTMLFile = "Tmp%s.html" % OptionsInfo["OutfilePrefix"]
573
574 HTMLResultsFH = open(TMAPResultsHTMLFile, "r")
575 JSResultsFH = open(TMAPResultsJSFile, "r")
576
577 TMPHTMLResultsFH = open(TMAPResultsTMPHTMLFile, "w")
578
579 for HTMLLine in HTMLResultsFH:
580 HTMLLine = HTMLLine.rstrip()
581 if re.search("%s" % TMAPResultsJSFile, HTMLLine, re.IGNORECASE):
582 TMPHTMLResultsFH.write(" <script>\n")
583
584 FirstLine = True
585 for JSLine in JSResultsFH:
586 JSLine = JSLine.rstrip()
587 if FirstLine:
588 FirstLine = False
589 TMPHTMLResultsFH.write(" %s\n" % JSLine)
590 else:
591 TMPHTMLResultsFH.write("%s\n" % JSLine)
592 TMPHTMLResultsFH.write("\n </script>\n")
593
594 else:
595 TMPHTMLResultsFH.write("%s\n" % HTMLLine)
596
597 HTMLResultsFH.close()
598 JSResultsFH.close()
599 TMPHTMLResultsFH.close()
600
601 MiscUtil.PrintInfo("Moving %s to %s..." % (TMAPResultsTMPHTMLFile, OptionsInfo["Outfile"]))
602 shutil.move(TMAPResultsTMPHTMLFile, TMAPResultsHTMLFile)
603
604 MiscUtil.PrintInfo("Removing %s file..." % (OptionsInfo["OutfileJS"]))
605 os.remove(TMAPResultsJSFile)
606
607
608 def InitializeLSHForest():
609 """Initialize LSH forest."""
610
611 LSHForestParams = OptionsInfo["LSHForestParams"]
612 LSHForest = tm.LSHForest(LSHForestParams["Dim"], LSHForestParams["NumPrefixTrees"], LSHForestParams["Store"])
613
614 return LSHForest
615
616
617 def InitializeMinHashFingerprintsEncoder():
618 """Initialize MinHash fingerprints encoder."""
619
620 MinHashFPParams = OptionsInfo["MinHashFPParams"]
621 MinHashFingerprintsEncoder = MHFPEncoder(
622 n_permutations=MinHashFPParams["NumPermutations"], seed=MinHashFPParams["Seed"]
623 )
624
625 return MinHashFingerprintsEncoder
626
627
628 def ReadMoleculeData():
629 """Read molecule data."""
630
631 Infile = OptionsInfo["Infile"]
632 InfileDelimiter = OptionsInfo["InfileDelimiter"]
633
634 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
635 InfileDF = pd.read_csv(Infile, sep=InfileDelimiter)
636
637 return InfileDF
638
639
640 def ProcessMolCountBasedAutoOptions(MolCount):
641 """Process auto option values dependent on number of molecules."""
642
643 # Process "auto" option for LSHForestParams...
644 ParamName = "NumPrefixTrees"
645 ParamValue = "%s" % OptionsInfo["LSHForestParams"][ParamName]
646 if re.match("^auto$", ParamValue, re.I):
647 ParamValue = 128 if MolCount <= 10e03 else 8
648 OptionsInfo["LSHForestParams"][ParamName] = ParamValue
649
650 # Process "auto" option for FaerunScatterPlotParams...
651 ParamName = "PointScale"
652 ParamValue = OptionsInfo["FaerunScatterPlotParams"][ParamName]
653 ParamValue = "%s" % ParamValue
654 if re.match("^auto$", ParamValue, re.I):
655 if MolCount <= 10e03:
656 ParamValue = 4.0
657 elif MolCount <= 10e04:
658 ParamValue = 2.0
659 else:
660 ParamValue = 1.0
661 OptionsInfo["FaerunScatterPlotParams"][ParamName] = ParamValue
662
663 # Process "auto" option for LSHLayoutConfigParams...
664 for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
665 ParamValue = "%s" % OptionsInfo["LSHLayoutConfigParams"][ParamName]
666
667 if not re.match("^auto$", ParamValue, re.I):
668 continue
669
670 if re.match("^K$", ParamName, re.I):
671 ParamValue = 75 if MolCount <= 10e03 else 10
672 elif re.match("^KC$", ParamName, re.I):
673 ParamValue = 20 if MolCount <= 10e03 else 10
674 elif re.match("^SLRepeats$", ParamName, re.I):
675 ParamValue = 2 if MolCount <= 10e03 else 1
676 elif re.match("^SLExtraScalingSteps$", ParamName, re.I):
677 ParamValue = 4 if MolCount <= 10e03 else 2
678 elif re.match("^MMMRepeats$", ParamName, re.I):
679 ParamValue = 2 if MolCount <= 10e03 else 1
680 elif re.match("^NodeSizeDenominator$", ParamName, re.I):
681 ParamValue = 65.0 if MolCount <= 10e03 else 70.0
682
683 OptionsInfo["LSHLayoutConfigParams"][ParamName] = ParamValue
684
685
686 def SetupTMAPDisplayMessage(MolCount, ValidMolCount):
687 """Setup TMAP display message."""
688
689 # Setup default TMAP display message using valid molecule count...
690 if re.match("^auto$", OptionsInfo["TMAPDisplayMsg"], re.I):
691 if MolCount == ValidMolCount:
692 OptionsInfo["TMAPDisplayMsg"] = (
693 "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s"
694 % (OptionsInfo["Infile"], MolCount)
695 )
696 else:
697 OptionsInfo["TMAPDisplayMsg"] = (
698 "TMAP chemspace visualization<br/>Input file: %s<br/>Number of molecules: %s<br/>Number of valid molecules: %s"
699 % (OptionsInfo["Infile"], MolCount, ValidMolCount)
700 )
701
702
703 def ProcessFaerunConfigParametersOption():
704 """Process option for faerun configuration parameters."""
705
706 ParamsOptionName = "--faerunConfigParams"
707 ParamsOptionValue = Options[ParamsOptionName]
708 ParamsDefaultInfo = {
709 "ClearColor": ["str", "#000000"],
710 "ShowLegend": ["bool", True],
711 "LegendTitle": ["str", "Legend"],
712 "LegendOrientation": ["str", "vertical"],
713 "LegendNumberFormat": ["str", "{:.2f}"],
714 "Scale": ["float", 750.0],
715 "AlphaBlending": ["bool", False],
716 "AntiAliasing": ["bool", True],
717 "ThumbnailWidth": ["int", 250],
718 "ThumbnailFixed": ["bool", False],
719 }
720
721 FaerunConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(
722 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
723 )
724
725 ParamName = "LegendOrientation"
726 ParamValue = FaerunConfigParams[ParamName]
727 if not re.match("^(vertical|horizontal)$", ParamValue, re.I):
728 MiscUtil.PrintError(
729 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: vertical or horizontal\n'
730 % (ParamValue, ParamName, ParamsOptionName)
731 )
732 FaerunConfigParams[ParamName] = ParamValue.lower()
733
734 for ParamName in ["Scale", "ThumbnailWidth"]:
735 ParamValue = FaerunConfigParams[ParamName]
736 if ParamValue <= 0:
737 MiscUtil.PrintError(
738 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
739 % (ParamValue, ParamName, ParamsOptionName)
740 )
741
742 OptionsInfo["FaerunConfigParams"] = FaerunConfigParams
743
744
745 def ProcessFaerunScatterPlotParamsOption():
746 """Process option for faerun scatter plot parameters."""
747
748 ParamsOptionName = "--faerunScatterPlotParams"
749 ParamsOptionValue = Options[ParamsOptionName]
750 ParamsDefaultInfo = {
751 "Shader": ["str", "circle"],
752 "PointScale": ["str", "auto"],
753 "MaxPointSize": ["float", 100.0],
754 "FogIntensity": ["float", 0.0],
755 "Interactive": ["bool", True],
756 }
757
758 FaerunScatterPlotParams = MiscUtil.ProcessOptionNameValuePairParameters(
759 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
760 )
761
762 ParamName = "PointScale"
763 ParamValue = FaerunScatterPlotParams[ParamName]
764 if not re.match("^auto$", ParamValue, re.I):
765 if not MiscUtil.IsFloat(ParamValue):
766 MiscUtil.PrintError(
767 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be a float.'
768 % (ParamValue, ParamName, ParamsOptionName)
769 )
770 ParamValue = float(ParamValue)
771 if ParamValue <= 0:
772 MiscUtil.PrintError(
773 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
774 % (ParamValue, ParamName, ParamsOptionName)
775 )
776 FaerunScatterPlotParams[ParamName] = ParamValue
777
778 ParamName = "MaxPointSize"
779 ParamValue = FaerunScatterPlotParams[ParamName]
780 if ParamValue <= 0:
781 MiscUtil.PrintError(
782 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
783 % (ParamValue, ParamName, ParamsOptionName)
784 )
785
786 ParamName = "FogIntensity"
787 ParamValue = FaerunScatterPlotParams[ParamName]
788 if ParamValue < 0:
789 MiscUtil.PrintError(
790 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
791 % (ParamValue, ParamName, ParamsOptionName)
792 )
793
794 OptionsInfo["FaerunScatterPlotParams"] = FaerunScatterPlotParams
795
796
797 def ProcessLSHForestParamsOption():
798 """Process option for LSH forest parameters."""
799
800 ParamsOptionName = "--lshForestParams"
801 ParamsOptionValue = Options[ParamsOptionName]
802 ParamsDefaultInfo = {"Dim": ["int", 2048], "NumPrefixTrees": ["str", "auto"], "Store": ["bool", True]}
803
804 LSHForestParams = MiscUtil.ProcessOptionNameValuePairParameters(
805 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
806 )
807
808 ParamName = "Dim"
809 ParamValue = LSHForestParams[ParamName]
810 if ParamValue <= 0:
811 MiscUtil.PrintError(
812 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
813 % (ParamValue, ParamName, ParamsOptionName)
814 )
815
816 ParamName = "NumPrefixTrees"
817 ParamValue = LSHForestParams[ParamName]
818 if not re.match("^auto$", ParamValue, re.I):
819 if not MiscUtil.IsInteger(ParamValue):
820 MiscUtil.PrintError(
821 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be an integer.'
822 % (ParamValue, ParamName, ParamsOptionName)
823 )
824 ParamValue = int(ParamValue)
825 if ParamValue <= 0:
826 MiscUtil.PrintError(
827 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
828 % (ParamValue, ParamName, ParamsOptionName)
829 )
830 LSHForestParams[ParamName] = ParamValue
831
832 OptionsInfo["LSHForestParams"] = LSHForestParams
833
834
835 def ProcessLSHLayoutConfigParamsOption():
836 """Process option for LSH configuration parameters."""
837
838 ParamsOptionName = "--lshLayoutConfigParams"
839 ParamsOptionValue = Options[ParamsOptionName]
840 ParamsDefaultInfo = {
841 "K": ["str", "auto"],
842 "KC": ["str", "auto"],
843 "FMEIterations": ["int", 1000],
844 "FMERandomize": ["bool", False],
845 "FMEThreads": ["int", 4],
846 "FMEPrecision": ["int", 4],
847 "SLRepeats": ["str", "auto"],
848 "SLExtraScalingSteps": ["str", "auto"],
849 "SLScalingMin": ["float", 1.0],
850 "SLScalingMax": ["float", 1.0],
851 "SLScalingType": ["str", "RelativeToDrawing"],
852 "MMMRepeats": ["str", "auto"],
853 "Placer": ["str", "Barycenter"],
854 "Merger": ["str", "LocalBiconnected"],
855 "MergerFactor": ["float", 2.0],
856 "MergerAdjustment": ["int", 0],
857 "NodeSizeDenominator": ["str", "auto"],
858 }
859
860 LSHLayoutConfigParams = MiscUtil.ProcessOptionNameValuePairParameters(
861 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
862 )
863
864 for ParamName in [
865 "FMEIterations",
866 "FMEThreads",
867 "FMEPrecision",
868 "SLScalingMin",
869 "SLScalingMax",
870 "MergerFactor",
871 "MergerAdjustment",
872 ]:
873 ParamValue = LSHLayoutConfigParams[ParamName]
874 if re.match("^%s$" % ParamName, "MergerAdjustment", re.I):
875 if ParamValue < 0:
876 MiscUtil.PrintError(
877 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0\n'
878 % (ParamValue, ParamName, ParamsOptionName)
879 )
880 else:
881 if ParamValue <= 0:
882 MiscUtil.PrintError(
883 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
884 % (ParamValue, ParamName, ParamsOptionName)
885 )
886
887 # Process "auto" values...
888 for ParamName in ["K", "KC", "SLRepeats", "SLExtraScalingSteps", "MMMRepeats", "NodeSizeDenominator"]:
889 ParamValue = LSHLayoutConfigParams[ParamName]
890
891 if not re.match("^auto$", ParamValue, re.I):
892 if re.match("^NodeSizeDenominator$", ParamName, re.I):
893 if not MiscUtil.IsFloat(ParamValue):
894 MiscUtil.PrintError(
895 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be a float.'
896 % (ParamValue, ParamName, ParamsOptionName)
897 )
898 ParamValue = float(ParamValue)
899 else:
900 if not MiscUtil.IsInteger(ParamValue):
901 MiscUtil.PrintError(
902 'The parameter value, %s, specified for parameter name, %s, using "%s" option must be an integer.'
903 % (ParamValue, ParamName, ParamsOptionName)
904 )
905 ParamValue = int(ParamValue)
906
907 if ParamValue <= 0:
908 MiscUtil.PrintError(
909 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
910 % (ParamValue, ParamName, ParamsOptionName)
911 )
912 LSHLayoutConfigParams[ParamName] = ParamValue
913
914 # Map SLScalingType to TMAP object...
915 ParamInfo = {
916 "Absolute": tm.ScalingType.Absolute,
917 "RelativeToAvgLength": tm.ScalingType.RelativeToAvgLength,
918 "RelativeToDesiredLength": tm.ScalingType.RelativeToDesiredLength,
919 "RelativeToDrawing": tm.ScalingType.RelativeToDrawing,
920 }
921 ParamName = "SLScalingType"
922 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
923
924 # Map Placer to TMAP object...
925 ParamInfo = {
926 "Barycenter": tm.Placer.Barycenter,
927 "Solar": tm.Placer.Solar,
928 "Circle": tm.Placer.Circle,
929 "Median": tm.Placer.Median,
930 "Random": tm.Placer.Random,
931 "Zero": tm.Placer.Zero,
932 }
933 ParamName = "Placer"
934 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
935
936 # Map Merger to TMAP object...
937 ParamInfo = {
938 "EdgeCover": tm.Merger.EdgeCover,
939 "LocalBiconnected": tm.Merger.LocalBiconnected,
940 "Solar": tm.Merger.Solar,
941 "IndependentSet": tm.Merger.IndependentSet,
942 }
943 ParamName = "Merger"
944 MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo)
945
946 OptionsInfo["LSHLayoutConfigParams"] = LSHLayoutConfigParams
947
948
949 def MapLSHLayoutConfigParamToTMAPObject(LSHLayoutConfigParams, ParamsOptionName, ParamName, ParamInfo):
950 """Map LSH layout configuration patameter valut to TMAP object."""
951
952 ParamValue = LSHLayoutConfigParams[ParamName]
953 if ParamValue not in ParamInfo:
954 MiscUtil.PrintError(
955 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: %s\n'
956 % (ParamValue, ParamName, ParamsOptionName, ", ".join(sorted(ParamInfo.keys())))
957 )
958 LSHLayoutConfigParams[ParamName] = ParamInfo[ParamValue]
959
960
961 def ProcessMinHashFPParamsOption():
962 """Process option for MinHash parameters."""
963
964 ParamsOptionName = "--minHashFPParams"
965 ParamsOptionValue = Options[ParamsOptionName]
966 ParamsDefaultInfo = {
967 "Radius": ["int", 3],
968 "Rings": ["bool", True],
969 "Kekulize": ["bool", True],
970 "Sanitize": ["bool", True],
971 "MinRadius": ["int", 1],
972 "NumPermutations": ["int", 2048],
973 "Seed": ["int", 42],
974 }
975
976 MinHashFPParams = MiscUtil.ProcessOptionNameValuePairParameters(
977 ParamsOptionName, ParamsOptionValue, ParamsDefaultInfo
978 )
979
980 for ParamName in ["Radius", "MinRadius", "NumPermutations"]:
981 ParamValue = MinHashFPParams[ParamName]
982 if ParamValue <= 0:
983 MiscUtil.PrintError(
984 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: > 0\n'
985 % (ParamValue, ParamName, ParamsOptionName)
986 )
987
988 OptionsInfo["MinHashFPParams"] = MinHashFPParams
989
990
991 def ProcessInfileDelimiterOption():
992 """Process option infile delimiter."""
993
994 InfileDelim = Options["--infileDelimiter"]
995 if re.match("^auto$", InfileDelim, re.I):
996 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Infile"])
997 if re.match("^csv$", FileExt, re.I):
998 InfileDelim = "comma"
999 elif re.match("^(tsv|txt)$", FileExt, re.I):
1000 InfileDelim = "tab"
1001 elif re.match("^(smi)$", FileExt, re.I):
1002 InfileDelim = "space"
1003 else:
1004 MiscUtil.PrintError(
1005 'The input file delimiter couldn\'t be determined from its extension %s. You must explicitly specify an input file delimiter using option"--infileDelimiter".\n'
1006 % (InfileDelim)
1007 )
1008
1009 InfileDelimMap = {"comma": ",", "tab": "\t", "space": " "}
1010 OptionsInfo["InfileDelimiter"] = InfileDelimMap[InfileDelim]
1011
1012
1013 def ProcessColumnModeOption():
1014 """Process column mode option."""
1015
1016 CollabelMode, ColnumMode = [False, False]
1017 Colmode = Options["--colmode"]
1018 if re.match("^collabel$", Colmode, re.I):
1019 CollabelMode = True
1020 elif re.match("^colnum$", Colmode, re.I):
1021 ColnumMode = True
1022 else:
1023 MiscUtil.PrintError(
1024 'The value, %s, specified for option "-c, --colmode" is not valid. Supported values: collabel or colnum\n'
1025 % (Colmode)
1026 )
1027
1028 OptionsInfo["Colmode"] = Colmode
1029 OptionsInfo["CollabelMode"] = CollabelMode
1030 OptionsInfo["ColnumMode"] = ColnumMode
1031
1032
1033 def RetrieveColumnNames():
1034 """Retrieve column names."""
1035
1036 Infile = OptionsInfo["Infile"]
1037
1038 InfileFH = open(Infile, "r")
1039 InfileReader = csv.reader(InfileFH, delimiter=OptionsInfo["InfileDelimiter"], quotechar='"')
1040 Colnames = next(InfileReader)
1041 InfileFH.close()
1042
1043 if len(Colnames) == 0:
1044 MiscUtil.PrintError("The first line in input file, %s, is empty. It must contain column names.\n" % Infile)
1045
1046 ColnameToColnumMap = {}
1047 ColnumToColnameMap = {}
1048 for ColIndex, Colname in enumerate(Colnames):
1049 Colnum = ColIndex + 1
1050 ColnameToColnumMap[Colname] = Colnum
1051 ColnumToColnameMap[Colnum] = Colname
1052
1053 OptionsInfo["Colnames"] = Colnames
1054 OptionsInfo["ColCount"] = len(Colnames)
1055 OptionsInfo["ColnameToColnumMap"] = ColnameToColnumMap
1056 OptionsInfo["ColnumToColnameMap"] = ColnumToColnameMap
1057
1058 # Initialize for tracking specified column names...
1059 SpecifiedColsInfo = {}
1060 SpecifiedColsInfo["Colnames"] = []
1061 SpecifiedColsInfo["Colnum"] = {}
1062 SpecifiedColsInfo["OptionName"] = {}
1063
1064 OptionsInfo["SpecifiedColsInfo"] = SpecifiedColsInfo
1065
1066
1067 def ProcessSMILESColOption():
1068 """Process SMILES column option."""
1069
1070 SMILESCol = Options["--colSMILES"]
1071 if re.match("^auto$", SMILESCol, re.I):
1072 Colname = "SMILES"
1073 if Colname not in OptionsInfo["ColnameToColnumMap"]:
1074 MiscUtil.PrintError(
1075 'The SMILES column name, %s, doen\'t exist in input file. You must specify a valid SMILES column name or number using "--colSMILES" option.\n'
1076 % Colname
1077 )
1078
1079 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
1080 SMILESColspec = Colnum if OptionsInfo["ColnumMode"] else Colname
1081 else:
1082 SMILESColspec = SMILESCol
1083
1084 SMILESColname, SMILESColnum = ProcessColumnSpecification("--colSMILES", SMILESColspec)
1085
1086 OptionsInfo["SMILESCol"] = SMILESCol
1087 OptionsInfo["SMILESColname"] = SMILESColname
1088 OptionsInfo["SMILESColnum"] = SMILESColnum
1089
1090
1091 def ProcessCategoricalDataColsOption():
1092 """Process categorical data columns option."""
1093
1094 CategoricalDataColnames, CategoricalDataColnums = [None] * 2
1095 CategoricalDataCols = Options["--categoricalDataCols"]
1096 if not re.match("^none$", CategoricalDataCols, re.I):
1097 CategoricalDataColnames = []
1098 CategoricalDataColnums = []
1099 for DataCol in CategoricalDataCols.split(","):
1100 DataCol = DataCol.strip()
1101 DataColname, DataColnum = ProcessColumnSpecification("--categoricalDataCols", DataCol)
1102 CategoricalDataColnames.append(DataColname)
1103 CategoricalDataColnums.append(DataColnum)
1104
1105 OptionsInfo["CategoricalDataCols"] = CategoricalDataCols
1106 OptionsInfo["CategoricalDataColnames"] = CategoricalDataColnames
1107 OptionsInfo["CategoricalDataColnums"] = CategoricalDataColnums
1108
1109
1110 def ProcessCategoricalDataColormapsOption():
1111 """Process categorical data color maps option."""
1112
1113 if OptionsInfo["CategoricalDataColnames"] is None:
1114 OptionsInfo["CategoricalDataColormaps"] = Options["--categoricalDataColormaps"]
1115 OptionsInfo["CategoricalDataColormapsList"] = None
1116 return
1117
1118 CategoricalDataColormapsList = []
1119 CategoricalDataColCount = len(OptionsInfo["CategoricalDataColnames"])
1120
1121 CategoricalDataColormaps = Options["--categoricalDataColormaps"]
1122 if not re.match("^auto$", CategoricalDataColormaps, re.I):
1123 ColormapsWords = CategoricalDataColormaps.split(",")
1124 if len(ColormapsWords) != CategoricalDataColCount:
1125 MiscUtil.PrintInfo(
1126 'The number of colormaps, %s, specified using "--categoricalDataColormaps" must be equal to the number of columns, %s, specified using "--categoricalDataCols" option.'
1127 % (len(ColormapsWords), CategoricalDataColCount)
1128 )
1129 for Colormap in ColormapsWords:
1130 Colormap = Colormap.strip()
1131 CategoricalDataColormapsList.append(Colormap)
1132 else:
1133 CategoricalDataColormapsList = ["tab10"] * CategoricalDataColCount
1134
1135 OptionsInfo["CategoricalDataColormaps"] = CategoricalDataColormaps
1136 OptionsInfo["CategoricalDataColormapsList"] = CategoricalDataColormapsList
1137
1138
1139 def ProcessNumericalDataColsOption():
1140 """Process numerical data columns option."""
1141
1142 NumericalDataColnames, NumericalDataColnums = [None] * 2
1143 NumericalDataCols = Options["--numericalDataCols"]
1144 if not re.match("^none$", NumericalDataCols, re.I):
1145 NumericalDataColnames = []
1146 NumericalDataColnums = []
1147 for DataCol in NumericalDataCols.split(","):
1148 DataCol = DataCol.strip()
1149 DataColname, DataColnum = ProcessColumnSpecification("--numericalDataCols", DataCol)
1150 NumericalDataColnames.append(DataColname)
1151 NumericalDataColnums.append(DataColnum)
1152
1153 OptionsInfo["NumericalDataCols"] = NumericalDataCols
1154 OptionsInfo["NumericalDataColnames"] = NumericalDataColnames
1155 OptionsInfo["NumericalDataColnums"] = NumericalDataColnums
1156
1157
1158 def ProcessNumericalDataColormapsOption():
1159 """Process numerical data color maps option."""
1160
1161 if OptionsInfo["NumericalDataColnames"] is None:
1162 OptionsInfo["NumericalDataColormaps"] = Options["--numericalDataColormaps"]
1163 OptionsInfo["NumericalDataColormapsList"] = None
1164 return
1165
1166 NumericalDataColormapsList = []
1167 NumericalDataColCount = len(OptionsInfo["NumericalDataColnames"])
1168
1169 NumericalDataColormaps = Options["--numericalDataColormaps"]
1170 if not re.match("^auto$", NumericalDataColormaps, re.I):
1171 ColormapsWords = NumericalDataColormaps.split(",")
1172 if len(ColormapsWords) != NumericalDataColCount:
1173 MiscUtil.PrintInfo(
1174 'The number of colormaps, %s, specified using "--categoricalDataColormaps" must be equal to the number of columns, %s, specified using "--categoricalDataCols" option.'
1175 % (len(ColormapsWords), NumericalDataColCount)
1176 )
1177 for Colormap in ColormapsWords:
1178 Colormap = Colormap.strip()
1179 NumericalDataColormapsList.append(Colormap)
1180 else:
1181 NumericalDataColormapsList = ["viridis"] * NumericalDataColCount
1182
1183 OptionsInfo["NumericalDataColormaps"] = NumericalDataColormaps
1184 OptionsInfo["NumericalDataColormapsList"] = NumericalDataColormapsList
1185
1186
1187 def ProcessStructureDisplayDataColsOption():
1188 """Process structure display data columns option."""
1189
1190 StructureDisplayDataColnames = []
1191 StructureDisplayDataColnums = []
1192
1193 # Add SMILES column...
1194 StructureDisplayDataColnames.append(OptionsInfo["SMILESColname"])
1195 StructureDisplayDataColnums.append(OptionsInfo["SMILESColnum"])
1196
1197 # Process specified columns...
1198 OptionName = "--structureDisplayDataCols"
1199 StructureDisplayDataCols = Options[OptionName]
1200 if re.match("^auto$", StructureDisplayDataCols, re.I):
1201 # Automatically add 'Name' column...
1202 Colname = "Name"
1203 if Colname in OptionsInfo["ColnameToColnumMap"]:
1204 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
1205 StructureDisplayDataColnames.append(Colname)
1206 StructureDisplayDataColnums.append(Colnum)
1207 else:
1208 for DataCol in StructureDisplayDataCols.split(","):
1209 DataCol = DataCol.strip()
1210 if OptionsInfo["ColnumMode"]:
1211 Colnum = int(DataCol)
1212 if Colnum not in OptionsInfo["ColnumToColnameMap"]:
1213 MiscUtil.PrintError(
1214 'The column number, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n'
1215 % (Colnum, OptionName, OptionsInfo["ColCount"])
1216 )
1217 Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
1218 else:
1219 Colname = DataCol
1220 if Colname not in OptionsInfo["ColnameToColnumMap"]:
1221 MiscUtil.PrintError(
1222 'The column name, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column name. Valid values: %s\n'
1223 % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))
1224 )
1225 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
1226
1227 if Colname in StructureDisplayDataColnames:
1228 StructureDisplayDataColnumsStrs = ["%s" % Num for Num in StructureDisplayDataColnums]
1229 if OptionsInfo["ColnumMode"]:
1230 MiscUtil.PrintError(
1231 'The column number, %s, specified using "%s" option is a duplicate column number. It has already been used for this option. You must specify a different column number. Used column names: %s; Used column nums: %s\n'
1232 % (
1233 Colnum,
1234 OptionName,
1235 " ".join(StructureDisplayDataColnames),
1236 " ".join(StructureDisplayDataColnumsStrs),
1237 )
1238 )
1239 else:
1240 MiscUtil.PrintError(
1241 'The column name, %s, specified using "%s" option is a duplicate column name. It has already been used for this option. You must specify a different column name. Used column names: %s; Used column nums: %s\n'
1242 % (
1243 Colname,
1244 OptionName,
1245 " ".join(StructureDisplayDataColnames),
1246 " ".join(StructureDisplayDataColnumsStrs),
1247 )
1248 )
1249
1250 StructureDisplayDataColnames.append(Colname)
1251 StructureDisplayDataColnums.append(Colnum)
1252
1253 OptionsInfo["StructureDisplayDataCols"] = StructureDisplayDataCols
1254 OptionsInfo["StructureDisplayDataColnames"] = StructureDisplayDataColnames
1255 OptionsInfo["StructureDisplayDataColnums"] = StructureDisplayDataColnums
1256
1257
1258 def ProcessColumnSpecification(OptionName, Colspec):
1259 """Process column specification corresponding to a column name or number."""
1260
1261 Colname, Colnum = [None, None]
1262 if OptionsInfo["ColnumMode"]:
1263 Colnum = int(Colspec)
1264 if Colnum not in OptionsInfo["ColnumToColnameMap"]:
1265 MiscUtil.PrintError(
1266 'The column number, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column number. Valid values: >= 1 and <= %s\n'
1267 % (Colnum, OptionName, OptionsInfo["ColCount"])
1268 )
1269 Colname = OptionsInfo["ColnumToColnameMap"][Colnum]
1270 else:
1271 Colname = Colspec
1272 if Colname not in OptionsInfo["ColnameToColnumMap"]:
1273 MiscUtil.PrintError(
1274 'The column name, %s, specified using "%s" option doesn\'t exist in input file. You must specify a valid column name. Valid values: %s\n'
1275 % (Colname, OptionName, " ".join(OptionsInfo["Colnames"]))
1276 )
1277 Colnum = OptionsInfo["ColnameToColnumMap"][Colname]
1278
1279 # Track and check for duplicate column specification...
1280 SpecifiedColsInfo = OptionsInfo["SpecifiedColsInfo"]
1281 if Colname in SpecifiedColsInfo["Colnames"]:
1282 if OptionsInfo["ColnumMode"]:
1283 MiscUtil.PrintError(
1284 'The column number, %s, specified using "%s" option is a duplicate column number. It has already been used for "%s" option. You must specify a different column number.\n'
1285 % (Colnum, OptionName, SpecifiedColsInfo["OptionName"][Colname])
1286 )
1287 else:
1288 MiscUtil.PrintError(
1289 'The column name, %s, specified using "%s" option is a duplicate column name. It has already been used for "%s" option. You must specify a different column name.\n'
1290 % (Colname, OptionName, SpecifiedColsInfo["OptionName"][Colname])
1291 )
1292 else:
1293 SpecifiedColsInfo["Colnames"].append(Colname)
1294 SpecifiedColsInfo["Colnum"][Colname] = Colnum
1295 SpecifiedColsInfo["OptionName"][Colname] = OptionName
1296
1297 return (Colname, Colnum)
1298
1299
1300 def ProcessOptions():
1301 """Process and validate command line arguments and options."""
1302
1303 MiscUtil.PrintInfo("Processing options...")
1304
1305 # Validate options...
1306 ValidateOptions()
1307
1308 OptionsInfo["Infile"] = Options["--infile"]
1309
1310 Outfile = Options["--outfile"]
1311 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
1312 OptionsInfo["OutfilePrefix"] = FileName
1313 OptionsInfo["OutfileExt"] = FileExt
1314
1315 OptionsInfo["Outfile"] = Outfile
1316 OptionsInfo["OutfileJS"] = "%s.js" % FileName
1317 OptionsInfo["OutfileLSHForest"] = "%s.dat" % FileName
1318
1319 ProcessInfileDelimiterOption()
1320 RetrieveColumnNames()
1321
1322 ProcessColumnModeOption()
1323 ProcessSMILESColOption()
1324
1325 OptionsInfo["CategoricalDataMaxDisplay"] = int(Options["--categoricalDataMaxDisplay"])
1326 ProcessCategoricalDataColsOption()
1327 ProcessCategoricalDataColormapsOption()
1328
1329 ProcessNumericalDataColsOption()
1330 ProcessNumericalDataColormapsOption()
1331
1332 ProcessStructureDisplayDataColsOption()
1333
1334 ProcessFaerunConfigParametersOption()
1335 ProcessFaerunScatterPlotParamsOption()
1336
1337 OptionsInfo["LSHForestFileWriteMode"] = True if re.match("^yes$", Options["--lshForestFileWrite"], re.I) else False
1338 OptionsInfo["LSHForestFileRestoreMode"] = (
1339 True if re.match("^yes$", Options["--lshForestFileRestore"], re.I) else False
1340 )
1341 if OptionsInfo["LSHForestFileRestoreMode"]:
1342 LSHForestFile = OptionsInfo["OutfileLSHForest"]
1343 if not os.path.isfile(LSHForestFile):
1344 MiscUtil.PrintError(
1345 'The LSH forest file, %s, must be present for, %s, value of "--lshForestFileRestore" option.'
1346 % (LSHForestFile, Options["--lshForestFileRestore"])
1347 )
1348
1349 ProcessLSHForestParamsOption()
1350 ProcessLSHLayoutConfigParamsOption()
1351
1352 OptionsInfo["MergeHTMLandJSFilesMode"] = (
1353 True if re.match("^yes$", Options["--mergeHTMLandJSFiles"], re.I) else False
1354 )
1355
1356 ProcessMinHashFPParamsOption()
1357
1358 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
1359 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
1360
1361 OptionsInfo["Overwrite"] = Options["--overwrite"]
1362 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False
1363
1364 OptionsInfo["TMAPDisplayMsg"] = Options["--tmapDisplayMsg"]
1365
1366
1367 def RetrieveOptions():
1368 """Retrieve command line arguments and options."""
1369
1370 # Get options...
1371 global Options
1372 Options = docopt(_docoptUsage_)
1373
1374 # Set current working directory to the specified directory...
1375 WorkingDir = Options["--workingdir"]
1376 if WorkingDir:
1377 os.chdir(WorkingDir)
1378
1379 # Handle examples option...
1380 if "--examples" in Options and Options["--examples"]:
1381 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
1382 sys.exit(0)
1383
1384
1385 def ValidateOptions():
1386 """Validate option values."""
1387
1388 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
1389 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "smi csv tsv txt")
1390
1391 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "html")
1392 MiscUtil.ValidateOptionsOutputFileOverwrite(
1393 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
1394 )
1395 MiscUtil.ValidateOptionsDistinctFileNames(
1396 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
1397 )
1398
1399 MiscUtil.ValidateOptionTextValue("-c, --colmode", Options["--colmode"], "collabel colnum")
1400
1401 if re.match("^none$", Options["--categoricalDataCols"], re.I) and re.match(
1402 "^none$", Options["--numericalDataCols"], re.I
1403 ):
1404 MiscUtil.PrintError(
1405 'You must specify al least one caetgorical or numerical data column using option "--categoricalDataCols" or "--numericalDataCols". It is used to color TMAP.'
1406 )
1407
1408 ColnumMode = True if re.match("^colnum$", Options["--colmode"], re.I) else False
1409 if ColnumMode and not re.match("^auto$", Options["--colSMILES"], re.I):
1410 MiscUtil.ValidateOptionIntegerValue("--colSMILES", Options["--colSMILES"], {">": 0})
1411
1412 if ColnumMode and not re.match("^none$", Options["--categoricalDataCols"], re.I):
1413 MiscUtil.ValidateOptionNumberValues(
1414 "--categoricalDataCols", Options["--categoricalDataCols"], 0, ",", "integer", {">": 0}
1415 )
1416
1417 MiscUtil.ValidateOptionIntegerValue("--categoricalDataMaxDisplay", Options["--categoricalDataMaxDisplay"], {">": 0})
1418
1419 if not re.match("^auto$", Options["--categoricalDataColormaps"], re.I):
1420 ColormapCount = len(Options["--categoricalDataColormaps"].split(","))
1421 ColCount = len(Options["--categoricalDataCols"].split(","))
1422 if ColormapCount != ColCount:
1423 MiscUtil.PrintError(
1424 'The number of colormaps, %s, specified using option "--categoricalDataColormaps" must be equal to number of columns, %s, specified using option "-categoricalDataCols". '
1425 % (ColormapCount, ColCount)
1426 )
1427
1428 if ColnumMode and not re.match("^none$", Options["--numericalDataCols"], re.I):
1429 MiscUtil.ValidateOptionNumberValues(
1430 "--numericalDataCols", Options["--numericalDataCols"], 0, ",", "integer", {">": 0}
1431 )
1432
1433 if not re.match("^auto$", Options["--numericalDataColormaps"], re.I):
1434 ColormapCount = len(Options["--numericalDataColormaps"].split(","))
1435 ColCount = len(Options["--numericalDataCols"].split(","))
1436 if ColormapCount != ColCount:
1437 MiscUtil.PrintError(
1438 'The number of colormaps, %s, specified using option "--numericalDataColormaps" must be equal to number of columns, %s, specified using option "-numericalDataCols". '
1439 % (ColormapCount, ColCount)
1440 )
1441
1442 if not re.match("^auto$", Options["--structureDisplayDataCols"], re.I):
1443 if ColnumMode and not re.match("^none$", Options["--structureDisplayDataCols"], re.I):
1444 MiscUtil.ValidateOptionNumberValues(
1445 "--structureDisplayDataCols", Options["--structureDisplayDataCols"], 0, ",", "integer", {">": 0}
1446 )
1447
1448 if not re.match("^auto$", Options["--infileDelimiter"], re.I):
1449 MiscUtil.ValidateOptionTextValue(" --infileDelimiter", Options["--infileDelimiter"], "comma tab space")
1450
1451 MiscUtil.ValidateOptionTextValue("--lshForestFileWrite", Options["--lshForestFileWrite"], "yes no")
1452 MiscUtil.ValidateOptionTextValue("--lshForestFileRestore", Options["--lshForestFileRestore"], "yes no")
1453 MiscUtil.ValidateOptionTextValue("--mergeHTMLandJSFiles", Options["--mergeHTMLandJSFiles"], "yes no")
1454
1455 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
1456
1457
1458 # Setup a usage string for docopt...
1459 _docoptUsage_ = """
1460 VisualizeChemspaceUsingTMAP.py - Visualize chemspace
1461
1462 Usage:
1463 VisualizeChemspaceUsingTMAP.py [--categoricalDataCols <collabel1,... or colnum1,...>] [--categoricalDataColormaps <Colormap1, Colormap2,...>]
1464 [--categoricalDataMaxDisplay <number>] [--colmode <collabel or colnum>] [--colSMILES <text or number>]
1465 [--faerunConfigParams <Name,Value,...>] [--faerunScatterPlotParams <Name,Value,...>]
1466 [--infileDelimiter <comma, tab, or space>] [--lshForestFileWrite <yes or no>] [--lshForestFileRestore <yes or no>]
1467 [--lshForestParams <Name,Value,...>] [--lshLayoutConfigParams <Name,Value,...>] [--mergeHTMLandJSFiles <yes or no>]
1468 [--minHashFPParams <Name,Value,...>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
1469 [--numericalDataCols <collabel1,... or colnum1,...>] [--numericalDataColormaps <Colormap1, Colormap2,...>]
1470 [--overwrite] [--quiet <yes or no>] [--structureDisplayDataCols <collabel1,... or colnum1,...> ]
1471 [--tmapDisplayMsg <text>] [-w <dir>] -i <infile> -o <outfile>
1472 VisualizeChemspaceUsingTMAP.py -h | --help | -e | --examples
1473
1474 Description:
1475 Generate an interactive TreeMAP (TMAP) [Ref 171, 172] visualization for molecules
1476 in a text input file. The text input file must have a column containing SMILES strings.
1477 In addition, it must contain at least one column corresponding to categorical or
1478 numerical data for coloring TMAP nodes. You may optionally map multiple categorical
1479 and numerical data columns on to a TMAP visualization. A HTML file is generated for
1480 interactive visualization of chemspace in a browser.
1481
1482 The TMAP methodology is able to generate a reasonably interactive visualization
1483 for relatively large data sets. A brief description of the methodology is as follows.
1484 A set of MinHash Fingerprints (MHFPs) are calculated for molecules in input file
1485 followed by the generation of a Locality Sensitivity Hashing (LSH) forest employing
1486 MHFPs. A c-approximate k-Nearest Neighbor Graph (c-k-NNG) is constructed from
1487 LSH, which is used to construct a Minimum Spanning Tree (MST) or Forest (MSF).
1488 The final TMAP visualization is generated by laying out MST and MSF on a plane
1489 using an algorithm provided by the Open Graph Drawing Framework (OGDF). The
1490 OGDF provides flexibility to adjust graph layout methodology in terms of not only
1491 aesthetics but also computational time.
1492
1493 The supported input file formats are: CSV (.csv) TSV (.txt or .tsv),
1494 SMILES (.smi)
1495
1496 The supported output file format is: HTML (.html).
1497
1498 Options:
1499 --categoricalDataCols <collabel1,... or colnum1,...> [default: none]
1500 A comma delimited list of column labels or numbers corresponding to
1501 categorical data to map on a TMAP visualization.
1502 --categoricalDataColormaps <Colormap1, Colormap2,...> [default: auto]
1503 A comma delimited list of color map names corresponding to categorical
1504 data. The default is to use 'tab10' color map name for mapping categorical
1505 data on a TMAP. The number of specified color maps must match the number
1506 of categorical data columns. You must specify valid color map names
1507 supported by Matplotlib. No validation is performed. Example color map
1508 names for categorical data: Pastel1, Pastel2, Paired, Accent, Dark2, Set1,
1509 Set2, Set3, tab10, tab20, tab20b, tab20c.
1510 --categoricalDataMaxDisplay <number> [default: 6]
1511 Maximum number of categories in a category column to display on a TMAP
1512 visualization. The rest of the categories are aggregated under a new
1513 category named 'Other' before mapping on to a TMAP visualization.
1514 -c, --colmode <collabel or colnum> [default: collabel]
1515 Use column number or name for the specification of columns in input
1516 text file containing SMILES strings and molecule names along with any
1517 categorical or numerical data.
1518 --colSMILES <text or number> [default: auto]
1519 Column name or number corresponding to SMILES strings. The default value
1520 is automatically set based on the value of '-c, --colmode': 'SMILES' for
1521 'collabel'; SMILES string column number for 'colnum'. SMILES strings must
1522 be present in input file.
1523 -e, --examples
1524 Print examples.
1525 --faerunConfigParams <Name,Value,...> [default: auto]
1526 A comma delimited list of parameter name and value pairs for configuring
1527 faerun (Ref 172) to generate a TMAP visualization.
1528
1529 The supported parameter names along with their default and possible
1530 values are shown below:
1531
1532 clearColor, #000000
1533 showLegend, yes [ Possible values: yes or no ]
1534 legendTitle, Legend
1535 legendOrientation, vertical [ Possible values: vertical or
1536 horizontal ]
1537 legendNumberFormat, {:.2f}
1538 scale, 750.0
1539 alphaBlending, no [ Possible values: yes or no ]
1540 antiAliasing, yes [Possible values: yes or no]
1541 thumbnailWidth, 250
1542 thumbnailFixed, no [ Possible values: yes or no ]
1543
1544 A brief description of parameters, as available in the code for faerun, is
1545 provided below:
1546
1547 clearColor: Background color
1548 showLegend: Show legend at lower right
1549 legendTitle: Legend title
1550 legendOrientation: Legend Orientation
1551 legendNumberFormat: Number string format applied to numbers
1552 displayed in legend
1553 scale: Scaling factor for scaling normalized coordinates
1554 AlphaBlending: Activate alpha blending. It is required for smoothCircle
1555 shader.
1556 antiAliasing: Activate anti-aliasing. It might adversly impact
1557 rendering performance.
1558 thumbnailWidth: Width of thumbnail images for structures
1559 thumbnailFixed: Show thumbnail images at a fixed location at the
1560 top instead of next to the mouse
1561
1562 --faerunScatterPlotParams <Name,Value,...> [default: auto]
1563 A comma delimited list of parameter name and value pairs for generating
1564 scatter plot representing a TMAP using faerun (Ref 172).
1565
1566 The supported parameter names along with their default and possible
1567 values are shown below:
1568
1569 shader, circle [ Possible values: circle, smoothCircle,
1570 sphere, or any valid value]
1571 pointScale, auto [ 4 if MolCout<=10K; 2 if MolCount<=100K; else 1 ]
1572 maxPointSize, 100.0
1573 fogIntensity, 0.0
1574 interactive, yes [ Possible values: yes or no ]
1575
1576 A brief description of parameters is provided below:
1577
1578 shader: Shader to use for visualizating data points
1579 pointScale: Relative size of data points
1580 maxPointSize: Maximum size of the data points during zooming
1581 fogIntensity: Intensity of distance fog
1582 interactive: Generate interactive scatter plot
1583
1584 -h, --help
1585 Print this help message.
1586 -i, --infile <infile>
1587 Input file name. The SMILES strings must be present in the input file.
1588 Supported formats: CSV (.csv) TSV (.txt or .tsv), or SMILES (.smi)
1589 --infileDelimiter <comma, tab, or space> [default: auto]
1590 Input file delimiter for processing data. The default value is automatically
1591 set based on the type of input file: comma - CSV (.csv); tab - TSV (.txt or
1592 .tsv); space - SMILES (.smi)
1593 --lshForestFileWrite <yes or no> [default: yes]
1594 Write LSH forest data a file for subsequent generation of a TMAP visualization.
1595 Default file name: <OutfileRoot>_LSHForest.dat. The LSH forest data is
1596 generated using MinHash fingerprints. You may restore LSH forest data
1597 using '--lshForestFileRestore' option to skip the generation of fingerprints.
1598 --lshForestFileRestore <yes or no> [default: no]
1599 Check and restore LSH forest data from a file for generating a TMAP
1600 visualization and skip the generation of MinHash fingerprints. Default file
1601 name: <OutfileRoot>_LSHForest.dat
1602 --lshForestParams <Name,Value,...> [default: auto]
1603 A comma delimited list of parameter name and value pairs for generating
1604 LSH (Locality Sensitivity Hashing) forest from MinHash fingerprints.
1605
1606 The supported parameter names along with their default and possible
1607 values are shown below:
1608
1609 dim, 2048
1610 numPrefixTrees, auto [ 128 if MolCount <= 10K else 8 ]
1611 store, yes [ Possible values: yes or no ]
1612
1613 A brief description of parameters, as available in the code for LSH, is
1614 provided below:
1615
1616 dim: Dimensionality of MinHashes to be added to LSHForest
1617 numPrefixTrees: Number of prefix trees to use
1618 store: store the data for enhanced retrieval
1619
1620 --lshLayoutConfigParams <Name,Value,...> [default: auto]
1621 A comma delimited list of parameter name and value pairs for configuring
1622 LSH (Locality Sensitivity Hashing) layout.
1623
1624 The supported parameter names along with their default and possible
1625 values are shown below:
1626
1627 k, auto [ 75 if MolCount <= 10K else 10]
1628 kc, auto [ 20 if MolCount <= 10K else 10]
1629 fmeIterations, 1000
1630 fmeRandomize, no [ Possible values: yes or no ]
1631 fmeThreads, 4
1632 fmePrecision, 4
1633 slRrepeats, auto [ 2 if MolCount <= 10K else 1]
1634 slExtraScalingSteps, auto [ 4 if MolCount <= 10K else 2 ]
1635 slScalingMin, 1.0
1636 slScalingMax, 1.0
1637 slScalingType, RelativeToDrawing [ Possible values: Absolute,
1638 RelativeToAvgLength, RelativeToDesiredLength, or
1639 RelativeToDrawing ]
1640 mmmRepeats, auto [ 2 MolCount <= 10K else 1 ]
1641 placer, Barycenter [ Possible valeues: Barycenter, Solar, Circle,
1642 Median, Random, or Zero ]
1643 merger, LocalBiconnected [ Possible values: EdgeCover,
1644 LocalBiconnected, Solar, or IndependentSet ]
1645 mergerFactor, 2.0
1646 mergerAdjustment, 0
1647 nodeSizeDenominator, auto [ 65 if MolCout <= 10K else 70.0]
1648
1649 A brief description of parameters, as available in the code for LSH, is
1650 provided below:
1651
1652 k: Number of nearest neighbors used to create k-nearest neighbor
1653 graph
1654 kc: Scalar by which k is multiplied before querying LSH forest.
1655 The results are then sorted in decreasing order based on linear
1656 scan distances.
1657 fmeIterations: Maximum number of iterations of Fast Multipole
1658 Embedder (FME)
1659 fmeRandomize: Randomize FME layout at the start
1660 fmeThreads: Number of threads for FME
1661 fmePrecision: Number of coefficients of multipole expansion
1662 slRepeats: Number of repeats of scaling layout algorithm
1663 slExtraScalingSteps: Number of repeats of scaling
1664 slScalingMin: Minimum scaling factor
1665 slScalingMax: Maximum scaling factor.
1666 slScalingType: Scaling type corresponding to relative scale of graph
1667 mmmRepeats, Number of repeats of layout at each level
1668 placer: Methodology for defining initial positions of vertices in a
1669 graph at each level
1670 merger: Vertex merging methodology used during coarsening phase
1671 of multilevel algorithm
1672 mergerFactor: Ratio of sizes between two levels up to which merging
1673 is performed. It doesn't apply to all merging methodologies.
1674 mergerAdjustment: Edge length adjustment for merging methodology.
1675 It doesn't apply to all merging methodologies.
1676 nodeSizeDenominator: Node size denominator affecting the magnitude
1677 of repelling force between nodes. Node size corresponds to
1678 1.0 / nodeSizeDenominator. You may want to increase the value
1679 nodeSizeDenominator to decrease node size and resolve overlaps
1680 in a crowded tree.
1681
1682 --mergeHTMLandJSFiles <yes or no> [default: yes]
1683 Merge TMAP JS data file into HTML file and delete JS data file. Default
1684 file names: <OutfileRoot>.html, <OutfileRoot>.js.
1685 --minHashFPParams <Name,Value,...> [default: auto]
1686 A comma delimited list of parameter name and value pairs for generating
1687 Min Hash Fingerprints (MHFP).
1688
1689 The supported parameter names along with their default and possible
1690 values are shown below:
1691
1692 radius, 3
1693 rings, yes [ Possible values: yes or no ]
1694 kekulize, yes [ Possible values: yes or no ]
1695 sanitize, yes [ Possible values: yes or no ]
1696 minRadius, 1
1697 numPermutations, 2048
1698 seed, 42
1699
1700 A brief description of parameters, as available in the code for MHFP, is
1701 provided below:
1702
1703 radius: MHFP radius (A radius of 3 corresponds to MHFP6)
1704 rings: Include rings in shingling
1705 kekulize: Kekulize SMILES
1706 sanitize: Sanitize SMILES
1707 minRadius: Minimum radius that is used to extract n-grams
1708 numPermutations: Number of permutations used for hashing
1709 seed: Random number seed for numpy.random
1710
1711 --mp <yes or no> [default: no]
1712 Use multiprocessing for the generation of fingerprints.
1713
1714 By default, input data is retrieved in a lazy manner via mp.Pool.imap()
1715 function employing lazy RDKit data iterable. This allows processing of
1716 arbitrary large data sets without any additional requirements memory.
1717
1718 All input data may be optionally loaded into memory by mp.Pool.map()
1719 before starting worker processes in a process pool by setting the value
1720 of 'inputDataMode' to 'InMemory' in '--mpParams' option.
1721
1722 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
1723 data mode may adversely impact the performance. The '--mpParams' section
1724 provides additional information to tune the value of 'chunkSize'.
1725 --mpParams <Name,Value,...> [default: auto]
1726 A comma delimited list of parameter name and value pairs to configure
1727 multiprocessing during the generation of fingerprints.
1728
1729 The supported parameter names along with their default and possible
1730 values are shown below:
1731
1732 chunkSize, auto
1733 inputDataMode, Lazy [ Possible values: InMemory or Lazy ]
1734 numProcesses, auto [ Default: mp.cpu_count() ]
1735
1736 These parameters are used by the following functions to configure and
1737 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
1738 mp.Pool.imap().
1739
1740 The chunkSize determines chunks of input data passed to each worker
1741 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
1742 The default value of chunkSize is dependent on the value of 'inputDataMode'.
1743
1744 The mp.Pool.map() function, invoked during 'InMemory' input data mode,
1745 automatically converts RDKit data iterable into a list, loads all data into
1746 memory, and calculates the default chunkSize using the following method
1747 as shown in its code:
1748
1749 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
1750 if extra: chunkSize += 1
1751
1752 For example, the default chunkSize will be 7 for a pool of 4 worker processes
1753 and 100 data items.
1754
1755 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
1756 'lazy' RDKit data iterable to retrieve data as needed, without loading all the
1757 data into memory. Consequently, the size of input data is not known a priori.
1758 It's not possible to estimate an optimal value for the chunkSize. The default
1759 chunkSize is set to 1.
1760
1761 The default value for the chunkSize during 'Lazy' data mode may adversely
1762 impact the performance due to the overhead associated with exchanging
1763 small chunks of data. It is generally a good idea to explicitly set chunkSize to
1764 a larger value during 'Lazy' input data mode, based on the size of your input
1765 data and number of processes in the process pool.
1766
1767 The mp.Pool.map() function waits for all worker processes to process all
1768 the data and return the results. The mp.Pool.imap() function, however,
1769 returns the the results obtained from worker processes as soon as the
1770 results become available for specified chunks of data.
1771
1772 The order of data in the results returned by both mp.Pool.map() and
1773 mp.Pool.imap() functions always corresponds to the input data.
1774 --numericalDataCols <collabel1,... or colnum1,...> [default: none]
1775 A comma demlimited list of column labels or numbers corresponding to
1776 numerical data to map on a TMAP visualization.
1777 --numericalDataColormaps <Colormap1, Colormap2,...> [default: auto]
1778 A comma demlimited list of color map names corresponding to numerical
1779 data. The default is to use 'viridis' color map name for mapping numerical
1780 data on a TMAP. The number of specified color maps must mtach the number
1781 of numerical data columns. You must specify valid color map names
1782 supported by Matplotlib. No validation is performed. Example color map
1783 names for numerical data: viridis, plasma, inferno, magma, cividis.
1784 -o, --outfile <outfile>
1785 Output HTML file name for writing out a TMAP visualization.
1786 --overwrite
1787 Overwrite existing files.
1788 -q, --quiet <yes or no> [default: no]
1789 Use quiet mode. The warning and information messages will not be printed.
1790 --structureDisplayDataCols <collabel1,... or colnum1,...> [default: auto]
1791 A comma delimited list of column labels or numbers corresponding to data
1792 to display under a thumbnail image of a structure in a TMAP visualization.
1793 The default column is set to 'Name' and it is automatically shown. In addition,
1794 the SMILES string column is always used to display SMILES under the structures.
1795 -t, --tmapDisplayMsg <text> [default: auto]
1796 A brief message to display at the top left in HTML page containing a TMAP
1797 visualization. You must specify a valid HTML string. No validation is
1798 performed. Default message: TMAP chemspace visualization<br/>
1799 Input file: <InfileName><br/>Number of molecules: <Count>
1800 -w, --workingdir <dir>
1801 Location of working directory which defaults to the current directory.
1802
1803 Examples:
1804 To visualize chemspace for SMILES strings present in a column name SMILES in
1805 input file, mapping a categorical data column on TMAP, writing out LSH forest
1806 for subsequent use to skip the generation of fingerprints, merging TMAP JS file
1807 into HTML file, and write out a HTML file containing TMAP visualization, type:
1808
1809 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1810 -i SampleChemspace.csv -o SampleChemspace.html
1811
1812 To run the first example for SMILES strings in column name SMILES in input file
1813 and write out a HTML file containing TMAP visualization, type:
1814
1815 % VisualizeChemspaceUsingTMAP.py --colSMILES SMILES
1816 --categoricalDataCols Source
1817 -i SampleChemspace.csv -o SampleChemspace.html
1818
1819 To run the first example for mapping categrorical data in column number 4 in
1820 input file and write out a HTML file containing TMAP visualization, type:
1821
1822 % VisualizeChemspaceUsingTMAP.py --colmode colnum
1823 --categoricalDataCols 4
1824 -i SampleChemspace.csv -o SampleChemspace.html
1825
1826 To run the first example for mapping both categrorical and numerical data
1827 coumns and write out a HTML file containing TMAP visualization, type:
1828
1829 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
1830 --numericalDataCols "MolWt,MolLogP"
1831 -i SampleChemspace.csv -o SampleChemspace.html
1832
1833 To run the first example for mapping both categrorical and numerical data
1834 coumns along with specified colormaps and write out a HTML file containing
1835 TMAP visualization, type:
1836
1837 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
1838 --categoricalDataColormaps "tab10"
1839 --numericalDataCols "MolWt,MolLogP"
1840 --numericalDataColormaps "viridis, plasma"
1841 -i SampleChemspace.csv -o SampleChemspace.html
1842
1843 To run the first example for mapping both categrorical and numerical data
1844 coumns along with displaying specific data under the structure display and
1845 write out a HTML file containing TMAP visualization, type:
1846
1847 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols "Source"
1848 --numericalDataCols "MolWt,NHOHCount,NOCount,MolLogP,
1849 NumRotatableBonds,TPSA" --structureDisplayDataCols "Name,ID"
1850 -i SampleChemspace.csv -o SampleChemspace.html
1851
1852 To run the first example for restoring LSH forest data from a file to skip the
1853 generation of fingerpritns and write out a HTML file containing TMAP
1854 visualization, type:
1855
1856 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1857 --lshForestFileRestore yes -i SampleChemspace.csv -o SampleChemspace.html
1858
1859 To run the first example in multiprocessing mode on all available CPUs without
1860 loading all data into memory and write out a HTML file containing TMAP
1861 visualization, type:
1862
1863 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1864 --mp yes -i SampleChemspace.csv -o SampleChemspace.html
1865
1866 To run the first example in multiprocessing mode on all available CPUs by
1867 loading all data into memory and write out a HTML file containing TMAP
1868 visualization, type:
1869
1870 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1871 --mp yes --mpParams "inputDataMode,InMemory"
1872 -i SampleChemspace.csv -o SampleChemspace.html
1873
1874 To run the first example in multiprocessing mode on specific number of CPUs
1875 and chunk size without loading all data into memory and write out a HTML file
1876 containing TMAP visualization, type:
1877
1878 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1879 --mp yes --mpParams "inputDataMode,lazy,numProcesses,4,
1880 chunkSize,50" -i SampleChemspace.csv -o SampleChemspace.html
1881
1882 To run the first example using a set of specified parameters to generate
1883 fingerprints and LSH forest, configure faerun and scatter plot layout, and
1884 write out a HTML file containing TMAP visualization, type:
1885
1886 % VisualizeChemspaceUsingTMAP.py --categoricalDataCols Source
1887 --minHashFPParams "radius,3,numPermutations,2048"
1888 --lshForestParams "dim,2048,numPrefixTrees,128"
1889 --lshLayoutConfigParams "k,75,kc,20,slRepeats,2,
1890 slExtraScalingSteps,4,mmmRepeats,2"
1891 --faerunConfigParams "clearColor, #000000,thumbnailWidth, 250"
1892 --faerunScatterPlotParams "shader,circle,pointScale,4"
1893 --tmapDisplayMsg "TMAP Chemspace visualization"
1894 -i SampleChemspace.csv -o SampleChemspace.html
1895
1896 Author:
1897 Manish Sud(msud@san.rr.com)
1898
1899 See also:
1900 RDKitConvertFileFormat.py, RDKitCalculateMolecularDescriptors.py,
1901 RDKitStandardizeMolecules.py
1902
1903 Copyright:
1904 Copyright (C) 2026 Manish Sud. All rights reserved.
1905
1906 The functionality available in this script is implemented using TMAP and
1907 Faerun, open source software packages for visualizing chemspace, and
1908 RDKit, an open source toolkit for cheminformatics developed by Greg
1909 Landrum.
1910
1911 This file is part of MayaChemTools.
1912
1913 MayaChemTools is free software; you can redistribute it and/or modify it under
1914 the terms of the GNU Lesser General Public License as published by the Free
1915 Software Foundation; either version 3 of the License, or (at your option) any
1916 later version.
1917
1918 """
1919
1920 if __name__ == "__main__":
1921 main()