1 #!/bin/env python
2 #
3 # File: RDKitFilterTorsionLibraryAlerts.py
4 # Author: Manish Sud <msud@san.rr.com>
5 #
6 # Collaborator: Pat Walters
7 #
8 # Acknowledgments: Wolfgang Guba, Patrick Penner, and Levi Pierce
9 #
10 # Copyright (C) 2026 Manish Sud. All rights reserved.
11 #
12 # This script uses the Torsion Library jointly developed by the University
13 # of Hamburg, Center for Bioinformatics, Hamburg, Germany and
14 # F. Hoffmann-La-Roche Ltd., Basel, Switzerland.
15 #
16 # The functionality available in this script is implemented using RDKit, an
17 # open source toolkit for cheminformatics developed by Greg Landrum.
18 #
19 # This file is part of MayaChemTools.
20 #
21 # MayaChemTools is free software; you can redistribute it and/or modify it under
22 # the terms of the GNU Lesser General Public License as published by the Free
23 # Software Foundation; either version 3 of the License, or (at your option) any
24 # later version.
25 #
26 # MayaChemTools is distributed in the hope that it will be useful, but without
27 # any warranty; without even the implied warranty of merchantability of fitness
28 # for a particular purpose. See the GNU Lesser General Public License for more
29 # details.
30 #
31 # You should have received a copy of the GNU Lesser General Public License
32 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
33 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
34 # Boston, MA, 02111-1307, USA.
35 #
36
37 from __future__ import print_function
38
39 import os
40 import sys
41 import time
42 import re
43 import glob
44 import multiprocessing as mp
45
46 # RDKit imports...
47 try:
48 from rdkit import rdBase
49 from rdkit import Chem
50 except ImportError as ErrMsg:
51 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
52 sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
53 sys.exit(1)
54
55 # MayaChemTools imports...
56 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
57 try:
58 from docopt import docopt
59 import MiscUtil
60 import RDKitUtil
61 from TorsionAlerts.TorsionLibraryAlerts import TorsionLibraryAlerts
62 except ImportError as ErrMsg:
63 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
64 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
65 sys.exit(1)
66
67 ScriptName = os.path.basename(sys.argv[0])
68 Options = {}
69 OptionsInfo = {}
70
71
72 def main():
73 """Start execution of the script."""
74
75 MiscUtil.PrintInfo(
76 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
77 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
78 )
79
80 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
81
82 # Retrieve command line arguments and options...
83 RetrieveOptions()
84
85 if Options["--list"]:
86 # Handle listing of torsion library information...
87 ProcessListTorsionLibraryOption()
88 else:
89 # Process and validate command line arguments and options...
90 ProcessOptions()
91
92 # Perform actions required by the script...
93 PerformFiltering()
94
95 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
96 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
97
98
99 def PerformFiltering():
100 """Filter molecules using SMARTS torsion rules in the torsion library file."""
101
102 # Setup a molecule reader...
103 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"])
104 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"])
105
106 MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount = ProcessMolecules(Mols)
107
108 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
109 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
110 MiscUtil.PrintInfo("Number of molecules failed during writing: %d" % WriteFailedCount)
111 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + WriteFailedCount))
112
113 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount)
114 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount))
115
116
117 def ProcessMolecules(Mols):
118 """Process and filter molecules."""
119
120 if OptionsInfo["MPMode"]:
121 return ProcessMoleculesUsingMultipleProcesses(Mols)
122 else:
123 return ProcessMoleculesUsingSingleProcess(Mols)
124
125
126 def ProcessMoleculesUsingSingleProcess(Mols):
127 """Process and filter molecules using a single process."""
128
129 # Instantiate torsion library alerts class...
130 TorsionLibraryAlertsHandle = InstantiateTorsionLibraryAlertsClass()
131
132 MiscUtil.PrintInfo("\nFiltering molecules...")
133
134 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
135
136 # Set up writers...
137 OutfilesWriters = SetupOutfilesWriters()
138
139 WriterRemaining = OutfilesWriters["WriterRemaining"]
140 WriterFiltered = OutfilesWriters["WriterFiltered"]
141 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"]
142
143 # Initialize alerts summary info...
144 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo()
145
146 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5
147 for Mol in Mols:
148 MolCount += 1
149
150 if Mol is None:
151 continue
152
153 if RDKitUtil.IsMolEmpty(Mol):
154 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % RDKitUtil.GetMolName(Mol, MolCount))
155 continue
156
157 # Check for 3D flag...
158 if not Mol.GetConformer().Is3D():
159 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % RDKitUtil.GetMolName(Mol, MolCount))
160 continue
161
162 ValidMolCount += 1
163
164 # Identify torsion library alerts for rotatable bonds..
165 RotBondsAlertsStatus, RotBondsAlertsInfo = (
166 TorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol)
167 )
168
169 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo)
170
171 # Write out filtered and remaining molecules...
172 WriteStatus = True
173 if RotBondsAlertsStatus:
174 if OutfileFilteredMode:
175 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo)
176 if WriteStatus:
177 FilteredMolWriteCount += 1
178 else:
179 RemainingMolCount += 1
180 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo)
181
182 if not WriteStatus:
183 WriteFailedCount += 1
184
185 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo)
186 CloseOutfilesWriters(OutfilesWriters)
187
188 if FilteredMolWriteCount:
189 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo)
190
191 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount)
192
193
194 def ProcessMoleculesUsingMultipleProcesses(Mols):
195 """Process and filter molecules using multiprocessing."""
196
197 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...")
198
199 MPParams = OptionsInfo["MPParams"]
200 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"]
201
202 # Instantiate torsion strain energy alerts class to list torsion library information...
203 InstantiateTorsionLibraryAlertsClass()
204
205 # Set up writers...
206 OutfilesWriters = SetupOutfilesWriters()
207
208 WriterRemaining = OutfilesWriters["WriterRemaining"]
209 WriterFiltered = OutfilesWriters["WriterFiltered"]
210 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"]
211
212 # Initialize alerts summary info...
213 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo()
214
215 # Setup data for initializing a worker process...
216 MiscUtil.PrintInfo("Encoding options info and rotatable bond pattern molecule...")
217 InitializeWorkerProcessArgs = (
218 MiscUtil.ObjectToBase64EncodedString(Options),
219 MiscUtil.ObjectToBase64EncodedString(OptionsInfo),
220 )
221
222 # Setup a encoded mols data iterable for a worker process...
223 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols)
224
225 # Setup process pool along with data initialization for each process...
226 MiscUtil.PrintInfo(
227 "\nConfiguring multiprocessing using %s method..."
228 % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")
229 )
230 MiscUtil.PrintInfo(
231 "NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n"
232 % (
233 MPParams["NumProcesses"],
234 MPParams["InputDataMode"],
235 ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]),
236 )
237 )
238
239 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs)
240
241 # Start processing...
242 if re.match("^Lazy$", MPParams["InputDataMode"], re.I):
243 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
244 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I):
245 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"])
246 else:
247 MiscUtil.PrintError(
248 'The value, %s, specified for "--inputDataMode" is not supported.' % (MPParams["InputDataMode"])
249 )
250
251 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5
252 for Result in Results:
253 MolCount += 1
254 MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo = Result
255
256 if EncodedMol is None:
257 continue
258 ValidMolCount += 1
259
260 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
261
262 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo)
263
264 # Write out filtered and remaining molecules...
265 WriteStatus = True
266 if RotBondsAlertsStatus:
267 if OutfileFilteredMode:
268 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo)
269 if WriteStatus:
270 FilteredMolWriteCount += 1
271 else:
272 RemainingMolCount += 1
273 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo)
274
275 if not WriteStatus:
276 WriteFailedCount += 1
277
278 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo)
279 CloseOutfilesWriters(OutfilesWriters)
280
281 if FilteredMolWriteCount:
282 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo)
283
284 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount)
285
286
287 def InitializeWorkerProcess(*EncodedArgs):
288 """Initialize data for a worker process."""
289
290 global Options, OptionsInfo
291
292 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid())
293
294 # Decode Options and OptionInfo...
295 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0])
296 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1])
297
298 # Instantiate torsion slibrary alerts class...
299 OptionsInfo["TorsionLibraryAlertsHandle"] = InstantiateTorsionLibraryAlertsClass(Quiet=True)
300
301
302 def WorkerProcess(EncodedMolInfo):
303 """Process data for a worker process."""
304
305 MolIndex, EncodedMol = EncodedMolInfo
306
307 if EncodedMol is None:
308 return [MolIndex, None, False, None]
309
310 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol)
311 if RDKitUtil.IsMolEmpty(Mol):
312 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
313 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
314 return [MolIndex, None, False, None]
315
316 # Check for 3D flag...
317 if not Mol.GetConformer().Is3D():
318 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1))
319 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % MolName)
320 return [MolIndex, None, False, None]
321
322 # Identify torsion library alerts for rotatable bonds..
323 TorsionTorsionLibraryAlertsHandle = OptionsInfo["TorsionLibraryAlertsHandle"]
324 RotBondsAlertsStatus, RotBondsAlertsInfo = (
325 TorsionTorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol)
326 )
327
328 return [MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo]
329
330
331 def InitializeTorsionAlertsSummaryInfo():
332 """Initialize torsion alerts summary."""
333
334 if OptionsInfo["CountMode"]:
335 return None
336
337 if not OptionsInfo["TrackAlertsSummaryInfo"]:
338 return None
339
340 TorsionAlertsSummaryInfo = {}
341 TorsionAlertsSummaryInfo["RuleIDs"] = []
342
343 for DataLabel in [
344 "SMARTSToRuleIDs",
345 "RuleSMARTS",
346 "HierarchyClassName",
347 "HierarchySubClassName",
348 "TorsionRulePeaks",
349 "TorsionRuleTolerances1",
350 "TorsionRuleTolerances2",
351 "AlertTypes",
352 "AlertTypesMolCount",
353 ]:
354 TorsionAlertsSummaryInfo[DataLabel] = {}
355
356 return TorsionAlertsSummaryInfo
357
358
359 def TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo):
360 """Track torsion alerts summary information for matched torsion rules in a
361 molecule."""
362
363 if OptionsInfo["CountMode"]:
364 return
365
366 if not OptionsInfo["TrackAlertsSummaryInfo"]:
367 return
368
369 if RotBondsAlertsInfo is None:
370 return
371
372 MolAlertsInfo = {}
373 MolAlertsInfo["RuleIDs"] = []
374 MolAlertsInfo["AlertTypes"] = {}
375
376 for ID in RotBondsAlertsInfo["IDs"]:
377 if not RotBondsAlertsInfo["MatchStatus"][ID]:
378 continue
379
380 if OptionsInfo["OutfileAlertsOnly"]:
381 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]:
382 continue
383
384 AlertType = RotBondsAlertsInfo["AlertTypes"][ID]
385 TorsionRuleNodeID = RotBondsAlertsInfo["TorsionRuleNodeID"][ID]
386 TorsionRuleSMARTS = RotBondsAlertsInfo["TorsionRuleSMARTS"][ID]
387
388 # Track data for torsion alert summary information across molecules...
389 if TorsionRuleNodeID not in TorsionAlertsSummaryInfo["RuleSMARTS"]:
390 TorsionAlertsSummaryInfo["RuleIDs"].append(TorsionRuleNodeID)
391 TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRuleSMARTS] = TorsionRuleNodeID
392
393 TorsionAlertsSummaryInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRuleSMARTS
394 TorsionAlertsSummaryInfo["HierarchyClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo[
395 "HierarchyClassNames"
396 ][ID]
397 TorsionAlertsSummaryInfo["HierarchySubClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo[
398 "HierarchySubClassNames"
399 ][ID]
400
401 TorsionAlertsSummaryInfo["TorsionRulePeaks"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRulePeaks"][ID]
402 TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = RotBondsAlertsInfo[
403 "TorsionRuleTolerances1"
404 ][ID]
405 TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = RotBondsAlertsInfo[
406 "TorsionRuleTolerances2"
407 ][ID]
408
409 # Initialize number of alert types across all molecules...
410 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID] = {}
411
412 # Initialize number of molecules flagged by each alert type...
413 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID] = {}
414
415 if AlertType not in TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID]:
416 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0
417 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] = 0
418
419 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1
420
421 # Track data for torsion alert information in a molecule...
422 if TorsionRuleNodeID not in MolAlertsInfo["AlertTypes"]:
423 MolAlertsInfo["RuleIDs"].append(TorsionRuleNodeID)
424 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID] = {}
425
426 if AlertType not in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]:
427 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0
428 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1
429
430 # Track number of molecules flagged by a specific torsion alert...
431 for TorsionRuleNodeID in MolAlertsInfo["RuleIDs"]:
432 for AlertType in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]:
433 if MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType]:
434 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] += 1
435
436
437 def WriteTorsionAlertsSummaryInfo(Writer, TorsionAlertsSummaryInfo):
438 """Write out torsion alerts summary informatio to a CSV file."""
439
440 if OptionsInfo["CountMode"]:
441 return
442
443 if not OptionsInfo["OutfileSummaryMode"]:
444 return
445
446 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0:
447 return
448
449 # Write headers...
450 QuoteValues = True
451 Values = [
452 "TorsionRule",
453 "TorsionPeaks",
454 "Tolerances1",
455 "Tolerances2",
456 "HierarchyClass",
457 "HierarchySubClass",
458 "TorsionAlertTypes",
459 "TorsionAlertCount",
460 "TorsionAlertMolCount",
461 ]
462 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues))
463
464 SortedRuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo)
465
466 # Write alerts information...
467 for ID in SortedRuleIDs:
468 # Remove any double quotes in SMARTS...
469 RuleSMARTS = TorsionAlertsSummaryInfo["RuleSMARTS"][ID]
470 RuleSMARTS = re.sub('"', "", RuleSMARTS, re.I)
471
472 HierarchyClassName = TorsionAlertsSummaryInfo["HierarchyClassName"][ID]
473 HierarchySubClassName = TorsionAlertsSummaryInfo["HierarchySubClassName"][ID]
474
475 TorsionPeaks = MiscUtil.JoinWords(
476 ["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRulePeaks"][ID]], ","
477 )
478 TorsionRuleTolerances1 = MiscUtil.JoinWords(
479 ["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][ID]], ","
480 )
481 TorsionRuleTolerances2 = MiscUtil.JoinWords(
482 ["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][ID]], ","
483 )
484
485 AlertTypes = []
486 AlertTypeCount = []
487 AlertTypeMolCount = []
488 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]):
489 AlertTypes.append(AlertType)
490 AlertTypeCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypes"][ID][AlertType])
491 AlertTypeMolCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType])
492
493 Values = [
494 RuleSMARTS,
495 TorsionPeaks,
496 TorsionRuleTolerances1,
497 TorsionRuleTolerances2,
498 HierarchyClassName,
499 HierarchySubClassName,
500 "%s" % MiscUtil.JoinWords(AlertTypes, ","),
501 "%s" % (MiscUtil.JoinWords(AlertTypeCount, ",")),
502 "%s" % (MiscUtil.JoinWords(AlertTypeMolCount, ",")),
503 ]
504 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues))
505
506
507 def GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo):
508 """Sort torsion rule IDs by alert types molecule count in descending order."""
509
510 SortedRuleIDs = []
511
512 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"]
513 if len(RuleIDs) == 0:
514 return SortedRuleIDs
515
516 # Setup a map from AlertTypesMolCount to IDs for sorting alerts...
517 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"]
518 MolCountMap = {}
519 for ID in RuleIDs:
520 MolCount = 0
521 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]):
522 MolCount += TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType]
523 MolCountMap[ID] = MolCount
524
525 SortedRuleIDs = sorted(RuleIDs, key=lambda ID: MolCountMap[ID], reverse=True)
526
527 return SortedRuleIDs
528
529
530 def WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo):
531 """Write out torsion alerts SD files for individual torsion rules."""
532
533 if OptionsInfo["CountMode"]:
534 return
535
536 if not OptionsInfo["OutfilesFilteredByRulesMode"]:
537 return
538
539 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0:
540 return
541
542 # Setup a molecule reader for filtered molecules...
543 FilteredMols = RDKitUtil.ReadMolecules(OptionsInfo["OutfileFiltered"], **OptionsInfo["InfileParams"])
544
545 # Get torsion rule IDs for writing out filtered SD files for individual torsion alert rules...
546 TorsionRuleIDs = GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo)
547
548 # Setup writers...
549 ByRuleOutfilesWriters = SetupByRuleOutfilesWriters(TorsionRuleIDs)
550
551 for Mol in FilteredMols:
552 # Retrieve torsion alerts info...
553 TorsionAlertsInfo = RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo)
554 if TorsionAlertsInfo is None:
555 continue
556
557 for TorsionRuleID in TorsionRuleIDs:
558 if TorsionRuleID not in TorsionAlertsInfo["RuleSMARTS"]:
559 continue
560
561 WriteMoleculeFilteredByRuleID(
562 ByRuleOutfilesWriters[TorsionRuleID], Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo
563 )
564
565 CloseByRuleOutfilesWriters(ByRuleOutfilesWriters)
566
567
568 def GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo):
569 """Get torsion rule IDs for writing out individual SD files filtered by torsion alert rules."""
570
571 # Get torsion rule IDs triggering torsion alerts sorted in the order from the most to
572 # the least number of unique molecules...
573 RuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo)
574
575 # Select torsion rule IDs for writing out SD files...
576 if not OptionsInfo["OutfilesFilteredByRulesAllMode"]:
577 MaxRuleIDs = OptionsInfo["OutfilesFilteredByRulesMaxCount"]
578 if MaxRuleIDs < len(RuleIDs):
579 RuleIDs = RuleIDs[0:MaxRuleIDs]
580
581 return RuleIDs
582
583
584 def RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo):
585 """Parse torsion alerts data field value to retrieve alerts information for rotatable bonds."""
586
587 TorsionAlertsLabel = OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]
588 TorsionAlerts = Mol.GetProp(TorsionAlertsLabel) if Mol.HasProp(TorsionAlertsLabel) else None
589
590 if TorsionAlerts is None or len(TorsionAlerts) == 0:
591 return None
592
593 # Initialize for tracking by rule IDs...
594 TorsionAlertsInfo = {}
595 TorsionAlertsInfo["RuleIDs"] = []
596
597 for DataLabel in [
598 "RuleSMARTS",
599 "HierarchyClassName",
600 "HierarchySubClassName",
601 "TorsionRulePeaks",
602 "TorsionRuleTolerances1",
603 "TorsionRuleTolerances2",
604 "AlertTypes",
605 "AtomIndices",
606 "TorsionAtomIndices",
607 "TorsionAngles",
608 "TorsionAngleViolations",
609 "AlertTypesCount",
610 ]:
611 TorsionAlertsInfo[DataLabel] = {}
612
613 ValuesDelimiter = OptionsInfo["IntraSetValuesDelim"]
614 TorsionAlertsSetSize = 11
615
616 TorsionAlertsWords = TorsionAlerts.split()
617 if len(TorsionAlertsWords) % TorsionAlertsSetSize:
618 MiscUtil.PrintError(
619 "The number of space delimited values, %s, for TorsionAlerts data field in filtered SD file must be a multiple of %s."
620 % (len(TorsionAlertsWords), TorsionAlertsSetSize)
621 )
622
623 ID = 0
624 for Index in range(0, len(TorsionAlertsWords), TorsionAlertsSetSize):
625 ID += 1
626
627 (
628 RotBondIndices,
629 TorsionAlertType,
630 TorsionIndices,
631 TorsionAngle,
632 TorsionAngleViolation,
633 HierarchyClass,
634 HierarchySubClass,
635 TorsionPeaks,
636 Tolerances1,
637 Tolerances2,
638 TorsionRule,
639 ) = TorsionAlertsWords[Index : Index + TorsionAlertsSetSize]
640 RotBondIndices = RotBondIndices.split(ValuesDelimiter)
641 TorsionIndices = TorsionIndices.split(ValuesDelimiter)
642 TorsionPeaks = TorsionPeaks.split(ValuesDelimiter)
643 Tolerances1 = Tolerances1.split(ValuesDelimiter)
644 Tolerances2 = Tolerances2.split(ValuesDelimiter)
645
646 if TorsionRule not in TorsionAlertsSummaryInfo["SMARTSToRuleIDs"]:
647 MiscUtil.PrintWarning(
648 "The SMARTS pattern, %s, for TorsionAlerts data field in filtered SD file doesn't map to any torsion rule..."
649 % TorsionRule
650 )
651 continue
652 TorsionRuleNodeID = TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRule]
653
654 # Track data for torsion alerts in a molecule...
655 if TorsionRuleNodeID not in TorsionAlertsInfo["RuleSMARTS"]:
656 TorsionAlertsInfo["RuleIDs"].append(TorsionRuleNodeID)
657
658 TorsionAlertsInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRule
659 TorsionAlertsInfo["HierarchyClassName"][TorsionRuleNodeID] = HierarchyClass
660 TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleNodeID] = HierarchySubClass
661 TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleNodeID] = TorsionPeaks
662 TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = Tolerances1
663 TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = Tolerances2
664
665 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID] = []
666 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID] = []
667 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID] = []
668 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID] = []
669 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID] = []
670
671 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID] = {}
672
673 # Track multiple values for a rule ID...
674 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID].append(TorsionAlertType)
675 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID].append(RotBondIndices)
676 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID].append(TorsionIndices)
677 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID].append(TorsionAngle)
678 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID].append(TorsionAngleViolation)
679
680 # Count alert type for a rule ID...
681 if TorsionAlertType not in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID]:
682 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] = 0
683 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] += 1
684
685 return TorsionAlertsInfo
686
687
688 def WriteMolecule(Writer, Mol, RotBondsAlertsInfo):
689 """Write out molecule."""
690
691 if OptionsInfo["CountMode"]:
692 return True
693
694 SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo)
695
696 try:
697 Writer.write(Mol)
698 except Exception as ErrMsg:
699 MiscUtil.PrintWarning("Failed to write molecule %s:\n%s\n" % (RDKitUtil.GetMolName(Mol), ErrMsg))
700 return False
701
702 return True
703
704
705 def SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo):
706 """Setup molecule properties containing alerts information for rotatable bonds."""
707
708 if not OptionsInfo["OutfileAlerts"]:
709 return
710
711 # Setup rotatable bonds count...
712 RotBondsCount = 0
713 if RotBondsAlertsInfo is not None:
714 RotBondsCount = len(RotBondsAlertsInfo["IDs"])
715 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["RotBondsCountLabel"], "%s" % RotBondsCount)
716
717 # Setup alert counts for rotatable bonds...
718 AlertsCount = []
719 if RotBondsAlertsInfo is not None:
720 for AlertType in ["Green", "Orange", "Red"]:
721 if AlertType in RotBondsAlertsInfo["Count"]:
722 AlertsCount.append("%s" % RotBondsAlertsInfo["Count"][AlertType])
723 else:
724 AlertsCount.append("0")
725
726 if len(AlertsCount):
727 Mol.SetProp(
728 OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsCountLabel"], "%s" % MiscUtil.JoinWords(AlertsCount, " ")
729 )
730
731 # Setup alert information for rotatable bonds...
732 AlertsInfoValues = []
733
734 # Delimiter for multiple values corresponding to specific set of information for
735 # a rotatable bond. For example: TorsionAtomIndices
736 ValuesDelim = OptionsInfo["IntraSetValuesDelim"]
737
738 # Delimiter for various values for a rotatable bond...
739 RotBondValuesDelim = OptionsInfo["InterSetValuesDelim"]
740
741 # Delimiter for values corresponding to multiple rotatable bonds...
742 AlertsInfoValuesDelim = OptionsInfo["InterSetValuesDelim"]
743
744 if RotBondsAlertsInfo is not None:
745 for ID in RotBondsAlertsInfo["IDs"]:
746 if not RotBondsAlertsInfo["MatchStatus"][ID]:
747 continue
748
749 if OptionsInfo["OutfileAlertsOnly"]:
750 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]:
751 continue
752
753 RotBondValues = []
754
755 # Bond atom indices...
756 Values = ["%s" % Value for Value in RotBondsAlertsInfo["AtomIndices"][ID]]
757 RotBondValues.append(ValuesDelim.join(Values))
758
759 # Alert type...
760 RotBondValues.append(RotBondsAlertsInfo["AlertTypes"][ID])
761
762 # Torsion atom indices...
763 TorsionAtomIndices = SetupTorsionAtomIndicesValues(
764 RotBondsAlertsInfo["TorsionAtomIndices"][ID], ValuesDelim
765 )
766 RotBondValues.append(TorsionAtomIndices)
767
768 # Torsion angle...
769 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngles"][ID])
770
771 # Torsion angle violation...
772 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngleViolations"][ID])
773
774 # Hierarchy class and subclass names...
775 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchyClassNames"][ID])
776 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchySubClassNames"][ID])
777
778 # Torsion rule peaks...
779 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRulePeaks"][ID]]
780 RotBondValues.append(ValuesDelim.join(Values))
781
782 # Torsion rule tolerances...
783 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances1"][ID]]
784 RotBondValues.append(ValuesDelim.join(Values))
785 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances2"][ID]]
786 RotBondValues.append(ValuesDelim.join(Values))
787
788 # Torsion rule SMARTS...
789 RotBondValues.append("%s" % RotBondsAlertsInfo["TorsionRuleSMARTS"][ID])
790
791 # Track joined values for a rotatable bond...
792 AlertsInfoValues.append("%s" % RotBondValuesDelim.join(RotBondValues))
793
794 if len(AlertsInfoValues):
795 Mol.SetProp(
796 OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"],
797 "%s" % ("%s" % AlertsInfoValuesDelim.join(AlertsInfoValues)),
798 )
799
800
801 def WriteMoleculeFilteredByRuleID(Writer, Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo):
802 """Write out molecule."""
803
804 if OptionsInfo["CountMode"]:
805 return
806
807 SetupMolPropertiesForFilteredByRuleIDAlertsInformation(
808 Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo
809 )
810
811 Writer.write(Mol)
812
813
814 def SetupMolPropertiesForFilteredByRuleIDAlertsInformation(
815 Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo
816 ):
817 """Setup molecule properties containing alerts information for torsion alerts
818 fileted by Rule IDs."""
819
820 # Delete torsion alerts information for rotatable bonds...
821 if Mol.HasProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]):
822 Mol.ClearProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"])
823
824 # Delimiter for values...
825 IntraSetValuesDelim = OptionsInfo["IntraSetValuesDelim"]
826 InterSetValuesDelim = OptionsInfo["InterSetValuesDelim"]
827
828 # Setup alert rule information...
829 AlertRuleInfoValues = []
830
831 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchyClassName"][TorsionRuleID])
832 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleID])
833
834 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleID]]
835 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values))
836
837 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleID]]
838 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values))
839 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleID]]
840 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values))
841
842 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["RuleSMARTS"][TorsionRuleID])
843
844 Mol.SetProp(
845 OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleLabel"],
846 "%s" % ("%s" % InterSetValuesDelim.join(AlertRuleInfoValues)),
847 )
848
849 # Setup alerts count for torsion rule...
850 AlertsCount = []
851 for AlertType in ["Green", "Orange", "Red"]:
852 if AlertType in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID]:
853 AlertsCount.append("%s" % TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID][AlertType])
854 else:
855 AlertsCount.append("0")
856
857 Mol.SetProp(
858 OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsCountLabel"], "%s" % (InterSetValuesDelim.join(AlertsCount))
859 )
860
861 # Setup torsion rule alerts...
862 AlertsInfoValues = []
863 for Index in range(0, len(TorsionAlertsInfo["AlertTypes"][TorsionRuleID])):
864 RotBondInfoValues = []
865
866 # Bond atom indices...
867 Values = ["%s" % Value for Value in TorsionAlertsInfo["AtomIndices"][TorsionRuleID][Index]]
868 RotBondInfoValues.append(IntraSetValuesDelim.join(Values))
869
870 # Alert type...
871 RotBondInfoValues.append(TorsionAlertsInfo["AlertTypes"][TorsionRuleID][Index])
872
873 # Torsion atom indices retrieved from the filtered SD file and stored as strings...
874 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleID][Index]]
875 RotBondInfoValues.append(IntraSetValuesDelim.join(Values))
876
877 # Torsion angle...
878 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngles"][TorsionRuleID][Index])
879
880 # Torsion angle violation...
881 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID][Index])
882
883 # Track alerts informaiton...
884 AlertsInfoValues.append("%s" % InterSetValuesDelim.join(RotBondInfoValues))
885
886 Mol.SetProp(
887 OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsLabel"], "%s" % (InterSetValuesDelim.join(AlertsInfoValues))
888 )
889
890 # Setup torsion rule alert max angle violation...
891 TorsionAngleViolations = [float(Angle) for Angle in TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID]]
892 Mol.SetProp(
893 OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleMaxAngleViolationLabel"], "%.2f" % (max(TorsionAngleViolations))
894 )
895
896
897 def SetupTorsionAtomIndicesValues(TorsionAtomIndicesList, ValuesDelim):
898 """Setup torsion atom indices value for output files."""
899
900 # Check for any list values in the list of torsion atom indices used as placeholders
901 # for positions of lone pairs in torsion rules containing N_lp...
902 TorsionAtomsInfo = []
903 for Value in TorsionAtomIndicesList:
904 if type(Value) is list:
905 TorsionAtomsInfo.append("N_lp")
906 else:
907 TorsionAtomsInfo.append(Value)
908
909 Values = ["%s" % Value for Value in TorsionAtomsInfo]
910
911 return ValuesDelim.join(Values)
912
913
914 def SetupOutfilesWriters():
915 """Setup molecule and summary writers."""
916
917 OutfilesWriters = {"WriterRemaining": None, "WriterFiltered": None, "WriterAlertSummary": None}
918
919 # Writers for SD files...
920 WriterRemaining, WriterFiltered = SetupMoleculeWriters()
921 OutfilesWriters["WriterRemaining"] = WriterRemaining
922 OutfilesWriters["WriterFiltered"] = WriterFiltered
923
924 # Writer for alert summary CSV file...
925 WriterAlertSummary = SetupAlertSummaryWriter()
926 OutfilesWriters["WriterAlertSummary"] = WriterAlertSummary
927
928 return OutfilesWriters
929
930
931 def SetupMoleculeWriters():
932 """Setup molecule writers."""
933
934 Writer = None
935 WriterFiltered = None
936
937 if OptionsInfo["CountMode"]:
938 return (Writer, WriterFiltered)
939
940 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"])
941 if Writer is None:
942 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"])
943 MiscUtil.PrintInfo("\nGenerating file %s..." % OptionsInfo["Outfile"])
944
945 if OptionsInfo["OutfileFilteredMode"]:
946 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"])
947 if WriterFiltered is None:
948 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"])
949 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"])
950
951 return (Writer, WriterFiltered)
952
953
954 def SetupAlertSummaryWriter():
955 """Setup a alert summary writer."""
956
957 Writer = None
958
959 if OptionsInfo["CountMode"]:
960 return Writer
961
962 if not OptionsInfo["OutfileSummaryMode"]:
963 return Writer
964
965 Outfile = OptionsInfo["OutfileSummary"]
966 Writer = open(Outfile, "w")
967 if Writer is None:
968 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
969
970 MiscUtil.PrintInfo("Generating file %s..." % Outfile)
971
972 return Writer
973
974
975 def CloseOutfilesWriters(OutfilesWriters):
976 """Close outfile writers."""
977
978 for WriterType, Writer in OutfilesWriters.items():
979 if Writer is not None:
980 Writer.close()
981
982
983 def SetupByRuleOutfilesWriters(RuleIDs):
984 """Setup by rule outfiles writers."""
985
986 # Initialize...
987 OutfilesWriters = {}
988 for RuleID in RuleIDs:
989 OutfilesWriters[RuleID] = None
990
991 if OptionsInfo["CountMode"]:
992 return OutfilesWriters
993
994 if not OptionsInfo["OutfilesFilteredByRulesMode"]:
995 return OutfilesWriters
996
997 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
998 OutfilesRoot = "%s_Filtered_TopRule" % FileName
999 OutfilesExt = "sdf"
1000
1001 MsgTxt = (
1002 "all"
1003 if OptionsInfo["OutfilesFilteredByRulesAllMode"]
1004 else "top %s" % OptionsInfo["OutfilesFilteredByRulesMaxCount"]
1005 )
1006 MiscUtil.PrintInfo(
1007 "\nGenerating output files %s*.%s for %s torsion rules triggering alerts..."
1008 % (OutfilesRoot, OutfilesExt, MsgTxt)
1009 )
1010
1011 # Delete any existing output files...
1012 Outfiles = glob.glob("%s*.%s" % (OutfilesRoot, OutfilesExt))
1013 if len(Outfiles):
1014 MiscUtil.PrintInfo("Deleting existing output files %s*.%s..." % (OutfilesRoot, OutfilesExt))
1015 for Outfile in Outfiles:
1016 try:
1017 os.remove(Outfile)
1018 except Exception as ErrMsg:
1019 MiscUtil.PrintWarning("Failed to delete file: %s" % ErrMsg)
1020
1021 RuleIndex = 0
1022 for RuleID in RuleIDs:
1023 RuleIndex += 1
1024 Outfile = "%s%s.%s" % (OutfilesRoot, RuleIndex, OutfilesExt)
1025 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
1026 if Writer is None:
1027 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
1028
1029 OutfilesWriters[RuleID] = Writer
1030
1031 return OutfilesWriters
1032
1033
1034 def CloseByRuleOutfilesWriters(OutfilesWriters):
1035 """Close by rule outfile writers."""
1036
1037 for RuleID, Writer in OutfilesWriters.items():
1038 if Writer is not None:
1039 Writer.close()
1040
1041
1042 def InstantiateTorsionLibraryAlertsClass(Quiet=False):
1043 """Initialize torsion library alerts class."""
1044
1045 try:
1046 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(
1047 AlertsMode=OptionsInfo["AlertsMode"],
1048 MinAlertsCount=OptionsInfo["MinAlertsCount"],
1049 NitrogenLonePairAllowHydrogenNbrs=OptionsInfo["NitrogenLonePairParams"]["AllowHydrogenNbrs"],
1050 NitrogenLonePairPlanarityTolerance=OptionsInfo["NitrogenLonePairParams"]["PlanarityTolerance"],
1051 RotBondsSMARTSMode=OptionsInfo["RotBondsSMARTSMode"],
1052 RotBondsSMARTSPattern=OptionsInfo["RotBondsSMARTSPattern"],
1053 TorsionLibraryFilePath=OptionsInfo["TorsionLibraryFile"],
1054 )
1055 except Exception as ErrMsg:
1056 MiscUtil.PrintError("Failed to instantiate TorsionLibraryAlerts:\n%s\n" % (ErrMsg))
1057
1058 if not Quiet:
1059 MiscUtil.PrintInfo(
1060 "\nRetrieving data from library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()
1061 )
1062 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo()
1063
1064 return TorsionLibraryAlertsHandle
1065
1066
1067 def ProcessRotatableBondsSMARTSMode():
1068 """ "Process SMARTS pattern for rotatable bonds."""
1069
1070 RotBondsMode = OptionsInfo["RotBondsSMARTSMode"]
1071
1072 RotBondsSMARTSPattern = None
1073 RotBondsSMARTSPatternSpecified = OptionsInfo["RotBondsSMARTSPatternSpecified"]
1074
1075 if re.match("^(NonStrict|SemiStrict|Strict)$", RotBondsMode, re.I):
1076 RotBondsSMARTSPattern = None
1077 elif re.match("Specify", RotBondsMode, re.I):
1078 RotBondsSMARTSPatternSpecified = RotBondsSMARTSPatternSpecified.strip()
1079 if not len(RotBondsSMARTSPatternSpecified):
1080 MiscUtil.PrintError(
1081 'Empty value specified for SMILES/SMARTS pattern in "--rotBondsSMARTSPattern" option, %s.'
1082 % RotBondsMode
1083 )
1084
1085 RotBondsPatternMol = Chem.MolFromSmarts(RotBondsSMARTSPatternSpecified)
1086 if RotBondsPatternMol is None:
1087 MiscUtil.PrintError(
1088 'Failed to create rotatable bonds pattern molecule. The rotatable bonds SMARTS pattern, "%s", specified using "--rotBondsSMARTSPattern" option is not valid.'
1089 % (RotBondsSMARTSPatternSpecified)
1090 )
1091 else:
1092 MiscUtil.PrintError(
1093 'The value, %s, specified for option "-r, --rotBondsSMARTSMode" is not valid. ' % RotBondsMode
1094 )
1095
1096 OptionsInfo["RotBondsSMARTSPattern"] = RotBondsSMARTSPattern
1097
1098
1099 def ProcessSDFieldLabelsOption():
1100 """Process SD data field label option."""
1101
1102 ParamsOptionName = "--outfileSDFieldLabels"
1103 ParamsOptionValue = Options["--outfileSDFieldLabels"]
1104
1105 ParamsIDsToLabels = {
1106 "RotBondsCountLabel": "RotBondsCount",
1107 "TorsionAlertsCountLabel": "TorsionAlertsCount (Green Orange Red)",
1108 "TorsionAlertsLabel": "TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)",
1109 "TorsionRuleLabel": "TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)",
1110 "TorsionRuleAlertsCountLabel": "TorsionRuleAlertsCount (Green Orange Red)",
1111 "TorsionRuleAlertsLabel": "TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation)",
1112 "TorsionRuleMaxAngleViolationLabel": "TorsionRuleMaxAngleViolation",
1113 }
1114
1115 if re.match("^auto$", ParamsOptionValue, re.I):
1116 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels
1117 return
1118
1119 # Setup a canonical paramater names...
1120 ValidParamNames = []
1121 CanonicalParamNamesMap = {}
1122 for ParamName in sorted(ParamsIDsToLabels):
1123 ValidParamNames.append(ParamName)
1124 CanonicalParamNamesMap[ParamName.lower()] = ParamName
1125
1126 ParamsOptionValue = ParamsOptionValue.strip()
1127 if not ParamsOptionValue:
1128 MiscUtil.PrintError('No valid parameter name and value pairs specified using "%s" option' % ParamsOptionName)
1129
1130 ParamsOptionValueWords = ParamsOptionValue.split(",")
1131 if len(ParamsOptionValueWords) % 2:
1132 MiscUtil.PrintError(
1133 'The number of comma delimited paramater names and values, %d, specified using "%s" option must be an even number.'
1134 % (len(ParamsOptionValueWords), ParamsOptionName)
1135 )
1136
1137 # Validate paramater name and value pairs...
1138 for Index in range(0, len(ParamsOptionValueWords), 2):
1139 Name = ParamsOptionValueWords[Index].strip()
1140 Value = ParamsOptionValueWords[Index + 1].strip()
1141
1142 CanonicalName = Name.lower()
1143 if CanonicalName not in CanonicalParamNamesMap:
1144 MiscUtil.PrintError(
1145 'The parameter name, %s, specified using "%s" is not a valid name. Supported parameter names: %s'
1146 % (Name, ParamsOptionName, " ".join(ValidParamNames))
1147 )
1148
1149 ParamName = CanonicalParamNamesMap[CanonicalName]
1150 ParamValue = Value
1151
1152 # Set value...
1153 ParamsIDsToLabels[ParamName] = ParamValue
1154
1155 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels
1156
1157
1158 def ProcessOptionNitrogenLonePairParameters():
1159 """Process nitrogen lone pair parameters option."""
1160
1161 ParamsOptionName = "--nitrogenLonePairParams"
1162 ParamsOptionValue = Options["--nitrogenLonePairParams"]
1163
1164 ParamsInfo = {
1165 "AllowHydrogenNbrs": True,
1166 "PlanarityTolerance": 1.0,
1167 }
1168
1169 if re.match("^auto$", ParamsOptionValue, re.I):
1170 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo
1171 return
1172
1173 # Setup a canonical paramater names...
1174 ValidParamNames = []
1175 CanonicalParamNamesMap = {}
1176 for ParamName in sorted(ParamsInfo):
1177 ValidParamNames.append(ParamName)
1178 CanonicalParamNamesMap[ParamName.lower()] = ParamName
1179
1180 ParamsOptionValue = ParamsOptionValue.strip()
1181 if not ParamsOptionValue:
1182 MiscUtil.PrintError('No valid parameter name and value pairs specified using "%s" option' % ParamsOptionName)
1183
1184 ParamsOptionValueWords = ParamsOptionValue.split(",")
1185 if len(ParamsOptionValueWords) % 2:
1186 MiscUtil.PrintError(
1187 'The number of comma delimited paramater names and values, %d, specified using "%s" option must be an even number.'
1188 % (len(ParamsOptionValueWords), ParamsOptionName)
1189 )
1190
1191 # Validate paramater name and value pairs...
1192 for Index in range(0, len(ParamsOptionValueWords), 2):
1193 Name = ParamsOptionValueWords[Index].strip()
1194 Value = ParamsOptionValueWords[Index + 1].strip()
1195
1196 CanonicalName = Name.lower()
1197 if CanonicalName not in CanonicalParamNamesMap:
1198 MiscUtil.PrintError(
1199 'The parameter name, %s, specified using "%s" is not a valid name. Supported parameter names: %s'
1200 % (Name, ParamsOptionName, " ".join(ValidParamNames))
1201 )
1202
1203 ParamName = CanonicalParamNamesMap[CanonicalName]
1204 ParamValue = Value
1205
1206 if re.match("^PlanarityTolerance$", ParamName, re.I):
1207 Value = float(Value)
1208 if Value < 0:
1209 MiscUtil.PrintError(
1210 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: >= 0'
1211 % (Value, Name, ParamsOptionName)
1212 )
1213 ParamValue = Value
1214 elif re.match("^AllowHydrogenNbrs$", ParamName, re.I):
1215 if not re.match("^(yes|no)$", Value, re.I):
1216 MiscUtil.PrintError(
1217 'The parameter value, %s, specified for parameter name, %s, using "%s" option is not a valid value. Supported values: yes or no'
1218 % (Value, Name, ParamsOptionName)
1219 )
1220 ParamValue = True if re.match("^yes$", Value, re.I) else False
1221
1222 # Set value...
1223 ParamsInfo[ParamName] = ParamValue
1224
1225 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo
1226
1227
1228 def ProcessOptions():
1229 """Process and validate command line arguments and options."""
1230
1231 MiscUtil.PrintInfo("Processing options...")
1232
1233 # Validate options...
1234 ValidateOptions()
1235
1236 OptionsInfo["Infile"] = Options["--infile"]
1237 ParamsDefaultInfoOverride = {"RemoveHydrogens": False}
1238 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
1239 "--infileParams",
1240 Options["--infileParams"],
1241 InfileName=Options["--infile"],
1242 ParamsDefaultInfo=ParamsDefaultInfoOverride,
1243 )
1244
1245 OptionsInfo["Outfile"] = Options["--outfile"]
1246 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
1247 "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
1248 )
1249
1250 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"])
1251 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt)
1252 OptionsInfo["OutfileFiltered"] = OutfileFiltered
1253 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False
1254
1255 OutfileSummary = "%s_AlertsSummary.csv" % (FileName)
1256 OptionsInfo["OutfileSummary"] = OutfileSummary
1257 OptionsInfo["OutfileSummaryMode"] = True if re.match("^yes$", Options["--outfileSummary"], re.I) else False
1258
1259 OptionsInfo["OutfilesFilteredByRulesMode"] = (
1260 True if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I) else False
1261 )
1262 OptionsInfo["TrackAlertsSummaryInfo"] = (
1263 True if (OptionsInfo["OutfileSummaryMode"] or OptionsInfo["OutfilesFilteredByRulesMode"]) else False
1264 )
1265
1266 OutfilesFilteredByRulesMaxCount = Options["--outfilesFilteredByRulesMaxCount"]
1267 if not re.match("^All$", OutfilesFilteredByRulesMaxCount, re.I):
1268 OutfilesFilteredByRulesMaxCount = int(OutfilesFilteredByRulesMaxCount)
1269 OptionsInfo["OutfilesFilteredByRulesMaxCount"] = OutfilesFilteredByRulesMaxCount
1270 OptionsInfo["OutfilesFilteredByRulesAllMode"] = (
1271 True if re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I) else False
1272 )
1273
1274 OptionsInfo["OutfileAlerts"] = True if re.match("^yes$", Options["--outfileAlerts"], re.I) else False
1275
1276 if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I):
1277 if not re.match("^yes$", Options["--outfileAlerts"], re.I):
1278 MiscUtil.PrintError(
1279 'The value "%s" specified for "--outfilesFilteredByRules" option is not valid. The specified value is only allowed during "yes" value of "--outfileAlerts" option.'
1280 % (Options["--outfilesFilteredByRules"])
1281 )
1282
1283 OptionsInfo["OutfileAlertsMode"] = Options["--outfileAlertsMode"]
1284 OptionsInfo["OutfileAlertsOnly"] = True if re.match("^AlertsOnly$", Options["--outfileAlertsMode"], re.I) else False
1285
1286 ProcessSDFieldLabelsOption()
1287
1288 OptionsInfo["Overwrite"] = Options["--overwrite"]
1289 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
1290
1291 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False
1292 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"])
1293
1294 ProcessOptionNitrogenLonePairParameters()
1295
1296 OptionsInfo["AlertsMode"] = Options["--alertsMode"]
1297 OptionsInfo["SpecifiedAlertsModeList"] = []
1298 if re.match("^Red$", Options["--alertsMode"], re.I):
1299 OptionsInfo["SpecifiedAlertsModeList"].append("Red")
1300 elif re.match("^RedAndOrange$", Options["--alertsMode"], re.I):
1301 OptionsInfo["SpecifiedAlertsModeList"].append("Red")
1302 OptionsInfo["SpecifiedAlertsModeList"].append("Orange")
1303
1304 OptionsInfo["MinAlertsCount"] = int(Options["--alertsMinCount"])
1305
1306 OptionsInfo["RotBondsSMARTSMode"] = Options["--rotBondsSMARTSMode"]
1307 OptionsInfo["RotBondsSMARTSPatternSpecified"] = Options["--rotBondsSMARTSPattern"]
1308 ProcessRotatableBondsSMARTSMode()
1309
1310 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"]
1311
1312 # Setup delimiter for writing out torsion alert information to output files...
1313 OptionsInfo["IntraSetValuesDelim"] = ","
1314 OptionsInfo["InterSetValuesDelim"] = " "
1315
1316
1317 def RetrieveOptions():
1318 """Retrieve command line arguments and options."""
1319
1320 # Get options...
1321 global Options
1322 Options = docopt(_docoptUsage_)
1323
1324 # Set current working directory to the specified directory...
1325 WorkingDir = Options["--workingdir"]
1326 if WorkingDir:
1327 os.chdir(WorkingDir)
1328
1329 # Handle examples option...
1330 if "--examples" in Options and Options["--examples"]:
1331 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
1332 sys.exit(0)
1333
1334
1335 def ProcessListTorsionLibraryOption():
1336 """Process list torsion library information."""
1337
1338 # Validate and process dataFile option for listing torsion library information...
1339 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"]
1340 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I):
1341 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"])
1342
1343 # Instantiate TorsionLibraryAlerts using defaults...
1344 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(TorsionLibraryFilePath=OptionsInfo["TorsionLibraryFile"])
1345 MiscUtil.PrintInfo(
1346 "\nRetrieving data from torsion library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()
1347 )
1348 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo()
1349
1350
1351 def ValidateOptions():
1352 """Validate option values."""
1353
1354 MiscUtil.ValidateOptionTextValue("-a, --alertsMode", Options["--alertsMode"], "Red RedAndOrange")
1355 MiscUtil.ValidateOptionIntegerValue("--alertsMinCount", Options["--alertsMinCount"], {">=": 1})
1356
1357 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
1358 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol")
1359
1360 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd")
1361 if re.match("^filter$", Options["--mode"], re.I):
1362 MiscUtil.ValidateOptionsOutputFileOverwrite(
1363 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
1364 )
1365 MiscUtil.ValidateOptionsDistinctFileNames(
1366 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
1367 )
1368
1369 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no")
1370
1371 MiscUtil.ValidateOptionTextValue("--outfilesFilteredByRules", Options["--outfilesFilteredByRules"], "yes no")
1372 if not re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I):
1373 MiscUtil.ValidateOptionIntegerValue(
1374 "--outfilesFilteredByRulesMaxCount", Options["--outfilesFilteredByRulesMaxCount"], {">": 0}
1375 )
1376
1377 MiscUtil.ValidateOptionTextValue("--outfileSummary", Options["--outfileSummary"], "yes no")
1378 MiscUtil.ValidateOptionTextValue("--outfileAlerts", Options["--outfileAlerts"], "yes no")
1379 MiscUtil.ValidateOptionTextValue("--outfileAlertsMode", Options["--outfileAlertsMode"], "All AlertsOnly")
1380
1381 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count")
1382 if re.match("^filter$", Options["--mode"], re.I):
1383 if not Options["--outfile"]:
1384 MiscUtil.PrintError(
1385 'The outfile must be specified using "-o, --outfile" during "filter" value of "-m, --mode" option'
1386 )
1387
1388 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no")
1389
1390 MiscUtil.ValidateOptionTextValue(
1391 "-r, --rotBondsSMARTSMode", Options["--rotBondsSMARTSMode"], "NonStrict SemiStrict Strict Specify"
1392 )
1393 if re.match("^Specify$", Options["--rotBondsSMARTSMode"], re.I):
1394 if not Options["--rotBondsSMARTSPattern"]:
1395 MiscUtil.PrintError(
1396 'The SMARTS pattern must be specified using "--rotBondsSMARTSPattern" during "Specify" value of "-r, --rotBondsSMARTS" option'
1397 )
1398
1399 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I):
1400 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"])
1401
1402
1403 # Setup a usage string for docopt...
1404 _docoptUsage_ = """
1405 RDKitFilterTorsionLibraryAlerts.py - Filter torsion library alerts
1406
1407 Usage:
1408 RDKitFilterTorsionLibraryAlerts.py [--alertsMode <Red, RedAndOrange>] [--alertsMinCount <Number>]
1409 [--infileParams <Name,Value,...>] [--mode <filter or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>]
1410 [--nitrogenLonePairParams <Name,Value,...>] [--outfileAlerts <yes or no>]
1411 [--outfileAlertsMode <All or AlertsOnly>] [--outfileFiltered <yes or no>]
1412 [--outfilesFilteredByRules <yes or no>] [--outfilesFilteredByRulesMaxCount <All or number>]
1413 [--outfileSummary <yes or no>] [--outfileSDFieldLabels <Type,Label,...>]
1414 [--outfileParams <Name,Value,...>] [--overwrite] [ --rotBondsSMARTSMode <NonStrict, SemiStrict,...>]
1415 [--rotBondsSMARTSPattern <SMARTS>] [--torsionLibraryFile <FileName or auto>] [-w <dir>] -i <infile> -o <outfile>
1416 RDKitFilterTorsionLibraryAlerts.py [--torsionLibraryFile <FileName or auto>] -l | --list
1417 RDKitFilterTorsionLibraryAlerts.py -h | --help | -e | --examples
1418
1419 Description:
1420 Filter strained molecules from an input file for torsion library [ Ref 146, 152, 159 ]
1421 alerts by matching rotatable bonds against SMARTS patterns specified for torsion
1422 rules in a torsion library file and write out appropriate molecules to output
1423 files. The molecules must have 3D coordinates in input file.
1424
1425 The default torsion library file, TorsionLibrary.xml, is available under
1426 MAYACHEMTOOLS/lib/python/TorsionAlerts directory.
1427
1428 The data in torsion library file is organized in a hierarchical manner. It consists
1429 of one generic class and six specific classes at the highest level. Each class
1430 contains multiple subclasses corresponding to named functional groups or
1431 substructure patterns. The subclasses consist of torsion rules sorted from
1432 specific to generic torsion patterns. The torsion rule, in turn, contains a list
1433 of peak values for torsion angles and two tolerance values. A pair of tolerance
1434 values define torsion bins around a torsion peak value. For example:
1435
1436 <library>
1437 <hierarchyClass name="GG" id1="G" id2="G">
1438 ...
1439 </hierarchyClass>
1440 <hierarchyClass name="CO" id1="C" id2="O">
1441 <hierarchySubClass name="Ester bond I" smarts="O=[C:2][O:3]">
1442 <torsionRule smarts="[O:1]=[C:2]!@[O:3]~[CH0:4]">
1443 <angleList>
1444 <angle value="0.0" tolerance1="20.00"
1445 tolerance2="25.00" score="56.52"/>
1446 </angleList>
1447 </torsionRule>
1448 ...
1449 ...
1450 ...
1451 </hierarchyClass>
1452 <hierarchyClass name="NC" id1="N" id2="C">
1453 ...
1454 </hierarchyClass>
1455 <hierarchyClass name="SN" id1="S" id2="N">
1456 ...
1457 </hierarchyClass>
1458 <hierarchyClass name="CS" id1="C" id2="S">
1459 ...
1460 </hierarchyClass>
1461 <hierarchyClass name="CC" id1="C" id2="C">
1462 ...
1463 </hierarchyClass>
1464 <hierarchyClass name="SS" id1="S" id2="S">
1465 ...
1466 </hierarchyClass>
1467 </library>
1468
1469 The rotatable bonds in a 3D molecule are identified using a default SMARTS pattern.
1470 A custom SMARTS pattern may be optionally specified to detect rotatable bonds.
1471 Each rotatable bond is matched to a torsion rule in the torsion library and
1472 assigned one of the following three alert categories: Green, Orange or Red. The
1473 rotatable bond is marked Green or Orange for the measured angle of the torsion
1474 pattern within the first or second tolerance bins around a torsion peak.
1475 Otherwise, it's marked Red implying that the measured angle is not observed in
1476 the structure databases employed to generate the torsion library.
1477
1478 The following output files are generated after the filtering:
1479
1480 <OutfileRoot>.sdf
1481 <OutfileRoot>_Filtered.sdf
1482 <OutfileRoot>_AlertsSummary.csv
1483 <OutfileRoot>_Filtered_TopRule*.sdf
1484
1485 The supported input file formats are: Mol (.mol), SD (.sdf, .sd)
1486
1487 The supported output file formats are: SD (.sdf, .sd)
1488
1489 Options:
1490 -a, --alertsMode <Red, RedAndOrange> [default: Red]
1491 Torsion library alert types to use for filtering molecules containing
1492 rotatable bonds marked with Green, Orange, or Red alerts. Possible
1493 values: Red or RedAndOrange.
1494 --alertsMinCount <Number> [default: 1]
1495 Minimum number of rotatable bond alerts in a molecule for filtering the
1496 molecule.
1497 -e, --examples
1498 Print examples.
1499 -h, --help
1500 Print this help message.
1501 -i, --infile <infile>
1502 Input file name.
1503 --infileParams <Name,Value,...> [default: auto]
1504 A comma delimited list of parameter name and value pairs for reading
1505 molecules from files. The supported parameter names for different file
1506 formats, along with their default values, are shown below:
1507
1508 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes
1509
1510 -l, --list
1511 List torsion library information without performing any filtering.
1512 -m, --mode <filter or count> [default: filter]
1513 Specify whether to filter molecules for torsion library [ Ref 146, 152, 159 ] alerts
1514 by matching rotatable bonds against SMARTS patterns specified for torsion
1515 rules and write out the rest of the molecules to an outfile or simply count
1516 the number of matched molecules marked for filtering.
1517 --mp <yes or no> [default: no]
1518 Use multiprocessing.
1519
1520 By default, input data is retrieved in a lazy manner via mp.Pool.imap()
1521 function employing lazy RDKit data iterable. This allows processing of
1522 arbitrary large data sets without any additional requirements memory.
1523
1524 All input data may be optionally loaded into memory by mp.Pool.map()
1525 before starting worker processes in a process pool by setting the value
1526 of 'inputDataMode' to 'InMemory' in '--mpParams' option.
1527
1528 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input
1529 data mode may adversely impact the performance. The '--mpParams' section
1530 provides additional information to tune the value of 'chunkSize'.
1531 --mpParams <Name,Value,...> [default: auto]
1532 A comma delimited list of parameter name and value pairs to configure
1533 multiprocessing.
1534
1535 The supported parameter names along with their default and possible
1536 values are shown below:
1537
1538 chunkSize, auto
1539 inputDataMode, Lazy [ Possible values: InMemory or Lazy ]
1540 numProcesses, auto [ Default: mp.cpu_count() ]
1541
1542 These parameters are used by the following functions to configure and
1543 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and
1544 mp.Pool.imap().
1545
1546 The chunkSize determines chunks of input data passed to each worker
1547 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions.
1548 The default value of chunkSize is dependent on the value of 'inputDataMode'.
1549
1550 The mp.Pool.map() function, invoked during 'InMemory' input data mode,
1551 automatically converts RDKit data iterable into a list, loads all data into
1552 memory, and calculates the default chunkSize using the following method
1553 as shown in its code:
1554
1555 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4)
1556 if extra: chunkSize += 1
1557
1558 For example, the default chunkSize will be 7 for a pool of 4 worker processes
1559 and 100 data items.
1560
1561 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs
1562 'lazy' RDKit data iterable to retrieve data as needed, without loading all the
1563 data into memory. Consequently, the size of input data is not known a priori.
1564 It's not possible to estimate an optimal value for the chunkSize. The default
1565 chunkSize is set to 1.
1566
1567 The default value for the chunkSize during 'Lazy' data mode may adversely
1568 impact the performance due to the overhead associated with exchanging
1569 small chunks of data. It is generally a good idea to explicitly set chunkSize to
1570 a larger value during 'Lazy' input data mode, based on the size of your input
1571 data and number of processes in the process pool.
1572
1573 The mp.Pool.map() function waits for all worker processes to process all
1574 the data and return the results. The mp.Pool.imap() function, however,
1575 returns the the results obtained from worker processes as soon as the
1576 results become available for specified chunks of data.
1577
1578 The order of data in the results returned by both mp.Pool.map() and
1579 mp.Pool.imap() functions always corresponds to the input data.
1580 -n, --nitrogenLonePairParams <Name,Value,...> [default: auto]
1581 A comma delimited list of parameter name and value pairs to match
1582 torsion SMARTS patterns containing non-standard construct 'N_lp'
1583 corresponding to nitrogen lone pair.
1584
1585 The supported parameter names along with their default and possible
1586 values are shown below:
1587
1588 allowHydrogenNbrs, yes [ Possible values: yes or no ]
1589 planarityTolerance, 1 [Possible values: >=0]
1590
1591 These parameters are used during the matching of torsion rules containing
1592 'N_lp' in their SMARTS patterns. The 'allowHydrogensNbrs' allows the use
1593 hydrogen neighbors attached to nitrogen during the determination of its
1594 planarity. The 'planarityTolerance' in degrees represents the tolerance
1595 allowed for nitrogen to be considered coplanar with its three neighbors.
1596
1597 The torsion rules containing 'N_lp' in their SMARTS patterns are categorized
1598 into the following two types of rules:
1599
1600 TypeOne:
1601
1602 [CX4:1][CX4H2:2]!@[NX3;"N_lp":3][CX4:4]
1603 [C:1][CX4H2:2]!@[NX3;"N_lp":3][C:4]
1604 ... ... ...
1605
1606 TypeTwo:
1607
1608 [!#1:1][CX4:2]!@[NX3;"N_lp":3]
1609 [C:1][$(S(=O)=O):2]!@["N_lp":3]
1610 ... ... ...
1611
1612 The torsions are matched to torsion rules containing 'N_lp' using specified
1613 SMARTS patterns without the 'N_lp' along with additional constraints using
1614 the following methodology:
1615
1616 TypeOne:
1617
1618 . SMARTS pattern must contain four mapped atoms and the third
1619 mapped atom must be a nitrogen matched with 'NX3:3'
1620 . Nitrogen atom must have 3 neighbors. The 'allowHydrogens'
1621 parameter controls inclusion of hydrogens as its neighbors.
1622 . Nitrogen atom and its 3 neighbors must be coplanar.
1623 'planarityTolerance' parameter provides tolerance in degrees
1624 for nitrogen to be considered coplanar with its 3 neighbors.
1625
1626 TypeTwo:
1627
1628 . SMARTS pattern must contain three mapped atoms and the third
1629 mapped atom must be a nitrogen matched with 'NX3:3'. The
1630 third mapped atom may contain only 'N_lp:3' The missing 'NX3'
1631 is automatically detected.
1632 . Nitrogen atom must have 3 neighbors. 'allowHydrogens'
1633 parameter controls inclusion of hydrogens as neighbors.
1634 . Nitrogen atom and its 3 neighbors must not be coplanar.
1635 'planarityTolerance' parameter provides tolerance in degrees
1636 for nitrogen to be considered coplanar with its 3 neighbors.
1637 . Nitrogen lone pair position equivalent to VSEPR theory is
1638 determined based on the position of nitrogen and its neighbors.
1639 A vector normal to 3 nitrogen neighbors is calculated and added
1640 to the coordinates of nitrogen atom to determine the approximate
1641 position of the lone pair. It is used as the fourth position to
1642 calculate the torsion angle.
1643
1644 -o, --outfile <outfile>
1645 Output file name.
1646 --outfileAlerts <yes or no> [default: yes]
1647 Write out alerts information to SD output files.
1648 --outfileAlertsMode <All or AlertsOnly> [default: AlertsOnly]
1649 Write alerts information to SD output files for all alerts or only for alerts
1650 specified by '--AlertsMode' option. Possible values: All or AlertsOnly
1651 This option is only valid for 'Yes' value of '--outfileAlerts' option.
1652
1653 The following alerts information is added to SD output files using
1654 'TorsionAlerts' data field:
1655
1656 RotBondIndices TorsionAlert TorsionIndices TorsionAngle
1657 TorsionAngleViolation HierarchyClass HierarchySubClass
1658 TorsionRule TorsionPeaks Tolerances1 Tolerances2
1659
1660 The 'RotBondsCount' and 'TorsionAlertsCount' data fields are always added
1661 to SD output files containing both remaining and filtered molecules.
1662
1663 Format:
1664
1665 > <RotBondsCount>
1666 Number
1667
1668 > <TorsionAlertsCount (Green Orange Red)>
1669 Number Number Number
1670
1671 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices
1672 TorsionAngle TorsionAngleViolation HierarchyClass
1673 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2
1674 TorsionRule)>
1675 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3,
1676 AtomIndex4 Angle AngleViolation ClassName SubClassName
1677 CommaDelimPeakValues CommaDelimTol1Values CommDelimTol2Values
1678 SMARTS ... ... ...
1679 ... ... ...
1680
1681 A set of 11 values is written out as value of 'TorsionAlerts' data field for
1682 each torsion in a molecule. The space character is used as a delimiter
1683 to separate values with in a set and across set. The comma character
1684 is used to delimit multiple values for each value in a set.
1685
1686 The 'RotBondIndices' and 'TorsionIndices' contain 2 and 4 comma delimited
1687 values representing atom indices for a rotatable bond and matched torsion.
1688 The 'TorsionPeaks', 'Tolerances1', and 'Tolerances2' contain same number
1689 of comma delimited values corresponding to torsion angle peaks and
1690 tolerance intervals specified in torsion library. For example:
1691
1692 ... ... ...
1693 > <RotBondsCount> (1)
1694 7
1695
1696 > <TorsionAlertsCount (Green Orange Red)> (1)
1697 3 2 2
1698
1699 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices
1700 TorsionAngle TorsionAngleViolation HierarchyClass
1701 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2
1702 TorsionRule)>
1703 1,2 Red 32,2,1,0 0.13 149.87 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2]
1704 ([cH,nX2H0])!@[NX3H1:3][CX4:4] 8,9 Red 10,9,8,28 -0.85 GG
1705 None -90.0,90.0 30.0,30.0 60.0,60.0 [cH1:1][a:2]([cH1])!@[a:3]
1706 ([cH0])[cH0:4]
1707 ... ... ...
1708
1709 --outfileFiltered <yes or no> [default: yes]
1710 Write out a file containing filtered molecules. Its name is automatically
1711 generated from the specified output file. Default: <OutfileRoot>_
1712 Filtered.<OutfileExt>.
1713 --outfilesFilteredByRules <yes or no> [default: yes]
1714 Write out SD files containing filtered molecules for individual torsion
1715 rules triggering alerts in molecules. The name of SD files are automatically
1716 generated from the specified output file. Default file names: <OutfileRoot>_
1717 Filtered_TopRule*.sdf
1718
1719 The following alerts information is added to SD output files:
1720
1721 > <RotBondsCount>
1722 Number
1723
1724 > <TorsionAlertsCount (Green Orange Red)>
1725 Number Number Number
1726
1727 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks
1728 Tolerances1 Tolerances2 TorsionRule)>
1729 ClassName SubClassName CommaDelimPeakValues CommaDelimTol1Values
1730 CommDelimTol2Values SMARTS ... ... ...
1731 ... ... ...
1732
1733 > <TorsionRuleAlertsCount (Green Orange Red)>
1734 Number Number Number
1735
1736 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices
1737 TorsionAngle TorsionAngleViolation)>
1738 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3,
1739 AtomIndex4 Angle AngleViolation ... ... ...
1740
1741 > <TorsionRuleMaxAngleViolation>
1742 Number
1743 ... ... ...
1744
1745 For example:
1746
1747 ... ... ...
1748 > <RotBondsCount> (1)
1749 7
1750
1751 > <TorsionAlertsCount (Green Orange Red)> (1)
1752 3 2 2
1753
1754 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks
1755 Tolerances1 Tolerances2 TorsionRule)> (1)
1756 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2]([cH,nX2H0])!@[NX3H1:3][CX4:4]
1757
1758 > <TorsionRuleAlertsCount (Green Orange Red)> (1)
1759 0 0 1
1760
1761 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices
1762 TorsionAngle TorsionAngleViolation)> (1)
1763 1,2 Red 32,2,1,0 0.13 149.87
1764
1765 > <TorsionRuleMaxAngleViolation> (1)
1766 149.87
1767 ... ... ...
1768
1769 --outfilesFilteredByRulesMaxCount <All or number> [default: 10]
1770 Write out SD files containing filtered molecules for specified number of
1771 top N torsion rules triggering alerts for the largest number of molecules
1772 or for all torsion rules triggering alerts across all molecules.
1773 --outfileSummary <yes or no> [default: yes]
1774 Write out a CVS text file containing summary of torsions rules responsible
1775 for triggering torsion alerts. Its name is automatically generated from the
1776 specified output file. Default: <OutfileRoot>_AlertsSummary.csv.
1777
1778 The following alerts information is written to summary text file:
1779
1780 TorsionRule, TorsionPeaks, Tolerances1, Tolerances2,
1781 HierarchyClass, HierarchySubClass, TorsionAlertType,
1782 TorsionAlertCount, TorsionAlertMolCount
1783
1784 The double quotes characters are removed from SMART patterns before
1785 before writing them to a CSV file. In addition, the torsion rules are sorted by
1786 TorsionAlertMolCount. For example:
1787
1788 "TorsionRule","TorsionPeaks","Tolerances1","Tolerances2",
1789 "HierarchyClass","HierarchySubClass","TorsionAlertTypes",
1790 "TorsionAlertCount","TorsionAlertMolCount"
1791 "[!#1:1][CX4H2:2]!@[CX4H2:3][!#1:4]","-60.0,60.0,180.0",
1792 "20.0,20.0,20.0","30.0,30.0,30.0","CC","None/[CX4:2][CX4:3]",
1793 "Red","16","11"
1794 ... ... ...
1795
1796 --outfileSDFieldLabels <Type,Label,...> [default: auto]
1797 A comma delimited list of SD data field type and label value pairs for writing
1798 torsion alerts information along with molecules to SD files.
1799
1800 The supported SD data field label type along with their default values are
1801 shown below:
1802
1803 For all SD files:
1804
1805 RotBondsCountLabel, RotBondsCount
1806 TorsionAlertsCountLabel, TorsionAlertsCount (Green Orange Red)
1807 TorsionAlertsLabel, TorsionAlerts (RotBondIndices TorsionAlert
1808 TorsionIndices TorsionAngle TorsionAngleViolation
1809 HierarchyClass HierarchySubClass TorsionPeaks Tolerances1
1810 Tolerances2 TorsionRule)
1811
1812 For individual SD files filtered by torsion rules:
1813
1814 TorsionRuleLabel, TorsionRule (HierarchyClass HierarchySubClass
1815 TorsionPeaks Tolerances1 Tolerances2 TorsionRule)
1816 TorsionRuleAlertsCountLabel, TorsionRuleAlertsCount (Green Orange
1817 Red)
1818 TorsionRuleAlertsLabel, TorsionRuleAlerts (RotBondIndices
1819 TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation)
1820 TorsionRuleMaxAngleViolationLabel, TorsionRuleMaxAngleViolation
1821
1822 --outfileParams <Name,Value,...> [default: auto]
1823 A comma delimited list of parameter name and value pairs for writing
1824 molecules to files. The supported parameter names for different file
1825 formats, along with their default values, are shown below:
1826
1827 SD: kekulize,yes,forceV3000,no
1828
1829 --overwrite
1830 Overwrite existing files.
1831 -r, --rotBondsSMARTSMode <NonStrict, SemiStrict,...> [default: SemiStrict]
1832 SMARTS pattern to use for identifying rotatable bonds in a molecule
1833 for matching against torsion rules in the torsion library. Possible values:
1834 NonStrict, SemiStrict, Strict or Specify. The rotatable bond SMARTS matches
1835 are filtered to ensure that each atom in the rotatable bond is attached to
1836 at least two heavy atoms.
1837
1838 The following SMARTS patterns are used to identify rotatable bonds for
1839 different modes:
1840
1841 NonStrict: [!$(*#*)&!D1]-&!@[!$(*#*)&!D1]
1842
1843 SemiStrict:
1844 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)
1845 &!$(C([CH3])([CH3])[CH3])]-!@[!$(*#*)&!D1&!$(C(F)(F)F)
1846 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])]
1847
1848 Strict:
1849 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)
1850 &!$(C([CH3])([CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1])
1851 &!$([#7,O,S!D1]-!@[CD3]=[N,O,S])&!$([CD3](=[N+])-!@[#7!D1])
1852 &!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#*)&!D1&!$(C(F)(F)F)
1853 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])]
1854
1855 The 'NonStrict' and 'Strict' SMARTS patterns are available in RDKit. The
1856 'NonStrict' SMARTS pattern corresponds to original Daylight SMARTS
1857 specification for rotatable bonds. The 'SemiStrict' SMARTS pattern is
1858 derived from 'Strict' SMARTS patterns for its usage in this script.
1859
1860 You may use any arbitrary SMARTS pattern to identify rotatable bonds by
1861 choosing 'Specify' value for '-r, --rotBondsSMARTSMode' option and providing its
1862 value via '--rotBondsSMARTSPattern' option.
1863 --rotBondsSMARTSPattern <SMARTS>
1864 SMARTS pattern for identifying rotatable bonds. This option is only valid
1865 for 'Specify' value of '-r, --rotBondsSMARTSMode' option.
1866 -t, --torsionLibraryFile <FileName or auto> [default: auto]
1867 Specify a XML file name containing data for torsion library hierarchy
1868 or use default file, TorsionLibrary.xml, available in
1869 MAYACHEMTOOLS/lib/Python/TorsionAlerts directory.
1870
1871 The format of data in local XML file must match format of the data in Torsion
1872 Library [ Ref 146, 152, 159 ] file available in MAYACHEMTOOLS directory.
1873 -w, --workingdir <dir>
1874 Location of working directory which defaults to the current directory.
1875
1876 Examples:
1877 To filter molecules containing any rotatable bonds marked with Red alerts
1878 based on torsion rules in the torsion library and write out SD files containing
1879 remaining and filtered molecules, and individual SD files for torsion rules
1880 triggering alerts along with appropriate torsion information for red alerts,
1881 type:
1882
1883 % RDKitFilterTorsionLibraryAlerts.py -i Sample3D.sdf -o Sample3DOut.sdf
1884
1885 To run the first example for only counting number of alerts without writing
1886 out any SD files, type:
1887
1888 % RDKitFilterTorsionLibraryAlerts.py -m count -i Sample3D.sdf -o
1889 Sample3DOut.sdf
1890
1891 To run the first example for filtertering molecules marked with Orange or
1892 Red alerts and write out SD files, tye:
1893
1894 % RDKitFilterTorsionLibraryAlerts.py -m Filter --alertsMode RedAndOrange
1895 -i Sample3D.sdf -o Sample3DOut.sdf
1896
1897 To run the first example for filtering molecules and writing out torsion
1898 information for all alert types to SD files, type:
1899
1900 % RDKitFilterTorsionLibraryAlerts.py --outfileAlertsMode All
1901 -i Sample3D.sdf -o Sample3DOut.sdf
1902
1903 To run the first example for filtering molecules in multiprocessing mode on
1904 all available CPUs without loading all data into memory and write out SD files,
1905 type:
1906
1907 % RDKitFilterTorsionLibraryAlerts.py --mp yes -i Sample3D.sdf
1908 -o Sample3DOut.sdf
1909
1910 To run the first example for filtering molecules in multiprocessing mode on
1911 all available CPUs by loading all data into memory and write out a SD files,
1912 type:
1913
1914 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams
1915 "inputDataMode, InMemory" -i Sample3D.sdf -o Sample3DOut.sdf
1916
1917 To run the first example for filtering molecules in multiprocessing mode on
1918 specific number of CPUs and chunksize without loading all data into memory
1919 and write out SD files, type:
1920
1921 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams
1922 "inputDataMode,lazy,numProcesses,4,chunkSize,8" -i Sample3D.sdf
1923 -o Sample3DOut.sdf
1924
1925 To list information about default torsion library file without performing any
1926 filtering, type:
1927
1928 % RDKitFilterTorsionLibraryAlerts.py -l
1929
1930 To list information about a local torsion library XML file without performing
1931 any, filtering, type:
1932
1933 % RDKitFilterTorsionLibraryAlerts.py --torsionLibraryFile
1934 TorsionLibrary.xml -l
1935
1936 Author:
1937 Manish Sud (msud@san.rr.com)
1938
1939 Collaborator:
1940 Pat Walters
1941
1942 Acknowledgments:
1943 Wolfgang Guba, Patrick Penner, and Levi Pierce
1944
1945 See also:
1946 RDKitFilterChEMBLAlerts.py, RDKitFilterPAINS.py, RDKitFilterTorsionStrainEnergyAlerts.py,
1947 RDKitConvertFileFormat.py, RDKitSearchSMARTS.py
1948
1949 Copyright:
1950 Copyright (C) 2026 Manish Sud. All rights reserved.
1951
1952 This script uses the Torsion Library jointly developed by the University
1953 of Hamburg, Center for Bioinformatics, Hamburg, Germany and
1954 F. Hoffmann-La-Roche Ltd., Basel, Switzerland.
1955
1956 The functionality available in this script is implemented using RDKit, an
1957 open source toolkit for cheminformatics developed by Greg Landrum.
1958
1959 This file is part of MayaChemTools.
1960
1961 MayaChemTools is free software; you can redistribute it and/or modify it under
1962 the terms of the GNU Lesser General Public License as published by the Free
1963 Software Foundation; either version 3 of the License, or (at your option) any
1964 later version.
1965
1966 """
1967
1968 if __name__ == "__main__":
1969 main()