1 #!/bin/env python 2 # 3 # File: RDKitFilterTorsionLibraryAlerts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Collaborator: Pat Walters 7 # 8 # Acknowledgments: Wolfgang Guba, Patrick Penner, and Levi Pierce 9 # 10 # Copyright (C) 2024 Manish Sud. All rights reserved. 11 # 12 # This script uses the Torsion Library jointly developed by the University 13 # of Hamburg, Center for Bioinformatics, Hamburg, Germany and 14 # F. Hoffmann-La-Roche Ltd., Basel, Switzerland. 15 # 16 # The functionality available in this script is implemented using RDKit, an 17 # open source toolkit for cheminformatics developed by Greg Landrum. 18 # 19 # This file is part of MayaChemTools. 20 # 21 # MayaChemTools is free software; you can redistribute it and/or modify it under 22 # the terms of the GNU Lesser General Public License as published by the Free 23 # Software Foundation; either version 3 of the License, or (at your option) any 24 # later version. 25 # 26 # MayaChemTools is distributed in the hope that it will be useful, but without 27 # any warranty; without even the implied warranty of merchantability of fitness 28 # for a particular purpose. See the GNU Lesser General Public License for more 29 # details. 30 # 31 # You should have received a copy of the GNU Lesser General Public License 32 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 33 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 34 # Boston, MA, 02111-1307, USA. 35 # 36 37 from __future__ import print_function 38 39 # Add local python path to the global path and import standard library modules... 40 import os 41 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 42 import time 43 import re 44 import glob 45 import multiprocessing as mp 46 47 # RDKit imports... 48 try: 49 from rdkit import rdBase 50 from rdkit import Chem 51 from rdkit.Chem import rdMolTransforms 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 55 sys.exit(1) 56 57 # MayaChemTools imports... 58 try: 59 from docopt import docopt 60 import MiscUtil 61 import RDKitUtil 62 from TorsionAlerts.TorsionLibraryAlerts import TorsionLibraryAlerts 63 except ImportError as ErrMsg: 64 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 65 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 66 sys.exit(1) 67 68 ScriptName = os.path.basename(sys.argv[0]) 69 Options = {} 70 OptionsInfo = {} 71 72 def main(): 73 """Start execution of the script.""" 74 75 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 76 77 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 78 79 # Retrieve command line arguments and options... 80 RetrieveOptions() 81 82 if Options["--list"]: 83 # Handle listing of torsion library information... 84 ProcessListTorsionLibraryOption() 85 else: 86 # Process and validate command line arguments and options... 87 ProcessOptions() 88 89 # Perform actions required by the script... 90 PerformFiltering() 91 92 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 93 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 94 95 def PerformFiltering(): 96 """Filter molecules using SMARTS torsion rules in the torsion library file.""" 97 98 # Setup a molecule reader... 99 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 100 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 101 102 MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount = ProcessMolecules(Mols) 103 104 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 105 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 106 MiscUtil.PrintInfo("Number of molecules failed during writing: %d" % WriteFailedCount) 107 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + WriteFailedCount)) 108 109 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 110 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 111 112 def ProcessMolecules(Mols): 113 """Process and filter molecules.""" 114 115 if OptionsInfo["MPMode"]: 116 return ProcessMoleculesUsingMultipleProcesses(Mols) 117 else: 118 return ProcessMoleculesUsingSingleProcess(Mols) 119 120 def ProcessMoleculesUsingSingleProcess(Mols): 121 """Process and filter molecules using a single process.""" 122 123 # Instantiate torsion library alerts class... 124 TorsionLibraryAlertsHandle = InstantiateTorsionLibraryAlertsClass() 125 126 MiscUtil.PrintInfo("\nFiltering molecules...") 127 128 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 129 130 # Set up writers... 131 OutfilesWriters = SetupOutfilesWriters() 132 133 WriterRemaining = OutfilesWriters["WriterRemaining"] 134 WriterFiltered = OutfilesWriters["WriterFiltered"] 135 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"] 136 137 # Initialize alerts summary info... 138 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo() 139 140 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5 141 for Mol in Mols: 142 MolCount += 1 143 144 if Mol is None: 145 continue 146 147 if RDKitUtil.IsMolEmpty(Mol): 148 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % RDKitUtil.GetMolName(Mol, MolCount)) 149 continue 150 151 # Check for 3D flag... 152 if not Mol.GetConformer().Is3D(): 153 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % RDKitUtil.GetMolName(Mol, MolCount)) 154 continue 155 156 ValidMolCount += 1 157 158 # Identify torsion library alerts for rotatable bonds.. 159 RotBondsAlertsStatus, RotBondsAlertsInfo = TorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol) 160 161 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo) 162 163 # Write out filtered and remaining molecules... 164 WriteStatus = True 165 if RotBondsAlertsStatus: 166 if OutfileFilteredMode: 167 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo) 168 if WriteStatus: 169 FilteredMolWriteCount += 1 170 else: 171 RemainingMolCount += 1 172 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo) 173 174 if not WriteStatus: 175 WriteFailedCount += 1 176 177 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo) 178 CloseOutfilesWriters(OutfilesWriters) 179 180 if FilteredMolWriteCount: 181 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo) 182 183 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount) 184 185 def ProcessMoleculesUsingMultipleProcesses(Mols): 186 """Process and filter molecules using multiprocessing.""" 187 188 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 189 190 MPParams = OptionsInfo["MPParams"] 191 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 192 193 # Instantiate torsion strain energy alerts class to list torsion library information... 194 TorsionLibraryAlertsHandle = InstantiateTorsionLibraryAlertsClass() 195 196 # Set up writers... 197 OutfilesWriters = SetupOutfilesWriters() 198 199 WriterRemaining = OutfilesWriters["WriterRemaining"] 200 WriterFiltered = OutfilesWriters["WriterFiltered"] 201 WriterAlertSummary = OutfilesWriters["WriterAlertSummary"] 202 203 # Initialize alerts summary info... 204 TorsionAlertsSummaryInfo = InitializeTorsionAlertsSummaryInfo() 205 206 # Setup data for initializing a worker process... 207 MiscUtil.PrintInfo("Encoding options info and rotatable bond pattern molecule...") 208 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 209 210 # Setup a encoded mols data iterable for a worker process... 211 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 212 213 # Setup process pool along with data initialization for each process... 214 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 215 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 216 217 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 218 219 # Start processing... 220 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 221 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 222 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 223 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 224 else: 225 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 226 227 (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount, FilteredMolWriteCount) = [0] * 5 228 for Result in Results: 229 MolCount += 1 230 MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo = Result 231 232 if EncodedMol is None: 233 continue 234 ValidMolCount += 1 235 236 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 237 238 TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo) 239 240 # Write out filtered and remaining molecules... 241 WriteStatus = True 242 if RotBondsAlertsStatus: 243 if OutfileFilteredMode: 244 WriteStatus = WriteMolecule(WriterFiltered, Mol, RotBondsAlertsInfo) 245 if WriteStatus: 246 FilteredMolWriteCount += 1 247 else: 248 RemainingMolCount += 1 249 WriteStatus = WriteMolecule(WriterRemaining, Mol, RotBondsAlertsInfo) 250 251 if not WriteStatus: 252 WriteFailedCount += 1 253 254 WriteTorsionAlertsSummaryInfo(WriterAlertSummary, TorsionAlertsSummaryInfo) 255 CloseOutfilesWriters(OutfilesWriters) 256 257 if FilteredMolWriteCount: 258 WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo) 259 260 return (MolCount, ValidMolCount, RemainingMolCount, WriteFailedCount) 261 262 def InitializeWorkerProcess(*EncodedArgs): 263 """Initialize data for a worker process.""" 264 265 global Options, OptionsInfo 266 267 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 268 269 # Decode Options and OptionInfo... 270 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 271 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 272 273 # Instantiate torsion slibrary alerts class... 274 OptionsInfo["TorsionLibraryAlertsHandle"] = InstantiateTorsionLibraryAlertsClass(Quiet = True) 275 276 def WorkerProcess(EncodedMolInfo): 277 """Process data for a worker process.""" 278 279 MolIndex, EncodedMol = EncodedMolInfo 280 281 if EncodedMol is None: 282 return [MolIndex, None, False, None] 283 284 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 285 if RDKitUtil.IsMolEmpty(Mol): 286 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 287 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 288 return [MolIndex, None, False, None] 289 290 # Check for 3D flag... 291 if not Mol.GetConformer().Is3D(): 292 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 293 MiscUtil.PrintWarning("3D tag is not set. Ignoring molecule: %s\n" % MolName) 294 return [MolIndex, None, False, None] 295 296 # Identify torsion library alerts for rotatable bonds.. 297 TorsionTorsionLibraryAlertsHandle = OptionsInfo["TorsionLibraryAlertsHandle"] 298 RotBondsAlertsStatus, RotBondsAlertsInfo = TorsionTorsionLibraryAlertsHandle.IdentifyTorsionLibraryAlertsForRotatableBonds(Mol) 299 300 return [MolIndex, EncodedMol, RotBondsAlertsStatus, RotBondsAlertsInfo] 301 302 def InitializeTorsionAlertsSummaryInfo(): 303 """Initialize torsion alerts summary.""" 304 305 if OptionsInfo["CountMode"]: 306 return None 307 308 if not OptionsInfo["TrackAlertsSummaryInfo"]: 309 return None 310 311 TorsionAlertsSummaryInfo = {} 312 TorsionAlertsSummaryInfo["RuleIDs"] = [] 313 314 for DataLabel in ["SMARTSToRuleIDs", "RuleSMARTS", "HierarchyClassName", "HierarchySubClassName", "TorsionRulePeaks", "TorsionRuleTolerances1", "TorsionRuleTolerances2", "AlertTypes", "AlertTypesMolCount"]: 315 TorsionAlertsSummaryInfo[DataLabel] = {} 316 317 return TorsionAlertsSummaryInfo 318 319 def TrackTorsionAlertsSummaryInfo(TorsionAlertsSummaryInfo, RotBondsAlertsInfo): 320 """Track torsion alerts summary information for matched torsion rules in a 321 molecule.""" 322 323 if OptionsInfo["CountMode"]: 324 return 325 326 if not OptionsInfo["TrackAlertsSummaryInfo"]: 327 return 328 329 if RotBondsAlertsInfo is None: 330 return 331 332 MolAlertsInfo = {} 333 MolAlertsInfo["RuleIDs"] = [] 334 MolAlertsInfo["AlertTypes"] = {} 335 336 for ID in RotBondsAlertsInfo["IDs"]: 337 if not RotBondsAlertsInfo["MatchStatus"][ID]: 338 continue 339 340 if OptionsInfo["OutfileAlertsOnly"]: 341 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]: 342 continue 343 344 AlertType = RotBondsAlertsInfo["AlertTypes"][ID] 345 TorsionRuleNodeID = RotBondsAlertsInfo["TorsionRuleNodeID"][ID] 346 TorsionRuleSMARTS = RotBondsAlertsInfo["TorsionRuleSMARTS"][ID] 347 348 # Track data for torsion alert summary information across molecules... 349 if TorsionRuleNodeID not in TorsionAlertsSummaryInfo["RuleSMARTS"]: 350 TorsionAlertsSummaryInfo["RuleIDs"].append(TorsionRuleNodeID) 351 TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRuleSMARTS] = TorsionRuleNodeID 352 353 TorsionAlertsSummaryInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRuleSMARTS 354 TorsionAlertsSummaryInfo["HierarchyClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo["HierarchyClassNames"][ID] 355 TorsionAlertsSummaryInfo["HierarchySubClassName"][TorsionRuleNodeID] = RotBondsAlertsInfo["HierarchySubClassNames"][ID] 356 357 TorsionAlertsSummaryInfo["TorsionRulePeaks"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRulePeaks"][ID] 358 TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRuleTolerances1"][ID] 359 TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = RotBondsAlertsInfo["TorsionRuleTolerances2"][ID] 360 361 # Initialize number of alert types across all molecules... 362 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID] = {} 363 364 # Initialize number of molecules flagged by each alert type... 365 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID] = {} 366 367 if AlertType not in TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID]: 368 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0 369 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] = 0 370 371 TorsionAlertsSummaryInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1 372 373 # Track data for torsion alert information in a molecule... 374 if TorsionRuleNodeID not in MolAlertsInfo["AlertTypes"]: 375 MolAlertsInfo["RuleIDs"].append(TorsionRuleNodeID) 376 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID] = {} 377 378 if AlertType not in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]: 379 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] = 0 380 MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType] += 1 381 382 # Track number of molecules flagged by a specific torsion alert... 383 for TorsionRuleNodeID in MolAlertsInfo["RuleIDs"]: 384 for AlertType in MolAlertsInfo["AlertTypes"][TorsionRuleNodeID]: 385 if MolAlertsInfo["AlertTypes"][TorsionRuleNodeID][AlertType]: 386 TorsionAlertsSummaryInfo["AlertTypesMolCount"][TorsionRuleNodeID][AlertType] += 1 387 388 def WriteTorsionAlertsSummaryInfo(Writer, TorsionAlertsSummaryInfo): 389 """Write out torsion alerts summary informatio to a CSV file.""" 390 391 if OptionsInfo["CountMode"]: 392 return 393 394 if not OptionsInfo["OutfileSummaryMode"]: 395 return 396 397 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0: 398 return 399 400 # Write headers... 401 QuoteValues = True 402 Values = ["TorsionRule", "TorsionPeaks", "Tolerances1", "Tolerances2", "HierarchyClass", "HierarchySubClass", "TorsionAlertTypes", "TorsionAlertCount", "TorsionAlertMolCount"] 403 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues)) 404 405 SortedRuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo) 406 407 # Write alerts information... 408 for ID in SortedRuleIDs: 409 # Remove any double quotes in SMARTS... 410 RuleSMARTS = TorsionAlertsSummaryInfo["RuleSMARTS"][ID] 411 RuleSMARTS = re.sub("\"", "", RuleSMARTS, re.I) 412 413 HierarchyClassName = TorsionAlertsSummaryInfo["HierarchyClassName"][ID] 414 HierarchySubClassName = TorsionAlertsSummaryInfo["HierarchySubClassName"][ID] 415 416 TorsionPeaks = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRulePeaks"][ID]], ",") 417 TorsionRuleTolerances1 = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances1"][ID]], ",") 418 TorsionRuleTolerances2 = MiscUtil.JoinWords(["%s" % Value for Value in TorsionAlertsSummaryInfo["TorsionRuleTolerances2"][ID]], ",") 419 420 AlertTypes = [] 421 AlertTypeCount = [] 422 AlertTypeMolCount = [] 423 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]): 424 AlertTypes.append(AlertType) 425 AlertTypeCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypes"][ID][AlertType]) 426 AlertTypeMolCount.append("%s" % TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType]) 427 428 Values = [RuleSMARTS, TorsionPeaks, TorsionRuleTolerances1, TorsionRuleTolerances2, HierarchyClassName, HierarchySubClassName, "%s" % MiscUtil.JoinWords(AlertTypes, ","), "%s" % (MiscUtil.JoinWords(AlertTypeCount, ",")), "%s" % (MiscUtil.JoinWords(AlertTypeMolCount, ","))] 429 Writer.write("%s\n" % MiscUtil.JoinWords(Values, ",", QuoteValues)) 430 431 def GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo): 432 """Sort torsion rule IDs by alert types molecule count in descending order.""" 433 434 SortedRuleIDs = [] 435 436 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"] 437 if len(RuleIDs) == 0: 438 return SortedRuleIDs 439 440 # Setup a map from AlertTypesMolCount to IDs for sorting alerts... 441 RuleIDs = TorsionAlertsSummaryInfo["RuleIDs"] 442 MolCountMap = {} 443 for ID in RuleIDs: 444 MolCount = 0 445 for AlertType in sorted(TorsionAlertsSummaryInfo["AlertTypes"][ID]): 446 MolCount += TorsionAlertsSummaryInfo["AlertTypesMolCount"][ID][AlertType] 447 MolCountMap[ID] = MolCount 448 449 SortedRuleIDs = sorted(RuleIDs, key = lambda ID: MolCountMap[ID], reverse = True) 450 451 return SortedRuleIDs 452 453 def WriteTorsionAlertsFilteredByRulesInfo(TorsionAlertsSummaryInfo): 454 """Write out torsion alerts SD files for individual torsion rules.""" 455 456 if OptionsInfo["CountMode"]: 457 return 458 459 if not OptionsInfo["OutfilesFilteredByRulesMode"]: 460 return 461 462 if len(TorsionAlertsSummaryInfo["RuleIDs"]) == 0: 463 return 464 465 # Setup a molecule reader for filtered molecules... 466 FilteredMols = RDKitUtil.ReadMolecules(OptionsInfo["OutfileFiltered"], **OptionsInfo["InfileParams"]) 467 468 # Get torsion rule IDs for writing out filtered SD files for individual torsion alert rules... 469 TorsionRuleIDs = GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo) 470 471 # Setup writers... 472 ByRuleOutfilesWriters = SetupByRuleOutfilesWriters(TorsionRuleIDs) 473 474 for Mol in FilteredMols: 475 # Retrieve torsion alerts info... 476 TorsionAlertsInfo = RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo) 477 if TorsionAlertsInfo is None: 478 continue 479 480 for TorsionRuleID in TorsionRuleIDs: 481 if TorsionRuleID not in TorsionAlertsInfo["RuleSMARTS"]: 482 continue 483 484 WriteMoleculeFilteredByRuleID(ByRuleOutfilesWriters[TorsionRuleID], Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo) 485 486 CloseByRuleOutfilesWriters(ByRuleOutfilesWriters) 487 488 def GetTorsionAlertsFilteredByRuleFilesRuleIDs(TorsionAlertsSummaryInfo): 489 """Get torsion rule IDs for writing out individual SD files filtered by torsion alert rules.""" 490 491 # Get torsion rule IDs triggering torsion alerts sorted in the order from the most to 492 # the least number of unique molecules... 493 RuleIDs = GetSortedTorsionAlertsSummaryInfoRuleIDs(TorsionAlertsSummaryInfo) 494 495 # Select torsion rule IDs for writing out SD files... 496 if not OptionsInfo["OutfilesFilteredByRulesAllMode"]: 497 MaxRuleIDs = OptionsInfo["OutfilesFilteredByRulesMaxCount"] 498 if MaxRuleIDs < len(RuleIDs): 499 RuleIDs = RuleIDs[0:MaxRuleIDs] 500 501 return RuleIDs 502 503 def RetrieveTorsionAlertsInfo(Mol, TorsionAlertsSummaryInfo): 504 """Parse torsion alerts data field value to retrieve alerts information for rotatable bonds.""" 505 506 TorsionAlertsLabel = OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"] 507 TorsionAlerts = Mol.GetProp(TorsionAlertsLabel) if Mol.HasProp(TorsionAlertsLabel) else None 508 509 if TorsionAlerts is None or len(TorsionAlerts) == 0: 510 return None 511 512 # Initialize for tracking by rule IDs... 513 TorsionAlertsInfo = {} 514 TorsionAlertsInfo["RuleIDs"] = [] 515 516 for DataLabel in ["RuleSMARTS", "HierarchyClassName", "HierarchySubClassName", "TorsionRulePeaks", "TorsionRuleTolerances1", "TorsionRuleTolerances2", "AlertTypes", "AtomIndices", "TorsionAtomIndices", "TorsionAngles", "TorsionAngleViolations", "AlertTypesCount"]: 517 TorsionAlertsInfo[DataLabel] = {} 518 519 ValuesDelimiter = OptionsInfo["IntraSetValuesDelim"] 520 TorsionAlertsSetSize = 11 521 522 TorsionAlertsWords = TorsionAlerts.split() 523 if len(TorsionAlertsWords) % TorsionAlertsSetSize: 524 MiscUtil.PrintError("The number of space delimited values, %s, for TorsionAlerts data field in filtered SD file must be a multiple of %s." % (len(TorsionAlertsWords), TorsionAlertsSetSize)) 525 526 ID = 0 527 for Index in range(0, len(TorsionAlertsWords), TorsionAlertsSetSize): 528 ID += 1 529 530 RotBondIndices, TorsionAlertType, TorsionIndices, TorsionAngle, TorsionAngleViolation, HierarchyClass, HierarchySubClass, TorsionPeaks, Tolerances1, Tolerances2, TorsionRule = TorsionAlertsWords[Index: Index + TorsionAlertsSetSize] 531 RotBondIndices = RotBondIndices.split(ValuesDelimiter) 532 TorsionIndices = TorsionIndices.split(ValuesDelimiter) 533 TorsionPeaks = TorsionPeaks.split(ValuesDelimiter) 534 Tolerances1 = Tolerances1.split(ValuesDelimiter) 535 Tolerances2 = Tolerances2.split(ValuesDelimiter) 536 537 if TorsionRule not in TorsionAlertsSummaryInfo["SMARTSToRuleIDs"]: 538 MiscUtil.PrintWarning("The SMARTS pattern, %s, for TorsionAlerts data field in filtered SD file doesn't map to any torsion rule..." % TorsionRule) 539 continue 540 TorsionRuleNodeID = TorsionAlertsSummaryInfo["SMARTSToRuleIDs"][TorsionRule] 541 542 # Track data for torsion alerts in a molecule... 543 if TorsionRuleNodeID not in TorsionAlertsInfo["RuleSMARTS"]: 544 TorsionAlertsInfo["RuleIDs"].append(TorsionRuleNodeID) 545 546 TorsionAlertsInfo["RuleSMARTS"][TorsionRuleNodeID] = TorsionRule 547 TorsionAlertsInfo["HierarchyClassName"][TorsionRuleNodeID] = HierarchyClass 548 TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleNodeID] = HierarchySubClass 549 TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleNodeID] = TorsionPeaks 550 TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleNodeID] = Tolerances1 551 TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleNodeID] = Tolerances2 552 553 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID] = [] 554 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID] = [] 555 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID] = [] 556 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID] = [] 557 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID] = [] 558 559 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID] = {} 560 561 # Track multiple values for a rule ID... 562 TorsionAlertsInfo["AlertTypes"][TorsionRuleNodeID].append(TorsionAlertType) 563 TorsionAlertsInfo["AtomIndices"][TorsionRuleNodeID].append(RotBondIndices) 564 TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleNodeID].append(TorsionIndices) 565 TorsionAlertsInfo["TorsionAngles"][TorsionRuleNodeID].append(TorsionAngle) 566 TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleNodeID].append(TorsionAngleViolation) 567 568 # Count alert type for a rule ID... 569 if TorsionAlertType not in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID]: 570 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] = 0 571 TorsionAlertsInfo["AlertTypesCount"][TorsionRuleNodeID][TorsionAlertType] += 1 572 573 return TorsionAlertsInfo 574 575 def WriteMolecule(Writer, Mol, RotBondsAlertsInfo): 576 """Write out molecule.""" 577 578 if OptionsInfo["CountMode"]: 579 return True 580 581 SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo) 582 583 try: 584 Writer.write(Mol) 585 except Exception as ErrMsg: 586 MiscUtil.PrintWarning("Failed to write molecule %s:\n%s\n" % (RDKitUtil.GetMolName(Mol), ErrMsg)) 587 return False 588 589 return True 590 591 def SetupMolPropertiesForAlertsInformation(Mol, RotBondsAlertsInfo): 592 """Setup molecule properties containing alerts information for rotatable bonds.""" 593 594 if not OptionsInfo["OutfileAlerts"]: 595 return 596 597 # Setup rotatable bonds count... 598 RotBondsCount = 0 599 if RotBondsAlertsInfo is not None: 600 RotBondsCount = len(RotBondsAlertsInfo["IDs"]) 601 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["RotBondsCountLabel"], "%s" % RotBondsCount) 602 603 # Setup alert counts for rotatable bonds... 604 AlertsCount = [] 605 if RotBondsAlertsInfo is not None: 606 for AlertType in ["Green", "Orange", "Red"]: 607 if AlertType in RotBondsAlertsInfo["Count"]: 608 AlertsCount.append("%s" % RotBondsAlertsInfo["Count"][AlertType]) 609 else: 610 AlertsCount.append("0") 611 612 if len(AlertsCount): 613 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsCountLabel"], "%s" % MiscUtil.JoinWords(AlertsCount, " ")) 614 615 # Setup alert information for rotatable bonds... 616 AlertsInfoValues = [] 617 618 # Delimiter for multiple values corresponding to specific set of information for 619 # a rotatable bond. For example: TorsionAtomIndices 620 ValuesDelim = OptionsInfo["IntraSetValuesDelim"] 621 622 # Delimiter for various values for a rotatable bond... 623 RotBondValuesDelim = OptionsInfo["InterSetValuesDelim"] 624 625 # Delimiter for values corresponding to multiple rotatable bonds... 626 AlertsInfoValuesDelim = OptionsInfo["InterSetValuesDelim"] 627 628 if RotBondsAlertsInfo is not None: 629 for ID in RotBondsAlertsInfo["IDs"]: 630 if not RotBondsAlertsInfo["MatchStatus"][ID]: 631 continue 632 633 if OptionsInfo["OutfileAlertsOnly"]: 634 if RotBondsAlertsInfo["AlertTypes"][ID] not in OptionsInfo["SpecifiedAlertsModeList"]: 635 continue 636 637 RotBondValues = [] 638 639 # Bond atom indices... 640 Values = ["%s" % Value for Value in RotBondsAlertsInfo["AtomIndices"][ID]] 641 RotBondValues.append(ValuesDelim.join(Values)) 642 643 # Alert type... 644 RotBondValues.append(RotBondsAlertsInfo["AlertTypes"][ID]) 645 646 # Torsion atom indices... 647 TorsionAtomIndices = SetupTorsionAtomIndicesValues(RotBondsAlertsInfo["TorsionAtomIndices"][ID], ValuesDelim) 648 RotBondValues.append(TorsionAtomIndices) 649 650 # Torsion angle... 651 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngles"][ID]) 652 653 # Torsion angle violation... 654 RotBondValues.append("%.2f" % RotBondsAlertsInfo["TorsionAngleViolations"][ID]) 655 656 # Hierarchy class and subclass names... 657 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchyClassNames"][ID]) 658 RotBondValues.append("%s" % RotBondsAlertsInfo["HierarchySubClassNames"][ID]) 659 660 # Torsion rule peaks... 661 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRulePeaks"][ID]] 662 RotBondValues.append(ValuesDelim.join(Values)) 663 664 # Torsion rule tolerances... 665 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances1"][ID]] 666 RotBondValues.append(ValuesDelim.join(Values)) 667 Values = ["%s" % Value for Value in RotBondsAlertsInfo["TorsionRuleTolerances2"][ID]] 668 RotBondValues.append(ValuesDelim.join(Values)) 669 670 # Torsion rule SMARTS... 671 RotBondValues.append("%s" % RotBondsAlertsInfo["TorsionRuleSMARTS"][ID]) 672 673 # Track joined values for a rotatable bond... 674 AlertsInfoValues.append("%s" % RotBondValuesDelim.join(RotBondValues)) 675 676 if len(AlertsInfoValues): 677 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"], "%s" % ("%s" % AlertsInfoValuesDelim.join(AlertsInfoValues))) 678 679 def WriteMoleculeFilteredByRuleID(Writer, Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo): 680 """Write out molecule.""" 681 682 if OptionsInfo["CountMode"]: 683 return 684 685 SetupMolPropertiesForFilteredByRuleIDAlertsInformation(Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo) 686 687 Writer.write(Mol) 688 689 def SetupMolPropertiesForFilteredByRuleIDAlertsInformation(Mol, TorsionRuleID, TorsionAlertsSummaryInfo, TorsionAlertsInfo): 690 """Setup molecule properties containing alerts information for torsion alerts 691 fileted by Rule IDs.""" 692 693 # Delete torsion alerts information for rotatable bonds... 694 if Mol.HasProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]): 695 Mol.ClearProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionAlertsLabel"]) 696 697 # Delimiter for values... 698 IntraSetValuesDelim = OptionsInfo["IntraSetValuesDelim"] 699 InterSetValuesDelim = OptionsInfo["InterSetValuesDelim"] 700 701 # Setup alert rule information... 702 AlertRuleInfoValues = [] 703 704 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchyClassName"][TorsionRuleID]) 705 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["HierarchySubClassName"][TorsionRuleID]) 706 707 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRulePeaks"][TorsionRuleID]] 708 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 709 710 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances1"][TorsionRuleID]] 711 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 712 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionRuleTolerances2"][TorsionRuleID]] 713 AlertRuleInfoValues.append(IntraSetValuesDelim.join(Values)) 714 715 AlertRuleInfoValues.append("%s" % TorsionAlertsInfo["RuleSMARTS"][TorsionRuleID]) 716 717 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleLabel"], "%s" % ("%s" % InterSetValuesDelim.join(AlertRuleInfoValues))) 718 719 # Setup alerts count for torsion rule... 720 AlertsCount = [] 721 for AlertType in ["Green", "Orange", "Red"]: 722 if AlertType in TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID]: 723 AlertsCount.append("%s" % TorsionAlertsInfo["AlertTypesCount"][TorsionRuleID][AlertType]) 724 else: 725 AlertsCount.append("0") 726 727 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsCountLabel"], "%s" % (InterSetValuesDelim.join(AlertsCount))) 728 729 # Setup torsion rule alerts... 730 AlertsInfoValues = [] 731 for Index in range(0, len(TorsionAlertsInfo["AlertTypes"][TorsionRuleID])): 732 RotBondInfoValues = [] 733 734 # Bond atom indices... 735 Values = ["%s" % Value for Value in TorsionAlertsInfo["AtomIndices"][TorsionRuleID][Index]] 736 RotBondInfoValues.append(IntraSetValuesDelim.join(Values)) 737 738 # Alert type... 739 RotBondInfoValues.append(TorsionAlertsInfo["AlertTypes"][TorsionRuleID][Index]) 740 741 # Torsion atom indices retrieved from the filtered SD file and stored as strings... 742 Values = ["%s" % Value for Value in TorsionAlertsInfo["TorsionAtomIndices"][TorsionRuleID][Index]] 743 RotBondInfoValues.append(IntraSetValuesDelim.join(Values)) 744 745 # Torsion angle... 746 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngles"][TorsionRuleID][Index]) 747 748 # Torsion angle violation... 749 RotBondInfoValues.append(TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID][Index]) 750 751 # Track alerts informaiton... 752 AlertsInfoValues.append("%s" % InterSetValuesDelim.join(RotBondInfoValues)) 753 754 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleAlertsLabel"], "%s" % (InterSetValuesDelim.join(AlertsInfoValues))) 755 756 # Setup torsion rule alert max angle violation... 757 TorsionAngleViolations = [float(Angle) for Angle in TorsionAlertsInfo["TorsionAngleViolations"][TorsionRuleID]] 758 Mol.SetProp(OptionsInfo["SDFieldIDsToLabels"]["TorsionRuleMaxAngleViolationLabel"], "%.2f" % (max(TorsionAngleViolations))) 759 760 def SetupTorsionAtomIndicesValues(TorsionAtomIndicesList, ValuesDelim): 761 """Setup torsion atom indices value for output files.""" 762 763 # Check for any list values in the list of torsion atom indices used as placeholders 764 # for positions of lone pairs in torsion rules containing N_lp... 765 TorsionAtomsInfo = [] 766 for Value in TorsionAtomIndicesList: 767 if type(Value) is list: 768 TorsionAtomsInfo.append("N_lp") 769 else: 770 TorsionAtomsInfo.append(Value) 771 772 Values = ["%s" % Value for Value in TorsionAtomsInfo] 773 774 return ValuesDelim.join(Values) 775 776 def SetupOutfilesWriters(): 777 """Setup molecule and summary writers.""" 778 779 OutfilesWriters = {"WriterRemaining": None, "WriterFiltered": None, "WriterAlertSummary": None} 780 781 # Writers for SD files... 782 WriterRemaining, WriterFiltered = SetupMoleculeWriters() 783 OutfilesWriters["WriterRemaining"] = WriterRemaining 784 OutfilesWriters["WriterFiltered"] = WriterFiltered 785 786 # Writer for alert summary CSV file... 787 WriterAlertSummary = SetupAlertSummaryWriter() 788 OutfilesWriters["WriterAlertSummary"] = WriterAlertSummary 789 790 return OutfilesWriters 791 792 def SetupMoleculeWriters(): 793 """Setup molecule writers.""" 794 795 Writer = None 796 WriterFiltered = None 797 798 if OptionsInfo["CountMode"]: 799 return (Writer, WriterFiltered) 800 801 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 802 if Writer is None: 803 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 804 MiscUtil.PrintInfo("\nGenerating file %s..." % OptionsInfo["Outfile"]) 805 806 if OptionsInfo["OutfileFilteredMode"]: 807 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 808 if WriterFiltered is None: 809 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 810 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 811 812 return (Writer, WriterFiltered) 813 814 def SetupAlertSummaryWriter(): 815 """Setup a alert summary writer.""" 816 817 Writer = None 818 819 if OptionsInfo["CountMode"]: 820 return Writer 821 822 if not OptionsInfo["OutfileSummaryMode"]: 823 return Writer 824 825 Outfile = OptionsInfo["OutfileSummary"] 826 Writer = open(Outfile, "w") 827 if Writer is None: 828 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 829 830 MiscUtil.PrintInfo("Generating file %s..." % Outfile) 831 832 return Writer 833 834 def CloseOutfilesWriters(OutfilesWriters): 835 """Close outfile writers.""" 836 837 for WriterType, Writer in OutfilesWriters.items(): 838 if Writer is not None: 839 Writer.close() 840 841 def SetupByRuleOutfilesWriters(RuleIDs): 842 """Setup by rule outfiles writers.""" 843 844 # Initialize... 845 OutfilesWriters = {} 846 for RuleID in RuleIDs: 847 OutfilesWriters[RuleID] = None 848 849 if OptionsInfo["CountMode"]: 850 return OutfilesWriters 851 852 if not OptionsInfo["OutfilesFilteredByRulesMode"]: 853 return OutfilesWriters 854 855 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 856 OutfilesRoot = "%s_Filtered_TopRule" % FileName 857 OutfilesExt = "sdf" 858 859 MsgTxt = "all" if OptionsInfo["OutfilesFilteredByRulesAllMode"] else "top %s" % OptionsInfo["OutfilesFilteredByRulesMaxCount"] 860 MiscUtil.PrintInfo("\nGenerating output files %s*.%s for %s torsion rules triggering alerts..." % (OutfilesRoot, OutfilesExt, MsgTxt)) 861 862 # Delete any existing output files... 863 Outfiles = glob.glob("%s*.%s" % (OutfilesRoot, OutfilesExt)) 864 if len(Outfiles): 865 MiscUtil.PrintInfo("Deleting existing output files %s*.%s..." % (OutfilesRoot, OutfilesExt)) 866 for Outfile in Outfiles: 867 try: 868 os.remove(Outfile) 869 except Exception as ErrMsg: 870 MiscUtil.PrintWarning("Failed to delete file: %s" % ErrMsg) 871 872 RuleIndex = 0 873 for RuleID in RuleIDs: 874 RuleIndex += 1 875 Outfile = "%s%s.%s" % (OutfilesRoot, RuleIndex, OutfilesExt) 876 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 877 if Writer is None: 878 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 879 880 OutfilesWriters[RuleID] = Writer 881 882 return OutfilesWriters 883 884 def CloseByRuleOutfilesWriters(OutfilesWriters): 885 """Close by rule outfile writers.""" 886 887 for RuleID, Writer in OutfilesWriters.items(): 888 if Writer is not None: 889 Writer.close() 890 891 def InstantiateTorsionLibraryAlertsClass(Quiet = False): 892 """Initialize torsion library alerts class.""" 893 894 try: 895 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(AlertsMode = OptionsInfo["AlertsMode"], MinAlertsCount = OptionsInfo["MinAlertsCount"], NitrogenLonePairAllowHydrogenNbrs = OptionsInfo["NitrogenLonePairParams"]["AllowHydrogenNbrs"], NitrogenLonePairPlanarityTolerance = OptionsInfo["NitrogenLonePairParams"]["PlanarityTolerance"], RotBondsSMARTSMode = OptionsInfo["RotBondsSMARTSMode"], RotBondsSMARTSPattern = OptionsInfo["RotBondsSMARTSPattern"], TorsionLibraryFilePath = OptionsInfo["TorsionLibraryFile"]) 896 except Exception as ErrMsg: 897 MiscUtil.PrintError("Failed to instantiate TorsionLibraryAlerts:\n%s\n" % (ErrMsg)) 898 899 if not Quiet: 900 MiscUtil.PrintInfo("\nRetrieving data from library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()) 901 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo() 902 903 return TorsionLibraryAlertsHandle 904 905 906 def ProcessRotatableBondsSMARTSMode(): 907 """"Process SMARTS pattern for rotatable bonds.""" 908 909 RotBondsMode = OptionsInfo["RotBondsSMARTSMode"] 910 911 RotBondsSMARTSPattern = None 912 RotBondsSMARTSPatternSpecified = OptionsInfo["RotBondsSMARTSPatternSpecified"] 913 914 if re.match("^(NonStrict|SemiStrict|Strict)$", RotBondsMode, re.I): 915 RotBondsSMARTSPattern = None 916 elif re.match("Specify", RotBondsMode, re.I): 917 RotBondsSMARTSPatternSpecified = RotBondsSMARTSPatternSpecified.strip() 918 if not len(RotBondsSMARTSPatternSpecified): 919 MiscUtil.PrintError("Empty value specified for SMILES/SMARTS pattern in \"--rotBondsSMARTSPattern\" option, %s." % RotBondsMode) 920 921 RotBondsPatternMol = Chem.MolFromSmarts(RotBondsSMARTSPatternSpecified) 922 if RotBondsPatternMol is None: 923 MiscUtil.PrintError("Failed to create rotatable bonds pattern molecule. The rotatable bonds SMARTS pattern, \"%s\", specified using \"--rotBondsSMARTSPattern\" option is not valid." % (RotBondsSMARTSPatternSpecified)) 924 else: 925 MiscUtil.PrintError("The value, %s, specified for option \"-r, --rotBondsSMARTSMode\" is not valid. " % RotBondsMode) 926 927 OptionsInfo["RotBondsSMARTSPattern"] = RotBondsSMARTSPattern 928 929 def ProcessSDFieldLabelsOption(): 930 """Process SD data field label option.""" 931 932 ParamsOptionName = "--outfileSDFieldLabels" 933 ParamsOptionValue = Options["--outfileSDFieldLabels"] 934 935 ParamsIDsToLabels = {"RotBondsCountLabel": "RotBondsCount", "TorsionAlertsCountLabel": "TorsionAlertsCount (Green Orange Red)", "TorsionAlertsLabel": "TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)", "TorsionRuleLabel": "TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 TorsionRule)", "TorsionRuleAlertsCountLabel": "TorsionRuleAlertsCount (Green Orange Red)", "TorsionRuleAlertsLabel": "TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation)", "TorsionRuleMaxAngleViolationLabel": "TorsionRuleMaxAngleViolation"} 936 937 if re.match("^auto$", ParamsOptionValue, re.I): 938 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels 939 return 940 941 # Setup a canonical paramater names... 942 ValidParamNames = [] 943 CanonicalParamNamesMap = {} 944 for ParamName in sorted(ParamsIDsToLabels): 945 ValidParamNames.append(ParamName) 946 CanonicalParamNamesMap[ParamName.lower()] = ParamName 947 948 ParamsOptionValue = ParamsOptionValue.strip() 949 if not ParamsOptionValue: 950 PrintError("No valid parameter name and value pairs specified using \"%s\" option" % ParamsOptionName) 951 952 ParamsOptionValueWords = ParamsOptionValue.split(",") 953 if len(ParamsOptionValueWords) % 2: 954 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"%s\" option must be an even number." % (len(ParamsOptionValueWords), ParamsOptionName)) 955 956 # Validate paramater name and value pairs... 957 for Index in range(0, len(ParamsOptionValueWords), 2): 958 Name = ParamsOptionValueWords[Index].strip() 959 Value = ParamsOptionValueWords[Index + 1].strip() 960 961 CanonicalName = Name.lower() 962 if not CanonicalName in CanonicalParamNamesMap: 963 MiscUtil.PrintError("The parameter name, %s, specified using \"%s\" is not a valid name. Supported parameter names: %s" % (Name, ParamsOptionName, " ".join(ValidParamNames))) 964 965 ParamName = CanonicalParamNamesMap[CanonicalName] 966 ParamValue = Value 967 968 # Set value... 969 ParamsIDsToLabels[ParamName] = ParamValue 970 971 OptionsInfo["SDFieldIDsToLabels"] = ParamsIDsToLabels 972 973 def ProcessOptionNitrogenLonePairParameters(): 974 """Process nitrogen lone pair parameters option.""" 975 976 ParamsOptionName = "--nitrogenLonePairParams" 977 ParamsOptionValue = Options["--nitrogenLonePairParams"] 978 979 ParamsInfo = {"AllowHydrogenNbrs": True, "PlanarityTolerance": 1.0,} 980 981 if re.match("^auto$", ParamsOptionValue, re.I): 982 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo 983 return 984 985 # Setup a canonical paramater names... 986 ValidParamNames = [] 987 CanonicalParamNamesMap = {} 988 for ParamName in sorted(ParamsInfo): 989 ValidParamNames.append(ParamName) 990 CanonicalParamNamesMap[ParamName.lower()] = ParamName 991 992 ParamsOptionValue = ParamsOptionValue.strip() 993 if not ParamsOptionValue: 994 PrintError("No valid parameter name and value pairs specified using \"%s\" option" % ParamsOptionName) 995 996 ParamsOptionValueWords = ParamsOptionValue.split(",") 997 if len(ParamsOptionValueWords) % 2: 998 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"%s\" option must be an even number." % (len(ParamsOptionValueWords), ParamsOptionName)) 999 1000 # Validate paramater name and value pairs... 1001 for Index in range(0, len(ParamsOptionValueWords), 2): 1002 Name = ParamsOptionValueWords[Index].strip() 1003 Value = ParamsOptionValueWords[Index + 1].strip() 1004 1005 CanonicalName = Name.lower() 1006 if not CanonicalName in CanonicalParamNamesMap: 1007 MiscUtil.PrintError("The parameter name, %s, specified using \"%s\" is not a valid name. Supported parameter names: %s" % (Name, ParamsOptionName, " ".join(ValidParamNames))) 1008 1009 ParamName = CanonicalParamNamesMap[CanonicalName] 1010 ParamValue = Value 1011 1012 if re.match("^PlanarityTolerance$", ParamName, re.I): 1013 Value = float(Value) 1014 if Value < 0: 1015 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: >= 0" % (Value, Name, ParamsOptionName)) 1016 ParamValue = Value 1017 elif re.match("^AllowHydrogenNbrs$", ParamName, re.I): 1018 if not re.match("^(yes|no)$", Value, re.I): 1019 MiscUtil.PrintError("The parameter value, %s, specified for parameter name, %s, using \"%s\" option is not a valid value. Supported values: yes or no" % (Value, Name, ParamsOptionName)) 1020 ParamValue = True if re.match("^yes$", Value, re.I) else False 1021 1022 # Set value... 1023 ParamsInfo[ParamName] = ParamValue 1024 1025 OptionsInfo["NitrogenLonePairParams"] = ParamsInfo 1026 1027 def ProcessOptions(): 1028 """Process and validate command line arguments and options.""" 1029 1030 MiscUtil.PrintInfo("Processing options...") 1031 1032 # Validate options... 1033 ValidateOptions() 1034 1035 OptionsInfo["Infile"] = Options["--infile"] 1036 ParamsDefaultInfoOverride = {"RemoveHydrogens": False} 1037 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], InfileName = Options["--infile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 1038 1039 OptionsInfo["Outfile"] = Options["--outfile"] 1040 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 1041 1042 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 1043 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 1044 OptionsInfo["OutfileFiltered"] = OutfileFiltered 1045 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 1046 1047 OutfileSummary = "%s_AlertsSummary.csv" % (FileName) 1048 OptionsInfo["OutfileSummary"] = OutfileSummary 1049 OptionsInfo["OutfileSummaryMode"] = True if re.match("^yes$", Options["--outfileSummary"], re.I) else False 1050 1051 OptionsInfo["OutfilesFilteredByRulesMode"] = True if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I) else False 1052 OptionsInfo["TrackAlertsSummaryInfo"] = True if (OptionsInfo["OutfileSummaryMode"] or OptionsInfo["OutfilesFilteredByRulesMode"]) else False 1053 1054 OutfilesFilteredByRulesMaxCount = Options["--outfilesFilteredByRulesMaxCount"] 1055 if not re.match("^All$", OutfilesFilteredByRulesMaxCount, re.I): 1056 OutfilesFilteredByRulesMaxCount = int(OutfilesFilteredByRulesMaxCount) 1057 OptionsInfo["OutfilesFilteredByRulesMaxCount"] = OutfilesFilteredByRulesMaxCount 1058 OptionsInfo["OutfilesFilteredByRulesAllMode"] = True if re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I) else False 1059 1060 OptionsInfo["OutfileAlerts"] = True if re.match("^yes$", Options["--outfileAlerts"], re.I) else False 1061 1062 if re.match("^yes$", Options["--outfilesFilteredByRules"], re.I): 1063 if not re.match("^yes$", Options["--outfileAlerts"], re.I): 1064 MiscUtil.PrintError("The value \"%s\" specified for \"--outfilesFilteredByRules\" option is not valid. The specified value is only allowed during \"yes\" value of \"--outfileAlerts\" option." % (Options["--outfilesFilteredByRules"])) 1065 1066 OptionsInfo["OutfileAlertsMode"] = Options["--outfileAlertsMode"] 1067 OptionsInfo["OutfileAlertsOnly"] = True if re.match("^AlertsOnly$", Options["--outfileAlertsMode"], re.I) else False 1068 1069 ProcessSDFieldLabelsOption() 1070 1071 OptionsInfo["Overwrite"] = Options["--overwrite"] 1072 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 1073 1074 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 1075 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 1076 1077 ProcessOptionNitrogenLonePairParameters() 1078 1079 OptionsInfo["AlertsMode"] = Options["--alertsMode"] 1080 OptionsInfo["SpecifiedAlertsModeList"] = [] 1081 if re.match("^Red$", Options["--alertsMode"], re.I): 1082 OptionsInfo["SpecifiedAlertsModeList"].append("Red") 1083 elif re.match("^RedAndOrange$", Options["--alertsMode"], re.I): 1084 OptionsInfo["SpecifiedAlertsModeList"].append("Red") 1085 OptionsInfo["SpecifiedAlertsModeList"].append("Orange") 1086 1087 OptionsInfo["MinAlertsCount"] = int(Options["--alertsMinCount"]) 1088 1089 OptionsInfo["RotBondsSMARTSMode"] = Options["--rotBondsSMARTSMode"] 1090 OptionsInfo["RotBondsSMARTSPatternSpecified"] = Options["--rotBondsSMARTSPattern"] 1091 ProcessRotatableBondsSMARTSMode() 1092 1093 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"] 1094 1095 # Setup delimiter for writing out torsion alert information to output files... 1096 OptionsInfo["IntraSetValuesDelim"] = "," 1097 OptionsInfo["InterSetValuesDelim"] = " " 1098 1099 def RetrieveOptions(): 1100 """Retrieve command line arguments and options.""" 1101 1102 # Get options... 1103 global Options 1104 Options = docopt(_docoptUsage_) 1105 1106 # Set current working directory to the specified directory... 1107 WorkingDir = Options["--workingdir"] 1108 if WorkingDir: 1109 os.chdir(WorkingDir) 1110 1111 # Handle examples option... 1112 if "--examples" in Options and Options["--examples"]: 1113 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 1114 sys.exit(0) 1115 1116 def ProcessListTorsionLibraryOption(): 1117 """Process list torsion library information.""" 1118 1119 # Validate and process dataFile option for listing torsion library information... 1120 OptionsInfo["TorsionLibraryFile"] = Options["--torsionLibraryFile"] 1121 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I): 1122 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"]) 1123 1124 # Instantiate TorsionLibraryAlerts using defaults... 1125 TorsionLibraryAlertsHandle = TorsionLibraryAlerts(TorsionLibraryFilePath = OptionsInfo["TorsionLibraryFile"]) 1126 MiscUtil.PrintInfo("\nRetrieving data from torsion library file %s..." % TorsionLibraryAlertsHandle.GetTorsionLibraryFilePath()) 1127 TorsionLibraryAlertsHandle.ListTorsionLibraryInfo() 1128 1129 def ValidateOptions(): 1130 """Validate option values.""" 1131 1132 MiscUtil.ValidateOptionTextValue("-a, --alertsMode", Options["--alertsMode"], "Red RedAndOrange") 1133 MiscUtil.ValidateOptionIntegerValue("--alertsMinCount", Options["--alertsMinCount"], {">=": 1}) 1134 1135 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 1136 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol") 1137 1138 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd") 1139 if re.match("^filter$", Options["--mode"], re.I): 1140 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 1141 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 1142 1143 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 1144 1145 MiscUtil.ValidateOptionTextValue("--outfilesFilteredByRules", Options["--outfilesFilteredByRules"], "yes no") 1146 if not re.match("^All$", Options["--outfilesFilteredByRulesMaxCount"], re.I): 1147 MiscUtil.ValidateOptionIntegerValue("--outfilesFilteredByRulesMaxCount", Options["--outfilesFilteredByRulesMaxCount"], {">": 0}) 1148 1149 MiscUtil.ValidateOptionTextValue("--outfileSummary", Options["--outfileSummary"], "yes no") 1150 MiscUtil.ValidateOptionTextValue("--outfileAlerts", Options["--outfileAlerts"], "yes no") 1151 MiscUtil.ValidateOptionTextValue("--outfileAlertsMode", Options["--outfileAlertsMode"], "All AlertsOnly") 1152 1153 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 1154 if re.match("^filter$", Options["--mode"], re.I): 1155 if not Options["--outfile"]: 1156 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 1157 1158 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 1159 1160 MiscUtil.ValidateOptionTextValue("-r, --rotBondsSMARTSMode", Options["--rotBondsSMARTSMode"], "NonStrict SemiStrict Strict Specify") 1161 if re.match("^Specify$", Options["--rotBondsSMARTSMode"], re.I): 1162 if not Options["--rotBondsSMARTSPattern"]: 1163 MiscUtil.PrintError("The SMARTS pattern must be specified using \"--rotBondsSMARTSPattern\" during \"Specify\" value of \"-r, --rotBondsSMARTS\" option") 1164 1165 if not re.match("^auto$", Options["--torsionLibraryFile"], re.I): 1166 MiscUtil.ValidateOptionFilePath("-t, --torsionLibraryFile", Options["--torsionLibraryFile"]) 1167 1168 # Setup a usage string for docopt... 1169 _docoptUsage_ = """ 1170 RDKitFilterTorsionLibraryAlerts.py - Filter torsion library alerts 1171 1172 Usage: 1173 RDKitFilterTorsionLibraryAlerts.py [--alertsMode <Red, RedAndOrange>] [--alertsMinCount <Number>] 1174 [--infileParams <Name,Value,...>] [--mode <filter or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 1175 [--nitrogenLonePairParams <Name,Value,...>] [--outfileAlerts <yes or no>] 1176 [--outfileAlertsMode <All or AlertsOnly>] [--outfileFiltered <yes or no>] 1177 [--outfilesFilteredByRules <yes or no>] [--outfilesFilteredByRulesMaxCount <All or number>] 1178 [--outfileSummary <yes or no>] [--outfileSDFieldLabels <Type,Label,...>] 1179 [--outfileParams <Name,Value,...>] [--overwrite] [ --rotBondsSMARTSMode <NonStrict, SemiStrict,...>] 1180 [--rotBondsSMARTSPattern <SMARTS>] [--torsionLibraryFile <FileName or auto>] [-w <dir>] -i <infile> -o <outfile> 1181 RDKitFilterTorsionLibraryAlerts.py [--torsionLibraryFile <FileName or auto>] -l | --list 1182 RDKitFilterTorsionLibraryAlerts.py -h | --help | -e | --examples 1183 1184 Description: 1185 Filter strained molecules from an input file for torsion library [ Ref 146, 152, 159 ] 1186 alerts by matching rotatable bonds against SMARTS patterns specified for torsion 1187 rules in a torsion library file and write out appropriate molecules to output 1188 files. The molecules must have 3D coordinates in input file. 1189 1190 The default torsion library file, TorsionLibrary.xml, is available under 1191 MAYACHEMTOOLS/lib/python/TorsionAlerts directory. 1192 1193 The data in torsion library file is organized in a hierarchical manner. It consists 1194 of one generic class and six specific classes at the highest level. Each class 1195 contains multiple subclasses corresponding to named functional groups or 1196 substructure patterns. The subclasses consist of torsion rules sorted from 1197 specific to generic torsion patterns. The torsion rule, in turn, contains a list 1198 of peak values for torsion angles and two tolerance values. A pair of tolerance 1199 values define torsion bins around a torsion peak value. For example: 1200 1201 <library> 1202 <hierarchyClass name="GG" id1="G" id2="G"> 1203 ... 1204 </hierarchyClass> 1205 <hierarchyClass name="CO" id1="C" id2="O"> 1206 <hierarchySubClass name="Ester bond I" smarts="O=[C:2][O:3]"> 1207 <torsionRule smarts="[O:1]=[C:2]!@[O:3]~[CH0:4]"> 1208 <angleList> 1209 <angle value="0.0" tolerance1="20.00" 1210 tolerance2="25.00" score="56.52"/> 1211 </angleList> 1212 </torsionRule> 1213 ... 1214 ... 1215 ... 1216 </hierarchyClass> 1217 <hierarchyClass name="NC" id1="N" id2="C"> 1218 ... 1219 </hierarchyClass> 1220 <hierarchyClass name="SN" id1="S" id2="N"> 1221 ... 1222 </hierarchyClass> 1223 <hierarchyClass name="CS" id1="C" id2="S"> 1224 ... 1225 </hierarchyClass> 1226 <hierarchyClass name="CC" id1="C" id2="C"> 1227 ... 1228 </hierarchyClass> 1229 <hierarchyClass name="SS" id1="S" id2="S"> 1230 ... 1231 </hierarchyClass> 1232 </library> 1233 1234 The rotatable bonds in a 3D molecule are identified using a default SMARTS pattern. 1235 A custom SMARTS pattern may be optionally specified to detect rotatable bonds. 1236 Each rotatable bond is matched to a torsion rule in the torsion library and 1237 assigned one of the following three alert categories: Green, Orange or Red. The 1238 rotatable bond is marked Green or Orange for the measured angle of the torsion 1239 pattern within the first or second tolerance bins around a torsion peak. 1240 Otherwise, it's marked Red implying that the measured angle is not observed in 1241 the structure databases employed to generate the torsion library. 1242 1243 The following output files are generated after the filtering: 1244 1245 <OutfileRoot>.sdf 1246 <OutfileRoot>_Filtered.sdf 1247 <OutfileRoot>_AlertsSummary.csv 1248 <OutfileRoot>_Filtered_TopRule*.sdf 1249 1250 The supported input file formats are: Mol (.mol), SD (.sdf, .sd) 1251 1252 The supported output file formats are: SD (.sdf, .sd) 1253 1254 Options: 1255 -a, --alertsMode <Red, RedAndOrange> [default: Red] 1256 Torsion library alert types to use for filtering molecules containing 1257 rotatable bonds marked with Green, Orange, or Red alerts. Possible 1258 values: Red or RedAndOrange. 1259 --alertsMinCount <Number> [default: 1] 1260 Minimum number of rotatable bond alerts in a molecule for filtering the 1261 molecule. 1262 -e, --examples 1263 Print examples. 1264 -h, --help 1265 Print this help message. 1266 -i, --infile <infile> 1267 Input file name. 1268 --infileParams <Name,Value,...> [default: auto] 1269 A comma delimited list of parameter name and value pairs for reading 1270 molecules from files. The supported parameter names for different file 1271 formats, along with their default values, are shown below: 1272 1273 SD, MOL: removeHydrogens,no,sanitize,yes,strictParsing,yes 1274 1275 -l, --list 1276 List torsion library information without performing any filtering. 1277 -m, --mode <filter or count> [default: filter] 1278 Specify whether to filter molecules for torsion library [ Ref 146, 152, 159 ] alerts 1279 by matching rotatable bonds against SMARTS patterns specified for torsion 1280 rules and write out the rest of the molecules to an outfile or simply count 1281 the number of matched molecules marked for filtering. 1282 --mp <yes or no> [default: no] 1283 Use multiprocessing. 1284 1285 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 1286 function employing lazy RDKit data iterable. This allows processing of 1287 arbitrary large data sets without any additional requirements memory. 1288 1289 All input data may be optionally loaded into memory by mp.Pool.map() 1290 before starting worker processes in a process pool by setting the value 1291 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 1292 1293 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 1294 data mode may adversely impact the performance. The '--mpParams' section 1295 provides additional information to tune the value of 'chunkSize'. 1296 --mpParams <Name,Value,...> [default: auto] 1297 A comma delimited list of parameter name and value pairs to configure 1298 multiprocessing. 1299 1300 The supported parameter names along with their default and possible 1301 values are shown below: 1302 1303 chunkSize, auto 1304 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 1305 numProcesses, auto [ Default: mp.cpu_count() ] 1306 1307 These parameters are used by the following functions to configure and 1308 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 1309 mp.Pool.imap(). 1310 1311 The chunkSize determines chunks of input data passed to each worker 1312 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 1313 The default value of chunkSize is dependent on the value of 'inputDataMode'. 1314 1315 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 1316 automatically converts RDKit data iterable into a list, loads all data into 1317 memory, and calculates the default chunkSize using the following method 1318 as shown in its code: 1319 1320 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 1321 if extra: chunkSize += 1 1322 1323 For example, the default chunkSize will be 7 for a pool of 4 worker processes 1324 and 100 data items. 1325 1326 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 1327 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 1328 data into memory. Consequently, the size of input data is not known a priori. 1329 It's not possible to estimate an optimal value for the chunkSize. The default 1330 chunkSize is set to 1. 1331 1332 The default value for the chunkSize during 'Lazy' data mode may adversely 1333 impact the performance due to the overhead associated with exchanging 1334 small chunks of data. It is generally a good idea to explicitly set chunkSize to 1335 a larger value during 'Lazy' input data mode, based on the size of your input 1336 data and number of processes in the process pool. 1337 1338 The mp.Pool.map() function waits for all worker processes to process all 1339 the data and return the results. The mp.Pool.imap() function, however, 1340 returns the the results obtained from worker processes as soon as the 1341 results become available for specified chunks of data. 1342 1343 The order of data in the results returned by both mp.Pool.map() and 1344 mp.Pool.imap() functions always corresponds to the input data. 1345 -n, --nitrogenLonePairParams <Name,Value,...> [default: auto] 1346 A comma delimited list of parameter name and value pairs to match 1347 torsion SMARTS patterns containing non-standard construct 'N_lp' 1348 corresponding to nitrogen lone pair. 1349 1350 The supported parameter names along with their default and possible 1351 values are shown below: 1352 1353 allowHydrogenNbrs, yes [ Possible values: yes or no ] 1354 planarityTolerance, 1 [Possible values: >=0] 1355 1356 These parameters are used during the matching of torsion rules containing 1357 'N_lp' in their SMARTS patterns. The 'allowHydrogensNbrs' allows the use 1358 hydrogen neighbors attached to nitrogen during the determination of its 1359 planarity. The 'planarityTolerance' in degrees represents the tolerance 1360 allowed for nitrogen to be considered coplanar with its three neighbors. 1361 1362 The torsion rules containing 'N_lp' in their SMARTS patterns are categorized 1363 into the following two types of rules: 1364 1365 TypeOne: 1366 1367 [CX4:1][CX4H2:2]!@[NX3;"N_lp":3][CX4:4] 1368 [C:1][CX4H2:2]!@[NX3;"N_lp":3][C:4] 1369 ... ... ... 1370 1371 TypeTwo: 1372 1373 [!#1:1][CX4:2]!@[NX3;"N_lp":3] 1374 [C:1][$(S(=O)=O):2]!@["N_lp":3] 1375 ... ... ... 1376 1377 The torsions are matched to torsion rules containing 'N_lp' using specified 1378 SMARTS patterns without the 'N_lp' along with additional constraints using 1379 the following methodology: 1380 1381 TypeOne: 1382 1383 . SMARTS pattern must contain four mapped atoms and the third 1384 mapped atom must be a nitrogen matched with 'NX3:3' 1385 . Nitrogen atom must have 3 neighbors. The 'allowHydrogens' 1386 parameter controls inclusion of hydrogens as its neighbors. 1387 . Nitrogen atom and its 3 neighbors must be coplanar. 1388 'planarityTolerance' parameter provides tolerance in degrees 1389 for nitrogen to be considered coplanar with its 3 neighbors. 1390 1391 TypeTwo: 1392 1393 . SMARTS pattern must contain three mapped atoms and the third 1394 mapped atom must be a nitrogen matched with 'NX3:3'. The 1395 third mapped atom may contain only 'N_lp:3' The missing 'NX3' 1396 is automatically detected. 1397 . Nitrogen atom must have 3 neighbors. 'allowHydrogens' 1398 parameter controls inclusion of hydrogens as neighbors. 1399 . Nitrogen atom and its 3 neighbors must not be coplanar. 1400 'planarityTolerance' parameter provides tolerance in degrees 1401 for nitrogen to be considered coplanar with its 3 neighbors. 1402 . Nitrogen lone pair position equivalent to VSEPR theory is 1403 determined based on the position of nitrogen and its neighbors. 1404 A vector normal to 3 nitrogen neighbors is calculated and added 1405 to the coordinates of nitrogen atom to determine the approximate 1406 position of the lone pair. It is used as the fourth position to 1407 calculate the torsion angle. 1408 1409 -o, --outfile <outfile> 1410 Output file name. 1411 --outfileAlerts <yes or no> [default: yes] 1412 Write out alerts information to SD output files. 1413 --outfileAlertsMode <All or AlertsOnly> [default: AlertsOnly] 1414 Write alerts information to SD output files for all alerts or only for alerts 1415 specified by '--AlertsMode' option. Possible values: All or AlertsOnly 1416 This option is only valid for 'Yes' value of '--outfileAlerts' option. 1417 1418 The following alerts information is added to SD output files using 1419 'TorsionAlerts' data field: 1420 1421 RotBondIndices TorsionAlert TorsionIndices TorsionAngle 1422 TorsionAngleViolation HierarchyClass HierarchySubClass 1423 TorsionRule TorsionPeaks Tolerances1 Tolerances2 1424 1425 The 'RotBondsCount' and 'TorsionAlertsCount' data fields are always added 1426 to SD output files containing both remaining and filtered molecules. 1427 1428 Format: 1429 1430 > <RotBondsCount> 1431 Number 1432 1433 > <TorsionAlertsCount (Green Orange Red)> 1434 Number Number Number 1435 1436 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices 1437 TorsionAngle TorsionAngleViolation HierarchyClass 1438 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 1439 TorsionRule)> 1440 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3, 1441 AtomIndex4 Angle AngleViolation ClassName SubClassName 1442 CommaDelimPeakValues CommaDelimTol1Values CommDelimTol2Values 1443 SMARTS ... ... ... 1444 ... ... ... 1445 1446 A set of 11 values is written out as value of 'TorsionAlerts' data field for 1447 each torsion in a molecule. The space character is used as a delimiter 1448 to separate values with in a set and across set. The comma character 1449 is used to delimit multiple values for each value in a set. 1450 1451 The 'RotBondIndices' and 'TorsionIndices' contain 2 and 4 comma delimited 1452 values representing atom indices for a rotatable bond and matched torsion. 1453 The 'TorsionPeaks', 'Tolerances1', and 'Tolerances2' contain same number 1454 of comma delimited values corresponding to torsion angle peaks and 1455 tolerance intervals specified in torsion library. For example: 1456 1457 ... ... ... 1458 > <RotBondsCount> (1) 1459 7 1460 1461 > <TorsionAlertsCount (Green Orange Red)> (1) 1462 3 2 2 1463 1464 > <TorsionAlerts (RotBondIndices TorsionAlert TorsionIndices 1465 TorsionAngle TorsionAngleViolation HierarchyClass 1466 HierarchySubClass TorsionPeaks Tolerances1 Tolerances2 1467 TorsionRule)> 1468 1,2 Red 32,2,1,0 0.13 149.87 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2] 1469 ([cH,nX2H0])!@[NX3H1:3][CX4:4] 8,9 Red 10,9,8,28 -0.85 GG 1470 None -90.0,90.0 30.0,30.0 60.0,60.0 [cH1:1][a:2]([cH1])!@[a:3] 1471 ([cH0])[cH0:4] 1472 ... ... ... 1473 1474 --outfileFiltered <yes or no> [default: yes] 1475 Write out a file containing filtered molecules. Its name is automatically 1476 generated from the specified output file. Default: <OutfileRoot>_ 1477 Filtered.<OutfileExt>. 1478 --outfilesFilteredByRules <yes or no> [default: yes] 1479 Write out SD files containing filtered molecules for individual torsion 1480 rules triggering alerts in molecules. The name of SD files are automatically 1481 generated from the specified output file. Default file names: <OutfileRoot>_ 1482 Filtered_TopRule*.sdf 1483 1484 The following alerts information is added to SD output files: 1485 1486 > <RotBondsCount> 1487 Number 1488 1489 > <TorsionAlertsCount (Green Orange Red)> 1490 Number Number Number 1491 1492 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks 1493 Tolerances1 Tolerances2 TorsionRule)> 1494 ClassName SubClassName CommaDelimPeakValues CommaDelimTol1Values 1495 CommDelimTol2Values SMARTS ... ... ... 1496 ... ... ... 1497 1498 > <TorsionRuleAlertsCount (Green Orange Red)> 1499 Number Number Number 1500 1501 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices 1502 TorsionAngle TorsionAngleViolation)> 1503 AtomIndex2,AtomIndex3 AlertType AtomIndex1,AtomIndex2,AtomIndex3, 1504 AtomIndex4 Angle AngleViolation ... ... ... 1505 1506 > <TorsionRuleMaxAngleViolation> 1507 Number 1508 ... ... ... 1509 1510 For example: 1511 1512 ... ... ... 1513 > <RotBondsCount> (1) 1514 7 1515 1516 > <TorsionAlertsCount (Green Orange Red)> (1) 1517 3 2 2 1518 1519 > <TorsionRule (HierarchyClass HierarchySubClass TorsionPeaks 1520 Tolerances1 Tolerances2 TorsionRule)> (1) 1521 NC Anilines 180.0 10.0 30.0 [cH0:1][c:2]([cH,nX2H0])!@[NX3H1:3][CX4:4] 1522 1523 > <TorsionRuleAlertsCount (Green Orange Red)> (1) 1524 0 0 1 1525 1526 > <TorsionRuleAlerts (RotBondIndices TorsionAlert TorsionIndices 1527 TorsionAngle TorsionAngleViolation)> (1) 1528 1,2 Red 32,2,1,0 0.13 149.87 1529 1530 > <TorsionRuleMaxAngleViolation> (1) 1531 149.87 1532 ... ... ... 1533 1534 --outfilesFilteredByRulesMaxCount <All or number> [default: 10] 1535 Write out SD files containing filtered molecules for specified number of 1536 top N torsion rules triggering alerts for the largest number of molecules 1537 or for all torsion rules triggering alerts across all molecules. 1538 --outfileSummary <yes or no> [default: yes] 1539 Write out a CVS text file containing summary of torsions rules responsible 1540 for triggering torsion alerts. Its name is automatically generated from the 1541 specified output file. Default: <OutfileRoot>_AlertsSummary.csv. 1542 1543 The following alerts information is written to summary text file: 1544 1545 TorsionRule, TorsionPeaks, Tolerances1, Tolerances2, 1546 HierarchyClass, HierarchySubClass, TorsionAlertType, 1547 TorsionAlertCount, TorsionAlertMolCount 1548 1549 The double quotes characters are removed from SMART patterns before 1550 before writing them to a CSV file. In addition, the torsion rules are sorted by 1551 TorsionAlertMolCount. For example: 1552 1553 "TorsionRule","TorsionPeaks","Tolerances1","Tolerances2", 1554 "HierarchyClass","HierarchySubClass","TorsionAlertTypes", 1555 "TorsionAlertCount","TorsionAlertMolCount" 1556 "[!#1:1][CX4H2:2]!@[CX4H2:3][!#1:4]","-60.0,60.0,180.0", 1557 "20.0,20.0,20.0","30.0,30.0,30.0","CC","None/[CX4:2][CX4:3]", 1558 "Red","16","11" 1559 ... ... ... 1560 1561 --outfileSDFieldLabels <Type,Label,...> [default: auto] 1562 A comma delimited list of SD data field type and label value pairs for writing 1563 torsion alerts information along with molecules to SD files. 1564 1565 The supported SD data field label type along with their default values are 1566 shown below: 1567 1568 For all SD files: 1569 1570 RotBondsCountLabel, RotBondsCount 1571 TorsionAlertsCountLabel, TorsionAlertsCount (Green Orange Red) 1572 TorsionAlertsLabel, TorsionAlerts (RotBondIndices TorsionAlert 1573 TorsionIndices TorsionAngle TorsionAngleViolation 1574 HierarchyClass HierarchySubClass TorsionPeaks Tolerances1 1575 Tolerances2 TorsionRule) 1576 1577 For individual SD files filtered by torsion rules: 1578 1579 TorsionRuleLabel, TorsionRule (HierarchyClass HierarchySubClass 1580 TorsionPeaks Tolerances1 Tolerances2 TorsionRule) 1581 TorsionRuleAlertsCountLabel, TorsionRuleAlertsCount (Green Orange 1582 Red) 1583 TorsionRuleAlertsLabel, TorsionRuleAlerts (RotBondIndices 1584 TorsionAlert TorsionIndices TorsionAngle TorsionAngleViolation) 1585 TorsionRuleMaxAngleViolationLabel, TorsionRuleMaxAngleViolation 1586 1587 --outfileParams <Name,Value,...> [default: auto] 1588 A comma delimited list of parameter name and value pairs for writing 1589 molecules to files. The supported parameter names for different file 1590 formats, along with their default values, are shown below: 1591 1592 SD: kekulize,yes,forceV3000,no 1593 1594 --overwrite 1595 Overwrite existing files. 1596 -r, --rotBondsSMARTSMode <NonStrict, SemiStrict,...> [default: SemiStrict] 1597 SMARTS pattern to use for identifying rotatable bonds in a molecule 1598 for matching against torsion rules in the torsion library. Possible values: 1599 NonStrict, SemiStrict, Strict or Specify. The rotatable bond SMARTS matches 1600 are filtered to ensure that each atom in the rotatable bond is attached to 1601 at least two heavy atoms. 1602 1603 The following SMARTS patterns are used to identify rotatable bonds for 1604 different modes: 1605 1606 NonStrict: [!$(*#*)&!D1]-&!@[!$(*#*)&!D1] 1607 1608 SemiStrict: 1609 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br) 1610 &!$(C([CH3])([CH3])[CH3])]-!@[!$(*#*)&!D1&!$(C(F)(F)F) 1611 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])] 1612 1613 Strict: 1614 [!$(*#*)&!D1&!$(C(F)(F)F)&!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br) 1615 &!$(C([CH3])([CH3])[CH3])&!$([CD3](=[N,O,S])-!@[#7,O,S!D1]) 1616 &!$([#7,O,S!D1]-!@[CD3]=[N,O,S])&!$([CD3](=[N+])-!@[#7!D1]) 1617 &!$([#7!D1]-!@[CD3]=[N+])]-!@[!$(*#*)&!D1&!$(C(F)(F)F) 1618 &!$(C(Cl)(Cl)Cl)&!$(C(Br)(Br)Br)&!$(C([CH3])([CH3])[CH3])] 1619 1620 The 'NonStrict' and 'Strict' SMARTS patterns are available in RDKit. The 1621 'NonStrict' SMARTS pattern corresponds to original Daylight SMARTS 1622 specification for rotatable bonds. The 'SemiStrict' SMARTS pattern is 1623 derived from 'Strict' SMARTS patterns for its usage in this script. 1624 1625 You may use any arbitrary SMARTS pattern to identify rotatable bonds by 1626 choosing 'Specify' value for '-r, --rotBondsSMARTSMode' option and providing its 1627 value via '--rotBondsSMARTSPattern' option. 1628 --rotBondsSMARTSPattern <SMARTS> 1629 SMARTS pattern for identifying rotatable bonds. This option is only valid 1630 for 'Specify' value of '-r, --rotBondsSMARTSMode' option. 1631 -t, --torsionLibraryFile <FileName or auto> [default: auto] 1632 Specify a XML file name containing data for torsion library hierarchy 1633 or use default file, TorsionLibrary.xml, available in 1634 MAYACHEMTOOLS/lib/Python/TorsionAlerts directory. 1635 1636 The format of data in local XML file must match format of the data in Torsion 1637 Library [ Ref 146, 152, 159 ] file available in MAYACHEMTOOLS directory. 1638 -w, --workingdir <dir> 1639 Location of working directory which defaults to the current directory. 1640 1641 Examples: 1642 To filter molecules containing any rotatable bonds marked with Red alerts 1643 based on torsion rules in the torsion library and write out SD files containing 1644 remaining and filtered molecules, and individual SD files for torsion rules 1645 triggering alerts along with appropriate torsion information for red alerts, 1646 type: 1647 1648 % RDKitFilterTorsionLibraryAlerts.py -i Sample3D.sdf -o Sample3DOut.sdf 1649 1650 To run the first example for only counting number of alerts without writing 1651 out any SD files, type: 1652 1653 % RDKitFilterTorsionLibraryAlerts.py -m count -i Sample3D.sdf -o 1654 Sample3DOut.sdf 1655 1656 To run the first example for filtertering molecules marked with Orange or 1657 Red alerts and write out SD files, tye: 1658 1659 % RDKitFilterTorsionLibraryAlerts.py -m Filter --alertsMode RedAndOrange 1660 -i Sample3D.sdf -o Sample3DOut.sdf 1661 1662 To run the first example for filtering molecules and writing out torsion 1663 information for all alert types to SD files, type: 1664 1665 % RDKitFilterTorsionLibraryAlerts.py --outfileAlertsMode All 1666 -i Sample3D.sdf -o Sample3DOut.sdf 1667 1668 To run the first example for filtering molecules in multiprocessing mode on 1669 all available CPUs without loading all data into memory and write out SD files, 1670 type: 1671 1672 % RDKitFilterTorsionLibraryAlerts.py --mp yes -i Sample3D.sdf 1673 -o Sample3DOut.sdf 1674 1675 To run the first example for filtering molecules in multiprocessing mode on 1676 all available CPUs by loading all data into memory and write out a SD files, 1677 type: 1678 1679 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams 1680 "inputDataMode, InMemory" -i Sample3D.sdf -o Sample3DOut.sdf 1681 1682 To run the first example for filtering molecules in multiprocessing mode on 1683 specific number of CPUs and chunksize without loading all data into memory 1684 and write out SD files, type: 1685 1686 % RDKitFilterTorsionLibraryAlerts.py --mp yes --mpParams 1687 "inputDataMode,lazy,numProcesses,4,chunkSize,8" -i Sample3D.sdf 1688 -o Sample3DOut.sdf 1689 1690 To list information about default torsion library file without performing any 1691 filtering, type: 1692 1693 % RDKitFilterTorsionLibraryAlerts.py -l 1694 1695 To list information about a local torsion library XML file without performing 1696 any, filtering, type: 1697 1698 % RDKitFilterTorsionLibraryAlerts.py --torsionLibraryFile 1699 TorsionLibrary.xml -l 1700 1701 Author: 1702 Manish Sud (msud@san.rr.com) 1703 1704 Collaborator: 1705 Pat Walters 1706 1707 Acknowledgments: 1708 Wolfgang Guba, Patrick Penner, and Levi Pierce 1709 1710 See also: 1711 RDKitFilterChEMBLAlerts.py, RDKitFilterPAINS.py, RDKitFilterTorsionStrainEnergyAlerts.py, 1712 RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 1713 1714 Copyright: 1715 Copyright (C) 2024 Manish Sud. All rights reserved. 1716 1717 This script uses the Torsion Library jointly developed by the University 1718 of Hamburg, Center for Bioinformatics, Hamburg, Germany and 1719 F. Hoffmann-La-Roche Ltd., Basel, Switzerland. 1720 1721 The functionality available in this script is implemented using RDKit, an 1722 open source toolkit for cheminformatics developed by Greg Landrum. 1723 1724 This file is part of MayaChemTools. 1725 1726 MayaChemTools is free software; you can redistribute it and/or modify it under 1727 the terms of the GNU Lesser General Public License as published by the Free 1728 Software Foundation; either version 3 of the License, or (at your option) any 1729 later version. 1730 1731 """ 1732 1733 if __name__ == "__main__": 1734 main()