1 #!/bin/env python 2 # 3 # File: RDKitFilterChEMBLAlerts.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 except ImportError as ErrMsg: 44 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 45 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 46 sys.exit(1) 47 48 # MayaChemTools imports... 49 try: 50 from docopt import docopt 51 import MiscUtil 52 import RDKitUtil 53 except ImportError as ErrMsg: 54 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 55 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 56 sys.exit(1) 57 58 ScriptName = os.path.basename(sys.argv[0]) 59 Options = {} 60 OptionsInfo = {} 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 66 67 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 68 69 # Retrieve command line arguments and options... 70 RetrieveOptions() 71 72 # Process and validate command line arguments and options... 73 ProcessOptions() 74 75 # Perform actions required by the script... 76 PerformFiltering() 77 78 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 79 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 80 81 def PerformFiltering(): 82 """Filter molecules using SMARTS specified in ChEMBL filters file.""" 83 84 # Setup ChEMBL patterns and pattern mols... 85 MiscUtil.PrintInfo("\nSetting up ChEMBL pattern molecules for performing substructure search...") 86 ChEMBLPatternMols = SetupChEMBLPatternMols() 87 88 # Setup a molecule reader... 89 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 90 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 91 92 # Set up molecule writers... 93 Writer, WriterFiltered = SetupMoleculeWriters() 94 95 MolCount, ValidMolCount, RemainingMolCount = ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 96 97 if Writer is not None: 98 Writer.close() 99 if WriterFiltered is not None: 100 WriterFiltered.close() 101 102 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 103 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 104 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 105 106 MiscUtil.PrintInfo("\nNumber of remaining molecules: %d" % RemainingMolCount) 107 MiscUtil.PrintInfo("Number of filtered molecules: %d" % (ValidMolCount - RemainingMolCount)) 108 109 def ProcessMolecules(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 110 """Process and filter molecules. """ 111 112 if OptionsInfo["MPMode"]: 113 return ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 114 else: 115 return ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered) 116 117 def ProcessMoleculesUsingSingleProcess(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 118 """Process and filter molecules using a single process.""" 119 120 NegateMatch = OptionsInfo["NegateMatch"] 121 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 122 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 123 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 124 125 MiscUtil.PrintInfo("\nFiltering molecules...") 126 127 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 128 FirstMol = True 129 for Mol in Mols: 130 MolCount += 1 131 132 if Mol is None: 133 continue 134 135 if RDKitUtil.IsMolEmpty(Mol): 136 MolName = RDKitUtil.GetMolName(Mol, MolCount) 137 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 138 continue 139 140 ValidMolCount += 1 141 if FirstMol: 142 FirstMol = False 143 if SetSMILESMolProps: 144 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 145 146 MolMatched, AlertsInfo = DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols) 147 if MolMatched == NegateMatch: 148 RemainingMolCount += 1 149 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 150 else: 151 if OutfileFilteredMode: 152 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 153 154 return (MolCount, ValidMolCount, RemainingMolCount) 155 156 def ProcessMoleculesUsingMultipleProcesses(Mols, ChEMBLPatternMols, Writer, WriterFiltered): 157 """Process and filter molecules using multiprocessing.""" 158 159 MiscUtil.PrintInfo("\nFiltering molecules using multiprocessing...") 160 161 MPParams = OptionsInfo["MPParams"] 162 NegateMatch = OptionsInfo["NegateMatch"] 163 OutfileFilteredMode = OptionsInfo["OutfileFilteredMode"] 164 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 165 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 166 167 # Setup data for initializing a worker process... 168 MiscUtil.PrintInfo("Encoding options info and ChEMBL alert pattern molecules...") 169 OptionsInfo["EncodedChEMBLPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in ChEMBLPatternMols] 170 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 171 172 # Setup a encoded mols data iterable for a worker process... 173 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 174 175 # Setup process pool along with data initialization for each process... 176 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 177 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 178 179 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 180 181 # Start processing... 182 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 183 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 184 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 185 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 186 else: 187 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 188 189 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 190 FirstMol = True 191 for Result in Results: 192 MolCount += 1 193 MolIndex, EncodedMol, MolMatched, AlertsInfo = Result 194 195 if EncodedMol is None: 196 continue 197 ValidMolCount += 1 198 199 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 200 201 if FirstMol: 202 FirstMol = False 203 if SetSMILESMolProps: 204 SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol) 205 206 if MolMatched == NegateMatch: 207 RemainingMolCount += 1 208 WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords) 209 else: 210 if OutfileFilteredMode: 211 WriteMolecule(WriterFiltered, Mol, AlertsInfo, Compute2DCoords) 212 213 return (MolCount, ValidMolCount, RemainingMolCount) 214 215 def InitializeWorkerProcess(*EncodedArgs): 216 """Initialize data for a worker process.""" 217 218 global Options, OptionsInfo 219 220 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 221 222 # Decode Options and OptionInfo... 223 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 224 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 225 226 # Decode ChEMBLPatternMols... 227 OptionsInfo["ChEMBLPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedChEMBLPatternMols"]] 228 229 def WorkerProcess(EncodedMolInfo): 230 """Process data for a worker process.""" 231 232 MolIndex, EncodedMol = EncodedMolInfo 233 234 if EncodedMol is None: 235 return [MolIndex, None, False, None] 236 237 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 238 if RDKitUtil.IsMolEmpty(Mol): 239 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 240 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 241 return [MolIndex, None, False, None] 242 243 MolMatched, AlertsInfo = DoesMoleculeContainsChEMBLPattern(Mol, OptionsInfo["ChEMBLPatternMols"]) 244 245 return [MolIndex, EncodedMol, MolMatched, AlertsInfo] 246 247 def WriteMolecule(Writer, Mol, AlertsInfo, Compute2DCoords): 248 """Write out molecule.""" 249 250 if OptionsInfo["CountMode"]: 251 return 252 253 if Compute2DCoords: 254 AllChem.Compute2DCoords(Mol) 255 256 if AlertsInfo is not None and len(AlertsInfo): 257 AlertsCount = "%s" % len(AlertsInfo) 258 Alerts = "; ".join(AlertsInfo) 259 if OptionsInfo["WriteAlertsCount"]: 260 Mol.SetProp(OptionsInfo["AlertsCountLabel"], AlertsCount) 261 Mol.SetProp(OptionsInfo["AlertsLabel"], Alerts) 262 263 Writer.write(Mol) 264 265 def SetupMoleculeWriters(): 266 """Setup molecule writers.""" 267 268 Writer = None 269 WriterFiltered = None 270 271 if OptionsInfo["CountMode"]: 272 return (Writer, WriterFiltered) 273 274 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 275 if Writer is None: 276 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 277 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 278 279 if OptionsInfo["OutfileFilteredMode"]: 280 WriterFiltered = RDKitUtil.MoleculesWriter(OptionsInfo["OutfileFiltered"], **OptionsInfo["OutfileParams"]) 281 if WriterFiltered is None: 282 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["OutfileFiltered"]) 283 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["OutfileFiltered"]) 284 285 return (Writer, WriterFiltered) 286 287 def SetupSMILESMoleculeWritersProps(Writer, WriterFiltered, Mol): 288 """Setup properties to write for SMILES molecule writers.""" 289 290 if not OptionsInfo["OutfileParams"]["SetSMILESMolProps"]: 291 return 292 293 NegateMatch = OptionsInfo["NegateMatch"] 294 SetSMILESMolAlertsProp = OptionsInfo["SetSMILESMolAlertsProp"] 295 SMILESMolAlertsPropList = OptionsInfo["SMILESMolAlertsPropList"] 296 297 if Writer is not None: 298 RDKitUtil.SetWriterMolProps(Writer, Mol) 299 if SetSMILESMolAlertsProp: 300 if NegateMatch: 301 Writer.SetProps(SMILESMolAlertsPropList) 302 303 if WriterFiltered is not None: 304 RDKitUtil.SetWriterMolProps(WriterFiltered, Mol) 305 if SetSMILESMolAlertsProp: 306 if not NegateMatch: 307 WriterFiltered.SetProps(SMILESMolAlertsPropList) 308 309 def DoesMoleculeContainsChEMBLPattern(Mol, ChEMBLPatternMols): 310 """Check presence of ChEMBL alerts pattern in the molecule.""" 311 312 MatchAllAlerts = OptionsInfo["MatchAllAlerts"] 313 AlertsInfo = [] 314 for PatternMol in ChEMBLPatternMols: 315 if Mol.HasSubstructMatch(PatternMol, useChirality = True): 316 AlertsInfo.append("%s: %s" % (PatternMol.GetProp("FilterType"), PatternMol.GetProp("FilterID"))) 317 if not MatchAllAlerts: 318 break 319 320 if len(AlertsInfo) == 0: 321 MolMatched = False 322 AlertsInfo = None 323 else: 324 MolMatched = True 325 326 return (MolMatched, AlertsInfo) 327 328 def SetupChEMBLPatternMols(): 329 """Set up ChEMBL pattern mols for substructure search corresponding to alert mode""" 330 331 PatternMols = [] 332 for FilterType in OptionsInfo["SpecifiedFilterTypes"]: 333 for Index, Pattern in enumerate(OptionsInfo["ChEMBLFiltersMap"]["SMARTS"][FilterType]): 334 ID = OptionsInfo["ChEMBLFiltersMap"]["IDs"][FilterType][Index] 335 336 PatternMol = Chem.MolFromSmarts(Pattern) 337 if PatternMol is None: 338 MiscUtil.PrintWarning("Failed to convert ChEMBL pattern, %s, into a molecule..." % Pattern) 339 continue 340 341 # Setup FilterType and PattenMol as property of PatternMol 342 PatternMol.SetProp("FilterType", FilterType) 343 PatternMol.SetProp("FilterID", ID) 344 345 PatternMols.append(PatternMol) 346 347 return PatternMols 348 349 def ProcessChEMBLAlertsMode(): 350 """Process specified alerts mode.""" 351 352 OptionsInfo["AlertsMode"] = Options["--alertsMode"] 353 354 # Retrieve filetrs information... 355 RetrieveChEMBLFiltersInfo() 356 357 # Process alerts mode... 358 OptionsInfo["SpecifiedFilterTypes"] = OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"] 359 if re.match("^All$", OptionsInfo["AlertsMode"], re.I): 360 return 361 362 AlertsMode = re.sub(" ", "", OptionsInfo["AlertsMode"]) 363 if not len(AlertsMode): 364 MiscUtil.PrintError("The alerts mode specified using \"-a, --alertsMode\" option are empty.") 365 366 CanonicalFilterTypesMap = {} 367 for FilterType in OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]: 368 CanonicalFilterTypesMap[FilterType.lower()] = FilterType 369 370 SpecifiedFilterTypes = [] 371 for FilterType in AlertsMode.split(","): 372 CanonicalFilterType = FilterType.lower() 373 if not CanonicalFilterType in CanonicalFilterTypesMap: 374 MiscUtil.PrintError("The altert mode, %s, specified using \"-a, --alertsMode\" is not valid. Supported alert modes: %s" % (FilterType, ", ".join(OptionsInfo["ChEMBLFiltersMap"]["FilterTypes"]))) 375 376 SpecifiedFilterTypes.append(CanonicalFilterTypesMap[CanonicalFilterType]) 377 378 OptionsInfo["SpecifiedFilterTypes"] = SpecifiedFilterTypes 379 380 def ProcessChEMBLAlertsMatch(): 381 """Process specified alerts match.""" 382 383 AlertsMatch = Options["--alertsMatch"] 384 385 MatchFirstAlert, MatchAllAlerts = [False] * 2 386 if re.match("^First$", AlertsMatch, re.I): 387 MatchFirstAlert = True 388 elif re.match("^All$", AlertsMatch, re.I): 389 MatchAllAlerts = True 390 else: 391 MiscUtil.PrintError("The value %s, specified using \"--alertsMatch\" option is not valid. Supported values: First or All" % (AlertsMatch)) 392 393 OptionsInfo["AlertsMatch"] = AlertsMatch 394 OptionsInfo["MatchFirstAlert"] = MatchFirstAlert 395 OptionsInfo["MatchAllAlerts"] = MatchAllAlerts 396 397 # Setup labels for writing out alerts match information... 398 OptionsInfo["AlertsCountLabel"] = "ChEMBLAlertsCount" 399 OptionsInfo["AlertsLabel"] = "FirstChEMBLAlert" if MatchFirstAlert else "ChEMBLAlerts" 400 401 # Write out alerts count only for match all alerts... 402 OptionsInfo["WriteAlertsCount"] = True if MatchAllAlerts else False 403 404 # Write out alerts match information to comma or tab delimited SMILES files... 405 SMILESDelimiter = OptionsInfo["OutfileParams"]["SMILESDelimiter"] 406 OptionsInfo["SetSMILESMolAlertsProp"] = True if re.match("^[\t,]", SMILESDelimiter, re.I) else False 407 408 SMILESMolAlertsPropList = [] 409 if OptionsInfo["WriteAlertsCount"]: 410 SMILESMolAlertsPropList.append(OptionsInfo["AlertsCountLabel"]) 411 SMILESMolAlertsPropList.append(OptionsInfo["AlertsLabel"]) 412 OptionsInfo["SMILESMolAlertsPropList"] = SMILESMolAlertsPropList 413 414 def RetrieveChEMBLFiltersInfo(): 415 """Retrieve information for ChEMBL filters.""" 416 417 MayaChemToolsDataDir = MiscUtil.GetMayaChemToolsLibDataPath() 418 ChEMBLFiltersFilePath = os.path.join(MayaChemToolsDataDir, "ChEMBLFilters.csv") 419 420 MiscUtil.PrintInfo("\nRetrieving ChEMBL alerts SMARTS patterns from file %s" % (ChEMBLFiltersFilePath)) 421 422 Delimiter = ',' 423 QuoteChar = '"' 424 IgnoreHeaderLine = True 425 FilterLinesWords = MiscUtil.GetTextLinesWords(ChEMBLFiltersFilePath, Delimiter, QuoteChar, IgnoreHeaderLine) 426 427 ChEMBLFiltersMap = {} 428 ChEMBLFiltersMap["FilterTypes"] = [] 429 ChEMBLFiltersMap["IDs"] = {} 430 ChEMBLFiltersMap["SMARTS"] = {} 431 432 for LineWords in FilterLinesWords: 433 FilterType = LineWords[0] 434 ID = LineWords[1] 435 SMARTS = LineWords[2] 436 437 if not FilterType in ChEMBLFiltersMap["FilterTypes"]: 438 ChEMBLFiltersMap["FilterTypes"].append(FilterType) 439 ChEMBLFiltersMap["IDs"][FilterType] = [] 440 ChEMBLFiltersMap["SMARTS"][FilterType] = [] 441 442 ChEMBLFiltersMap["IDs"][FilterType].append(ID) 443 ChEMBLFiltersMap["SMARTS"][FilterType].append(SMARTS) 444 445 OptionsInfo["ChEMBLFiltersMap"] = ChEMBLFiltersMap 446 447 MiscUtil.PrintInfo("\nTotal number alerts: %d" % len(FilterLinesWords)) 448 MiscUtil.PrintInfo("Number of filter family types: %d\nFilter familty types: %s\n" % (len(ChEMBLFiltersMap["FilterTypes"]), ", ".join(ChEMBLFiltersMap["FilterTypes"]))) 449 450 for FilterType in ChEMBLFiltersMap["FilterTypes"]: 451 MiscUtil.PrintInfo("Filter family type: %s; Number of alerts: %d" % (FilterType, len(ChEMBLFiltersMap["IDs"][FilterType]))) 452 MiscUtil.PrintInfo("") 453 454 def ProcessOptions(): 455 """Process and validate command line arguments and options.""" 456 457 MiscUtil.PrintInfo("Processing options...") 458 459 # Validate options... 460 ValidateOptions() 461 462 OptionsInfo["Infile"] = Options["--infile"] 463 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 464 465 OptionsInfo["Outfile"] = Options["--outfile"] 466 ParamsDefaultInfoOverride = {"SMILESMolProps": True} 467 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"], ParamsDefaultInfo = ParamsDefaultInfoOverride) 468 469 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 470 OutfileFiltered = "%s_Filtered.%s" % (FileName, FileExt) 471 OptionsInfo["OutfileFiltered"] = OutfileFiltered 472 OptionsInfo["OutfileFilteredMode"] = True if re.match("^yes$", Options["--outfileFiltered"], re.I) else False 473 474 OptionsInfo["Overwrite"] = Options["--overwrite"] 475 476 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 477 OptionsInfo["NegateMatch"] = True if re.match("^yes$", Options["--negate"], re.I) else False 478 479 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 480 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 481 482 ProcessChEMBLAlertsMode() 483 ProcessChEMBLAlertsMatch() 484 485 def RetrieveOptions(): 486 """Retrieve command line arguments and options.""" 487 488 # Get options... 489 global Options 490 Options = docopt(_docoptUsage_) 491 492 # Set current working directory to the specified directory... 493 WorkingDir = Options["--workingdir"] 494 if WorkingDir: 495 os.chdir(WorkingDir) 496 497 # Handle examples option... 498 if "--examples" in Options and Options["--examples"]: 499 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 500 sys.exit(0) 501 502 def ValidateOptions(): 503 """Validate option values.""" 504 505 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 506 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 507 508 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 509 if re.match("^filter$", Options["--mode"], re.I): 510 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 511 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 512 513 MiscUtil.ValidateOptionTextValue("--alertsMatch", Options["--alertsMatch"], "First All") 514 515 MiscUtil.ValidateOptionTextValue("--outfileFiltered", Options["--outfileFiltered"], "yes no") 516 517 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "filter count") 518 if re.match("^filter$", Options["--mode"], re.I): 519 if not Options["--outfile"]: 520 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"filter\" value of \"-m, --mode\" option") 521 522 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 523 MiscUtil.ValidateOptionTextValue("-n, --negate", Options["--negate"], "yes no") 524 525 # Setup a usage string for docopt... 526 _docoptUsage_ = """ 527 RDKitFilterChEMBLAlterts.py - Filter ChEMBL alerts 528 529 Usage: 530 RDKitFilterChEMBLAlerts.py [--alertsMode <All or Type,Type,...>] [--alertsMatch <First or All>] 531 [--infileParams <Name,Value,...>] [--mode <filter or count>] 532 [--mp <yes or no>] [--mpParams <Name,Value,...>] 533 [--outfileFiltered <yes or no>] [ --outfileParams <Name,Value,...>] 534 [--negate <yes or no>] [--overwrite] [-w <dir>] -i <infile> -o <outfile> 535 RDKitFilterChEMBLAlerts.py -h | --help | -e | --examples 536 537 Description: 538 Filter molecules from an input file for ChEMBL structural alerts by performing 539 a substructure search using SMARTS patterns specified in MAYACHEMTOOLS/ 540 lib/data/ChEMBLFilters.csv file and write out appropriate molecules to an 541 output file or simply count the number of filtered molecules. 542 543 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 544 .tsv, .txt) 545 546 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 547 548 Options: 549 -a, --alertsMode <All or Type, Type,...> [default: All] 550 All or a comma delimited list of ChEMBL filter types to use for filtering 551 molecules. 552 553 The supported filter family types, along with a description, are show below: 554 555 BMS: Bristol-Myers Squibb HTS Deck Filters 556 Dundee: University of Dundee NTD Screening Library Filters 557 Glaxo: Bristol-Myers Squibb HTS Deck Filters 558 Inpharmatica 559 MLSMR: NIH MLSMR Excluded Functionality Filters 560 PfizerLINT: Pfizer LINT filters 561 SureChEMBL 562 563 --alertsMatch <First or All> [default: First] 564 Stop after matching only first alert or match all ChEMBL alerts for 565 filtering molecules. 566 567 The 'ChEMBLAlertsCount' and 'ChEMBLAlerts' data fields are added to 568 SD file containing filtered molecules for 'All' value of '-altersMatch'. In 569 addition, these data fields are only written to tab or comma delimited 570 SMILES file. 571 572 Format: 573 574 > <ChEMBLAlertsCount> 575 Number 576 577 > <ChEMBLAlerts> 578 FilterType: ID; FilterType: ID... ... ...`` 579 580 -e, --examples 581 Print examples. 582 -h, --help 583 Print this help message. 584 -i, --infile <infile> 585 Input file name. 586 --infileParams <Name,Value,...> [default: auto] 587 A comma delimited list of parameter name and value pairs for reading 588 molecules from files. The supported parameter names for different file 589 formats, along with their default values, are shown below: 590 591 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 592 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 593 smilesTitleLine,auto,sanitize,yes 594 595 Possible values for smilesDelimiter: space, comma or tab. 596 -m, --mode <filter or count> [default: filter] 597 Specify whether to filter the matched molecules and write out the rest of the 598 molecules to an outfile or simply count the number of matched molecules 599 marked for filtering. 600 --mp <yes or no> [default: no] 601 Use multiprocessing. 602 603 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 604 function employing lazy RDKit data iterable. This allows processing of 605 arbitrary large data sets without any additional requirements memory. 606 607 All input data may be optionally loaded into memory by mp.Pool.map() 608 before starting worker processes in a process pool by setting the value 609 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 610 611 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 612 data mode may adversely impact the performance. The '--mpParams' section 613 provides additional information to tune the value of 'chunkSize'. 614 --mpParams <Name,Value,...> [default: auto] 615 A comma delimited list of parameter name and value pairs to configure 616 multiprocessing. 617 618 The supported parameter names along with their default and possible 619 values are shown below: 620 621 chunkSize, auto 622 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 623 numProcesses, auto [ Default: mp.cpu_count() ] 624 625 These parameters are used by the following functions to configure and 626 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 627 mp.Pool.imap(). 628 629 The chunkSize determines chunks of input data passed to each worker 630 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 631 The default value of chunkSize is dependent on the value of 'inputDataMode'. 632 633 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 634 automatically converts RDKit data iterable into a list, loads all data into 635 memory, and calculates the default chunkSize using the following method 636 as shown in its code: 637 638 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 639 if extra: chunkSize += 1 640 641 For example, the default chunkSize will be 7 for a pool of 4 worker processes 642 and 100 data items. 643 644 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 645 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 646 data into memory. Consequently, the size of input data is not known a priori. 647 It's not possible to estimate an optimal value for the chunkSize. The default 648 chunkSize is set to 1. 649 650 The default value for the chunkSize during 'Lazy' data mode may adversely 651 impact the performance due to the overhead associated with exchanging 652 small chunks of data. It is generally a good idea to explicitly set chunkSize to 653 a larger value during 'Lazy' input data mode, based on the size of your input 654 data and number of processes in the process pool. 655 656 The mp.Pool.map() function waits for all worker processes to process all 657 the data and return the results. The mp.Pool.imap() function, however, 658 returns the the results obtained from worker processes as soon as the 659 results become available for specified chunks of data. 660 661 The order of data in the results returned by both mp.Pool.map() and 662 mp.Pool.imap() functions always corresponds to the input data. 663 -n, --negate <yes or no> [default: no] 664 Specify whether to filter molecules not matching the ChEMBL filters specified by 665 SMARTS patterns. 666 -o, --outfile <outfile> 667 Output file name. 668 --outfileFiltered <yes or no> [default: no] 669 Write out a file containing filtered molecules. Its name is automatically 670 generated from the specified output file. Default: <OutfileRoot>_ 671 Filtered.<OutfileExt>. 672 --outfileParams <Name,Value,...> [default: auto] 673 A comma delimited list of parameter name and value pairs for writing 674 molecules to files. The supported parameter names for different file 675 formats, along with their default values, are shown below: 676 677 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 678 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 679 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,yes 680 681 Default value for compute2DCoords: yes for SMILES input file; no for all other 682 file types. 683 --overwrite 684 Overwrite existing files. 685 -w, --workingdir <dir> 686 Location of working directory which defaults to the current directory. 687 688 Examples: 689 To count the number of molecules not containing any substructure corresponding 690 to any ChEMBL SMARTS patterns and write out SMILES files containing these molecules, 691 type: 692 693 % RDKitFilterChEMBLAlerts.py -i Sample.smi -o SampleOut.smi 694 695 To count the number of molecules not containing any substructure corresponding 696 to any ChEMBL SMARTS patterns and write out comma delmited SMILES files 697 containing these and filtered molecules along with the alerts information for 698 filtered molecules matching first pattern, type: 699 700 % RDKitFilterChEMBLAlerts.py --outfileFiltered yes --outfileParams 701 "SMILESDelimiter,comma" -i Sample.smi -o SampleOut.smi 702 703 To count the number of molecules not containing any substructure corresponding 704 to any ChEMBL SMARTS patterns and write out comma delmited SMILES files 705 containing these and filtered molecules along with the alerts information for 706 filtered molecules matching all patterns, type: 707 708 % RDKitFilterChEMBLAlerts.py --alertsMatch All --outfileFiltered yes 709 --outfileParams "SMILESDelimiter,comma" -i Sample.smi 710 -o SampleOut.smi 711 712 To count the number of molecules not containing any substructure corresponding 713 to any ChEMBL SMARTS patterns and write out SD files containing these and filtered 714 molecules along with the alerts information for filtered molecules matching all 715 patterns, type: 716 717 % RDKitFilterChEMBLAlerts.py --alertsMatch All --outfileFiltered yes 718 -i Sample.smi -o SampleOut.sdf 719 720 To count the number of molecules not containing any substructure corresponding to 721 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all 722 available CPUs without loading all data into memory, and write out a SMILES file, type: 723 724 % RDKitFilterChEMBLAlerts.py --mp yes -i Sample.smi -o SampleOut.smi 725 726 To count the number of molecules not containing any substructure corresponding to 727 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on all 728 available CPUs by loading all data into memory, and write out a SD file, type: 729 730 % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode, 731 InMemory" -i Sample.smi -o SampleOut.sdf 732 733 To count the number of molecules not containing any substructure corresponding to 734 ChEMBL SMARTS patterns, perform filtering in multiprocessing mode on specific 735 number of CPUs and chunk size without loading all data into memory, and 736 write out a SD file, type: 737 738 % RDKitFilterChEMBLAlerts.py --mp yes --mpParams "inputDataMode,Lazy, 739 numProcesses,4,chunkSize,8" -i Sample.smi -o SampleOut.sdf 740 741 To only count the number of molecules not containing any substructure corresponding 742 to BMS ChEMBL SMARTS patterns without writing out any files, type: 743 744 % RDKitFilterChEMBLAlerts.py -m count -a BMS -i Sample.sdf 745 -o SampleOut.smi 746 747 To count the number of molecules not containing any substructure corresponding 748 to Pfizer LINT ChEMBL SMARTS patterns in a CSV SMILES file and write out a SD file, 749 type: 750 751 % RDKitFilterChEMBLAlerts.py --altertsMode PfizerLINT --infileParams 752 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 753 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 754 -i SampleSMILES.csv -o SampleOut.sdf 755 756 Author: 757 Manish Sud(msud@san.rr.com) 758 759 See also: 760 RDKitFilterPAINS.py, RDKitConvertFileFormat.py, RDKitSearchSMARTS.py 761 762 Copyright: 763 Copyright (C) 2024 Manish Sud. All rights reserved. 764 765 The functionality available in this script is implemented using RDKit, an 766 open source toolkit for cheminformatics developed by Greg Landrum. 767 768 This file is part of MayaChemTools. 769 770 MayaChemTools is free software; you can redistribute it and/or modify it under 771 the terms of the GNU Lesser General Public License as published by the Free 772 Software Foundation; either version 3 of the License, or (at your option) any 773 later version. 774 775 """ 776 777 if __name__ == "__main__": 778 main()