1 #!/bin/env python 2 # 3 # File: RDKitSearchFunctionalGroups.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 38 # RDKit imports... 39 try: 40 from rdkit import rdBase 41 from rdkit import Chem 42 from rdkit.Chem import AllChem 43 from rdkit.Chem import FunctionalGroups 44 except ImportError as ErrMsg: 45 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 46 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 47 sys.exit(1) 48 49 # MayaChemTools imports... 50 try: 51 from docopt import docopt 52 import MiscUtil 53 import RDKitUtil 54 except ImportError as ErrMsg: 55 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 56 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 57 sys.exit(1) 58 59 ScriptName = os.path.basename(sys.argv[0]) 60 Options = {} 61 OptionsInfo = {} 62 63 FunctionalGroupsMap = {} 64 65 def main(): 66 """Start execution of the script.""" 67 68 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 69 70 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 71 72 # Retrieve command line arguments and options... 73 RetrieveOptions() 74 75 # Process and validate command line arguments and options... 76 ProcessOptions() 77 78 # Perform actions required by the script... 79 PerformFunctionalGroupsSearch() 80 81 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 82 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 83 84 def PerformFunctionalGroupsSearch(): 85 """Retrieve functional groups information and perform search.""" 86 87 # Process functional groups info... 88 ProcessFunctionalGroupsInfo() 89 90 # Setup pattern mols for functional group SMARTS... 91 GroupsPatternMols = SetupFunctionalGroupsSMARTSPatterns() 92 93 # Setup a molecule reader... 94 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 95 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 96 97 # Set up molecule writers... 98 Writer, GroupOutfilesWriters = SetupMoleculeWriters() 99 100 MolCount, ValidMolCount, RemainingMolCount, GroupsPatternMatchCountList = ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters) 101 102 if Writer is not None: 103 Writer.close() 104 for GroupOutfileWriter in GroupOutfilesWriters: 105 GroupOutfileWriter.close() 106 107 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 108 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 109 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 110 111 MiscUtil.PrintInfo("\nTotal number of molecules matched against specified match criteria: %d" % RemainingMolCount) 112 113 MiscUtil.PrintInfo("\nNumber of molecuels matched against individual functional groups:") 114 MiscUtil.PrintInfo("FunctionalGroupName,MatchCount") 115 116 for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])): 117 GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex] 118 if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]: 119 GroupName = '!' + GroupName 120 GroupMatchCount = GroupsPatternMatchCountList[GroupIndex] 121 MiscUtil.PrintInfo("%s,%d" % (GroupName, GroupMatchCount)) 122 123 def ProcessMolecules(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters): 124 """Process and search molecules.""" 125 126 if OptionsInfo["MPMode"]: 127 return ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters) 128 else: 129 return ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters) 130 131 def ProcessMoleculesUsingSingleProcess(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters): 132 """Process and search molecules using a single process.""" 133 134 MiscUtil.PrintInfo("\nSearching functional groups...") 135 136 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 137 CombineMatchResults = OptionsInfo["CombineMatchResults"] 138 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 139 140 GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"]) 141 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 142 143 FirstMol = True 144 for Mol in Mols: 145 MolCount += 1 146 147 if Mol is None: 148 continue 149 150 if RDKitUtil.IsMolEmpty(Mol): 151 MolName = RDKitUtil.GetMolName(Mol, MolCount) 152 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 153 continue 154 155 ValidMolCount += 1 156 if FirstMol: 157 FirstMol = False 158 if SetSMILESMolProps: 159 if Writer is not None: 160 RDKitUtil.SetWriterMolProps(Writer, Mol) 161 for GroupOutfileWriter in GroupOutfilesWriters: 162 if GroupOutfileWriter is not None: 163 RDKitUtil.SetWriterMolProps(GroupOutfileWriter, Mol) 164 165 # Match molecule against functional group patterns... 166 MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, GroupsPatternMols) 167 168 # Update functional group match count... 169 for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList): 170 if MatchStatus: 171 GroupsPatternsMatchCountList[GroupIndex] += 1 172 173 if not MolMatched: 174 continue 175 176 RemainingMolCount += 1 177 WriteMolecule(Writer, GroupOutfilesWriters, Mol, Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList) 178 179 return (MolCount, ValidMolCount, RemainingMolCount, GroupsPatternsMatchCountList) 180 181 def ProcessMoleculesUsingMultipleProcesses(Mols, GroupsPatternMols, Writer, GroupOutfilesWriters): 182 """Process and search molecules using multiprocessing.""" 183 184 MiscUtil.PrintInfo("\nSearching functional groups using multiprocessing...") 185 186 MPParams = OptionsInfo["MPParams"] 187 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 188 CombineMatchResults = OptionsInfo["CombineMatchResults"] 189 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 190 191 # Setup data for initializing a worker process... 192 MiscUtil.PrintInfo("Encoding options info and functional groups pattern molecules...") 193 OptionsInfo["EncodedGroupPatternMols"] = [RDKitUtil.MolToBase64EncodedMolString(PatternMol) for PatternMol in GroupsPatternMols] 194 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo), MiscUtil.ObjectToBase64EncodedString(FunctionalGroupsMap)) 195 196 # Setup a encoded mols data iterable for a worker process... 197 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 198 199 # Setup process pool along with data initialization for each process... 200 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 201 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 202 203 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 204 205 # Start processing... 206 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 207 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 208 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 209 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 210 else: 211 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 212 213 GroupsPatternsMatchCountList = [0] * len(OptionsInfo["SpecifiedFunctionalGroups"]) 214 215 (MolCount, ValidMolCount, RemainingMolCount) = [0] * 3 216 217 FirstMol = True 218 for Result in Results: 219 MolCount += 1 220 MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList = Result 221 222 if EncodedMol is None: 223 continue 224 ValidMolCount += 1 225 226 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 227 228 if FirstMol: 229 FirstMol = False 230 if SetSMILESMolProps: 231 if Writer is not None: 232 RDKitUtil.SetWriterMolProps(Writer, Mol) 233 for GroupOutfileWriter in GroupOutfilesWriters: 234 if GroupOutfileWriter is not None: 235 RDKitUtil.SetWriterMolProps(GroupOutfileWriter, Mol) 236 237 # Update functional group match count... 238 for GroupIndex, MatchStatus in enumerate(GroupsPatternMatchStatusList): 239 if MatchStatus: 240 GroupsPatternsMatchCountList[GroupIndex] += 1 241 242 if not MolMatched: 243 continue 244 245 RemainingMolCount += 1 246 WriteMolecule(Writer, GroupOutfilesWriters, Mol, Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList) 247 248 return (MolCount, ValidMolCount, RemainingMolCount, GroupsPatternsMatchCountList) 249 250 def InitializeWorkerProcess(*EncodedArgs): 251 """Initialize data for a worker process.""" 252 253 global Options, OptionsInfo, FunctionalGroupsMap 254 255 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 256 257 # Decode Options and OptionInfo... 258 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 259 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 260 FunctionalGroupsMap = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[2]) 261 262 # Decode ChEMBLPatternMols... 263 OptionsInfo["GroupPatternMols"] = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) for EncodedMol in OptionsInfo["EncodedGroupPatternMols"]] 264 265 def WorkerProcess(EncodedMolInfo): 266 """Process data for a worker process.""" 267 268 MolIndex, EncodedMol = EncodedMolInfo 269 270 if EncodedMol is None: 271 return [MolIndex, None, False, None] 272 273 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 274 if RDKitUtil.IsMolEmpty(Mol): 275 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 276 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 277 return [MolIndex, None, False, None] 278 279 # Match molecule against functional group patterns... 280 MolMatched, GroupsPatternMatchStatusList = MatchMolecule(Mol, OptionsInfo["GroupPatternMols"]) 281 282 return [MolIndex, EncodedMol, MolMatched, GroupsPatternMatchStatusList] 283 284 def MatchMolecule(Mol, GroupsPatternMols): 285 """Search for functional groups in a molecule.""" 286 287 GroupsPatternMatchStatusList = [] 288 289 # Match pattern mols... 290 for GroupIndex in range(0, len(OptionsInfo["SpecifiedFunctionalGroups"])): 291 Status = DoesPatternMolMatch(GroupsPatternMols[GroupIndex], Mol, OptionsInfo["UseChirality"], OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]) 292 GroupsPatternMatchStatusList.append(Status) 293 294 # Match mol against all specified criteria... 295 MolMatched = DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMatchStatusList, OptionsInfo["CombineMatchResults"], OptionsInfo["AndCombineOperatorMode"]) 296 297 return (MolMatched, GroupsPatternMatchStatusList) 298 299 def DoesMolMeetSpecifiedMatchCriteria(GroupsPatternMolsMatchStatus, CombineMatchResults, AndCombineOperatorMode): 300 """Match molecule using specified match criteia.""" 301 302 if CombineMatchResults and AndCombineOperatorMode: 303 # Must match all specified SMARTS 304 Status = True 305 for MatchStatus in GroupsPatternMolsMatchStatus: 306 if not MatchStatus: 307 Status = False 308 break 309 else: 310 # One match is enough... 311 Status = False 312 for MatchStatus in GroupsPatternMolsMatchStatus: 313 if MatchStatus: 314 Status = True 315 break 316 317 return Status 318 319 def WriteMolecule(Writer, GroupOutfilesWriters, Mol, Compute2DCoords, CombineMatchResults, GroupsPatternMatchStatusList): 320 """Write out molecule.""" 321 322 if OptionsInfo["CountMode"]: 323 return 324 325 if Compute2DCoords: 326 AllChem.Compute2DCoords(Mol) 327 328 if CombineMatchResults: 329 Writer.write(Mol) 330 else: 331 for GroupIndex in range(0, len(GroupsPatternMatchStatusList)): 332 if GroupsPatternMatchStatusList[GroupIndex]: 333 GroupOutfilesWriters[GroupIndex].write(Mol) 334 335 def SetupMoleculeWriters(): 336 """Set up molecule writers for output files.""" 337 338 Writer = None 339 GroupOutfilesWriters = [] 340 341 if OptionsInfo["CountMode"]: 342 return (Writer, GroupOutfilesWriters) 343 344 Outfile = OptionsInfo["Outfile"] 345 CombineMatchResults = OptionsInfo["CombineMatchResults"] 346 GroupsOutfiles = OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] 347 348 if CombineMatchResults: 349 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 350 if Writer is None: 351 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 352 MiscUtil.PrintInfo("Generating file %s..." % Outfile) 353 else: 354 for GroupOutfile in GroupsOutfiles: 355 GroupOutfileWriter = RDKitUtil.MoleculesWriter(GroupOutfile, **OptionsInfo["OutfileParams"]) 356 if GroupOutfileWriter is None: 357 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Writer) 358 GroupOutfilesWriters.append(GroupOutfileWriter) 359 360 GroupsCount = len(GroupsOutfiles) 361 if GroupsCount > 4: 362 MiscUtil.PrintInfo("Generating %d output files with the following file name format: %s<GroupName>.%s" % (GroupsCount, OptionsInfo["OutfileBasename"], OptionsInfo["OutfileExt"])) 363 else: 364 Delmiter = ', ' 365 OutfileNames = Delmiter.join(GroupsOutfiles) 366 MiscUtil.PrintInfo("Generating %d output files: %s..." % (GroupsCount, OutfileNames)) 367 368 return (Writer, GroupOutfilesWriters) 369 370 def DoesPatternMolMatch(PatternMol, Mol, UseChirality, NegateMatch): 371 """Perform a substructure match for the presence of pattern molecule in a molecule.""" 372 373 MolMatched = Mol.HasSubstructMatch(PatternMol, useChirality = UseChirality) 374 if NegateMatch: 375 if MolMatched: 376 MolMatched = False 377 else: 378 MolMatched = True 379 380 return MolMatched 381 382 def ProcessFunctionalGroupsInfo(): 383 """Process functional groups information.""" 384 385 RetrieveFunctionalGroupsInfo() 386 ProcessSpecifiedFunctionalGroups() 387 388 SetupFunctionalGroupsOutputFileNames() 389 390 def ProcessSpecifiedFunctionalGroups(): 391 """Process and validate specified functional groups.""" 392 393 OptionsInfo["SpecifiedFunctionalGroups"] = [] 394 OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = [] 395 396 if re.match("^All$", OptionsInfo["FunctionalGroups"], re.I): 397 OptionsInfo["SpecifiedFunctionalGroups"] = FunctionalGroupsMap['Names'] 398 OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = [False] * len(OptionsInfo["SpecifiedFunctionalGroups"]) 399 return 400 401 # Set up a map of valid group names for checking specified group names... 402 CanonicalGroupNameMap = {} 403 for GroupName in FunctionalGroupsMap['Names']: 404 CanonicalGroupNameMap[GroupName.lower()] = GroupName 405 406 # Parse and validate specified names... 407 GroupNames = re.sub(" ", "", OptionsInfo["FunctionalGroups"]) 408 if not GroupNames: 409 MiscUtil.PrintError("No functional group name specified for \"-f, --functionalGroups\" option") 410 411 SpecifiedFunctionalGroups = [] 412 SpecifiedNegateMatchStatus = [] 413 414 for GroupName in GroupNames.split(","): 415 CanonicalGroupName = GroupName.lower() 416 NegateMatchStatus = False 417 if re.match("^!", CanonicalGroupName, re.I): 418 NegateMatchStatus = True 419 CanonicalGroupName = re.sub("^!", "", CanonicalGroupName) 420 if CanonicalGroupName in CanonicalGroupNameMap: 421 SpecifiedFunctionalGroups.append(CanonicalGroupNameMap[CanonicalGroupName]) 422 SpecifiedNegateMatchStatus.append(NegateMatchStatus) 423 else: 424 MiscUtil.PrintWarning("The functional group name, %s, specified using \"-f, --functionalGroups\" option is not a valid name." % (GroupName)) 425 426 if not len(SpecifiedFunctionalGroups): 427 MiscUtil.PrintError("No valid functional group names specified for \"-f, --functionalGroups\" option") 428 429 OptionsInfo["SpecifiedFunctionalGroups"] = SpecifiedFunctionalGroups 430 OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"] = SpecifiedNegateMatchStatus 431 432 def SetupFunctionalGroupsSMARTSPatterns(): 433 """Setup SMARTS patterns for specified functional groups.""" 434 435 OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"] = [] 436 FunctionalGroupsPatternMols = [] 437 438 for Name in OptionsInfo["SpecifiedFunctionalGroups"]: 439 SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name] 440 PatternMol = Chem.MolFromSmarts(SMARTSPattern) 441 if PatternMol is None: 442 MiscUtil.PrintError("Failed to parse SMARTS pattern, %s, for function group, %s" % (SMARTSPattern, Name)) 443 444 OptionsInfo["SpecifiedFunctionalGroupsSMARTSPatterns"].append(SMARTSPattern) 445 FunctionalGroupsPatternMols.append(PatternMol) 446 447 return FunctionalGroupsPatternMols 448 449 def SetupFunctionalGroupsOutputFileNames(): 450 """Setup output file names for specified functional group names.""" 451 452 OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = [] 453 454 if OptionsInfo["CountMode"]: 455 # No need of any output file... 456 return 457 458 if OptionsInfo["CombineMatchResults"]: 459 # No need of output files for specified functional groups... 460 return 461 462 OutfileBasename = OptionsInfo["OutfileBasename"] 463 OutfileExt = OptionsInfo["OutfileExt"] 464 SpecifiedFunctionalGroupsOutfiles = [] 465 466 GroupsCount = len(OptionsInfo["SpecifiedFunctionalGroups"]) 467 for GroupIndex in range(0, GroupsCount): 468 GroupName = OptionsInfo["SpecifiedFunctionalGroups"][GroupIndex] 469 if OptionsInfo["SpecifiedFunctionalGroupsNegateMatch"][GroupIndex]: 470 GroupName = "Not" + GroupName 471 GroupName = re.sub("\.", "", GroupName) 472 473 GroupOutfile = "%s%s.%s" % (OutfileBasename, GroupName, OutfileExt) 474 SpecifiedFunctionalGroupsOutfiles.append(GroupOutfile) 475 476 OptionsInfo["SpecifiedFunctionalGroupsOutfiles"] = SpecifiedFunctionalGroupsOutfiles 477 478 def RetrieveFunctionalGroupsInfo(): 479 """Retrieve functional groups information.""" 480 481 MiscUtil.PrintInfo("\nRetrieving data from default RDKit functional groups hierarchy file Functional_Group_Hierarchy.txt...") 482 483 FunctionalGroupNamesFile = OptionsInfo["GroupNamesFile"] 484 FunctionalGroupsNodes = FunctionalGroups.BuildFuncGroupHierarchy(FunctionalGroupNamesFile) 485 486 FunctionalGroupsMap['Names'] = [] 487 FunctionalGroupsMap['SMARTSPattern'] = {} 488 489 RetrieveDataFromFunctionalGroupsHierarchy(FunctionalGroupsNodes) 490 491 if not len(FunctionalGroupsMap['Names']): 492 MiscUtil.PrintError("Failed to retrieve any functional group names and SMARTS patterns...") 493 494 MiscUtil.PrintInfo("Total number of functional groups present functional group hierarchy: %d" % (len(FunctionalGroupsMap['Names']))) 495 496 def RetrieveDataFromFunctionalGroupsHierarchy(FGNodes): 497 """Retrieve functional groups data by recursively visiting functional group nodes.""" 498 499 for FGNode in FGNodes: 500 Name = FGNode.label 501 SMARTSPattern = FGNode.smarts 502 503 if Name in FunctionalGroupsMap['SMARTSPattern']: 504 MiscUtil.PrintWarning("Ignoring duplicate functional group name: %s..." % Name) 505 else: 506 FunctionalGroupsMap['Names'].append(Name) 507 FunctionalGroupsMap['SMARTSPattern'][Name] = SMARTSPattern 508 509 RetrieveDataFromFunctionalGroupsHierarchy(FGNode.children) 510 511 def ListFunctionalGroupsInfo(): 512 """List functional groups information.""" 513 514 MiscUtil.PrintInfo("\nListing available functional groups names and SMARTS patterns...") 515 MiscUtil.PrintInfo("\nFunctionalGroupName\tSMARTSPattern") 516 517 for Name in sorted(FunctionalGroupsMap['Names']): 518 SMARTSPattern = FunctionalGroupsMap['SMARTSPattern'][Name] 519 MiscUtil.PrintInfo("%s\t%s" % (Name, SMARTSPattern)) 520 521 MiscUtil.PrintInfo("") 522 523 def ProcessOptions(): 524 """Process and validate command line arguments and options.""" 525 526 MiscUtil.PrintInfo("Processing options...") 527 528 # Validate options... 529 ValidateOptions() 530 531 OptionsInfo["CombineMatches"] = Options["--combineMatches"] 532 533 OptionsInfo["CombineMatchResults"] = True 534 if re.match("^No$", Options["--combineMatches"], re.I): 535 OptionsInfo["CombineMatchResults"] = False 536 if Options["--outfile"]: 537 FileDir, FileName, FileExt = MiscUtil.ParseFileName(Options["--outfile"]) 538 OptionsInfo["OutfileBasename"] = FileName 539 OptionsInfo["OutfileExt"] = FileExt 540 541 OptionsInfo["CombineOperator"] = Options["--combineOperator"] 542 OptionsInfo["AndCombineOperatorMode"] = True 543 if re.match("^or$", Options["--combineOperator"], re.I): 544 OptionsInfo["AndCombineOperatorMode"] = False 545 546 OptionsInfo["GroupNamesFile"] = None 547 if not re.match("^auto$", Options["--groupNamesFile"], re.I): 548 OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"] 549 550 OptionsInfo["FunctionalGroups"] = Options["--functionalGroups"] 551 552 OptionsInfo["Infile"] = Options["--infile"] 553 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 554 555 OptionsInfo["Outfile"] = Options["--outfile"] 556 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 557 558 OptionsInfo["Overwrite"] = Options["--overwrite"] 559 560 OptionsInfo["CountMode"] = False 561 if re.match("^count$", Options["--mode"], re.I): 562 OptionsInfo["CountMode"] = True 563 564 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 565 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 566 567 OptionsInfo["UseChirality"] = False 568 if re.match("^yes$", Options["--useChirality"], re.I): 569 OptionsInfo["UseChirality"] = True 570 571 def RetrieveOptions(): 572 """Retrieve command line arguments and options.""" 573 574 # Get options... 575 global Options 576 Options = docopt(_docoptUsage_) 577 578 # Set current working directory to the specified directory... 579 WorkingDir = Options["--workingdir"] 580 if WorkingDir: 581 os.chdir(WorkingDir) 582 583 # Handle examples option... 584 if "--examples" in Options and Options["--examples"]: 585 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 586 sys.exit(0) 587 588 # Handle listing of functional group information... 589 if Options and Options["--list"]: 590 ProcessListFunctionalGroupsOption() 591 sys.exit(0) 592 593 def ProcessListFunctionalGroupsOption(): 594 """Process list functional groups information.""" 595 596 # Validate and process dataFile option for listing functional groups information... 597 OptionsInfo["GroupNamesFile"] = None 598 if not re.match("^auto$", Options["--groupNamesFile"], re.I): 599 MiscUtil.ValidateOptionFilePath("-g, --groupNamesFile", Options["--groupNamesFile"]) 600 OptionsInfo["GroupNamesFile"] = Options["--groupNamesFile"] 601 602 RetrieveFunctionalGroupsInfo() 603 ListFunctionalGroupsInfo() 604 605 def ValidateOptions(): 606 """Validate option values.""" 607 608 MiscUtil.ValidateOptionTextValue("-c, --combineMatches", Options["--combineMatches"], "yes no") 609 MiscUtil.ValidateOptionTextValue("--combineOperator", Options["--combineOperator"], "and or") 610 611 if not re.match("^auto$", Options["--groupNamesFile"], re.I): 612 MiscUtil.ValidateOptionFilePath("-g, groupNamesFile", Options["--groupNamesFile"]) 613 614 if re.match("^none$", Options["--functionalGroups"], re.I): 615 MiscUtil.PrintError("The name(s) of functional groups must be specified using \"-f, --functionalGroups\" option") 616 617 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 618 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 619 if Options["--outfile"]: 620 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 621 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 622 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 623 624 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "retrieve count") 625 if re.match("^retrieve$", Options["--mode"], re.I): 626 if not Options["--outfile"]: 627 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"retrieve\" value of \"-m, --mode\" option") 628 629 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 630 631 MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no") 632 633 # Setup a usage string for docopt... 634 _docoptUsage_ = """ 635 RDKitSearchFunctionalGroups.py - Search for functional groups using SMARTS patterns 636 637 Usage: 638 RDKitSearchFunctionalGroups.py [--combineMatches <yes or no>] [--combineOperator <and or or>] 639 [--groupNamesFile <FileName or auto>] [--infileParams <Name,Value,...>] 640 [--mode <retrieve or count>] [--mp <yes or no>] [--mpParams <Name,Value,...>] 641 [--negate <yes or no>] [--outfileParams <Name,Value,...>] [--overwrite] 642 [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> -f <Name1,Name2,Name3... or All> 643 RDKitSearchFunctionalGroups.py [--groupNamesFile <FileName or auto>] -l | --list 644 RDKitSearchFunctionalGroups.py -h | --help | -e | --examples 645 646 Description: 647 Perform a substructure search in an input file using SMARTS patterns for functional 648 groups and write out the matched molecules to an output file or simply count the 649 number of matches. 650 651 The SMARTS patterns for specified functional group(s) are retrieved from file, 652 Functional_Group_Hierarchy.txt, available in RDKit data directory. 653 654 The names of valid functional groups and hierarchies are dynamically retrieved from the 655 functional groups hierarchy file and are shown below: 656 657 AcidChloride, AcidChloride.Aromatic, AcidChloride.Aliphatic 658 Alcohol, Alcohol.Aromatic, Alcohol.Aliphatic 659 Aldehyde, Aldehyde.Aromatic, Aldehyde.Aliphatic 660 Amine, Amine.Primary, Amine.Primary.Aromatic, Amine.Primary.Aliphatic, 661 Amine.Secondary, Amine.Secondary.Aromatic, Amine.Secondary.Aliphatic 662 Amine.Tertiary, Amine.Tertiary.Aromatic, Amine.Tertiary.Aliphatic 663 Amine.Aromatic, Amine.Aliphatic, Amine.Cyclic 664 Azide, Azide.Aromatic, Azide.Aliphatic 665 BoronicAcid, BoronicAcid.Aromatic, BoronicAcid.Aliphatic 666 CarboxylicAcid, CarboxylicAcid.Aromatic, CarboxylicAcid.Aliphatic, 667 CarboxylicAcid.AlphaAmino 668 Halogen, Halogen.Aromatic, Halogen.Aliphatic 669 Halogen.NotFluorine, Halogen.NotFluorine.Aliphatic, 670 Halogen.NotFluorine.Aromatic 671 Halogen.Bromine, Halogen.Bromine.Aliphatic, Halogen.Bromine.Aromatic, 672 Halogen.Bromine.BromoKetone 673 Isocyanate, Isocyanate.Aromatic, Isocyanate.Aliphatic 674 Nitro, Nitro.Aromatic, Nitro.Aliphatic, 675 SulfonylChloride, SulfonylChloride.Aromatic, SulfonylChloride.Aliphatic 676 TerminalAlkyne 677 678 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, .tsv, .txt) 679 680 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 681 682 Options: 683 -c, --combineMatches <yes or no> [default: yes] 684 Combine search results for matching SMARTS patterns of specified functional groups 685 against a molecule. Possible values: yes or no. 686 687 The matched molecules are written to a single output file for "yes" value. Otherwise, 688 multiple output files are generated, one for each functional group. The names of 689 these files correspond to a combination of the basename of the specified output file 690 and the name of the functional group. 691 692 No output files are generated during "count" value of "-m, --mode" option. 693 --combineOperator <and or or> [default: and] 694 Logical operator to use for combining match results corresponding to specified 695 functional group names before writing out a single file. This option is ignored 696 during "No" value of "-c, --combineMatches" option. 697 -e, --examples 698 Print examples. 699 -g, --groupNamesFile <FileName or auto> [default: auto] 700 Specify a file name containing data for functional groups hierarchy or use functional 701 group hierarchy file, Functional_Group_Hierarchy.txt, available in RDKit data directory. 702 703 RDKit data format: Name<tab>Smarts<tab>Label<tab>RemovalReaction (optional) 704 705 The format of data in local functional group hierarchy must match format of the 706 data in functional group file available in RDKit data directory. 707 -f, --functionalGroups <Name1,Name2,Name3... or All> [default: none] 708 Functional group names for performing substructure SMARTS search. Possible values: 709 Comma delimited list of valid functional group names or All. The current set of valid 710 functional group names are listed in the description section. 711 712 The match results for multiple functional group names are combined using 'and' 713 operator before writing them out to single file. No merging of match results takes 714 place during generation of individual result files corresponding to fictional group 715 names. 716 717 The functional group name may be started with an exclamation mark to negate 718 the match result for that fictional group. 719 -h, --help 720 Print this help message. 721 -i, --infile <infile> 722 Input file name. 723 --infileParams <Name,Value,...> [default: auto] 724 A comma delimited list of parameter name and value pairs for reading 725 molecules from files. The supported parameter names for different file 726 formats, along with their default values, are shown below: 727 728 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 729 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 730 smilesTitleLine,auto,sanitize,yes 731 732 Possible values for smilesDelimiter: space, comma or tab. 733 -l, --list 734 List functional groups information without performing any search. 735 -m, --mode <retrieve or count> [default: retrieve] 736 Specify whether to retrieve and write out matched molecules to an output 737 file or simply count the number of matches. 738 --mp <yes or no> [default: no] 739 Use multiprocessing. 740 741 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 742 function employing lazy RDKit data iterable. This allows processing of 743 arbitrary large data sets without any additional requirements memory. 744 745 All input data may be optionally loaded into memory by mp.Pool.map() 746 before starting worker processes in a process pool by setting the value 747 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 748 749 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 750 data mode may adversely impact the performance. The '--mpParams' section 751 provides additional information to tune the value of 'chunkSize'. 752 --mpParams <Name,Value,...> [default: auto] 753 A comma delimited list of parameter name and value pairs to configure 754 multiprocessing. 755 756 The supported parameter names along with their default and possible 757 values are shown below: 758 759 chunkSize, auto 760 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 761 numProcesses, auto [ Default: mp.cpu_count() ] 762 763 These parameters are used by the following functions to configure and 764 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 765 mp.Pool.imap(). 766 767 The chunkSize determines chunks of input data passed to each worker 768 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 769 The default value of chunkSize is dependent on the value of 'inputDataMode'. 770 771 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 772 automatically converts RDKit data iterable into a list, loads all data into 773 memory, and calculates the default chunkSize using the following method 774 as shown in its code: 775 776 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 777 if extra: chunkSize += 1 778 779 For example, the default chunkSize will be 7 for a pool of 4 worker processes 780 and 100 data items. 781 782 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 783 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 784 data into memory. Consequently, the size of input data is not known a priori. 785 It's not possible to estimate an optimal value for the chunkSize. The default 786 chunkSize is set to 1. 787 788 The default value for the chunkSize during 'Lazy' data mode may adversely 789 impact the performance due to the overhead associated with exchanging 790 small chunks of data. It is generally a good idea to explicitly set chunkSize to 791 a larger value during 'Lazy' input data mode, based on the size of your input 792 data and number of processes in the process pool. 793 794 The mp.Pool.map() function waits for all worker processes to process all 795 the data and return the results. The mp.Pool.imap() function, however, 796 returns the the results obtained from worker processes as soon as the 797 results become available for specified chunks of data. 798 799 The order of data in the results returned by both mp.Pool.map() and 800 mp.Pool.imap() functions always corresponds to the input data. 801 -o, --outfile <outfile> 802 Output file name. 803 --outfileParams <Name,Value,...> [default: auto] 804 A comma delimited list of parameter name and value pairs for writing 805 molecules to files. The supported parameter names for different file 806 formats, along with their default values, are shown below: 807 808 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 809 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 810 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 811 812 Default value for compute2DCoords: yes for SMILES input file; no for all other 813 file types. 814 --overwrite 815 Overwrite existing files. 816 -u, --useChirality <yes or no> [default: no] 817 Use stereochemistry information for SMARTS search. 818 -w, --workingdir <dir> 819 Location of working directory which defaults to the current directory. 820 821 Examples: 822 To list names of all available functional groups along with their SMARTS 823 patterns, type: 824 825 % RDKitSearchFunctionalGroups.py -l 826 827 To retrieve molecules containing amine functional group and write out a 828 SMILES file, type: 829 830 % RDKitSearchFunctionalGroups.py -f Amine -i Sample.smi -o SampleOut.smi 831 832 To retrieve molecules containing amine functional group, perform search in 833 multiprocessing mode on all available CPUs without loading all data into 834 memory, and write out a SMILES file, type: 835 836 % RDKitSearchFunctionalGroups.py --mp yes -f Amine -i Sample.smi 837 -o SampleOut.smi 838 839 To retrieve molecules containing amine functional group, perform search in 840 multiprocessing mode on all available CPUs by loading all data into memory, 841 and write out a SMILES file, type: 842 843 % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode, 844 InMemory" -f Amine -i Sample.smi -o SampleOut.smi 845 846 To retrieve molecules containing amine functional group, perform search in 847 multiprocessing mode on specific number of CPUs and chunksize without loading 848 all data into memory, and write out a SMILES file, type: 849 850 % RDKitSearchFunctionalGroups.py --mp yes --mpParams "inputDataMode, 851 lazy,numProcesses,4,chunkSize,8" -f Amine -i Sample.smi -o 852 SampleOut.smi 853 854 To retrieve molecules containing amine functional group but not halogens and carboxylic 855 acid functional groups and write out a SMILES file, type: 856 857 % RDKitSearchFunctionalGroups.py -f 'Amine,!Halogen,!CarboxylicAcid' 858 -i Sample.smi -o SampleOut.smi 859 860 To retrieve molecules containing amine, halogens or carboxylic acid functional groups 861 and write out a SMILES file, type: 862 863 % RDKitSearchFunctionalGroups.py -f 'Amine,Halogen,CarboxylicAcid' 864 --combineOperator or -i Sample.smi -o SampleOut.smi 865 866 To retrieve molecules containing amine and carboxylic acid functional groups defined in 867 a local functional groups hierarchy file and write out individual SD files for each 868 funcitonal group, type: 869 870 % RDKitSearchFunctionalGroups.py -f 'Amine,CarboxylicAcid' -i Sample.sdf 871 -g Custom_Functional_Group_Hierarchy.txt --combineMatches No -o SampleOut.sdf 872 873 To count number of all functional groups in molecules without writing out an output 874 files, type: 875 876 % RDKitSearchFunctionalGroups.py -m count -f All --combineMatches no -i Sample.smi 877 878 To retrieve molecule not containing aromatic alcohol and aromatic halogen functional 879 group along with the use of chirality during substructure search and write out individual 880 SMILES files for each functional group, type: 881 882 % RDKitSearchFunctionalGroups.py --combineMatches no -u yes 883 -f '!Alcohol.Aromatic,!Halogen.Aromatic' -i Sample.smi -o SampleOut.smi 884 885 To retrieve molecule containing amine functional group from a CSV SMILES file, 886 SMILES strings in column 1, name in column 2, and write out a SD file, type: 887 888 % RDKitSearchFunctionalGroups.py -f Amine --infileParams 889 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 890 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 891 -i SampleSMILES.csv -o SampleOut.sdf 892 893 Author: 894 Manish Sud(msud@san.rr.com) 895 896 See also: 897 RDKitConvertFileFormat.py, RDKitFilterPAINS.py, RDKitSearchSMARTS.py 898 899 Copyright: 900 Copyright (C) 2024 Manish Sud. All rights reserved. 901 902 The functionality available in this script is implemented using RDKit, an 903 open source toolkit for cheminformatics developed by Greg Landrum. 904 905 This file is part of MayaChemTools. 906 907 MayaChemTools is free software; you can redistribute it and/or modify it under 908 the terms of the GNU Lesser General Public License as published by the Free 909 Software Foundation; either version 3 of the License, or (at your option) any 910 later version. 911 912 """ 913 914 if __name__ == "__main__": 915 main()