1 #!/bin/env python 2 # 3 # File: RDKitPerformPositionalAnalogueScan.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 import multiprocessing as mp 37 from itertools import combinations 38 from itertools import permutations 39 40 # RDKit imports... 41 try: 42 from rdkit import rdBase 43 from rdkit import Chem 44 from rdkit.Chem import AllChem 45 except ImportError as ErrMsg: 46 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 47 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 48 sys.exit(1) 49 50 # MayaChemTools imports... 51 try: 52 from docopt import docopt 53 import MiscUtil 54 import RDKitUtil 55 except ImportError as ErrMsg: 56 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 57 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 58 sys.exit(1) 59 60 ScriptName = os.path.basename(sys.argv[0]) 61 Options = {} 62 OptionsInfo = {} 63 64 def main(): 65 """Start execution of the script.""" 66 67 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 PerformPositionalAnalogueScan() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 def PerformPositionalAnalogueScan(): 84 """Perform positional analogue scan.""" 85 86 # Setup a molecule reader for input file... 87 MiscUtil.PrintInfo("\nProcessing file %s..." % OptionsInfo["Infile"]) 88 OptionsInfo["InfileParams"]["AllowEmptyMols"] = True 89 Mols = RDKitUtil.ReadMolecules(OptionsInfo["Infile"], **OptionsInfo["InfileParams"]) 90 91 # Set up a molecule writer... 92 Writer = RDKitUtil.MoleculesWriter(OptionsInfo["Outfile"], **OptionsInfo["OutfileParams"]) 93 if Writer is None: 94 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % OptionsInfo["Outfile"]) 95 MiscUtil.PrintInfo("Generating file %s..." % OptionsInfo["Outfile"]) 96 97 MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount = ProcessMolecules(Mols, Writer) 98 99 if Writer is not None: 100 Writer.close() 101 102 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 103 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 104 if not OptionsInfo["RxnSMARTSMode"]: 105 MiscUtil.PrintInfo("Number of molecules with missing search pattern: %d" % SearchPatternMissingCount) 106 MiscUtil.PrintInfo("Number of molecules failed during generation of analogues: %d" % AnaloguesGenerationFailedCount) 107 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount + SearchPatternMissingCount + AnaloguesGenerationFailedCount)) 108 109 MiscUtil.PrintInfo("\nTotal number of analogues for valid molecules: %d" % AnaloguesCount) 110 MiscUtil.PrintInfo("Average number of analogues per valid molecule: %d" % (int(AnaloguesCount/(ValidMolCount)))) 111 112 def ProcessMolecules(Mols, Writer): 113 """Process molecules to generate analogues.""" 114 115 if OptionsInfo["MPMode"]: 116 return ProcessMoleculesUsingMultipleProcesses( Mols, Writer) 117 else: 118 return ProcessMoleculesUsingSingleProcess(Mols, Writer) 119 120 def ProcessMoleculesUsingSingleProcess(Mols, Writer): 121 """Process molecules to generate analogues using a single process.""" 122 123 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 124 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 125 126 (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5 127 128 # Initialize search and target patterns info... 129 SetupSearchAndTargetPatternsInfo() 130 131 FirstMol = True 132 for Mol in Mols: 133 MolCount += 1 134 135 if Mol is None: 136 continue 137 138 if RDKitUtil.IsMolEmpty(Mol): 139 if not OptionsInfo["QuietMode"]: 140 MolName = RDKitUtil.GetMolName(Mol, MolCount) 141 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 142 continue 143 ValidMolCount += 1 144 145 MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, MolCount) 146 147 if FirstMol: 148 FirstMol = False 149 if SetSMILESMolProps: 150 if Writer is not None: 151 RDKitUtil.SetWriterMolProps(Writer, Mol) 152 153 if not OptionsInfo["RxnSMARTSMode"]: 154 if SearchPatternMissingStatus: 155 SearchPatternMissingCount += 1 156 WriteMolecule(Writer, Mol, Compute2DCoords) 157 continue 158 159 if not AnaloguesGenerationStatus: 160 AnaloguesGenerationFailedCount += 1 161 WriteMolecule(Writer, Mol, Compute2DCoords) 162 continue 163 164 # Write out molecues and analogues... 165 WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords) 166 167 AnaloguesCount += len(MolAnalogues) 168 169 return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) 170 171 def ProcessMoleculesUsingMultipleProcesses(Mols, Writer): 172 """Process molecules to generate analogues using multiprocessing.""" 173 174 MPParams = OptionsInfo["MPParams"] 175 176 # Setup data for initializing a worker process... 177 MiscUtil.PrintInfo("Encoding options info...") 178 179 InitializeWorkerProcessArgs = (MiscUtil.ObjectToBase64EncodedString(Options), MiscUtil.ObjectToBase64EncodedString(OptionsInfo)) 180 181 # Setup a encoded mols data iterable for a worker process... 182 WorkerProcessDataIterable = RDKitUtil.GenerateBase64EncodedMolStrings(Mols) 183 184 # Setup process pool along with data initialization for each process... 185 MiscUtil.PrintInfo("\nConfiguring multiprocessing using %s method..." % ("mp.Pool.imap()" if re.match("^Lazy$", MPParams["InputDataMode"], re.I) else "mp.Pool.map()")) 186 MiscUtil.PrintInfo("NumProcesses: %s; InputDataMode: %s; ChunkSize: %s\n" % (MPParams["NumProcesses"], MPParams["InputDataMode"], ("automatic" if MPParams["ChunkSize"] is None else MPParams["ChunkSize"]))) 187 188 ProcessPool = mp.Pool(MPParams["NumProcesses"], InitializeWorkerProcess, InitializeWorkerProcessArgs) 189 190 # Start processing... 191 if re.match("^Lazy$", MPParams["InputDataMode"], re.I): 192 Results = ProcessPool.imap(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 193 elif re.match("^InMemory$", MPParams["InputDataMode"], re.I): 194 Results = ProcessPool.map(WorkerProcess, WorkerProcessDataIterable, MPParams["ChunkSize"]) 195 else: 196 MiscUtil.PrintError("The value, %s, specified for \"--inputDataMode\" is not supported." % (MPParams["InputDataMode"])) 197 198 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 199 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 200 201 (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) = [0] * 5 202 FirstMol = True 203 for Result in Results: 204 MolCount += 1 205 MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = Result 206 207 if EncodedMol is None: 208 continue 209 ValidMolCount += 1 210 211 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 212 213 if FirstMol: 214 FirstMol = False 215 if SetSMILESMolProps: 216 if Writer is not None: 217 RDKitUtil.SetWriterMolProps(Writer, Mol) 218 219 if not OptionsInfo["RxnSMARTSMode"]: 220 if SearchPatternMissingStatus: 221 SearchPatternMissingCount += 1 222 WriteMolecule(Writer, Mol, Compute2DCoords) 223 continue 224 225 if not AnaloguesGenerationStatus: 226 AnaloguesGenerationFailedCount += 1 227 WriteMolecule(Writer, Mol, Compute2DCoords) 228 continue 229 230 MolAnalogues = [RDKitUtil.MolFromBase64EncodedMolString(EncodedMolAnalogue) for EncodedMolAnalogue in EncodedMolAnalogues] 231 232 # Write out molecues and analogues... 233 WriteMolAnalogues(Writer, Mol, MolCount, MolAnalogues, Compute2DCoords) 234 235 AnaloguesCount += len(MolAnalogues) 236 237 return (MolCount, ValidMolCount, SearchPatternMissingCount, AnaloguesGenerationFailedCount, AnaloguesCount) 238 239 def InitializeWorkerProcess(*EncodedArgs): 240 """Initialize data for a worker process.""" 241 242 global Options, OptionsInfo 243 244 MiscUtil.PrintInfo("Starting process (PID: %s)..." % os.getpid()) 245 246 # Decode Options and OptionInfo... 247 Options = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[0]) 248 OptionsInfo = MiscUtil.ObjectFromBase64EncodedString(EncodedArgs[1]) 249 250 # Initialize search and target patterns info... 251 SetupSearchAndTargetPatternsInfo() 252 253 def WorkerProcess(EncodedMolInfo): 254 """Process data for a worker process.""" 255 256 MolIndex, EncodedMol = EncodedMolInfo 257 258 MolAnalogues = None 259 SearchPatternMissingStatus = False 260 AnaloguesGenerationStatus = True 261 262 if EncodedMol is None: 263 return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus] 264 265 Mol = RDKitUtil.MolFromBase64EncodedMolString(EncodedMol) 266 if RDKitUtil.IsMolEmpty(Mol): 267 if not OptionsInfo["QuietMode"]: 268 MolName = RDKitUtil.GetMolName(Mol, (MolIndex + 1)) 269 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 270 return [MolIndex, None, MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus] 271 272 MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus = GenerateMolAnalogues(Mol, (MolIndex +1)) 273 274 EncodedMolAnalogues = None 275 if MolAnalogues is not None: 276 EncodedMolAnalogues = [RDKitUtil.MolToBase64EncodedMolString(MolAnalogue, PropertyPickleFlags = Chem.PropertyPickleOptions.MolProps | Chem.PropertyPickleOptions.PrivateProps) for MolAnalogue in MolAnalogues] 277 278 return [MolIndex, EncodedMol, EncodedMolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus] 279 280 def GenerateMolAnalogues(Mol, MolNum = None): 281 """Generate analogues.""" 282 283 if OptionsInfo["AttachAtomsMode"] or OptionsInfo["ReplaceAtomsMode"] or OptionsInfo["AttachSMILESMode"]: 284 return GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum) 285 elif OptionsInfo["RxnSMARTSMode"]: 286 return GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum) 287 elif OptionsInfo["TandemScanMode"]: 288 return GenerateMolAnaloguesForTandemScanMode(Mol, MolNum) 289 else: 290 MiscUtil.PrintError("Failed to generate analogues. The mode value, %s, specified using \"-m, --mode\" is not supported." % (OptionsInfo["Mode"])) 291 292 def GenerateMolAnaloguesForMultipleScanModes(Mol, MolNum = None): 293 """Generate analogues for the following scan modes: AttachAtom, ReplaceAtom 294 or AttachSMILES.""" 295 296 MolAnalogues = None 297 SearchPatternMissingStatus = False 298 AnaloguesGenerationStatus = True 299 300 # Retrieve matched atoms... 301 MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum) 302 if MolMatchedAtomIndices is None: 303 SearchPatternMissingStatus = True 304 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 305 306 MolAnalogues = [] 307 MolAnaloguesSet = set() 308 309 TargetPatternInfo = OptionsInfo["TargetPatternInfo"] 310 for TargetPatternIndex, TargetPatternObject in enumerate(TargetPatternInfo["PatternObjects"]): 311 for AtomIndicesCombination in combinations(MolMatchedAtomIndices, OptionsInfo["ComboSearchAtoms"]): 312 MolAnalogue = Chem.RWMol(Mol) 313 314 for AtomIndex in AtomIndicesCombination: 315 if OptionsInfo["AttachAtomsMode"]: 316 AtomicNumber = TargetPatternObject 317 MolAnalogue = AttachAtom(MolAnalogue, AtomIndex, AtomicNumber, BondOrder = Chem.rdchem.BondType.SINGLE) 318 elif OptionsInfo["ReplaceAtomsMode"]: 319 AtomicNumber = TargetPatternObject 320 MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, AtomicNumber) 321 elif OptionsInfo["AttachSMILESMode"]: 322 AttachAtomIndex = MolAnalogue.GetNumAtoms() - 1 + TargetPatternInfo["AttachSMILESAtomIndexOffsets"][TargetPatternIndex] 323 SMILESMol = TargetPatternObject 324 MolAnalogue = AttachMol(MolAnalogue, AtomIndex, SMILESMol, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE) 325 326 if SanitizeMolAnalogue(MolAnalogue, MolNum): 327 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues) 328 329 if len(MolAnalogues) == 0: 330 AnaloguesGenerationStatus = False 331 332 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 333 334 def GenerateMolAnaloguesForRxnSMARTSMode(Mol, MolNum = None): 335 """Generate analogues for reaction SMARTS mode.""" 336 337 SearchPatternMissingStatus = False 338 AnaloguesGenerationStatus = True 339 340 MolAnalogues = [] 341 MolAnaloguesSet = set() 342 343 TargetPatternInfo = OptionsInfo["TargetPatternInfo"] 344 for Rxn in TargetPatternInfo["PatternObjects"]: 345 RxnProducts = Rxn.RunReactants((Mol,)) 346 347 if not len(RxnProducts): 348 continue 349 350 for RxnProduct in RxnProducts: 351 MolAnalogue = RxnProduct[0] 352 353 if SanitizeMolAnalogue(MolAnalogue, MolNum): 354 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues) 355 356 if len(MolAnalogues) == 0: 357 AnaloguesGenerationStatus = False 358 359 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 360 361 def GenerateMolAnaloguesForTandemScanMode(Mol, MolNum = None): 362 """Generate analogues for tandem scan mode.""" 363 364 MolAnalogues = None 365 SearchPatternMissingStatus = False 366 AnaloguesGenerationStatus = True 367 368 # Retrieve matched atoms... 369 MolMatchedAtomIndices = SetupMolMatchedAtomIndices(Mol, MolNum) 370 if MolMatchedAtomIndices is None: 371 SearchPatternMissingStatus = True 372 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 373 374 TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"] 375 NumOfTandemOperations = 0 376 if TargetPatternTandemScanInfo is not None and len(TargetPatternTandemScanInfo): 377 NumOfTandemOperations = len(TargetPatternTandemScanInfo[0]["Names"]) 378 379 if len(MolMatchedAtomIndices) < NumOfTandemOperations: 380 if not OptionsInfo["QuietMode"]: 381 MolName = RDKitUtil.GetMolName(Mol, MolNum) 382 MiscUtil.PrintWarning("Number of matched search pattern atoms, %s, must be >= number of specified operations, %s, during TandemScan mode for molecule %s..." % (len(MolMatchedAtomIndices), NumOfTandemOperations, MolName)) 383 SearchPatternMissingStatus = True 384 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 385 386 MolAnalogues = [] 387 MolAnaloguesSet = set() 388 for AtomIndicesCombination in combinations(MolMatchedAtomIndices, NumOfTandemOperations): 389 AtomIndicesPermutations = [] 390 if OptionsInfo["PermuteTandemScan"]: 391 for AtomIndicesPermutation in permutations(AtomIndicesCombination, NumOfTandemOperations): 392 AtomIndicesPermutations.append(list(AtomIndicesPermutation)) 393 else: 394 AtomIndicesPermutations.append(list(AtomIndicesCombination)) 395 396 for AtomIndicesPermutation in AtomIndicesPermutations: 397 for PatternInfo in TargetPatternTandemScanInfo: 398 MolAnalogue = Chem.RWMol(Mol) 399 400 for NameIndex, Name in enumerate(PatternInfo["Names"]): 401 AtomIndex = AtomIndicesPermutation[NameIndex] 402 PatternObject = PatternInfo["PatternObjects"][NameIndex] 403 404 if re.match("^AttachAtoms$", Name, re.I): 405 MolAnalogue = AttachAtom(MolAnalogue, AtomIndex, PatternObject, BondOrder = Chem.rdchem.BondType.SINGLE) 406 elif re.match("^ReplaceAtoms$", Name, re.I): 407 MolAnalogue = ReplaceAtom(MolAnalogue, AtomIndex, PatternObject) 408 elif re.match("^AttachSMILES$", Name, re.I): 409 AttachAtomIndex = MolAnalogue.GetNumAtoms() - 1 + PatternInfo["AttachSMILESAtomIndexOffsets"][NameIndex] 410 MolAnalogue = AttachMol(MolAnalogue, AtomIndex, PatternObject, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE) 411 MolAnalogue = Chem.RWMol(MolAnalogue) 412 413 if not SanitizeMolAnalogue(MolAnalogue, MolNum): 414 continue 415 416 TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues) 417 418 if len(MolAnalogues) == 0: 419 AnaloguesGenerationStatus = False 420 421 return (MolAnalogues, SearchPatternMissingStatus, AnaloguesGenerationStatus) 422 423 def TrackAnalogues(MolAnalogue, MolAnaloguesSet, MolAnalogues): 424 """Track analogues.""" 425 426 if OptionsInfo["UniqueAnalogues"]: 427 AnalogueSMILES = Chem.MolToSmiles(Chem.RemoveHs(MolAnalogue)) 428 if AnalogueSMILES not in MolAnaloguesSet: 429 MolAnaloguesSet.add(AnalogueSMILES) 430 MolAnalogues.append(MolAnalogue) 431 else: 432 MolAnalogues.append(MolAnalogue) 433 434 def ReplaceAtom(Mol, AtomIndex, AtomicNumber): 435 """Replace an existing atom with a new atom in a molecule.""" 436 437 Atom = Mol.GetAtomWithIdx(AtomIndex) 438 Atom.SetAtomicNum(AtomicNumber) 439 440 return Mol 441 442 def AttachAtom(Mol, AtomIndex, AtomicNumber, BondOrder = Chem.rdchem.BondType.SINGLE): 443 """Attach atom to an existing atom in a molecule.""" 444 445 NewAtomIndex = Mol.AddAtom(Chem.Atom(AtomicNumber)) 446 Mol.AddBond(AtomIndex, NewAtomIndex, BondOrder) 447 448 return Mol 449 450 def AttachMol(Mol, AtomIndex, AttachMol, AttachAtomIndex, BondOrder = Chem.rdchem.BondType.SINGLE): 451 """Attach molecule to an exisiting molecule.""" 452 453 NewMol = Chem.CombineMols(Mol, AttachMol) 454 455 NewMol = Chem.EditableMol(NewMol) 456 NewMol.AddBond(AtomIndex, AttachAtomIndex, order = BondOrder) 457 NewMol = NewMol.GetMol() 458 459 return NewMol 460 461 def SanitizeMolAnalogue(Mol, MolNum): 462 """Sanitize analogue and catch any errors.""" 463 464 Status = True 465 try: 466 Chem.SanitizeMol(Mol) 467 except Exception as ErrMsg: 468 if not OptionsInfo["QuietMode"]: 469 MolName = RDKitUtil.GetMolName(Mol, MolNum) 470 MiscUtil.PrintInfo("\n%s\nFailed to sanitize analogue of molecule %s...\nIgnoring analogue..." % (ErrMsg, MolName)) 471 Status = False 472 473 return Status 474 475 def SetupMolMatchedAtomIndices(Mol, MolNum): 476 """Setup a list of matched atom indices corresponding to first atom in search 477 SMARTS pattern for a molecule.""" 478 479 MatchedAtoms = [] 480 481 # Retrieve first atom match for PAS... 482 MatchedAtomIndices = [AtomIndices[0] for AtomIndices in Mol.GetSubstructMatches(OptionsInfo["SearchPatternMol"])] 483 if len(MatchedAtomIndices) == 0: 484 if not OptionsInfo["QuietMode"]: 485 MolName = RDKitUtil.GetMolName(Mol, MolNum) 486 MiscUtil.PrintWarning("No search pattern match for molecule %s..." % MolName) 487 return None 488 489 return MatchedAtomIndices 490 491 def WriteMolAnalogues(Writer, Mol, MolNum, MolAnalogues, Compute2DCoords): 492 """Write molecule analogues.""" 493 494 # Write out molecule... 495 WriteMolecule(Writer, Mol, Compute2DCoords) 496 497 if MolAnalogues is None: 498 return 499 500 # Write out analogues... 501 for Index, MolAnalogue in enumerate(MolAnalogues): 502 MolName = RDKitUtil.GetMolName(Mol, MolNum) 503 SetMolAnalogueName(MolAnalogue, MolName, (Index + 1)) 504 505 WriteMolecule(Writer, MolAnalogue, Compute2DCoords) 506 507 def SetMolAnalogueName(MolAnalogue, MolName, AnalogueCount): 508 """Set analogue name.""" 509 510 NameSuffix = OptionsInfo["NameSuffix"] 511 AnalogueName = "%s_%s%d" % (MolName, NameSuffix, AnalogueCount) 512 MolAnalogue.SetProp("_Name", AnalogueName) 513 514 def WriteMolecule(Writer, Mol, Compute2DCoords): 515 """Write out molecule.""" 516 517 if Compute2DCoords: 518 AllChem.Compute2DCoords(Mol) 519 520 Writer.write(Mol) 521 522 def SetupSearchAndTargetPatternsInfo(): 523 """Setup search and target patterns info.""" 524 525 # Setup search pattern molecule... 526 SearchPattern = OptionsInfo["SearchPattern"] 527 SearchPatternMol = None 528 if SearchPattern is not None: 529 SearchPatternMol = Chem.MolFromSmarts(SearchPattern) 530 if SearchPatternMol is None: 531 MiscUtil.PrintError("Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, \"%s\", specified using \"-s, --searchPattern\" option is not valid." % (SearchPattern)) 532 OptionsInfo["SearchPatternMol"] = SearchPatternMol 533 534 # Setup target pattern information... 535 Mode = OptionsInfo["Mode"] 536 TargetPattern = OptionsInfo["TargetPattern"] 537 538 TargetPatternInfo = OptionsInfo["TargetPatternInfo"] 539 if TargetPatternInfo is not None: 540 TargetPatternInfo["PatternObjects"] = [] 541 TargetPatternInfo["AttachSMILESAtomIndexOffsets"] = [] 542 543 if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I): 544 for Pattern in TargetPatternInfo["Patterns"]: 545 TargetPatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern)) 546 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None) 547 elif re.match("^AttachSMILES$", Mode, re.I): 548 for Pattern in TargetPatternInfo["Patterns"]: 549 SMILESMol = Chem.MolFromSmiles(Pattern) 550 AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol) 551 552 TargetPatternInfo["PatternObjects"].append(SMILESMol) 553 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset) 554 elif re.match("^RxnSMARTS$", Mode, re.I): 555 for Pattern in TargetPatternInfo["Patterns"]: 556 TargetPatternInfo["PatternObjects"].append(AllChem.ReactionFromSmarts(Pattern)) 557 TargetPatternInfo["AttachSMILESAtomIndexOffsets"].append(None) 558 elif re.match("^TandemScan$", Mode, re.I): 559 TargetPatternTandemScanInfo = OptionsInfo["TargetPatternTandemScanInfo"] 560 for PatternInfo in TargetPatternTandemScanInfo: 561 PatternInfo["PatternObjects"] = [] 562 PatternInfo["AttachSMILESAtomIndexOffsets"] = [] 563 for NameIndex, Name in enumerate(PatternInfo["Names"]): 564 Pattern = PatternInfo["Pattern"][NameIndex] 565 if re.match("^(AttachAtoms|ReplaceAtoms)$", Name, re.I): 566 PatternInfo["PatternObjects"].append(Chem.GetPeriodicTable().GetAtomicNumber(Pattern)) 567 PatternInfo["AttachSMILESAtomIndexOffsets"].append(None) 568 elif re.match("^AttachSMILES$", Name, re.I): 569 SMILESMol = Chem.MolFromSmiles(Pattern) 570 AttachAtomIndexOffset = GetAttachSMILESAtomIndexOffset(SMILESMol) 571 572 PatternInfo["PatternObjects"].append(SMILESMol) 573 PatternInfo["AttachSMILESAtomIndexOffsets"].append(AttachAtomIndexOffset) 574 575 def GetAttachSMILESAtomIndexOffset(Mol): 576 """Get attach atom index offset and optionally clear atom map numbers.""" 577 578 AttachAtomIndexOffset = 1 579 580 AtomMapIndices = RDKitUtil.GetAtomMapIndices(Mol) 581 if AtomMapIndices is not None and len(AtomMapIndices): 582 # Use the atom index corresponding to the lowest atom map number 583 # as the attachement atom. The AtomMapIndices list contains atom indices 584 # sorted by atom map numbers. 585 586 AttachAtomIndexOffset = AtomMapIndices[0] + 1 587 588 if OptionsInfo["ClearAtomMapNumbers"]: 589 RDKitUtil.ClearAtomMapNumbers(Mol) 590 591 return AttachAtomIndexOffset 592 593 def ProcessOptions(): 594 """Process and validate command line arguments and options.""" 595 596 MiscUtil.PrintInfo("Processing options...") 597 598 # Validate options... 599 ValidateOptions() 600 601 OptionsInfo["ComboSearchAtoms"] = int(Options["--comboSearchAtoms"]) 602 OptionsInfo["ClearAtomMapNumbers"] = True if re.match("^yes$", Options["--clearAtomMapNumbers"], re.I) else False 603 604 OptionsInfo["Infile"] = Options["--infile"] 605 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 606 607 OptionsInfo["Outfile"] = Options["--outfile"] 608 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 609 OptionsInfo["Overwrite"] = Options["--overwrite"] 610 611 OptionsInfo["Mode"] = Options["--mode"] 612 (AttachAtomsMode, ReplaceAtomsMode, AttachSMILESMode, RxnSMARTSMode, TandemScanMode) = [False] * 5 613 Mode = OptionsInfo["Mode"] 614 if re.match("^AttachAtoms$", Mode, re.I): 615 AttachAtomsMode = True 616 elif re.match("^ReplaceAtoms$", Mode, re.I): 617 ReplaceAtomsMode = True 618 elif re.match("^AttachSMILES$", Mode, re.I): 619 AttachSMILESMode = True 620 elif re.match("^RxnSMARTS$", Mode, re.I): 621 RxnSMARTSMode = True 622 elif re.match("^TandemScan$", Mode, re.I): 623 TandemScanMode = True 624 else: 625 MiscUtil.PrintError("The mode value , %s, specified using \"-m, --mode\" option is not valid" % (Mode)) 626 OptionsInfo["AttachAtomsMode"] = AttachAtomsMode 627 OptionsInfo["ReplaceAtomsMode"] = ReplaceAtomsMode 628 OptionsInfo["AttachSMILESMode"] = AttachSMILESMode 629 OptionsInfo["RxnSMARTSMode"] = RxnSMARTSMode 630 OptionsInfo["TandemScanMode"] = TandemScanMode 631 632 OptionsInfo["MPMode"] = True if re.match("^yes$", Options["--mp"], re.I) else False 633 OptionsInfo["MPParams"] = MiscUtil.ProcessOptionMultiprocessingParameters("--mpParams", Options["--mpParams"]) 634 635 OptionsInfo["NameSuffix"] = Options["--nameSuffix"] 636 OptionsInfo["PermuteTandemScan"] = True if re.match("^yes$", Options["--permuteTandemScan"], re.I) else False 637 OptionsInfo["QuietMode"] = True if re.match("^yes$", Options["--quiet"], re.I) else False 638 639 OptionsInfo["UniqueAnalogues"] = True if re.match("^yes$", Options["--uniqueAnalogues"], re.I) else False 640 641 # Process and validate search pattern... 642 SearchPattern = re.sub(" ", "", Options["--searchPattern"]) 643 if MiscUtil.IsEmpty(SearchPattern): 644 MiscUtil.PrintError("No value specified using \"-s, --searchPattern\" option.") 645 if re.match("^auto$", SearchPattern, re.I): 646 if re.match("^(AttachAtoms|ReplaceAtoms|AttachSMILES|TandemScan)$", Mode, re.I): 647 SearchPattern = "[cH]" 648 else: 649 SearchPattern = None 650 else: 651 if re.match("^RxnSMARTS$", Mode, re.I): 652 MiscUtil.PrintError("The specification of SMARTS search pattern, %s, using \"-s, --searchPattern\" option is not allowed during, %s, value of \"-m, --mode\"." % (SearchPattern, Mode)) 653 654 if SearchPattern is not None: 655 SearchPatternMol = Chem.MolFromSmarts(SearchPattern) 656 if SearchPatternMol is None: 657 MiscUtil.PrintError("Failed to create search pattern molecule. The torsion SMILES/SMARTS pattern, \"%s\", specified using \"-s, --searchPattern\" option is not valid." % (SearchPattern)) 658 OptionsInfo["SearchPattern"] = SearchPattern 659 660 # Process and validate target pattern... 661 TargetPattern = Options["--targetPattern"] 662 if MiscUtil.IsEmpty(TargetPattern): 663 MiscUtil.PrintError("No value specified using \"-t, --targetPattern\" option.") 664 TargetPattern = TargetPattern.strip() 665 666 if re.match("^auto$", TargetPattern, re.I): 667 # Setup default target pattern... 668 if re.match("^AttachAtoms$", Mode, re.I): 669 TargetPattern = "F" 670 elif re.match("^ReplaceAtoms$", Mode, re.I): 671 TargetPattern = "N" 672 elif re.match("^AttachSMILES$", Mode, re.I): 673 TargetPattern = "C(F)(F)(F)" 674 elif re.match("^RxnSMARTS$", Mode, re.I): 675 TargetPattern = "[cH:1]>>[N:1]" 676 elif re.match("^TandemScan$", Mode, re.I): 677 TargetPattern = "ReplaceAtoms,N,AttachAtoms,F" 678 else: 679 MiscUtil.PrintError("Failed to setup default target pattern for mode, %s, specified using \"-m, --mode\"" % (Mode)) 680 681 TargetPatternInfo = None 682 TargetPatternTandemScanInfo = None 683 684 if re.match("^(AttachAtoms|ReplaceAtoms)$", Mode, re.I): 685 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern) 686 for ElementSymbol in TargetPatternInfo["Patterns"]: 687 if not RDKitUtil.IsValidElementSymbol(ElementSymbol): 688 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid atomic symbol during, %s, value of \"-m, --mode\" option." % (ElementSymbol, Mode)) 689 elif re.match("^AttachSMILES$", Mode, re.I): 690 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern) 691 for SMILES in TargetPatternInfo["Patterns"]: 692 ValidateSMILESTargetPattern(SMILES, Mode) 693 elif re.match("^RxnSMARTS$", Mode, re.I): 694 TargetPatternInfo = InitializeTargetPatternInfo(TargetPattern) 695 for RxnPattern in TargetPatternInfo["Patterns"]: 696 try: 697 Rxn = AllChem.ReactionFromSmarts(RxnPattern) 698 except Exception as ErrMsg: 699 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid RxnSMARTS during, %s, value of \"-m, --mode\" option.\nErrMsg: %s" % (RxnPattern, Mode, ErrMsg)) 700 elif re.match("^TandemScan$", Mode, re.I): 701 TargetPatternTandemScanInfo = ProcessTandemScanTargetPattern(TargetPattern) 702 703 # Track target pattern info... 704 OptionsInfo["TargetPattern"] = TargetPattern 705 OptionsInfo["TargetPatternInfo"] = TargetPatternInfo 706 OptionsInfo["TargetPatternTandemScanInfo"] = TargetPatternTandemScanInfo 707 708 def InitializeTargetPatternInfo(TargetPattern): 709 """Initialize target pattern information.""" 710 711 TargetPatternInfo = {} 712 TargetPatternInfo["Patterns"] = TargetPattern.split() 713 714 return TargetPatternInfo 715 716 def ProcessTandemScanTargetPattern(TargetPattern): 717 """Process target pattern for tandem scan.""" 718 719 # Validate target pattern... 720 ValidatedTargetPatternInfo = ValidateTandemScanTargetPattern(TargetPattern) 721 722 # Expand target patterns to cover various combinations... 723 # 724 # For example: "ReplaceAtoms,N,Attach Atoms, F Cl" expands to 725 # "ReplaceAtoms,N,AttachAtoms,F" and "ReplaceAtoms,N,AttachAtoms,Cl" 726 # 727 ExpandedPatterns = None 728 for NameIndex, Name in enumerate(ValidatedTargetPatternInfo["Names"]): 729 Patterns = ValidatedTargetPatternInfo["Patterns"][NameIndex] 730 731 if ExpandedPatterns is None: 732 ExpandedPatterns = [] 733 for Pattern in Patterns: 734 ExpandedPatterns.append([Name, Pattern]) 735 continue 736 737 NewExpandedPatterns = [] 738 for ExpandedPattern in ExpandedPatterns: 739 for Pattern in Patterns: 740 NewExpandedPattern = [] 741 NewExpandedPattern.extend(ExpandedPattern) 742 743 NewExpandedPattern.append(Name) 744 NewExpandedPattern.append(Pattern) 745 746 NewExpandedPatterns.append(NewExpandedPattern) 747 748 ExpandedPatterns = NewExpandedPatterns 749 750 # Track target pattern info... 751 TargetPatternTandemScanInfo = [] 752 for ExpandedPattern in ExpandedPatterns: 753 PatternInfo = {} 754 PatternInfo["Names"] = [] 755 PatternInfo["Pattern"] = [] 756 757 for Index in range(0, len(ExpandedPattern), 2): 758 Name = ExpandedPattern[Index].strip() 759 Pattern = ExpandedPattern[Index + 1].strip() 760 PatternInfo["Names"].append(Name) 761 PatternInfo["Pattern"].append(Pattern) 762 763 TargetPatternTandemScanInfo.append(PatternInfo) 764 765 return TargetPatternTandemScanInfo 766 767 def ValidateTandemScanTargetPattern(TargetPattern): 768 """Validate target pattern for tandem scan.""" 769 770 ValidatedTargetPatternInfo = {} 771 ValidatedTargetPatternInfo["Names"] = [] 772 ValidatedTargetPatternInfo["Patterns"] = [] 773 774 Mode = OptionsInfo["Mode"] 775 776 TargetPatternWords = TargetPattern.split(",") 777 if len(TargetPatternWords) % 2: 778 MiscUtil.PrintError("The number of comma delimited paramater names and values, %d, specified using \"-t --targetPattern\" option must be an even number during, %s, value of \"-m, --mode\" option." % (len(TargetPatternWords), Mode)) 779 780 ValidOperationNames = ["AttachAtoms", "ReplaceAtoms", "AttachSMILES"] 781 CanonicalOperationNamesMap = {} 782 for OperationName in ValidOperationNames: 783 CanonicalOperationNamesMap[OperationName.lower()] = OperationName 784 785 # Validate PAS operation name and value pairs... 786 for Index in range(0, len(TargetPatternWords), 2): 787 OperationName = TargetPatternWords[Index].strip() 788 OperationPattern = TargetPatternWords[Index + 1].strip() 789 790 CanonicalOperationName = OperationName.lower() 791 if not CanonicalOperationName in CanonicalOperationNamesMap: 792 MiscUtil.PrintError("The operation name, %s, specified using \"-t, --targetPattern\" is not a valid during, %s, value of \"-m, --mode\" option. Supported operation names: %s" % (OperationName, Mode, " ".join(ValidOperationNames))) 793 794 if not OperationPattern: 795 MiscUtil.PrintError("Empty target pattern specified using \"-t, --targetPattern\" must be a valid atomic symbol for operation, %s, during, %s, value of \"-m, --mode\" option." % (OperationName, Mode)) 796 797 Name = CanonicalOperationNamesMap[CanonicalOperationName] 798 OperationPatterns = OperationPattern.split() 799 if re.match("^(AttachAtoms|ReplaceAtoms)$", OperationName, re.I): 800 for Pattern in OperationPatterns: 801 if not RDKitUtil.IsValidElementSymbol(Pattern): 802 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid atomic symbol for operation, %s, during, %s, value of \"-m, --mode\" option." % (Pattern, OperationName, Mode)) 803 elif re.match("^AttachSMILES$", OperationName, re.I): 804 for Pattern in OperationPatterns: 805 ValidateSMILESTargetPattern(Pattern, Mode, OperationName) 806 807 ValidatedTargetPatternInfo["Names"].append(Name) 808 ValidatedTargetPatternInfo["Patterns"].append(OperationPatterns) 809 810 return ValidatedTargetPatternInfo 811 812 def ValidateSMILESTargetPattern(TargetPattern, Mode, OperationName = None): 813 """Validate SMILES pattern.""" 814 815 Mol = Chem.MolFromSmiles(TargetPattern) 816 if Mol is None: 817 if OperationName is None: 818 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid SMILES during, %s, value of \"-m, --mode\" option." % (TargetPattern, Mode)) 819 else: 820 MiscUtil.PrintError("The target pattern, \"%s\", specified using \"-t, --targetPattern\" must be a valid SMILES for operation, %s, during, %s, value of \"-m, --mode\" option." % (TargetPattern, OperationName, Mode)) 821 822 def RetrieveOptions(): 823 """Retrieve command line arguments and options.""" 824 825 # Get options... 826 global Options 827 Options = docopt(_docoptUsage_) 828 829 # Set current working directory to the specified directory... 830 WorkingDir = Options["--workingdir"] 831 if WorkingDir: 832 os.chdir(WorkingDir) 833 834 # Handle examples option... 835 if "--examples" in Options and Options["--examples"]: 836 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 837 sys.exit(0) 838 839 def ValidateOptions(): 840 """Validate option values.""" 841 842 MiscUtil.ValidateOptionIntegerValue("-c, --comboSearchAtoms", Options["--comboSearchAtoms"], {">": 0}) 843 MiscUtil.ValidateOptionTextValue("--clearAtomMapNumbers", Options["--clearAtomMapNumbers"], "yes no") 844 845 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 846 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd mol smi txt csv tsv") 847 848 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi txt csv tsv") 849 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 850 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 851 852 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "ReplaceAtoms AttachAtoms AttachSMILES RxnSMARTS TandemScan") 853 854 MiscUtil.ValidateOptionTextValue("--mp", Options["--mp"], "yes no") 855 MiscUtil.ValidateOptionTextValue("-p, --permuteTandemScan", Options["--permuteTandemScan"], "yes no") 856 MiscUtil.ValidateOptionTextValue("-q, --quiet", Options["--quiet"], "yes no") 857 858 MiscUtil.ValidateOptionTextValue(" -u, --uniqueAnalogues", Options["--uniqueAnalogues"], "yes no") 859 860 # Setup a usage string for docopt... 861 _docoptUsage_ = """ 862 RDKitPerformPositionalAnalogueScan.py - Positional analogue scanning. 863 864 Usage: 865 RDKitPerformPositionalAnalogueScan.py [--comboSearchAtoms <number>] [--clearAtomMapNumbers <yes or no>] 866 [--infileParams <Name,Value,...>] [--mode <ReplaceAtoms, AttachAtoms, AttachSMILES, RxnSMARTS, TandemScan>] 867 [--mp <yes or no>] [--mpParams <Name,Value,...>] [ --nameSuffix <text>] [ --outfileParams <Name,Value,...> ] 868 [--overwrite] [--permuteTandemScan <yes or no>] [--quiet <yes or no>] [--searchPattern <SMARTSPattern>] 869 [--targetPattern <ElementSymbol, SMILES, RxnSMARTS,...>] [--uniqueAnalogues <yes or no> ] 870 [-w <dir>] -i <infile> -o <outfile> 871 RDKitPerformPositionalAnalogueScan.py -h | --help | -e | --examples 872 873 Description: 874 Perform Positional Analogue Scanning (PAS) to generate analogues of molecules 875 by applying chemical transformations to molecules [ Ref 147-148 ]. The chemical 876 transformations are defined using SMARTS, element symbols, SMILES, and 877 RxnSMARTS. Four different types of chemical transformations are available for 878 for generating analogues of molecules: replace atoms, attach atoms, attach SMILES, 879 and RxnSMARTS. Tandem positional analogue scanning may be performed by the 880 concurrent application of multiple chemical transformations. 881 882 A SMARTS search pattern identifies atoms in molecules for attachment or 883 replacement points during positional analogue scanning. It may retrieve multiple 884 substructure matches in a molecule. The first matched atom in each substructure 885 match comprises a set of attachment or replacement points. 886 887 A target pattern encompasses information regarding element symbol, SMILES, and 888 reaction SMARTS for replacing and attaching atoms, attaching SMILES, and applying 889 reaction SMARTS. In addition, multiple concurrent chemical transformations may 890 be specified during tandem positional analogue scanning. 891 892 The supported input file formats are: Mol (.mol), SD (.sdf, .sd), SMILES (.smi, 893 .csv, .tsv, .txt) 894 895 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi, .csv, 896 .tsv, .txt) 897 898 Options: 899 -c, --comboSearchAtoms <number> [default: 1] 900 Number of concurrent search pattern match atoms to use as attachment or 901 replacement points during positional analogue scanning in 'AttachAtoms', 902 'AttachSMILES', and 'ReplaceAtoms' modes. This value is ignored during 903 'RxnSMARTS' and 'TandemScan' values of '-m, --mode' option. 904 --clearAtomMapNumbers <yes or no> [default: yes] 905 Clear any atom map numbers in target SMILES specification for 'AttachSMILES' 906 before generating analogues. 907 -e, --examples 908 Print examples. 909 -h, --help 910 Print this help message. 911 -i, --infile <infile> 912 Input file name. 913 --infileParams <Name,Value,...> [default: auto] 914 A comma delimited list of parameter name and value pairs for reading 915 molecules from files. The supported parameter names for different file 916 formats, along with their default values, are shown below: 917 918 SD, MOL: removeHydrogens,yes,sanitize,yes,strictParsing,yes 919 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 920 smilesTitleLine,auto,sanitize,yes 921 922 Possible values for smilesDelimiter: space, comma or tab. 923 -m, --mode <ReplaceAtoms, AttachAtoms,...> [default: ReplaceAtoms] 924 Type of operations to perform during positional analogue scanning. The 925 supported values, along with a brief description, are shown below: 926 927 Mode Description 928 ReplaceAtoms Replace atoms with new atoms 929 AttachAtoms Attach new atoms to atoms 930 AttachSMILES Attach SMILES to atoms 931 RxnSMARTS Run reaction SMARTS 932 TandemScan Perform tandem scan by combining ReplaceAtoms, 933 AttachAtoms, and AttachSMILES 934 935 The chemical transformations of input molecules is dependent on the 936 values of '-s, --searchPattern' and '-t, --targetPattern' options. For 937 example, nitrogen-walk or nitrogen scan is performed by '[cH]' and 'N' 938 values for '-s, --searchPattern' and '-t, --targetPattern' options. 939 --mp <yes or no> [default: no] 940 Use multiprocessing. 941 942 By default, input data is retrieved in a lazy manner via mp.Pool.imap() 943 function employing lazy RDKit data iterable. This allows processing of 944 arbitrary large data sets without any additional requirements memory. 945 946 All input data may be optionally loaded into memory by mp.Pool.map() 947 before starting worker processes in a process pool by setting the value 948 of 'inputDataMode' to 'InMemory' in '--mpParams' option. 949 950 A word to the wise: The default 'chunkSize' value of 1 during 'Lazy' input 951 data mode may adversely impact the performance. The '--mpParams' section 952 provides additional information to tune the value of 'chunkSize'. 953 --mpParams <Name,Value,...> [default: auto] 954 A comma delimited list of parameter name and value pairs to configure 955 multiprocessing. 956 957 The supported parameter names along with their default and possible 958 values are shown below: 959 960 chunkSize, auto 961 inputDataMode, Lazy [ Possible values: InMemory or Lazy ] 962 numProcesses, auto [ Default: mp.cpu_count() ] 963 964 These parameters are used by the following functions to configure and 965 control the behavior of multiprocessing: mp.Pool(), mp.Pool.map(), and 966 mp.Pool.imap(). 967 968 The chunkSize determines chunks of input data passed to each worker 969 process in a process pool by mp.Pool.map() and mp.Pool.imap() functions. 970 The default value of chunkSize is dependent on the value of 'inputDataMode'. 971 972 The mp.Pool.map() function, invoked during 'InMemory' input data mode, 973 automatically converts RDKit data iterable into a list, loads all data into 974 memory, and calculates the default chunkSize using the following method 975 as shown in its code: 976 977 chunkSize, extra = divmod(len(dataIterable), len(numProcesses) * 4) 978 if extra: chunkSize += 1 979 980 For example, the default chunkSize will be 7 for a pool of 4 worker processes 981 and 100 data items. 982 983 The mp.Pool.imap() function, invoked during 'Lazy' input data mode, employs 984 'lazy' RDKit data iterable to retrieve data as needed, without loading all the 985 data into memory. Consequently, the size of input data is not known a priori. 986 It's not possible to estimate an optimal value for the chunkSize. The default 987 chunkSize is set to 1. 988 989 The default value for the chunkSize during 'Lazy' data mode may adversely 990 impact the performance due to the overhead associated with exchanging 991 small chunks of data. It is generally a good idea to explicitly set chunkSize to 992 a larger value during 'Lazy' input data mode, based on the size of your input 993 data and number of processes in the process pool. 994 995 The mp.Pool.map() function waits for all worker processes to process all 996 the data and return the results. The mp.Pool.imap() function, however, 997 returns the the results obtained from worker processes as soon as the 998 results become available for specified chunks of data. 999 1000 The order of data in the results returned by both mp.Pool.map() and 1001 mp.Pool.imap() functions always corresponds to the input data. 1002 -n, --nameSuffix <text> [default: Analogue] 1003 Name suffix for generating molecule names of analogues. Format of analogue 1004 names: <MolName>_<NameSuffix>_<MolNum> 1005 -o, --outfile <outfile> 1006 Output file name. 1007 --outfileParams <Name,Value,...> [default: auto] 1008 A comma delimited list of parameter name and value pairs for writing 1009 molecules to files. The supported parameter names for different file 1010 formats, along with their default values, are shown below: 1011 1012 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 1013 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 1014 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 1015 1016 --overwrite 1017 Overwrite existing files. 1018 -p, --permuteTandemScan <yes or no> [default: yes] 1019 Permute atom positions matched by SMARTS search pattern in a molecule 1020 to generate all possible analogues during tandem positional analogue 1021 scanning. 1022 1023 This option is only valid for 'TandemScan' value of '-m, --mode' option. 1024 -q, --quiet <yes or no> [default: no] 1025 Use quiet mode. The warning and information messages will not be printed. 1026 -s, --searchPattern <SMARTSPattern> [default: auto] 1027 SMARTS search pattern identifying atoms in molecules for attachment or 1028 replacement points during positional analogue scanning. The SMARTS search 1029 pattern may retrieve multiple substructure matches in a molecule. The first 1030 matched atom in each substructure match comprises a set of attachment or 1031 replacement points. 1032 1033 The default values, dependent on the value of '-m, --mode' option, are 1034 shown below: 1035 1036 Mode Default Description 1037 ReplaceAtoms [cH] Aromatic carbon 1038 AttachAtoms [cH] Aromatic carbon 1039 AttachSMILES [cH] Aromatic carbon 1040 RxnSMARTS None Not applicable 1041 TandemScan [cH] Aromatic carbon 1042 1043 This option is ignored during 'RxnSMARTS' value of '-m, --mode' option. 1044 -t, --targetPattern <ElementSymbol, SMILES, RxnSMARTS...> [default: auto] 1045 Target pattern for performing chemical transformations during positional 1046 analogue scanning. These values are used in conjunction with the value of 1047 '-s, --searchPattern' to generate appropriate analogues. 1048 1049 The default values, dependent on the values of '-m, --mode' and 1050 '-s, --searchPattern' options, are shown below: 1051 1052 Mode Default Description 1053 ReplaceAtoms N Element symbol for nitrogen 1054 AttachAtoms F Element symbol for fluorine 1055 AttachSMILES C(F)(F)(F) SMILES for CF3 1056 RxnSMARTS [cH:1]>>[N:1] Replace aromatic carbon by nitrogen 1057 TandemScan ReplaceAtoms,N,AttachAtoms,F Replace and attach 1058 1059 The target specifications may contain multiple space delimted values. For 1060 example: 1061 1062 Mode Example 1063 ReplaceAtoms "N P" 1064 AttachAtoms "F Cl Br" 1065 AttachSMILES "C(F)(F)(F) CO" 1066 RxnSMARTS "[cH:1]>>[N:1] [cH:1]>>[P:1]" 1067 TandemScan "ReplaceAtoms,N P,AttachAtoms,F Cl" 1068 1069 The target specification for 'AttachSMILES' may contain atom map numbers. 1070 The attachment point corresponds to the matched atom with the lowest 1071 atom map number. Otherwise, the first atom in SMILES target specification 1072 represents the attachment point. For example: 1073 1074 Mode Example 1075 AttachSMILES "C(C)(C)(C) CO" 1076 AttachSMILES "C([C:1])(C)(C) C(F)(F)(F)" 1077 1078 Multiple concurrent chemical transformations are allowed during 'TandemScan'. The 1079 target pattern specification for 'TandemScan' is a comma delimited list of operation 1080 type and target pattern. Format: OperationType,TargetPattern,... 1081 1082 The supported operation types and target pattern for 'TandemScan' are shown 1083 below: 1084 1085 ReplaceAtoms,<ElementSymbol> 1086 AttachAtoms,<ElementSymbol> 1087 AttachSMILES,<SMILES> 1088 1089 ReplaceAtoms,"<ElementSymbol> <ElementSymbol>..." 1090 AttachAtoms,"<ElementSymbol> <ElementSymbol>..." 1091 AttachSMILES,"<SMILES> <SMIELS>..." 1092 1093 For example: 1094 1095 ReplaceAtoms,N,AttachAtoms,F 1096 ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,C(F)(F)(F) 1097 1098 "ReplaceAtoms,N,AttachAtoms,F Cl" 1099 "ReplaceAtoms,N P,AttachAtoms,F Cl" 1100 "ReplaceAtoms,N P,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) CO" 1101 1102 The number of chemical transformations in 'TandemScan' must be less than or 1103 equal to the total number atoms matched by SMARTS search pattern in a molecule. 1104 Otherwise, it is not possible to perform a 'TandemScan'. The matched atom positions 1105 may be optionally permuted to generate all possible analogues during positional 1106 analogue scanning using '-p, --permuteTandemScan' option. 1107 -u, --uniqueAnalogues <yes or no> [default: yes] 1108 Keep only unique analogues of a molecule corresponding to unique SMILES 1109 strings. The duplicate SMILES string may be generated during PAS due to 1110 symmetric replacement or attachment points in molecules. 1111 -w, --workingdir <dir> 1112 Location of working directory which defaults to the current directory. 1113 1114 Examples: 1115 To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by 1116 nitrogens in molecules in a SMILES file and write out a SMILES file, type: 1117 1118 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.smi 1119 1120 To run the first example by explicitly specifying search and target patterns 1121 for replacing aromatic carbons by nitogens in molecules in a SD file and write 1122 out a SD file, type: 1123 1124 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]" 1125 -t "N" -i Sample.sdf -o SampleOut.sdf 1126 1127 To run the previous example for replacing aromatic carbons by N and P, type: 1128 1129 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]" 1130 -t "N P" -i Sample.sdf -o SampleOut.sdf 1131 1132 To run the first example in multiprocessing mode on all available CPUs 1133 without loading all data into memory and write out a SD file, type: 1134 1135 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf 1136 --mp yes 1137 1138 To run the previous example in multiprocessing mode on all available CPUs 1139 by loading all data into memory and write out a SD file, type: 1140 1141 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf 1142 --mp yes --mpParams "inputDataMode,InMemory" 1143 1144 To run the previous example in multiprocessing mode on specific number of 1145 CPUs and chunk size without loading all data into memory and write out a SD file, 1146 type: 1147 1148 % RDKitPerformPositionalAnalogueScan.py -i Sample.smi -o SampleOut.sdf 1149 --mpParams "inputDataMode,Lazy,numProcesses,4,chunkSize,8" 1150 1151 To perform positional analogue scanning by simultaneously attaching fluorines 1152 to two aromatic carbons in molecules in a SMILES file and write out a SD file, 1153 type: 1154 1155 % RDKitPerformPositionalAnalogueScan.py -m AttachAtoms -s "[cH]" 1156 -t "F" -c 2 -i Sample.smi -o SampleOut.sdf 1157 1158 To perform positional analogue scanning by attaching SMILES to aromatic 1159 carbons in molecules in a SMILES file and write out a SD file, type: 1160 1161 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]" 1162 -t "F" -i Sample.smi -o SampleOut.sdf 1163 1164 To run the previous example for attaching F and CF3 to aromatic carbons, 1165 type: 1166 1167 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]" 1168 -t "F C(F)(F)(F)" -i Sample.smi -o SampleOut.sdf 1169 1170 To run the previous example for attaching "C(C)(C)(C)" at a specific attachment 1171 point corresponding to the lowest atom map number, type: 1172 1173 % RDKitPerformPositionalAnalogueScan.py -m AttachSMILES -s "[cH]" 1174 -t "C([C:1])([C:2])(C)" -i Sample.smi -o SampleOut.sdf 1175 1176 To perform a nitrogen-walk or nitrogen scan by using reaction SMARTS to replace 1177 aromatic carbons by nitrogens in molecules in a SMILES file and write out a SMILES 1178 file, type: 1179 1180 % RDKitPerformPositionalAnalogueScan.py -m RxnSMARTS 1181 -t "[cH:1]>>[N:1]" -i Sample.smi -o SampleOut.smi 1182 1183 To perform a tandem positional analogue scan by concurrently applying multiple 1184 chemical transformations to aromatic carbons, permute all matched search 1185 atom positions during analogue generation, and write out a SD file, type: 1186 1187 % RDKitPerformPositionalAnalogueScan.py -m TandemScan -s "[cH]" 1188 -t "ReplaceAtoms,N,AttachAtoms,F,AttachSMILES,OC" 1189 -p yes -i Sample.smi -o SampleOut.smi 1190 1191 To run the previous example for attaching multiple atoms and SMILES to 1192 aromatic carbons, type: 1193 1194 % RDKitPerformPositionalAnalogueScan.py -m TandemScan -s "[cH]" 1195 -t "ReplaceAtoms,N,AttachAtoms,F Cl,AttachSMILES,C(F)(F)(F) OC" 1196 -p yes -i Sample.smi -o SampleOut.smi 1197 1198 To perform a nitrogen-walk or nitrogen scan by replacing aromatic carbons by 1199 nitrogens in molecules in a SMILES CSV fileS, MILES strings in column 1, name 1200 in column 2, and write out a SD file, type: 1201 1202 % RDKitPerformPositionalAnalogueScan.py -m ReplaceAtoms -s "[cH]" 1203 -t "N" --infileParams "smilesDelimiter,comma, smilesTitleLine,yes, 1204 smilesColumn,1,smilesNameColumn,2" 1205 -i SampleSMILES.csv -o SampleOut.sdf 1206 1207 Author: 1208 Manish Sud(msud@san.rr.com) 1209 1210 See also: 1211 RDKitConvertFileFormat.py, RDKitEnumerateCompoundLibrary.py, 1212 RDKitPerformTorsionScan.py 1213 1214 Copyright: 1215 Copyright (C) 2024 Manish Sud. All rights reserved. 1216 1217 The functionality available in this script is implemented using RDKit, an 1218 open source toolkit for cheminformatics developed by Greg Landrum. 1219 1220 This file is part of MayaChemTools. 1221 1222 MayaChemTools is free software; you can redistribute it and/or modify it under 1223 the terms of the GNU Lesser General Public License as published by the Free 1224 Software Foundation; either version 3 of the License, or (at your option) any 1225 later version. 1226 1227 """ 1228 1229 if __name__ == "__main__": 1230 main()