1 #!/bin/env python 2 # File: RDKitUtil.py 3 # Author: Manish Sud <msud@san.rr.com> 4 # 5 # Copyright (C) 2024 Manish Sud. All rights reserved. 6 # 7 # The functionality available in this file is implemented using RDKit, an 8 # open source toolkit for cheminformatics developed by Greg Landrum. 9 # 10 # This file is part of MayaChemTools. 11 # 12 # MayaChemTools is free software; you can redistribute it and/or modify it under 13 # the terms of the GNU Lesser General Public License as published by the Free 14 # Software Foundation; either version 3 of the License, or (at your option) any 15 # later version. 16 # 17 # MayaChemTools is distributed in the hope that it will be useful, but without 18 # any warranty; without even the implied warranty of merchantability of fitness 19 # for a particular purpose. See the GNU Lesser General Public License for more 20 # details. 21 # 22 # You should have received a copy of the GNU Lesser General Public License 23 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 24 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 25 # Boston, MA, 02111-1307, USA. 26 # 27 28 from __future__ import print_function 29 30 import os 31 import sys 32 import re 33 import base64 34 import pickle 35 36 from rdkit import Chem 37 from rdkit.Chem import AllChem 38 from rdkit.Chem import Draw 39 40 import MiscUtil 41 42 __all__ = ["AreAtomIndicesSequentiallyConnected", "AreAtomMapNumbersPresentInMol", "AreHydrogensMissingInMolecule", "CalculateFormalCharge", "CalculateSpinMultiplicity", "ClearAtomMapNumbers", "ConstrainAndEmbed", "FilterSubstructureMatchByAtomMapNumbers", "FilterSubstructureMatchesByAtomMapNumbers", "GetAtomIndices", "GetAtomMapIndices", "GetAtomMapIndicesAndMapNumbers", "GetAtomSymbols", "GetAtomPositions", "GetFormalCharge", "GetHeavyAtomNeighbors", "GetInlineSVGForMolecule", "GetInlineSVGForMolecules", "GetMolName", "GetNumFragments", "GetNumHeavyAtomNeighbors", "GetSpinMultiplicity", "GetSVGForMolecule", "GetSVGForMolecules", "GetPsi4XYZFormatString", "GetTorsionsAroundRotatableBonds", "GenerateBase64EncodedMolStrings", "GenerateBase64EncodedMolStringWithConfIDs", "IsAtomSymbolPresentInMol", "IsMolEmpty", "IsValidElementSymbol", "IsValidAtomIndex", "MolFromBase64EncodedMolString", "GenerateBase64EncodedMolStringsWithIDs", "MolToBase64EncodedMolString", "MolFromSubstructureMatch", "MolsFromSubstructureMatches", "ReadMolecules", "ReadAndValidateMolecules", "ReadMoleculesFromSDFile", "ReadMoleculesFromMolFile", "ReadMoleculesFromMol2File", "ReadMoleculesFromPDBFile", "ReadMoleculesFromSMILESFile", "ReorderAtomIndicesInSequentiallyConnectedManner", "SetAtomPositions", "SetWriterMolProps", "ValidateElementSymbols", "WriteMolecules"] 43 44 def GetMolName(Mol, MolNum = None): 45 """Get molecule name. 46 47 Arguments: 48 Mol (object): RDKit molecule object. 49 MolNum (int or None): Molecule number in input file. 50 51 Returns: 52 str : Molname corresponding to _Name property of a molecule, generated 53 from specieid MolNum using the format "Mol%d" % MolNum, or an 54 empty string. 55 56 """ 57 58 MolName = '' 59 if Mol.HasProp("_Name"): 60 MolName = Mol.GetProp("_Name") 61 62 if not len(MolName): 63 if MolNum is not None: 64 MolName = "Mol%d" % MolNum 65 66 return MolName 67 68 def GetInlineSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True, Base64Encoded = True): 69 """Get SVG image text for a molecule suitable for inline embedding into a HTML page. 70 71 Arguments: 72 Mol (object): RDKit molecule object. 73 Width (int): Width of a molecule image in pixels. 74 Height (int): Height of a molecule image in pixels. 75 Legend (str): Text to display under the image. 76 AtomListToHighlight (list): List of atoms to highlight. 77 BondListToHighlight (list): List of bonds to highlight. 78 BoldText (bool): Flag to make text bold in the image of molecule. 79 Base64Encoded (bool): Flag to return base64 encoded string. 80 81 Returns: 82 str : SVG image text for inline embedding into a HTML page using "img" 83 tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or 84 tag: <img src="> 85 86 """ 87 88 SVGText = GetSVGForMolecule(Mol, Width, Height, Legend, AtomListToHighlight, BondListToHighlight, BoldText) 89 return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded) 90 91 def GetInlineSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighLight = None, BoldText = True, Base64Encoded = True): 92 """Get SVG image text for molecules suitable for inline embedding into a HTML page. 93 94 Arguments: 95 Mols (list): List of RDKit molecule objects. 96 MolsPerRow (int): Number of molecules per row. 97 Width (int): Width of a molecule image in pixels. 98 Height (int): Height of a molecule image in pixels. 99 Legends (list): List containing strings to display under images. 100 AtomListsToHighlight (list): List of lists containing atoms to highlight 101 for molecules. 102 BondListsToHighlight (list): List of lists containing bonds to highlight 103 for molecules 104 BoldText (bool): Flag to make text bold in the image of molecules. 105 Base64Encoded (bool): Flag to return base64 encoded string. 106 107 Returns: 108 str : SVG image text for inline embedding into a HTML page using "img" 109 tag: <img src="data:image/svg+xml;charset=UTF-8,SVGImageText> or 110 tag: <img src="> 111 112 """ 113 114 SVGText = GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText) 115 return _ModifySVGForInlineEmbedding(SVGText, Base64Encoded) 116 117 def _ModifySVGForInlineEmbedding(SVGText, Base64Encoded): 118 """Modify SVG for inline embedding into a HTML page using "img" tag 119 along with performing base64 encoding. 120 """ 121 122 # Take out all tags till the start of '<svg' tag... 123 Pattern = re.compile("^.*<svg", re.I | re.S) 124 SVGText = Pattern.sub("<svg", SVGText) 125 126 # Add an extra space before the "width=..." tag. Otherwise, inline embedding may 127 # cause the following XML error on some browsers due to start of the "width=..." 128 # at the begining of the line in <svg ...> tag: 129 # 130 # XML5607: Whitespace expected. 131 # 132 SVGText = re.sub("width='", " width='", SVGText, flags = re.I) 133 134 # Take out trailing new line... 135 SVGText = SVGText.strip() 136 137 # Perform base64 encoding by turning text into byte stream using string 138 # encode and transform byte stream returned by b64encode into a string 139 # by string decode... 140 # 141 if Base64Encoded: 142 SVGText = base64.b64encode(SVGText.encode()).decode() 143 144 return SVGText 145 146 def GetSVGForMolecule(Mol, Width, Height, Legend = None, AtomListToHighlight = None, BondListToHighlight = None, BoldText = True): 147 """Get SVG image text for a molecule suitable for viewing in a browser. 148 149 Arguments: 150 Mol (object): RDKit molecule object. 151 Width (int): Width of a molecule image in pixels. 152 Height (int): Height of a molecule image in pixels. 153 Legend (str): Text to display under the image. 154 AtomListToHighlight (list): List of atoms to highlight. 155 BondListToHighlight (list): List of bonds to highlight. 156 BoldText (bool): Flag to make text bold in the image of molecule. 157 158 Returns: 159 str : SVG image text for writing to a SVG file for viewing in a browser. 160 161 """ 162 163 Mols = [Mol] 164 165 MolsPerRow = 1 166 MolWidth = Width 167 MolHeight = Height 168 169 Legends = [Legend] if Legend is not None else None 170 AtomListsToHighlight = [AtomListToHighlight] if AtomListToHighlight is not None else None 171 BondListsToHighLight = [BondListsToHighLight] if BondListToHighlight is not None else None 172 173 return GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends, AtomListsToHighlight, BondListsToHighLight, BoldText) 174 175 def GetSVGForMolecules(Mols, MolsPerRow, MolWidth, MolHeight, Legends = None, AtomListsToHighlight = None, BondListsToHighlight = None, BoldText = True): 176 """Get SVG image text for molecules suitable for viewing in a browser. 177 178 Arguments: 179 Mols (list): List of RDKit molecule objects. 180 MolsPerRow (int): Number of molecules per row. 181 Width (int): Width of a molecule image in pixels. 182 Height (int): Height of a molecule image in pixels. 183 Legends (list): List containing strings to display under images. 184 AtomListsToHighlight (list): List of lists containing atoms to highlight 185 for molecules. 186 BondListsToHighlight (list): List of lists containing bonds to highlight 187 for molecules 188 BoldText (bool): Flag to make text bold in the image of molecules. 189 190 Returns: 191 str : SVG image text for writing to a SVG file for viewing in a browser. 192 193 """ 194 195 SVGText = Draw.MolsToGridImage(Mols, molsPerRow = MolsPerRow, subImgSize = (MolWidth,MolHeight), legends = Legends, highlightAtomLists = AtomListsToHighlight, highlightBondLists = BondListsToHighlight, useSVG = True) 196 197 return _ModifySVGForBrowserViewing(SVGText, BoldText) 198 199 def _ModifySVGForBrowserViewing(SVGText, BoldText = True): 200 """Modify SVG for loading into a browser.""" 201 202 # It appears that the string 'xmlns:svg' needs to be replaced with 'xmlns' in the 203 # SVG image string generated by older versions of RDKit. Otherwise, the image 204 # doesn't load in web browsers. 205 # 206 if re.search("xmlns:svg", SVGText, re.I): 207 SVGText = re.sub("xmlns:svg", "xmlns", SVGText, flags = re.I) 208 209 # Make text bold... 210 if BoldText: 211 SVGText = re.sub("font-weight:normal;", "font-weight:bold;", SVGText, flags = re.I) 212 213 return SVGText 214 215 def IsMolEmpty(Mol): 216 """Check for the presence of atoms in a molecule. 217 218 Arguments: 219 Mol (object): RDKit molecule object. 220 221 Returns: 222 bool : True - No atoms in molecule; Otherwise, false. 223 224 """ 225 226 Status = False if Mol.GetNumAtoms() else True 227 228 return Status 229 230 def IsAtomSymbolPresentInMol(Mol, AtomSymbol, IgnoreCase = True): 231 """ Check for the presence of an atom symbol in a molecule. 232 233 Arguments: 234 Mol (object): RDKit molecule object. 235 AtomSymbol (str): Atom symbol. 236 237 Returns: 238 bool : True - Atom symbol in molecule; Otherwise, false. 239 240 """ 241 242 for Atom in Mol.GetAtoms(): 243 Symbol = Atom.GetSymbol() 244 if IgnoreCase: 245 if re.match("^%s$" % AtomSymbol, Symbol, re.I): 246 return True 247 else: 248 if re.match("^%s$" % AtomSymbol, Symbol): 249 return True 250 251 return False 252 253 def ValidateElementSymbols(ElementSymbols): 254 """Validate element symbols. 255 256 Arguments: 257 ElementSymbols (list): List of element symbols to validate. 258 259 Returns: 260 bool : True - All element symbols are valid; Otherwise, false. 261 262 """ 263 for ElementSymbol in ElementSymbols: 264 if not IsValidElementSymbol(ElementSymbol): 265 return False 266 267 return True 268 269 def GetAtomPositions(Mol, ConfID = -1): 270 """Retrieve a list of lists containing coordinates of all atoms in a 271 molecule. 272 273 Arguments: 274 Mol (object): RDKit molecule object. 275 ConfID (int): Conformer number. 276 277 Returns: 278 list : List of lists containing atom positions. 279 280 Examples: 281 282 for AtomPosition in RDKitUtil.GetAtomPositions(Mol): 283 print("X: %s; Y: %s; Z: %s" % (AtomPosition[0], AtomPosition[1], AtomPosition[2])) 284 285 """ 286 287 return Mol.GetConformer(id = ConfID).GetPositions().tolist() 288 289 def SetAtomPositions(Mol, AtomPositions, ConfID = -1): 290 """Set atom positions of all atoms in a molecule. 291 292 Arguments: 293 Mol (object): RDKit molecule object. 294 AtomPositions (object): List of lists containing atom positions. 295 ConfID (int): Conformer number. 296 297 Returns: 298 object : RDKit molecule object. 299 300 """ 301 302 MolConf = Mol.GetConformer(ConfID) 303 304 for Index in range(len(AtomPositions)): 305 MolConf.SetAtomPosition(Index, tuple(AtomPositions[Index])) 306 307 return Mol 308 309 def GetAtomSymbols(Mol): 310 """Retrieve a list containing atom symbols of all atoms a molecule. 311 312 Arguments: 313 Mol (object): RDKit molecule object. 314 315 Returns: 316 list : List of atom symbols. 317 318 """ 319 320 return [Atom.GetSymbol() for Atom in Mol.GetAtoms()] 321 322 def GetAtomIndices(Mol): 323 """Retrieve a list containing atom indices of all atoms a molecule. 324 325 Arguments: 326 Mol (object): RDKit molecule object. 327 328 Returns: 329 list : List of atom indices. 330 331 """ 332 333 return [Atom.GetIdx() for Atom in Mol.GetAtoms()] 334 335 def GetFormalCharge(Mol, CheckMolProp = True): 336 """Get formal charge of a molecule. The formal charge is either retrieved 337 from 'FormalCharge' molecule property or calculated using RDKit function 338 Chem.GetFormalCharge(Mol). 339 340 The 'FormalCharge' molecule property may contain multiple space delimited 341 values. The total formal charge corresponds to the sum of the specified formal 342 charge values. 343 344 Arguments: 345 Mol (object): RDKit molecule object. 346 CheckMolProp (bool): Check 'FormalCharge' molecule property to 347 retrieve formal charge. 348 349 Returns: 350 int : Formal charge. 351 352 """ 353 354 Name = 'FormalCharge' 355 if (CheckMolProp and Mol.HasProp(Name)): 356 FormalCharge = Mol.GetProp(Name) 357 Values = FormalCharge.split() 358 if len(Values) > 1: 359 MiscUtil.PrintWarning("RDKitUtil.GetFormalCharge: Molecule property, %s, contains multiple values, %s. Formal charge corresponds to sum of the specified values..." % (Name, FormalCharge)) 360 FormalCharge = 0.0 361 for Value in Values: 362 FormalCharge += float(Value) 363 FormalCharge = int(FormalCharge) 364 else: 365 FormalCharge = int(float(FormalCharge)) 366 else: 367 FormalCharge = CalculateFormalCharge(Mol) 368 369 return int(FormalCharge) 370 371 def CalculateFormalCharge(Mol): 372 """Calculate formal charge of a molecule. The formal charge is calculated 373 using RDKit function Chem.GetFormalCharge(Mol). 374 375 Arguments: 376 Mol (object): RDKit molecule object. 377 retrieve formal charge. 378 379 Returns: 380 int : Formal charge. 381 382 """ 383 384 return int(Chem.GetFormalCharge(Mol)) 385 386 def GetSpinMultiplicity(Mol, CheckMolProp = True): 387 """Get spin multiplicity of a molecule. The spin multiplicity is either 388 retrieved from 'SpinMultiplicity' molecule property or calculated 389 from the number of free radical electrons using Hund's rule of maximum 390 multiplicity defined as 2S + 1 where S is the total electron spin. The 391 total spin is 1/2 the number of free radical electrons in a molecule. 392 393 The 'SpinMultiplicity' molecule property may contain multiple space delimited 394 values. The total spin multiplicity corresponds to the total number of free radical 395 electrons which are calculated for each specified value. 396 397 Arguments: 398 Mol (object): RDKit molecule object. 399 CheckMolProp (bool): Check 'SpinMultiplicity' molecule property to 400 retrieve spin multiplicity. 401 402 Returns: 403 int : Spin multiplicity. 404 405 """ 406 407 Name = 'SpinMultiplicity' 408 if (CheckMolProp and Mol.HasProp(Name)): 409 SpinMultiplicity = Mol.GetProp(Name) 410 Values = SpinMultiplicity.split() 411 if len(Values) > 1: 412 MiscUtil.PrintWarning("RDKitUtil.GetSpinMultiplicity: Molecule property, %s, contains multiple values, %s. Calculating spin multiplicity corresponding to total number of free radical electrons for each specified value..." % (Name, SpinMultiplicity)) 413 NumRadicalElectrons = 0 414 for Value in Values: 415 NumRadicalElectrons += int(float(Value)) - 1 416 417 TotalElectronicSpin = NumRadicalElectrons/2 418 SpinMultiplicity = 2 * TotalElectronicSpin + 1 419 else: 420 SpinMultiplicity = int(float(SpinMultiplicity)) 421 else: 422 SpinMultiplicity = CalculateSpinMultiplicity(Mol) 423 424 return int(SpinMultiplicity) 425 426 def CalculateSpinMultiplicity(Mol): 427 """Calculate spin multiplicity of a molecule. The spin multiplicity is calculated 428 from the number of free radical electrons using Hund's rule of maximum 429 multiplicity defined as 2S + 1 where S is the total electron spin. The 430 total spin is 1/2 the number of free radical electrons in a molecule. 431 432 Arguments: 433 Mol (object): RDKit molecule object. 434 435 Returns: 436 int : Spin multiplicity. 437 438 """ 439 440 # Calculate spin multiplicity using Hund's rule of maximum multiplicity... 441 NumRadicalElectrons = 0 442 for Atom in Mol.GetAtoms(): 443 NumRadicalElectrons += Atom.GetNumRadicalElectrons() 444 445 TotalElectronicSpin = NumRadicalElectrons/2 446 SpinMultiplicity = 2 * TotalElectronicSpin + 1 447 448 return int(SpinMultiplicity) 449 450 def GetPsi4XYZFormatString(Mol, ConfID = -1, FormalCharge = "auto", SpinMultiplicity = "auto", Symmetry = "auto", NoCom = False, NoReorient = False, CheckFragments = False): 451 """Retrieve geometry string of a molecule in Psi4ish XYZ format to perform 452 Psi4 quantum chemistry calculations. 453 454 You may explicit specify multiple space delimited values for formal charge 455 and spin multiplicity. Otherwise, these values are either automatically 456 retrieved from 'FormalCharge' and 'SpinMultiplicity' molecule properties or 457 calculated using RDKit. The number of specified values for these properties 458 must match the number of fragments in the molecule during the processing 459 of the fragments. 460 461 Arguments: 462 Mol (object): RDKit molecule object. 463 ConfID (int): Conformer number. 464 FormalCharge (str): Specified formal charge or 'auto' to calculate 465 its value by RDKit. 466 SpinMultiplicity (str): Specified spin multiplicity or 'auto' to calculate 467 its value by RDKit. 468 Symmetry (str): Specified symmetry or 'auto' to calculate its value by 469 Psi4. 470 NoCom (bool): Flag to disable recentering of a molecule by Psi4. 471 NoReorient (bool): Flag to disable reorientation of a molecule by Psi4. 472 CheckFragments (bool): Check for fragments and setup geometry string 473 using -- separator between fragments. 474 475 Returns: 476 str : Geometry string of a molecule in Psi4ish XYZ format. 477 478 """ 479 480 # Check for fragments... 481 Mols = [Mol] 482 if CheckFragments: 483 Fragments = list(Chem.rdmolops.GetMolFrags(Mol, asMols = True)) 484 if len(Fragments) > 1: 485 Mols = Fragments 486 487 FragMolFormalCharges = _SetFormalChargesForPsi4XYZFormatString(Mol, Mols, FormalCharge, CheckFragments) 488 FragMolSpinMultiplicities = _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, Mols, SpinMultiplicity, CheckFragments) 489 490 # Setup geometry string for Ps4... 491 GeometryList = [] 492 FragMolCount = 0 493 494 for FragMolIndex, FragMol in enumerate(Mols): 495 FragMolCount += 1 496 if FragMolCount > 1: 497 GeometryList.append("--") 498 499 FragMolFormalCharge = FragMolFormalCharges[FragMolIndex] 500 FragMolSpinMultiplicity = FragMolSpinMultiplicities[FragMolIndex] 501 if FragMolFormalCharge is None or FragMolSpinMultiplicity is None: 502 MiscUtil.PrintInfo("") 503 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Failed to set formal charge and spin multiplicity values. Both formal charge, %s, and spin multiplicity, %s, must be valid values. These values are either specified explicitly or automatically calculated..." % (FragMolFormalCharge, FragMolSpinMultiplicity)) 504 else: 505 GeometryList.append("%s %s" % (FragMolFormalCharge, FragMolSpinMultiplicity)) 506 507 AtomSymbols = GetAtomSymbols(FragMol) 508 AtomPositions = GetAtomPositions(FragMol, ConfID) 509 510 for AtomSymbol, AtomPosition in zip(AtomSymbols, AtomPositions): 511 GeometryList.append("%s %s %s %s" % (AtomSymbol, AtomPosition[0], AtomPosition[1], AtomPosition[2])) 512 513 GeometryList.append("units angstrom") 514 515 if not re.match("^auto$", Symmetry, re.I): 516 Name = 'Symmetry' 517 if (Mol.HasProp(Name)): 518 Symmetry = Mol.GetProp(Name) 519 GeometryList.append("symmetry %s" % Symmetry) 520 521 if NoCom: 522 GeometryList.append("no_com") 523 524 if NoReorient: 525 GeometryList.append("no_reorient") 526 527 Geometry = "\n".join(GeometryList) 528 529 return Geometry 530 531 def _SetFormalChargesForPsi4XYZFormatString(Mol, FragMols, FormalCharge, CheckFragments): 532 """Setup formal charges for Psi4 XYZ format string. """ 533 534 if not CheckFragments: 535 if re.match("^auto$", FormalCharge, re.I): 536 MolFormalCharge = GetFormalCharge(Mol) 537 else: 538 MolFormalCharge = int(FormalCharge) 539 return [MolFormalCharge] 540 541 FragMolsCount = len(FragMols) 542 FormalCharges = [None] * FragMolsCount 543 544 if re.match("^auto$", FormalCharge, re.I): 545 PropName = "FormalCharge" 546 if Mol.HasProp(PropName): 547 FormalCharge = Mol.GetProp(PropName) 548 FormalChargeWords = FormalCharge.split() 549 if len(FormalChargeWords) == FragMolsCount: 550 FormalCharges = [int(float(FormalCharge)) for FormalCharge in FormalChargeWords] 551 else: 552 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge molecule property. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount)) 553 else: 554 FormalCharges = [CalculateFormalCharge(FragMol) for FragMol in FragMols] 555 else: 556 FormalChargeWords = FormalCharge.split() 557 if len(FormalChargeWords) != FragMolsCount: 558 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for FormalCharge paramater. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (FormalCharge, len(FormalChargeWords), FragMolsCount)) 559 else: 560 FormalCharges = [int(FormalCharge) for FormalCharge in FormalChargeWords] 561 562 return FormalCharges 563 564 def _SetSpinMultiplicitiesForPsi4XYZFormatString(Mol, FragMols, SpinMultiplicity, CheckFragments): 565 """Setup spin multiplicites for Psi4 XYZ format string. """ 566 567 if not CheckFragments: 568 if re.match("^auto$", SpinMultiplicity, re.I): 569 MolSpinMultiplicity = GetSpinMultiplicity(Mol) 570 else: 571 MolSpinMultiplicity = int(SpinMultiplicity) 572 return [MolSpinMultiplicity] 573 574 FragMolsCount = len(FragMols) 575 SpinMultiplicities = [None] * FragMolsCount 576 577 if re.match("^auto$", SpinMultiplicity, re.I): 578 PropName = "SpinMultiplicity" 579 if Mol.HasProp(PropName): 580 SpinMultiplicity = Mol.GetProp(PropName) 581 SpinMultiplicityWords = SpinMultiplicity.split() 582 if len(SpinMultiplicityWords) == FragMolsCount: 583 SpinMultiplicities = [int(float(SpinMultiplicity)) for SpinMultiplicity in SpinMultiplicityWords] 584 else: 585 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity molecule property. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount)) 586 else: 587 SpinMultiplicities = [CalculateSpinMultiplicity(FragMol) for FragMol in FragMols] 588 else: 589 SpinMultiplicityWords = SpinMultiplicity.split() 590 if len(SpinMultiplicityWords) != FragMolsCount: 591 MiscUtil.PrintWarning("RDKitUtil.GetPsi4XYZFormatString: Ignoring specified value, %s, for SpinMultiplicity paramater. The number of space delimted specified values, %s, must match number of fragments, %s, in the molecule..." % (SpinMultiplicity, len(SpinMultiplicityWords), FragMolsCount)) 592 else: 593 SpinMultiplicities = [int(SpinMultiplicity) for SpinMultiplicity in SpinMultiplicityWords] 594 595 return SpinMultiplicities 596 597 def GetTorsionsAroundRotatableBonds(Mol, RotBondsPatternMol, IgnoreHydrogens = True): 598 """Identify torsions around rotatable bonds and return a list of lists 599 containing atom indices of torsions. 600 601 Arguments: 602 Mol (object): RDKit molecule object. 603 PatternMol (object): RDKit molecule object corresponding to SMARTS 604 pattern to identify rotatable bonds. 605 IgnoreHydrogens (bool): Flag to include torsions around rotatable bonds 606 containing hydrogens. 607 608 Returns: 609 list : List of lists containing atom indices of torsions around 610 rotatable bonds. 611 612 """ 613 614 # Match rotatable bonds... 615 RotBondsMatches = FilterSubstructureMatchesByAtomMapNumbers(Mol, RotBondsPatternMol, Mol.GetSubstructMatches(RotBondsPatternMol, useChirality = False)) 616 if not len(RotBondsMatches): 617 return None 618 619 # Identify torsions... 620 TorsioAtomIndicesList = [] 621 for RotBondMatch in RotBondsMatches: 622 if len(RotBondMatch) != 2: 623 continue 624 625 Atom1Index, Atom2Index = RotBondMatch 626 Atom1NbrIndices = _GetRotBondAtomNeighbors(Mol, Atom1Index, Atom2Index, IgnoreHydrogens) 627 Atom2NbrIndices = _GetRotBondAtomNeighbors(Mol, Atom2Index, Atom1Index, IgnoreHydrogens) 628 629 TorsionAtomIndices = [] 630 for Atom1NbrIndex in Atom1NbrIndices: 631 if Atom1NbrIndex in Atom2NbrIndices: 632 continue 633 for Atom2NbrIndex in Atom2NbrIndices: 634 if Atom2NbrIndex in Atom1NbrIndices: 635 continue 636 TorsionAtomIndices.append([Atom1NbrIndex, Atom1Index, Atom2Index, Atom2NbrIndex]) 637 638 TorsioAtomIndicesList.extend(TorsionAtomIndices) 639 640 return TorsioAtomIndicesList 641 642 def _GetRotBondAtomNeighbors(Mol, AtomIndex, BondedAtomIndex, IgnoreHydrogens): 643 """Get atom neigbors around a rotatable bond.""" 644 645 Atom = Mol.GetAtomWithIdx(AtomIndex) 646 AtomNeighbors = [] 647 648 for AtomNbr in Atom.GetNeighbors(): 649 AtomNbrIndex = AtomNbr.GetIdx() 650 if AtomNbrIndex == BondedAtomIndex: 651 continue 652 if IgnoreHydrogens: 653 if AtomNbr.GetAtomicNum() == 1: 654 continue 655 656 AtomNeighbors.append(AtomNbrIndex) 657 658 return AtomNeighbors 659 660 def GetNumFragments(Mol): 661 """Get number of fragment in a molecule. 662 663 Arguments: 664 Atom (object): RDKit molecule object. 665 666 Returns: 667 int : Number of fragments. 668 669 """ 670 671 Fragments = Chem.rdmolops.GetMolFrags(Mol, asMols = False) 672 673 return len(Fragments) if Fragments is not None else 0 674 675 def GetNumHeavyAtomNeighbors(Atom): 676 """Get number of heavy atom neighbors. 677 678 Arguments: 679 Atom (object): RDKit atom object. 680 681 Returns: 682 int : Number of neighbors. 683 684 """ 685 686 NbrCount = 0 687 for AtomNbr in Atom.GetNeighbors(): 688 if AtomNbr.GetAtomicNum() > 1: 689 NbrCount += 1 690 691 return NbrCount 692 693 def GetHeavyAtomNeighbors(Atom): 694 """Get a list of heavy atom neighbors. 695 696 Arguments: 697 Atom (object): RDKit atom object. 698 699 Returns: 700 list : List of heavy atom neighbors. 701 702 """ 703 704 AtomNeighbors = [] 705 for AtomNbr in Atom.GetNeighbors(): 706 if AtomNbr.GetAtomicNum() > 1: 707 AtomNeighbors.append(AtomNbr) 708 709 return AtomNeighbors 710 711 def IsValidElementSymbol(ElementSymbol): 712 """Validate element symbol. 713 714 Arguments: 715 ElementSymbol (str): Element symbol 716 717 Returns: 718 bool : True - Valid element symbol; Otherwise, false. 719 720 """ 721 722 try: 723 AtomicNumber = Chem.GetPeriodicTable().GetAtomicNumber(ElementSymbol) 724 Status = True if AtomicNumber > 0 else False 725 except Exception as ErrMsg: 726 Status = False 727 728 return Status 729 730 def IsValidAtomIndex(Mol, AtomIndex): 731 """Validate presence atom index in a molecule. 732 733 Arguments: 734 Mol (object): RDKit molecule object. 735 AtomIndex (int): Atom index. 736 737 Returns: 738 bool : True - Valid atom index; Otherwise, false. 739 740 """ 741 for Atom in Mol.GetAtoms(): 742 if AtomIndex == Atom.GetIdx(): 743 return True 744 745 return False 746 747 def AreHydrogensMissingInMolecule(Mol): 748 """Check for any missing hydrogens in in a molecue. 749 750 Arguments: 751 Mol (object): RDKit molecule object. 752 753 Returns: 754 bool : True - Missing hydrogens; Otherwise, false. 755 756 """ 757 758 for Atom in Mol.GetAtoms(): 759 NumExplicitAndImplicitHs = Atom.GetNumExplicitHs() + Atom.GetNumImplicitHs() 760 if NumExplicitAndImplicitHs > 0: 761 return True 762 763 return False 764 765 def AreAtomIndicesSequentiallyConnected(Mol, AtomIndices): 766 """Check for the presence bonds between sequential pairs of atoms in a 767 molecule. 768 769 Arguments: 770 Mol (object): RDKit molecule object. 771 AtomIndices (list): List of atom indices. 772 773 Returns: 774 bool : True - Sequentially connected; Otherwise, false. 775 776 """ 777 778 for Index in range(0, (len(AtomIndices) -1)): 779 Bond = Mol.GetBondBetweenAtoms(AtomIndices[Index], AtomIndices[Index + 1]) 780 if Bond is None: 781 return False 782 783 if Bond.GetIdx() is None: 784 return False 785 786 return True 787 788 def ReorderAtomIndicesInSequentiallyConnectedManner(Mol, AtomIndices): 789 """Check for the presence of sequentially connected list of atoms in an 790 arbitray list of atoms in molecule. 791 792 Arguments: 793 Mol (object): RDKit molecule object. 794 AtomIndices (list): List of atom indices. 795 796 Returns: 797 bool : True - Sequentially connected list found; Otherwise, false. 798 list : List of seqeuntially connected atoms or None. 799 800 """ 801 802 # Count the number of neighbors for specified atom indices ensuring 803 # that the neighbors are also part of atom indices... 804 AtomNbrsCount = {} 805 for AtomIndex in AtomIndices: 806 Atom = Mol.GetAtomWithIdx(AtomIndex) 807 808 AtomNbrsCount[AtomIndex] = 0 809 for AtomNbr in Atom.GetNeighbors(): 810 AtomNbrIndex = AtomNbr.GetIdx() 811 if AtomNbrIndex not in AtomIndices: 812 continue 813 AtomNbrsCount[AtomIndex] += 1 814 815 # Number of neighbors for each specified atom indices must be 1 or 2 816 # for sequentially connected list of atom indices... 817 AtomsWithOneNbr = [] 818 for AtomIndex, NbrsCount in AtomNbrsCount.items(): 819 if not (NbrsCount == 1 or NbrsCount ==2): 820 return (False, None) 821 822 if NbrsCount == 1: 823 AtomsWithOneNbr.append(AtomIndex) 824 825 # A sequentially connected list of indices must have two atom indices with 826 # exactly # one neighbor... 827 if len(AtomsWithOneNbr) != 2: 828 return (False, None) 829 830 # Setup a reordered list of sequentially connected atoms... 831 ReorderedAtomIndices = [] 832 833 AtomIndex1, AtomIndex2 = AtomsWithOneNbr 834 AtomIndex = AtomIndex1 if AtomIndex1 < AtomIndex2 else AtomIndex2 835 ReorderedAtomIndices.append(AtomIndex) 836 837 while (len(ReorderedAtomIndices) < len(AtomIndices)): 838 Atom = Mol.GetAtomWithIdx(AtomIndex) 839 840 for AtomNbr in Atom.GetNeighbors(): 841 AtomNbrIndex = AtomNbr.GetIdx() 842 if AtomNbrIndex not in AtomIndices: 843 continue 844 845 if AtomNbrIndex in ReorderedAtomIndices: 846 continue 847 848 # Treat neighbor as next connected atom... 849 AtomIndex = AtomNbrIndex 850 ReorderedAtomIndices.append(AtomIndex) 851 break 852 853 # Check reorderd list size... 854 if (len(ReorderedAtomIndices) != len(AtomIndices)): 855 return (False, None) 856 857 # A final check to validate reorderd list... 858 if not AreAtomIndicesSequentiallyConnected(Mol, ReorderedAtomIndices): 859 return (False, None) 860 861 return (True, ReorderedAtomIndices) 862 863 def MolToBase64EncodedMolString(Mol, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 864 """Encode RDkit molecule object into a base64 encoded string. The properties 865 can be optionally excluded. 866 867 The molecule is pickled using RDKit Mol.ToBinary() function before 868 their encoding. 869 870 Arguments: 871 Mol (object): RDKit molecule object. 872 PropertyPickleFlags: RDKit property pickle options. 873 874 Returns: 875 str : Base64 encode molecule string or None. 876 877 Notes: 878 The following property pickle flags are currently available in RDKit: 879 880 Chem.PropertyPickleOptions.NoProps 881 Chem.PropertyPickleOptions.MolProps 882 Chem.PropertyPickleOptions.AtomProps 883 Chem.PropertyPickleOptions.BondProps 884 Chem.PropertyPickleOptions.PrivateProps 885 Chem.PropertyPickleOptions.AllProps 886 887 """ 888 889 return None if Mol is None else base64.b64encode(Mol.ToBinary(PropertyPickleFlags)).decode() 890 891 def MolFromBase64EncodedMolString(EncodedMol): 892 """Generate a RDKit molecule object from a base64 encoded string. 893 894 Arguments: 895 str: Base64 encoded molecule string. 896 897 Returns: 898 object : RDKit molecule object or None. 899 900 """ 901 902 return None if EncodedMol is None else Chem.Mol(base64.b64decode(EncodedMol)) 903 904 def GenerateBase64EncodedMolStrings(Mols, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 905 """Setup an iterator for generating base64 encoded molecule string 906 from a RDKit molecule iterator. The iterator returns a list containing 907 a molecule index and encoded molecule string or None. 908 909 The molecules are pickled using RDKit Mol.ToBinary() function 910 before their encoding. 911 912 Arguments: 913 iterator: RDKit molecules iterator. 914 PropertyFlags: RDKit property pickle options. 915 916 Returns: 917 object : Base64 endcoded molecules iterator. The iterator returns a 918 list containing a molecule index and an encoded molecule string 919 or None. 920 921 Notes: 922 The following property pickle flags are currently available in RDKit: 923 924 Chem.PropertyPickleOptions.NoProps 925 Chem.PropertyPickleOptions.MolProps 926 Chem.PropertyPickleOptions.AtomProps 927 Chem.PropertyPickleOptions.BondProps 928 Chem.PropertyPickleOptions.PrivateProps 929 Chem.PropertyPickleOptions.AllProps 930 931 Examples: 932 933 EncodedMolsInfo = GenerateBase64EncodedMolStrings(Mols) 934 for MolIndex, EncodedMol in EncodedMolsInfo: 935 if EncodeMol is not None: 936 Mol = MolFromBase64EncodedMolString(EncodedMol) 937 938 """ 939 for MolIndex, Mol in enumerate(Mols): 940 yield [MolIndex, None] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags)] 941 942 def GenerateBase64EncodedMolStringsWithIDs(Mols, MolIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 943 """Setup an iterator for generating base64 encoded molecule string 944 from a RDKit molecule iterator. The iterator returns a list containing 945 a molecule ID and encoded molecule string or None. 946 947 The molecules are pickled using RDKit Mol.ToBinary() function 948 before their encoding. 949 950 Arguments: 951 iterator: RDKit molecules iterator. 952 MolIDs (list): Molecule IDs. 953 PropertyFlags: RDKit property pickle options. 954 955 Returns: 956 object : Base64 endcoded molecules iterator. The iterator returns a 957 list containing a molecule ID and an encoded molecule string 958 or None. 959 960 Notes: 961 The following property pickle flags are currently available in RDKit: 962 963 Chem.PropertyPickleOptions.NoProps 964 Chem.PropertyPickleOptions.MolProps 965 Chem.PropertyPickleOptions.AtomProps 966 Chem.PropertyPickleOptions.BondProps 967 Chem.PropertyPickleOptions.PrivateProps 968 Chem.PropertyPickleOptions.AllProps 969 970 Examples: 971 972 EncodedMolsInfo = GenerateBase64EncodedMolStringsWithIDs(Mols) 973 for MolID, EncodedMol in EncodedMolsInfo: 974 if EncodeMol is not None: 975 Mol = MolFromBase64EncodedMolString(EncodedMol) 976 977 """ 978 for MolIndex, Mol in enumerate(Mols): 979 yield [MolIDs[MolIndex], None] if Mol is None else [MolIDs[MolIndex], MolToBase64EncodedMolString(Mol, PropertyPickleFlags)] 980 981 def GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs, PropertyPickleFlags = Chem.PropertyPickleOptions.AllProps): 982 """Setup an iterator generating base64 encoded molecule string for a 983 molecule. The iterator returns a list containing a molecule index, an encoded 984 molecule string, and conf ID. 985 986 The molecules are pickled using RDKit Mol.ToBinary() function 987 before their encoding. 988 989 Arguments: 990 Mol (object): RDKit molecule object. 991 MolIndex (int): Molecule index. 992 ConfIDs (list): Conformer IDs. 993 PropertyFlags: RDKit property pickle options. 994 995 Returns: 996 object : Base64 endcoded molecules iterator. The iterator returns a 997 list containing a molecule index, an encoded molecule string, and 998 conf ID. 999 1000 Notes: 1001 The following property pickle flags are currently available in RDKit: 1002 1003 Chem.PropertyPickleOptions.NoProps 1004 Chem.PropertyPickleOptions.MolProps 1005 Chem.PropertyPickleOptions.AtomProps 1006 Chem.PropertyPickleOptions.BondProps 1007 Chem.PropertyPickleOptions.PrivateProps 1008 Chem.PropertyPickleOptions.AllProps 1009 1010 Examples: 1011 1012 EncodedMolsInfo = GenerateBase64EncodedMolStringWithConfIDs(Mol, MolIndex, ConfIDs) 1013 for MolIndex, EncodedMol, ConfID in EncodedMolsInfo: 1014 if EncodeMol is not None: 1015 Mol = MolFromBase64EncodedMolString(EncodedMol) 1016 1017 """ 1018 for ConfID in ConfIDs: 1019 yield [MolIndex, None, ConfID] if Mol is None else [MolIndex, MolToBase64EncodedMolString(Mol, PropertyPickleFlags), ConfID] 1020 1021 def AreAtomMapNumbersPresentInMol(Mol): 1022 """Check for the presence of atom map numbers in a molecue. 1023 1024 Arguments: 1025 Mol (object): RDKit molecule object. 1026 1027 Returns: 1028 bool : True - Atom map numbers present; Otherwise, false. 1029 1030 """ 1031 1032 return False if _GetAtomMapIndices(Mol) is None else True 1033 1034 def ClearAtomMapNumbers(Mol, AllowImplicitValence = True, ClearRadicalElectrons = True): 1035 """Check and clear atom map numbers in a molecule. In addition, allow implicit 1036 valence and clear radical electrons for atoms with associated map numbers. 1037 1038 For example, the following atomic properties are assigned by RDKit to atom 1039 map number 1 in a molecule corresponding to SMILES C[C:1](C)C: 1040 1041 NoImplicit: True; ImplicitValence: 0; ExplicitValence: 3; NumExplicitHs: 0; 1042 NumImplicitHs: 0; NumRadicalElectrons: 1 1043 1044 This function clears atoms map numbers in the molecule leading to SMILES 1045 CC(C)C, along with optionally updating atomic properties as shown below: 1046 1047 NoImplicit: False; ImplicitValence: 1; ExplicitValence: 3; NumExplicitHs: 0; 1048 NumImplicitHs: 1; NumRadicalElectrons: 0 1049 1050 Arguments: 1051 Mol (object): RDKit molecule object. 1052 1053 Returns: 1054 Mol (object): RDKit molecule object. 1055 1056 """ 1057 1058 AtomMapIndices = GetAtomMapIndices(Mol) 1059 1060 if AtomMapIndices is None: 1061 return Mol 1062 1063 for AtomMapIndex in AtomMapIndices: 1064 Atom = Mol.GetAtomWithIdx(AtomMapIndex) 1065 1066 # Clear map number property 'molAtomMapNumber'... 1067 Atom.SetAtomMapNum(0) 1068 1069 # Allow implit valence... 1070 if AllowImplicitValence: 1071 Atom.SetNoImplicit(False) 1072 1073 # Set number of electrons to 0... 1074 if ClearRadicalElectrons: 1075 Atom.SetNumRadicalElectrons(0) 1076 1077 Atom.UpdatePropertyCache() 1078 1079 Mol.UpdatePropertyCache() 1080 1081 return Mol 1082 1083 def GetAtomMapIndices(Mol): 1084 """Get a list of available atom indices corresponding to atom map numbers 1085 present in a SMILES/SMARTS pattern used for creating a molecule. The list of 1086 atom indices is sorted in ascending order by atom map numbers. 1087 1088 Arguments: 1089 Mol (object): RDKit molecule object. 1090 1091 Returns: 1092 list : List of atom indices sorted in the ascending order of atom map 1093 numbers or None. 1094 1095 """ 1096 1097 return _GetAtomMapIndices(Mol) 1098 1099 def GetAtomMapIndicesAndMapNumbers(Mol): 1100 """Get lists of available atom indices and atom map numbers present in a 1101 SMILES/SMARTS pattern used for creating a molecule. Both lists are sorted 1102 in ascending order by atom map numbers. 1103 1104 Arguments: 1105 Mol (object): RDKit molecule object. 1106 1107 Returns: 1108 list : List of atom indices sorted in the ascending order of atom map 1109 numbers or None. 1110 list : List of atom map numbers sorted in the ascending order or None. 1111 1112 """ 1113 1114 return (_GetAtomMapIndicesAndMapNumbers(Mol)) 1115 1116 def MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, FilterByAtomMapNums = False): 1117 """Generate a RDKit molecule object for a list of matched atom indices 1118 present in a pattern molecule. The list of atom indices correspond to a 1119 list retrieved by RDKit function GetSubstructureMatches using SMILES/SMARTS 1120 pattern. The atom indices are optionally filtered by mapping atom numbers 1121 to appropriate atom indices during the generation of the molecule. 1122 For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1123 1124 Arguments: 1125 Mol (object): RDKit molecule object. 1126 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1127 AtomIndices (list): Atom indices. 1128 FilterByAtomMapNums (bool): Filter matches by atom map numbers. 1129 1130 Returns: 1131 object : RDKit molecule object or None. 1132 1133 """ 1134 1135 AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None 1136 1137 return (_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1138 1139 def MolsFromSubstructureMatches(Mol, PatternMol, AtomIndicesList, FilterByAtomMapNums = False): 1140 """Generate a list of RDKit molecule objects for a list containing lists of 1141 matched atom indices present in a pattern molecule. The list of atom indices 1142 correspond to a list retrieved by RDKit function GetSubstructureMatches using 1143 SMILES/SMARTS pattern. The atom indices are optionally filtered by mapping 1144 atom numbers to appropriate atom indices during the generation of the 1145 molecule. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1146 1147 Arguments: 1148 Mol (object): RDKit molecule object. 1149 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1150 AtomIndicesList (list): A list of lists containing atom indices. 1151 FilterByAtomMapNums (bool): Filter matches by atom map numbers. 1152 1153 Returns: 1154 list : A list of lists containg RDKit molecule objects or None. 1155 1156 """ 1157 1158 AtomMapIndices = _GetAtomMapIndices(PatternMol) if FilterByAtomMapNums else None 1159 1160 Mols = [] 1161 for AtomIndices in AtomIndicesList: 1162 Mols.append(_MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1163 1164 return Mols if len(Mols) else None 1165 1166 def FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices): 1167 """Filter a list of matched atom indices by map atom numbers present in a 1168 pattern molecule. The list of atom indices correspond to a list retrieved by 1169 RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The 1170 atom map numbers are mapped to appropriate atom indices during the generation 1171 of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1172 1173 Arguments: 1174 Mol (object): RDKit molecule object. 1175 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1176 AtomIndices (list): Atom indices. 1177 1178 Returns: 1179 list : A list of filtered atom indices. 1180 1181 """ 1182 AtomMapIndices = _GetAtomMapIndices(PatternMol) 1183 1184 return _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices) 1185 1186 def FilterSubstructureMatchesByAtomMapNumbers(Mol, PatternMol, AtomIndicesList): 1187 """Filter a list of lists containing matched atom indices by map atom numbers 1188 present in a pattern molecule. The list of atom indices correspond to a list retrieved by 1189 RDKit function GetSubstructureMatches using SMILES/SMARTS pattern. The 1190 atom map numbers are mapped to appropriate atom indices during the generation 1191 of molecules. For example: [O:1]=[S:2](=[O])[C:3][C:4]. 1192 1193 Arguments: 1194 Mol (object): RDKit molecule object. 1195 PatternMol (object): RDKit molecule object for a SMILES/SMARTS pattern. 1196 AtomIndicesList (list): A list of lists containing atom indices. 1197 1198 Returns: 1199 list : A list of lists containing filtered atom indices. 1200 1201 """ 1202 AtomMapIndices = _GetAtomMapIndices(PatternMol) 1203 1204 MatchedAtomIndicesList = [] 1205 for AtomIndices in AtomIndicesList: 1206 MatchedAtomIndicesList.append(_FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices)) 1207 1208 return MatchedAtomIndicesList 1209 1210 def _MolFromSubstructureMatch(Mol, PatternMol, AtomIndices, AtomMapIndices): 1211 """Generate a RDKit molecule object for a list of matched atom indices and available 1212 atom map indices. 1213 """ 1214 1215 if AtomMapIndices is not None: 1216 MatchedAtomIndices = [AtomIndices[Index] for Index in AtomMapIndices] 1217 else: 1218 MatchedAtomIndices = list(AtomIndices) 1219 1220 return _GetMolFromAtomIndices(Mol, MatchedAtomIndices) 1221 1222 def _GetAtomMapIndices(Mol): 1223 """Get a list of available atom indices corresponding to sorted atom map 1224 numbers present in a SMILES/SMARTS pattern used for creating a molecule. 1225 """ 1226 1227 AtomMapIndices, AtomMapNumbers = _GetAtomMapIndicesAndMapNumbers(Mol) 1228 1229 return AtomMapIndices 1230 1231 def _GetAtomMapIndicesAndMapNumbers(Mol): 1232 """Get a list of available atom indices and atom map numbers present 1233 in a SMILES/SMARTS pattern used for creating a molecule. Both lists 1234 are sorted in ascending order by atom map numbers. 1235 """ 1236 1237 # Setup a atom map number to atom indices map.. 1238 AtomMapNumToIndices = {} 1239 for Atom in Mol.GetAtoms(): 1240 AtomMapNum = Atom.GetAtomMapNum() 1241 1242 if AtomMapNum: 1243 AtomMapNumToIndices[AtomMapNum] = Atom.GetIdx() 1244 1245 # Setup atom indices corresponding to sorted atom map numbers... 1246 AtomMapIndices = None 1247 AtomMapNumbers = None 1248 if len(AtomMapNumToIndices): 1249 AtomMapNumbers = sorted(AtomMapNumToIndices) 1250 AtomMapIndices = [AtomMapNumToIndices[AtomMapNum] for AtomMapNum in AtomMapNumbers] 1251 1252 return (AtomMapIndices, AtomMapNumbers) 1253 1254 def _FilterSubstructureMatchByAtomMapNumbers(Mol, PatternMol, AtomIndices, AtomMapIndices): 1255 """Filter substructure match atom indices by atom map indices corresponding to 1256 atom map numbers. 1257 """ 1258 1259 if AtomMapIndices is None: 1260 return list(AtomIndices) 1261 1262 return [AtomIndices[Index] for Index in AtomMapIndices] 1263 1264 def _GetMolFromAtomIndices(Mol, AtomIndices): 1265 """Generate a RDKit molecule object from atom indices returned by 1266 substructure search. 1267 """ 1268 1269 BondIndices = [] 1270 for AtomIndex in AtomIndices: 1271 Atom = Mol.GetAtomWithIdx(AtomIndex) 1272 1273 for AtomNbr in Atom.GetNeighbors(): 1274 AtomNbrIndex = AtomNbr.GetIdx() 1275 if AtomNbrIndex not in AtomIndices: 1276 continue 1277 1278 BondIndex = Mol.GetBondBetweenAtoms(AtomIndex, AtomNbrIndex).GetIdx() 1279 if BondIndex in BondIndices: 1280 continue 1281 1282 BondIndices.append(BondIndex) 1283 1284 MatchedMol = Chem.PathToSubmol(Mol, BondIndices) if len(BondIndices) else None 1285 1286 return MatchedMol 1287 1288 def ConstrainAndEmbed(mol, core, coreMatchesMol=None, useTethers=True, coreConfId=-1, randomseed=2342, getForceField=AllChem.UFFGetMoleculeForceField, **kwargs): 1289 """ 1290 The function is a local copy of RDKit fucntion AllChem.ConstrainedEmbed(). 1291 It has been enhanced to support an explicit list of core matches corresponding 1292 to the matched atom indices in the molecule. The number of matched atom indices 1293 must be equal to the number of atoms in core molecule. 1294 1295 Arguments: 1296 mol (object): RDKit molecule object to embed. 1297 core (object): RDKit molecule to use as a source of constraints. 1298 coreMatchesMol (list): A list matches atom indices in mol. 1299 useTethers: (bool) if True, the final conformation will be optimized 1300 subject to a series of extra forces that pull the matching atoms to 1301 the positions of the core atoms. Otherwise simple distance 1302 constraints based on the core atoms will be used in the 1303 optimization. 1304 coreConfId (int): ID of the core conformation to use. 1305 randomSeed (int): Seed for the random number generator 1306 1307 Returns: 1308 mol (object): RDKit molecule object. 1309 1310 """ 1311 if coreMatchesMol is None: 1312 match = mol.GetSubstructMatch(core) 1313 if not match: 1314 raise ValueError("Molecule doesn't match the core.") 1315 else: 1316 if core.GetNumAtoms() != len(coreMatchesMol): 1317 raise ValueError("Number of atoms, %s, in core molecule must match number of atom indices, %s, specified in the list coreMatchesMol." % (core.GetNumAtoms(), len(coreMatchesMol))) 1318 # Check specified matched atom indices in coreMatchesMol and use the match atom 1319 # indices returned by GetSubstructMatches() for embedding... 1320 coreMatch = None 1321 matches = mol.GetSubstructMatches(core) 1322 for match in matches: 1323 if len(match) != len(coreMatchesMol): 1324 continue 1325 matchFound = True 1326 for atomIndex in match: 1327 if atomIndex not in coreMatchesMol: 1328 matchFound = False 1329 break 1330 if matchFound: 1331 coreMatch = match 1332 break 1333 if coreMatch is None: 1334 raise ValueError("Molecule doesn't match the atom indices specified in the list coreMatchesMol.") 1335 match = coreMatch 1336 1337 coordMap = {} 1338 coreConf = core.GetConformer(coreConfId) 1339 for i, idxI in enumerate(match): 1340 corePtI = coreConf.GetAtomPosition(i) 1341 coordMap[idxI] = corePtI 1342 1343 ci = AllChem.EmbedMolecule(mol, coordMap=coordMap, randomSeed=randomseed, **kwargs) 1344 if ci < 0: 1345 raise ValueError('Could not embed molecule.') 1346 1347 algMap = [(j, i) for i, j in enumerate(match)] 1348 1349 if not useTethers: 1350 # clean up the conformation 1351 ff = getForceField(mol, confId=0) 1352 for i, idxI in enumerate(match): 1353 for j in range(i + 1, len(match)): 1354 idxJ = match[j] 1355 d = coordMap[idxI].Distance(coordMap[idxJ]) 1356 ff.AddDistanceConstraint(idxI, idxJ, d, d, 100.) 1357 ff.Initialize() 1358 n = 4 1359 more = ff.Minimize() 1360 while more and n: 1361 more = ff.Minimize() 1362 n -= 1 1363 # rotate the embedded conformation onto the core: 1364 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1365 else: 1366 # rotate the embedded conformation onto the core: 1367 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1368 ff = getForceField(mol, confId=0) 1369 conf = core.GetConformer() 1370 for i in range(core.GetNumAtoms()): 1371 p = conf.GetAtomPosition(i) 1372 pIdx = ff.AddExtraPoint(p.x, p.y, p.z, fixed=True) - 1 1373 ff.AddDistanceConstraint(pIdx, match[i], 0, 0, 100.) 1374 ff.Initialize() 1375 n = 4 1376 more = ff.Minimize(energyTol=1e-4, forceTol=1e-3) 1377 while more and n: 1378 more = ff.Minimize(energyTol=1e-4, forceTol=1e-3) 1379 n -= 1 1380 # realign 1381 rms = AllChem.AlignMol(mol, core, atomMap=algMap) 1382 1383 mol.SetProp('EmbedRMS', str(rms)) 1384 return mol 1385 1386 def ReadAndValidateMolecules(FileName, **KeyWordArgs): 1387 """Read molecules from an input file, validate all molecule objects, and return 1388 a list of valid and non-valid molecule objects along with their counts. 1389 1390 Arguments: 1391 FileName (str): Name of a file with complete path. 1392 **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and 1393 processing molecules. 1394 1395 Returns: 1396 list : List of valid RDKit molecule objects. 1397 int : Number of total molecules in input file. 1398 int : Number of valid molecules in input file. 1399 1400 Notes: 1401 The file extension is used to determine type of the file and set up an appropriate 1402 file reader. 1403 1404 """ 1405 1406 AllowEmptyMols = True 1407 if "AllowEmptyMols" in KeyWordArgs: 1408 AllowEmptyMols = KeyWordArgs["AllowEmptyMols"] 1409 1410 Mols = ReadMolecules(FileName, **KeyWordArgs) 1411 1412 if AllowEmptyMols: 1413 ValidMols = [Mol for Mol in Mols if Mol is not None] 1414 else: 1415 ValidMols = [] 1416 MolCount = 0 1417 for Mol in Mols: 1418 MolCount += 1 1419 if Mol is None: 1420 continue 1421 1422 if IsMolEmpty(Mol): 1423 MolName = GetMolName(Mol, MolCount) 1424 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 1425 continue 1426 1427 ValidMols.append(Mol) 1428 1429 MolCount = len(Mols) 1430 ValidMolCount = len(ValidMols) 1431 1432 return (ValidMols, MolCount, ValidMolCount) 1433 1434 def ReadMolecules(FileName, **KeyWordArgs): 1435 """Read molecules from an input file without performing any validation 1436 and creation of molecule objects. 1437 1438 Arguments: 1439 FileName (str): Name of a file with complete path. 1440 **KeyWordArgs (dictionary) : Parameter name and value pairs for reading and 1441 processing molecules. 1442 1443 Returns: 1444 list : List of RDKit molecule objects. 1445 1446 Notes: 1447 The file extension is used to determine type of the file and set up an appropriate 1448 file reader. 1449 1450 """ 1451 1452 # Set default values for possible arguments... 1453 ReaderArgs = {"Sanitize": True, "RemoveHydrogens": True, "StrictParsing": True, "SMILESDelimiter" : ' ', "SMILESColumn": 1, "SMILESNameColumn": 2, "SMILESTitleLine": True } 1454 1455 # Set specified values for possible arguments... 1456 for Arg in ReaderArgs: 1457 if Arg in KeyWordArgs: 1458 ReaderArgs[Arg] = KeyWordArgs[Arg] 1459 1460 # Modify specific valeus for SMILES... 1461 if MiscUtil.CheckFileExt(FileName, "smi csv tsv txt"): 1462 Args = ["Sanitize", "SMILESTitleLine"] 1463 for Arg in Args: 1464 if ReaderArgs[Arg] is True: 1465 ReaderArgs[Arg] = 1 1466 else: 1467 ReaderArgs[Arg] = 0 1468 1469 Mols = [] 1470 if MiscUtil.CheckFileExt(FileName, "sdf sd"): 1471 return ReadMoleculesFromSDFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing']) 1472 elif MiscUtil.CheckFileExt(FileName, "mol"): 1473 return ReadMoleculesFromMolFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"], ReaderArgs['StrictParsing']) 1474 elif MiscUtil.CheckFileExt(FileName, "mol2"): 1475 return ReadMoleculesFromMol2File(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"]) 1476 elif MiscUtil.CheckFileExt(FileName, "pdb"): 1477 return ReadMoleculesFromPDBFile(FileName, ReaderArgs["Sanitize"], ReaderArgs["RemoveHydrogens"]) 1478 elif MiscUtil.CheckFileExt(FileName, "smi txt csv tsv"): 1479 SMILESColumnIndex = ReaderArgs["SMILESColumn"] - 1 1480 SMILESNameColumnIndex = ReaderArgs["SMILESNameColumn"] - 1 1481 return ReadMoleculesFromSMILESFile(FileName, ReaderArgs["SMILESDelimiter"], SMILESColumnIndex, SMILESNameColumnIndex, ReaderArgs["SMILESTitleLine"], ReaderArgs["Sanitize"]) 1482 else: 1483 MiscUtil.PrintWarning("RDKitUtil.ReadMolecules: Non supported file type: %s" % FileName) 1484 1485 return Mols 1486 1487 def ReadMoleculesFromSDFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True): 1488 """Read molecules from a SD file. 1489 1490 Arguments: 1491 FileName (str): Name of a file with complete path. 1492 Sanitize (bool): Sanitize molecules. 1493 RemoveHydrogens (bool): Remove hydrogens from molecules. 1494 StrictParsing (bool): Perform strict parsing. 1495 1496 Returns: 1497 list : List of RDKit molecule objects. 1498 1499 """ 1500 return Chem.SDMolSupplier(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing) 1501 1502 def ReadMoleculesFromMolFile(FileName, Sanitize = True, RemoveHydrogens = True, StrictParsing = True): 1503 """Read molecule from a MDL Mol file. 1504 1505 Arguments: 1506 FileName (str): Name of a file with complete path. 1507 Sanitize (bool): Sanitize molecules. 1508 RemoveHydrogens (bool): Remove hydrogens from molecules. 1509 StrictParsing (bool): Perform strict parsing. 1510 1511 Returns: 1512 list : List of RDKit molecule objects. 1513 1514 """ 1515 1516 Mols = [] 1517 Mols.append(Chem.MolFromMolFile(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens, strictParsing = StrictParsing)) 1518 return Mols 1519 1520 1521 def ReadMoleculesFromMol2File(FileName, Sanitize = True, RemoveHydrogens = True): 1522 """Read molecules from a Tripos Mol2 file. The first call to the function 1523 creates and returns a generator object using Python yield statement. The 1524 molecules are created during the subsequent iteration by the generator object. 1525 1526 Arguments: 1527 FileName (str): Name of a file with complete path. 1528 Sanitize (bool): Sanitize molecules. 1529 RemoveHydrogens (bool): Remove hydrogens from molecules. 1530 1531 Returns: 1532 list : A Python generator object for iterating over the molecules. 1533 1534 """ 1535 1536 return _Mol2MolSupplier(FileName, Sanitize, RemoveHydrogens) 1537 1538 def _Mol2MolSupplier(FileName, Sanitize = True, RemoveHydrogens = True): 1539 """Read molecules from a Tripos Mol2 file.""" 1540 1541 fh = open(FileName, 'r') 1542 1543 FirstMol = True 1544 ProcessingMol = False 1545 1546 for Line in fh: 1547 if re.match("^#", Line, re.I): 1548 continue 1549 1550 if re.match("^@<TRIPOS>MOLECULE", Line, re.I): 1551 ProcessingMol = True 1552 1553 if FirstMol: 1554 FirstMol = False 1555 1556 MolLines = [] 1557 MolLines.append(Line) 1558 continue 1559 1560 # Process lines for existing molecule... 1561 MolBlock = "".join(MolLines) 1562 1563 Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens) 1564 yield Mol 1565 1566 # Track lines for next molecule... 1567 MolLines = [] 1568 MolLines.append(Line) 1569 continue 1570 1571 if not ProcessingMol: 1572 continue 1573 1574 MolLines.append(Line) 1575 1576 fh.close 1577 1578 # Process last molecule... 1579 if len(MolLines): 1580 MolBlock = "".join(MolLines) 1581 Mol = Chem.MolFromMol2Block(MolBlock, sanitize = Sanitize, removeHs = RemoveHydrogens) 1582 yield Mol 1583 1584 def ReadMoleculesFromPDBFile(FileName, Sanitize = True, RemoveHydrogens = True): 1585 """Read molecule from a PDB file. 1586 1587 Arguments: 1588 FileName (str): Name of a file with complete path. 1589 Sanitize (bool): Sanitize molecules. 1590 RemoveHydrogens (bool): Remove hydrogens from molecules. 1591 1592 Returns: 1593 list : List of RDKit molecule objects. 1594 1595 """ 1596 1597 Mols = [] 1598 Mols.append(Chem.MolFromPDBFile(FileName, sanitize = Sanitize, removeHs = RemoveHydrogens)) 1599 return Mols 1600 1601 def ReadMoleculesFromSMILESFile(FileName, SMILESDelimiter = ' ', SMILESColIndex = 0, SMILESNameColIndex = 1, SMILESTitleLine = 1, Sanitize = 1): 1602 """Read molecules from a SMILES file. 1603 1604 Arguments: 1605 SMILESDelimiter (str): Delimiter for parsing SMILES line 1606 SMILESColIndex (int): Column index containing SMILES string. 1607 SMILESNameColIndex (int): Column index containing molecule name. 1608 SMILESTitleLine (int): Flag to indicate presence of title line. 1609 Sanitize (int): Sanitize molecules. 1610 1611 Returns: 1612 list : List of RDKit molecule objects. 1613 1614 """ 1615 1616 return Chem.SmilesMolSupplier(FileName, delimiter = SMILESDelimiter, smilesColumn = SMILESColIndex, nameColumn = SMILESNameColIndex, titleLine = SMILESTitleLine, sanitize = Sanitize) 1617 1618 def MoleculesWriter(FileName, **KeyWordArgs): 1619 """Set up a molecule writer. 1620 1621 Arguments: 1622 FileName (str): Name of a file with complete path. 1623 **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and 1624 processing molecules. 1625 1626 Returns: 1627 RDKit object : Molecule writer. 1628 1629 Notes: 1630 The file extension is used to determine type of the file and set up an appropriate 1631 file writer. 1632 1633 """ 1634 1635 # Set default values for possible arguments... 1636 WriterArgs = {"Compute2DCoords" : False, "Kekulize": True, "ForceV3000": False, "SMILESKekulize": False, "SMILESDelimiter" : ' ', "SMILESIsomeric": True, "SMILESTitleLine": True, "SMILESMolName": True} 1637 1638 # Set specified values for possible arguments... 1639 for Arg in WriterArgs: 1640 if Arg in KeyWordArgs: 1641 WriterArgs[Arg] = KeyWordArgs[Arg] 1642 1643 Writer = None 1644 if MiscUtil.CheckFileExt(FileName, "sdf sd"): 1645 Writer = Chem.SDWriter(FileName) 1646 Writer.SetKekulize(WriterArgs["Kekulize"]) 1647 Writer.SetForceV3000(WriterArgs["ForceV3000"]) 1648 elif MiscUtil.CheckFileExt(FileName, "pdb"): 1649 Writer = Chem.PDBWriter(FileName) 1650 elif MiscUtil.CheckFileExt(FileName, "smi"): 1651 # Text for the name column in the title line. Blank indicates not to include name column 1652 # in the output file... 1653 NameHeader = 'Name' if WriterArgs["SMILESMolName"] else '' 1654 Writer = Chem.SmilesWriter(FileName, delimiter = WriterArgs["SMILESDelimiter"], nameHeader = NameHeader, includeHeader = WriterArgs["SMILESTitleLine"], isomericSmiles = WriterArgs["SMILESIsomeric"], kekuleSmiles = WriterArgs["SMILESKekulize"]) 1655 else: 1656 MiscUtil.PrintWarning("RDKitUtil.WriteMolecules: Non supported file type: %s" % FileName) 1657 1658 return Writer 1659 1660 def WriteMolecules(FileName, Mols, **KeyWordArgs): 1661 """Write molecules to an output file. 1662 1663 Arguments: 1664 FileName (str): Name of a file with complete path. 1665 Mols (list): List of RDKit molecule objects. 1666 **KeyWordArgs (dictionary) : Parameter name and value pairs for writing and 1667 processing molecules. 1668 1669 Returns: 1670 int : Number of total molecules. 1671 int : Number of processed molecules written to output file. 1672 1673 Notes: 1674 The file extension is used to determine type of the file and set up an appropriate 1675 file writer. 1676 1677 """ 1678 1679 Compute2DCoords = False 1680 if "Compute2DCoords" in KeyWordArgs: 1681 Compute2DCoords = KeyWordArgs["Compute2DCoords"] 1682 1683 SetSMILESMolProps = KeyWordArgs["SetSMILESMolProps"] if "SetSMILESMolProps" in KeyWordArgs else False 1684 1685 MolCount = 0 1686 ProcessedMolCount = 0 1687 1688 Writer = MoleculesWriter(FileName, **KeyWordArgs) 1689 1690 if Writer is None: 1691 return (MolCount, ProcessedMolCount) 1692 1693 FirstMol = True 1694 for Mol in Mols: 1695 MolCount += 1 1696 if Mol is None: 1697 continue 1698 1699 if FirstMol: 1700 FirstMol = False 1701 if SetSMILESMolProps: 1702 SetWriterMolProps(Writer, Mol) 1703 1704 ProcessedMolCount += 1 1705 if Compute2DCoords: 1706 AllChem.Compute2DCoords(Mol) 1707 1708 Writer.write(Mol) 1709 1710 Writer.close() 1711 1712 return (MolCount, ProcessedMolCount) 1713 1714 def SetWriterMolProps(Writer, Mol): 1715 """Setup molecule properties for a writer to output. 1716 1717 Arguments: 1718 Writer (object): RDKit writer object. 1719 Mol (object): RDKit molecule object. 1720 1721 Returns: 1722 object : Writer object. 1723 1724 """ 1725 PropNames = list(Mol.GetPropNames()) 1726 if len(PropNames): 1727 Writer.SetProps(PropNames) 1728 1729 return Writer 1730