1 #!/bin/env python 2 # 3 # File: RDKitRemoveInvalidMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 65 66 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 67 68 # Retrieve command line arguments and options... 69 RetrieveOptions() 70 71 # Process and validate command line arguments and options... 72 ProcessOptions() 73 74 # Perform actions required by the script... 75 RemoveInvalidMolecules() 76 77 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 78 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 79 80 def RemoveInvalidMolecules(): 81 """Identify and remove invalid molecules.""" 82 83 Infile = OptionsInfo["Infile"] 84 Outfile = OptionsInfo["Outfile"] 85 InvalidMolsOutfile = OptionsInfo["InvalidMolsOutfile"] 86 87 CountMode = OptionsInfo["CountMode"] 88 89 # Setup a molecule reader... 90 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 91 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 92 93 Writer = None 94 MolNumWriter = None 95 if not CountMode: 96 # Set up a molecule writer... 97 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 98 if Writer is None: 99 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 100 101 # Set up a invalid molecule number writer... 102 InvalidMolsWriter = open(InvalidMolsOutfile, "w") 103 if InvalidMolsWriter is None: 104 MiscUtil.PrintError("Failed to open output fie %s " % InvalidMolsOutfile) 105 InvalidMolsWriter.write("MolName\n") 106 107 MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, InvalidMolsOutfile)) 108 109 # Process molecules... 110 MolCount = 0 111 ValidMolCount = 0 112 113 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 114 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 115 116 FirstMol = True 117 for Mol in Mols: 118 MolCount += 1 119 120 if Mol is None: 121 MolName = "Mol%s" % MolCount 122 MiscUtil.PrintWarning("Ignoring invalid molecule: %s" % MolName) 123 if not CountMode: 124 InvalidMolsWriter.write("%s\n" % MolName) 125 continue 126 127 if RDKitUtil.IsMolEmpty(Mol): 128 MolName = RDKitUtil.GetMolName(Mol, MolCount) 129 MiscUtil.PrintWarning("Ignoring invalid empty molecule: %s" % MolName) 130 if not CountMode: 131 InvalidMolsWriter.write("%s\n" % MolName) 132 continue 133 ValidMolCount += 1 134 135 if FirstMol: 136 FirstMol = False 137 if not CountMode: 138 if SetSMILESMolProps: 139 RDKitUtil.SetWriterMolProps(Writer, Mol) 140 141 if Compute2DCoords: 142 if not CountMode: 143 AllChem.Compute2DCoords(Mol) 144 145 if not CountMode: 146 Writer.write(Mol) 147 148 if Writer is not None: 149 Writer.close() 150 151 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 152 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 153 MiscUtil.PrintInfo("Number of invalid molecules: %d" % (MolCount - ValidMolCount)) 154 155 def ProcessOptions(): 156 """Process and validate command line arguments and options.""" 157 158 MiscUtil.PrintInfo("Processing options...") 159 160 # Validate options... 161 ValidateOptions() 162 163 OptionsInfo["Infile"] = Options["--infile"] 164 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 165 166 OptionsInfo["Outfile"] = Options["--outfile"] 167 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 168 169 OptionsInfo["Overwrite"] = Options["--overwrite"] 170 171 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 172 173 # Setup outfile for writing out molecule number for invalid molecules... 174 OptionsInfo["InvalidMolsOutfile"] = "" 175 if not OptionsInfo["CountMode"] : 176 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 177 OptionsInfo["InvalidMolsOutfile"] = "%sInvalidMols.csv" % (FileName) 178 179 def RetrieveOptions(): 180 """Retrieve command line arguments and options.""" 181 182 # Get options... 183 global Options 184 Options = docopt(_docoptUsage_) 185 186 # Set current working directory to the specified directory... 187 WorkingDir = Options["--workingdir"] 188 if WorkingDir: 189 os.chdir(WorkingDir) 190 191 # Handle examples option... 192 if "--examples" in Options and Options["--examples"]: 193 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 194 sys.exit(0) 195 196 def ValidateOptions(): 197 """Validate option values.""" 198 199 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 200 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 201 202 if Options["--outfile"]: 203 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 204 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 205 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 206 207 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 208 if re.match("^remove$", Options["--mode"], re.I): 209 if not Options["--outfile"]: 210 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option") 211 212 # Setup a usage string for docopt... 213 _docoptUsage_ = """ 214 RDKitRemoveInvalidMolecules.py - Remove invalid molecules 215 216 Usage: 217 RDKitRemoveInvalidMolecules.py [--infileParams <Name,Value,...>] 218 [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 219 [--overwrite] [-w <dir>] [-o <outfile>] -i <infile> 220 RDKitRemoveInvalidMolecules.py -h | --help | -e | --examples 221 222 Description: 223 Identify and remove invalid molecules based on success or failure of RDKit molecule 224 readers or simply count the number of invalid molecules. 225 226 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 227 228 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 229 230 Options: 231 -e, --examples 232 Print examples. 233 -h, --help 234 Print this help message. 235 -i, --infile <infile> 236 Input file name. 237 --infileParams <Name,Value,...> [default: auto] 238 A comma delimited list of parameter name and value pairs for reading 239 molecules from files. The supported parameter names for different file 240 formats, along with their default values, are shown below: 241 242 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 243 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 244 smilesTitleLine,auto,sanitize,yes 245 246 Possible values for smilesDelimiter: space, comma or tab. 247 -m, --mode <remove or count> [default: remove] 248 Specify whether to remove invalid molecules and write out filtered molecules 249 to output file or or simply count the number of invalid molecules. 250 -o, --outfile <outfile> 251 Output file name. 252 --outfileParams <Name,Value,...> [default: auto] 253 A comma delimited list of parameter name and value pairs for writing 254 molecules to files. The supported parameter names for different file 255 formats, along with their default values, are shown below: 256 257 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 258 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 259 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 260 261 Default value for compute2DCoords: yes for SMILES input file; no for all other 262 file types. 263 --overwrite 264 Overwrite existing files. 265 -w, --workingdir <dir> 266 Location of working directory which defaults to the current directory. 267 268 Examples: 269 To remove invalid molecules and generate an output file SMILES file 270 containing valid molecules, type: 271 272 % RDKitRemoveInvalidMolecules.py -i Sample.smi -o SampleOut.smi 273 274 To count number of valid and invaid molecules without generating any 275 output file, type: 276 277 % RDKitRemoveInvalidMolecules.py -m count -i Sample.sdf 278 279 To remove invalid molecules from a CSV SMILES file, SMILES strings in 280 column 1, name in column 2, and generate output SD file containing valid 281 molecules, type: 282 283 % RDKitRemoveInvalidMolecules.py --infileParams 284 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 285 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 286 -i SampleSMILES.csv -o SampleOut.sdf 287 288 Author: 289 Manish Sud(msud@san.rr.com) 290 291 See also: 292 RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py, 293 RDKitRemoveSalts, RDKitSearchFunctionalGroups.py, 294 RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py 295 296 Copyright: 297 Copyright (C) 2024 Manish Sud. All rights reserved. 298 299 The functionality available in this script is implemented using RDKit, an 300 open source toolkit for cheminformatics developed by Greg Landrum. 301 302 This file is part of MayaChemTools. 303 304 MayaChemTools is free software; you can redistribute it and/or modify it under 305 the terms of the GNU Lesser General Public License as published by the Free 306 Software Foundation; either version 3 of the License, or (at your option) any 307 later version. 308 309 """ 310 311 if __name__ == "__main__": 312 main()