1 #!/bin/env python 2 # 3 # File: RDKitRemoveInvalidMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 import os 32 import sys 33 import time 34 import re 35 36 # RDKit imports... 37 try: 38 from rdkit import rdBase 39 from rdkit.Chem import AllChem 40 except ImportError as ErrMsg: 41 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 42 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 43 sys.exit(1) 44 45 # MayaChemTools imports... 46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 47 try: 48 from docopt import docopt 49 import MiscUtil 50 import RDKitUtil 51 except ImportError as ErrMsg: 52 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 53 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 54 sys.exit(1) 55 56 ScriptName = os.path.basename(sys.argv[0]) 57 Options = {} 58 OptionsInfo = {} 59 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo( 65 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" 66 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()) 67 ) 68 69 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 70 71 # Retrieve command line arguments and options... 72 RetrieveOptions() 73 74 # Process and validate command line arguments and options... 75 ProcessOptions() 76 77 # Perform actions required by the script... 78 RemoveInvalidMolecules() 79 80 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 81 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 82 83 84 def RemoveInvalidMolecules(): 85 """Identify and remove invalid molecules.""" 86 87 Infile = OptionsInfo["Infile"] 88 Outfile = OptionsInfo["Outfile"] 89 InvalidMolsOutfile = OptionsInfo["InvalidMolsOutfile"] 90 91 CountMode = OptionsInfo["CountMode"] 92 93 # Setup a molecule reader... 94 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 95 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 96 97 Writer = None 98 if not CountMode: 99 # Set up a molecule writer... 100 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 101 if Writer is None: 102 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 103 104 # Set up a invalid molecule number writer... 105 InvalidMolsWriter = open(InvalidMolsOutfile, "w") 106 if InvalidMolsWriter is None: 107 MiscUtil.PrintError("Failed to open output fie %s " % InvalidMolsOutfile) 108 InvalidMolsWriter.write("MolName\n") 109 110 MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, InvalidMolsOutfile)) 111 112 # Process molecules... 113 MolCount = 0 114 ValidMolCount = 0 115 116 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 117 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 118 119 FirstMol = True 120 for Mol in Mols: 121 MolCount += 1 122 123 if Mol is None: 124 MolName = "Mol%s" % MolCount 125 MiscUtil.PrintWarning("Ignoring invalid molecule: %s" % MolName) 126 if not CountMode: 127 InvalidMolsWriter.write("%s\n" % MolName) 128 continue 129 130 if RDKitUtil.IsMolEmpty(Mol): 131 MolName = RDKitUtil.GetMolName(Mol, MolCount) 132 MiscUtil.PrintWarning("Ignoring invalid empty molecule: %s" % MolName) 133 if not CountMode: 134 InvalidMolsWriter.write("%s\n" % MolName) 135 continue 136 ValidMolCount += 1 137 138 if FirstMol: 139 FirstMol = False 140 if not CountMode: 141 if SetSMILESMolProps: 142 RDKitUtil.SetWriterMolProps(Writer, Mol) 143 144 if Compute2DCoords: 145 if not CountMode: 146 AllChem.Compute2DCoords(Mol) 147 148 if not CountMode: 149 Writer.write(Mol) 150 151 if Writer is not None: 152 Writer.close() 153 154 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 155 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 156 MiscUtil.PrintInfo("Number of invalid molecules: %d" % (MolCount - ValidMolCount)) 157 158 159 def ProcessOptions(): 160 """Process and validate command line arguments and options.""" 161 162 MiscUtil.PrintInfo("Processing options...") 163 164 # Validate options... 165 ValidateOptions() 166 167 OptionsInfo["Infile"] = Options["--infile"] 168 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters( 169 "--infileParams", Options["--infileParams"], Options["--infile"] 170 ) 171 172 OptionsInfo["Outfile"] = Options["--outfile"] 173 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters( 174 "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"] 175 ) 176 177 OptionsInfo["Overwrite"] = Options["--overwrite"] 178 179 OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False 180 181 # Setup outfile for writing out molecule number for invalid molecules... 182 OptionsInfo["InvalidMolsOutfile"] = "" 183 if not OptionsInfo["CountMode"]: 184 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 185 OptionsInfo["InvalidMolsOutfile"] = "%sInvalidMols.csv" % (FileName) 186 187 188 def RetrieveOptions(): 189 """Retrieve command line arguments and options.""" 190 191 # Get options... 192 global Options 193 Options = docopt(_docoptUsage_) 194 195 # Set current working directory to the specified directory... 196 WorkingDir = Options["--workingdir"] 197 if WorkingDir: 198 os.chdir(WorkingDir) 199 200 # Handle examples option... 201 if "--examples" in Options and Options["--examples"]: 202 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 203 sys.exit(0) 204 205 206 def ValidateOptions(): 207 """Validate option values.""" 208 209 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 210 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 211 212 if Options["--outfile"]: 213 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 214 MiscUtil.ValidateOptionsOutputFileOverwrite( 215 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"] 216 ) 217 MiscUtil.ValidateOptionsDistinctFileNames( 218 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"] 219 ) 220 221 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 222 if re.match("^remove$", Options["--mode"], re.I): 223 if not Options["--outfile"]: 224 MiscUtil.PrintError( 225 'The outfile must be specified using "-o, --outfile" during "remove" value of "-m, --mode" option' 226 ) 227 228 229 # Setup a usage string for docopt... 230 _docoptUsage_ = """ 231 RDKitRemoveInvalidMolecules.py - Remove invalid molecules 232 233 Usage: 234 RDKitRemoveInvalidMolecules.py [--infileParams <Name,Value,...>] 235 [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 236 [--overwrite] [-w <dir>] [-o <outfile>] -i <infile> 237 RDKitRemoveInvalidMolecules.py -h | --help | -e | --examples 238 239 Description: 240 Identify and remove invalid molecules based on success or failure of RDKit molecule 241 readers or simply count the number of invalid molecules. 242 243 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 244 245 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 246 247 Options: 248 -e, --examples 249 Print examples. 250 -h, --help 251 Print this help message. 252 -i, --infile <infile> 253 Input file name. 254 --infileParams <Name,Value,...> [default: auto] 255 A comma delimited list of parameter name and value pairs for reading 256 molecules from files. The supported parameter names for different file 257 formats, along with their default values, are shown below: 258 259 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 260 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 261 smilesTitleLine,auto,sanitize,yes 262 263 Possible values for smilesDelimiter: space, comma or tab. 264 -m, --mode <remove or count> [default: remove] 265 Specify whether to remove invalid molecules and write out filtered molecules 266 to output file or or simply count the number of invalid molecules. 267 -o, --outfile <outfile> 268 Output file name. 269 --outfileParams <Name,Value,...> [default: auto] 270 A comma delimited list of parameter name and value pairs for writing 271 molecules to files. The supported parameter names for different file 272 formats, along with their default values, are shown below: 273 274 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 275 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 276 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 277 278 Default value for compute2DCoords: yes for SMILES input file; no for all other 279 file types. 280 --overwrite 281 Overwrite existing files. 282 -w, --workingdir <dir> 283 Location of working directory which defaults to the current directory. 284 285 Examples: 286 To remove invalid molecules and generate an output file SMILES file 287 containing valid molecules, type: 288 289 % RDKitRemoveInvalidMolecules.py -i Sample.smi -o SampleOut.smi 290 291 To count number of valid and invaid molecules without generating any 292 output file, type: 293 294 % RDKitRemoveInvalidMolecules.py -m count -i Sample.sdf 295 296 To remove invalid molecules from a CSV SMILES file, SMILES strings in 297 column 1, name in column 2, and generate output SD file containing valid 298 molecules, type: 299 300 % RDKitRemoveInvalidMolecules.py --infileParams 301 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 302 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 303 -i SampleSMILES.csv -o SampleOut.sdf 304 305 Author: 306 Manish Sud(msud@san.rr.com) 307 308 See also: 309 RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py, 310 RDKitRemoveSalts, RDKitSearchFunctionalGroups.py, 311 RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py 312 313 Copyright: 314 Copyright (C) 2025 Manish Sud. All rights reserved. 315 316 The functionality available in this script is implemented using RDKit, an 317 open source toolkit for cheminformatics developed by Greg Landrum. 318 319 This file is part of MayaChemTools. 320 321 MayaChemTools is free software; you can redistribute it and/or modify it under 322 the terms of the GNU Lesser General Public License as published by the Free 323 Software Foundation; either version 3 of the License, or (at your option) any 324 later version. 325 326 """ 327 328 if __name__ == "__main__": 329 main()