1 #!/bin/env python 2 # 3 # File: RDKitRemoveDuplicateMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 # Add local python path to the global path and import standard library modules... 32 import os 33 import sys; sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 34 import time 35 import re 36 37 # RDKit imports... 38 try: 39 from rdkit import rdBase 40 from rdkit import Chem 41 from rdkit.Chem import AllChem 42 except ImportError as ErrMsg: 43 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 44 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 45 sys.exit(1) 46 47 # MayaChemTools imports... 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 def main(): 62 """Start execution of the script.""" 63 64 MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())) 65 66 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 67 68 # Retrieve command line arguments and options... 69 RetrieveOptions() 70 71 # Process and validate command line arguments and options... 72 ProcessOptions() 73 74 # Perform actions required by the script... 75 RemoveDuplicates() 76 77 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 78 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 79 80 def RemoveDuplicates(): 81 """Identify and remove duplicate molecules based on canonical SMILES.""" 82 83 Infile = OptionsInfo["Infile"] 84 Outfile = OptionsInfo["Outfile"] 85 DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"] 86 87 CountMode = OptionsInfo["CountMode"] 88 UseChirality = OptionsInfo["UseChirality"] 89 90 # Setup a molecule reader... 91 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 92 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 93 94 # Set up a molecule writer... 95 Writer = None 96 DuplicatesWriter = None 97 if not CountMode: 98 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 99 if Writer is None: 100 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 101 DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"]) 102 if DuplicatesWriter is None: 103 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % DuplicatesOutfile) 104 105 MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, DuplicatesOutfile)) 106 107 # Process molecules... 108 MolCount = 0 109 ValidMolCount = 0 110 111 UniqueMolCount = 0 112 DuplicateMolCount = 0 113 114 CanonicalSMILESMap = {} 115 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 116 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 117 118 FirstMol = True 119 for Mol in Mols: 120 MolCount += 1 121 122 if Mol is None: 123 continue 124 125 if RDKitUtil.IsMolEmpty(Mol): 126 MolName = RDKitUtil.GetMolName(Mol, MolCount) 127 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 128 continue 129 130 ValidMolCount += 1 131 132 if FirstMol: 133 FirstMol = False 134 if not CountMode: 135 if SetSMILESMolProps: 136 RDKitUtil.SetWriterMolProps(Writer, Mol) 137 RDKitUtil.SetWriterMolProps(DuplicatesWriter, Mol) 138 139 CanonicalSMILES = Chem.MolToSmiles(Mol, isomericSmiles = UseChirality, canonical = True) 140 141 if Compute2DCoords: 142 if not CountMode: 143 AllChem.Compute2DCoords(Mol) 144 145 if CanonicalSMILES in CanonicalSMILESMap: 146 DuplicateMolCount += 1 147 if not CountMode: 148 DuplicatesWriter.write(Mol) 149 else: 150 UniqueMolCount += 1 151 CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES 152 if not CountMode: 153 Writer.write(Mol) 154 155 if Writer is not None: 156 Writer.close() 157 158 if DuplicatesWriter is not None: 159 DuplicatesWriter.close() 160 161 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 162 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 163 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 164 165 MiscUtil.PrintInfo("\nTotal number of unique molecules: %d" % UniqueMolCount) 166 MiscUtil.PrintInfo("Total number of duplicate molecules: %d" % DuplicateMolCount) 167 168 def ProcessOptions(): 169 """Process and validate command line arguments and options.""" 170 171 MiscUtil.PrintInfo("Processing options...") 172 173 # Validate options... 174 ValidateOptions() 175 176 OptionsInfo["Infile"] = Options["--infile"] 177 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"]) 178 179 OptionsInfo["Outfile"] = Options["--outfile"] 180 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]) 181 182 OptionsInfo["Overwrite"] = Options["--overwrite"] 183 184 OptionsInfo["CountMode"] = False 185 if re.match("^count$", Options["--mode"], re.I): 186 OptionsInfo["CountMode"] = True 187 188 # Setup outfile for writing out duplicates... 189 OptionsInfo["DuplicatesOutfile"] = "" 190 if not OptionsInfo["CountMode"] : 191 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 192 OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt) 193 194 OptionsInfo["UseChirality"] = False 195 if re.match("^yes$", Options["--useChirality"], re.I): 196 OptionsInfo["UseChirality"] = True 197 198 def RetrieveOptions(): 199 """Retrieve command line arguments and options.""" 200 201 # Get options... 202 global Options 203 Options = docopt(_docoptUsage_) 204 205 # Set current working directory to the specified directory... 206 WorkingDir = Options["--workingdir"] 207 if WorkingDir: 208 os.chdir(WorkingDir) 209 210 # Handle examples option... 211 if "--examples" in Options and Options["--examples"]: 212 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 213 sys.exit(0) 214 215 def ValidateOptions(): 216 """Validate option values.""" 217 218 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 219 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 220 221 if Options["--outfile"]: 222 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 223 MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]) 224 MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]) 225 226 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 227 if re.match("^remove$", Options["--mode"], re.I): 228 if not Options["--outfile"]: 229 MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option") 230 231 MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no") 232 233 # Setup a usage string for docopt... 234 _docoptUsage_ = """ 235 RDKitRemoveDuplicateMolecules.py - Remove duplicate molecules 236 237 Usage: 238 RDKitRemoveDuplicateMolecules.py [--infileParams <Name,Value,...>] 239 [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 240 [--overwrite] [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> 241 RDKitRemoveDuplicateMolecules.py -h | --help | -e | --examples 242 243 Description: 244 Identify and remove duplicate molecules based on canonical SMILES strings or 245 simply count the number of duplicate molecules. 246 247 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 248 249 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 250 251 Options: 252 -e, --examples 253 Print examples. 254 -h, --help 255 Print this help message. 256 -i, --infile <infile> 257 Input file name. 258 --infileParams <Name,Value,...> [default: auto] 259 A comma delimited list of parameter name and value pairs for reading 260 molecules from files. The supported parameter names for different file 261 formats, along with their default values, are shown below: 262 263 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 264 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 265 smilesTitleLine,auto,sanitize,yes 266 267 Possible values for smilesDelimiter: space, comma or tab. 268 -m, --mode <remove or count> [default: remove] 269 Specify whether to remove duplicate molecules and write out filtered molecules 270 to output files or or simply count the number of duplicate molecules. 271 -o, --outfile <outfile> 272 Output file name. 273 --outfileParams <Name,Value,...> [default: auto] 274 A comma delimited list of parameter name and value pairs for writing 275 molecules to files. The supported parameter names for different file 276 formats, along with their default values, are shown below: 277 278 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 279 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 280 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 281 282 Default value for compute2DCoords: yes for SMILES input file; no for all other 283 file types. 284 --overwrite 285 Overwrite existing files. 286 -u, --useChirality <yes or no> [default: yes] 287 Use stereochemistry information for generation of canonical SMILES strings 288 to identify duplicate molecules. 289 -w, --workingdir <dir> 290 Location of working directory which defaults to the current directory. 291 292 Examples: 293 To remove duplicate molecules and generate output files containing unique and 294 duplicate SMILES strings, type: 295 296 % RDKitRemoveDuplicateMolecules.py -i Sample.smi -o SampleOut.smi 297 298 To remove duplicate molecules without using stereochemistry information for 299 generation of canonical SMILES and generate output files containing unique and 300 duplicate SMILES strings, type: 301 302 % RDKitRemoveDuplicateMolecules.py -u no -i Sample.sdf -o SampleOut.sdf 303 304 To count number of unique and duplicate molecules without generating any 305 output files, type: 306 307 % RDKitRemoveDuplicateMolecules.py -m count -i Sample.sdf 308 309 To remove duplicate molecules from a CSV SMILES file, SMILES strings in 310 column 1, name in column 2, and generate output SD files containing unique and 311 duplicate molecules, type: 312 313 % RDKitRemoveDuplicateMolecules.py --infileParams 314 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 315 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 316 -i SampleSMILES.csv -o SampleOut.sdf 317 318 Author: 319 Manish Sud(msud@san.rr.com) 320 321 See also: 322 RDKitConvertFileFormat.py, RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts, 323 RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py, 324 RDKitStandardizeMolecules.py 325 326 Copyright: 327 Copyright (C) 2025 Manish Sud. All rights reserved. 328 329 The functionality available in this script is implemented using RDKit, an 330 open source toolkit for cheminformatics developed by Greg Landrum. 331 332 This file is part of MayaChemTools. 333 334 MayaChemTools is free software; you can redistribute it and/or modify it under 335 the terms of the GNU Lesser General Public License as published by the Free 336 Software Foundation; either version 3 of the License, or (at your option) any 337 later version. 338 339 """ 340 341 if __name__ == "__main__": 342 main()