1 #!/bin/env python 2 # 3 # File: RDKitRemoveDuplicateMolecules.py 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2026 Manish Sud. All rights reserved. 7 # 8 # The functionality available in this script is implemented using RDKit, an 9 # open source toolkit for cheminformatics developed by Greg Landrum. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 from __future__ import print_function 30 31 import os 32 import sys 33 import time 34 import re 35 36 # RDKit imports... 37 try: 38 from rdkit import rdBase 39 from rdkit import Chem 40 from rdkit.Chem import AllChem 41 except ImportError as ErrMsg: 42 sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg) 43 sys.stderr.write("Check/update your RDKit environment and try again.\n\n") 44 sys.exit(1) 45 46 # MayaChemTools imports... 47 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python")) 48 try: 49 from docopt import docopt 50 import MiscUtil 51 import RDKitUtil 52 except ImportError as ErrMsg: 53 sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg) 54 sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n") 55 sys.exit(1) 56 57 ScriptName = os.path.basename(sys.argv[0]) 58 Options = {} 59 OptionsInfo = {} 60 61 62 def main(): 63 """Start execution of the script.""" 64 65 MiscUtil.PrintInfo( 66 "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" 67 % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()) 68 ) 69 70 (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime() 71 72 # Retrieve command line arguments and options... 73 RetrieveOptions() 74 75 # Process and validate command line arguments and options... 76 ProcessOptions() 77 78 # Perform actions required by the script... 79 RemoveDuplicates() 80 81 MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName) 82 MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime)) 83 84 85 def RemoveDuplicates(): 86 """Identify and remove duplicate molecules based on canonical SMILES.""" 87 88 Infile = OptionsInfo["Infile"] 89 Outfile = OptionsInfo["Outfile"] 90 DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"] 91 92 CountMode = OptionsInfo["CountMode"] 93 UseChirality = OptionsInfo["UseChirality"] 94 95 # Setup a molecule reader... 96 MiscUtil.PrintInfo("\nProcessing file %s..." % Infile) 97 Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"]) 98 99 # Set up a molecule writer... 100 Writer = None 101 DuplicatesWriter = None 102 if not CountMode: 103 Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"]) 104 if Writer is None: 105 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile) 106 DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"]) 107 if DuplicatesWriter is None: 108 MiscUtil.PrintError("Failed to setup a writer for output fie %s " % DuplicatesOutfile) 109 110 MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, DuplicatesOutfile)) 111 112 # Process molecules... 113 MolCount = 0 114 ValidMolCount = 0 115 116 UniqueMolCount = 0 117 DuplicateMolCount = 0 118 119 CanonicalSMILESMap = {} 120 Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"] 121 SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"] 122 123 FirstMol = True 124 for Mol in Mols: 125 MolCount += 1 126 127 if Mol is None: 128 continue 129 130 if RDKitUtil.IsMolEmpty(Mol): 131 MolName = RDKitUtil.GetMolName(Mol, MolCount) 132 MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName) 133 continue 134 135 ValidMolCount += 1 136 137 if FirstMol: 138 FirstMol = False 139 if not CountMode: 140 if SetSMILESMolProps: 141 RDKitUtil.SetWriterMolProps(Writer, Mol) 142 RDKitUtil.SetWriterMolProps(DuplicatesWriter, Mol) 143 144 CanonicalSMILES = Chem.MolToSmiles(Mol, isomericSmiles=UseChirality, canonical=True) 145 146 if Compute2DCoords: 147 if not CountMode: 148 AllChem.Compute2DCoords(Mol) 149 150 if CanonicalSMILES in CanonicalSMILESMap: 151 DuplicateMolCount += 1 152 if not CountMode: 153 DuplicatesWriter.write(Mol) 154 else: 155 UniqueMolCount += 1 156 CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES 157 if not CountMode: 158 Writer.write(Mol) 159 160 if Writer is not None: 161 Writer.close() 162 163 if DuplicatesWriter is not None: 164 DuplicatesWriter.close() 165 166 MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount) 167 MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount) 168 MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount)) 169 170 MiscUtil.PrintInfo("\nTotal number of unique molecules: %d" % UniqueMolCount) 171 MiscUtil.PrintInfo("Total number of duplicate molecules: %d" % DuplicateMolCount) 172 173 174 def ProcessOptions(): 175 """Process and validate command line arguments and options.""" 176 177 MiscUtil.PrintInfo("Processing options...") 178 179 # Validate options... 180 ValidateOptions() 181 182 OptionsInfo["Infile"] = Options["--infile"] 183 OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters( 184 "--infileParams", Options["--infileParams"], Options["--infile"] 185 ) 186 187 OptionsInfo["Outfile"] = Options["--outfile"] 188 OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters( 189 "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"] 190 ) 191 192 OptionsInfo["Overwrite"] = Options["--overwrite"] 193 194 OptionsInfo["CountMode"] = False 195 if re.match("^count$", Options["--mode"], re.I): 196 OptionsInfo["CountMode"] = True 197 198 # Setup outfile for writing out duplicates... 199 OptionsInfo["DuplicatesOutfile"] = "" 200 if not OptionsInfo["CountMode"]: 201 FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"]) 202 OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt) 203 204 OptionsInfo["UseChirality"] = False 205 if re.match("^yes$", Options["--useChirality"], re.I): 206 OptionsInfo["UseChirality"] = True 207 208 209 def RetrieveOptions(): 210 """Retrieve command line arguments and options.""" 211 212 # Get options... 213 global Options 214 Options = docopt(_docoptUsage_) 215 216 # Set current working directory to the specified directory... 217 WorkingDir = Options["--workingdir"] 218 if WorkingDir: 219 os.chdir(WorkingDir) 220 221 # Handle examples option... 222 if "--examples" in Options and Options["--examples"]: 223 MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_)) 224 sys.exit(0) 225 226 227 def ValidateOptions(): 228 """Validate option values.""" 229 230 MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"]) 231 MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv") 232 233 if Options["--outfile"]: 234 MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi") 235 MiscUtil.ValidateOptionsOutputFileOverwrite( 236 "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"] 237 ) 238 MiscUtil.ValidateOptionsDistinctFileNames( 239 "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"] 240 ) 241 242 MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count") 243 if re.match("^remove$", Options["--mode"], re.I): 244 if not Options["--outfile"]: 245 MiscUtil.PrintError( 246 'The outfile must be specified using "-o, --outfile" during "remove" value of "-m, --mode" option' 247 ) 248 249 MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no") 250 251 252 # Setup a usage string for docopt... 253 _docoptUsage_ = """ 254 RDKitRemoveDuplicateMolecules.py - Remove duplicate molecules 255 256 Usage: 257 RDKitRemoveDuplicateMolecules.py [--infileParams <Name,Value,...>] 258 [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 259 [--overwrite] [--useChirality <yes or no>] [-w <dir>] [-o <outfile>] -i <infile> 260 RDKitRemoveDuplicateMolecules.py -h | --help | -e | --examples 261 262 Description: 263 Identify and remove duplicate molecules based on canonical SMILES strings or 264 simply count the number of duplicate molecules. 265 266 The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt) 267 268 The supported output file formats are: SD (.sdf, .sd), SMILES (.smi) 269 270 Options: 271 -e, --examples 272 Print examples. 273 -h, --help 274 Print this help message. 275 -i, --infile <infile> 276 Input file name. 277 --infileParams <Name,Value,...> [default: auto] 278 A comma delimited list of parameter name and value pairs for reading 279 molecules from files. The supported parameter names for different file 280 formats, along with their default values, are shown below: 281 282 SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes 283 SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space, 284 smilesTitleLine,auto,sanitize,yes 285 286 Possible values for smilesDelimiter: space, comma or tab. 287 -m, --mode <remove or count> [default: remove] 288 Specify whether to remove duplicate molecules and write out filtered molecules 289 to output files or or simply count the number of duplicate molecules. 290 -o, --outfile <outfile> 291 Output file name. 292 --outfileParams <Name,Value,...> [default: auto] 293 A comma delimited list of parameter name and value pairs for writing 294 molecules to files. The supported parameter names for different file 295 formats, along with their default values, are shown below: 296 297 SD: compute2DCoords,auto,kekulize,yes,forceV3000,no 298 SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes, 299 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no 300 301 Default value for compute2DCoords: yes for SMILES input file; no for all other 302 file types. 303 --overwrite 304 Overwrite existing files. 305 -u, --useChirality <yes or no> [default: yes] 306 Use stereochemistry information for generation of canonical SMILES strings 307 to identify duplicate molecules. 308 -w, --workingdir <dir> 309 Location of working directory which defaults to the current directory. 310 311 Examples: 312 To remove duplicate molecules and generate output files containing unique and 313 duplicate SMILES strings, type: 314 315 % RDKitRemoveDuplicateMolecules.py -i Sample.smi -o SampleOut.smi 316 317 To remove duplicate molecules without using stereochemistry information for 318 generation of canonical SMILES and generate output files containing unique and 319 duplicate SMILES strings, type: 320 321 % RDKitRemoveDuplicateMolecules.py -u no -i Sample.sdf -o SampleOut.sdf 322 323 To count number of unique and duplicate molecules without generating any 324 output files, type: 325 326 % RDKitRemoveDuplicateMolecules.py -m count -i Sample.sdf 327 328 To remove duplicate molecules from a CSV SMILES file, SMILES strings in 329 column 1, name in column 2, and generate output SD files containing unique and 330 duplicate molecules, type: 331 332 % RDKitRemoveDuplicateMolecules.py --infileParams 333 "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1, 334 smilesNameColumn,2" --outfileParams "compute2DCoords,yes" 335 -i SampleSMILES.csv -o SampleOut.sdf 336 337 Author: 338 Manish Sud(msud@san.rr.com) 339 340 See also: 341 RDKitConvertFileFormat.py, RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts, 342 RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py, 343 RDKitStandardizeMolecules.py 344 345 Copyright: 346 Copyright (C) 2026 Manish Sud. All rights reserved. 347 348 The functionality available in this script is implemented using RDKit, an 349 open source toolkit for cheminformatics developed by Greg Landrum. 350 351 This file is part of MayaChemTools. 352 353 MayaChemTools is free software; you can redistribute it and/or modify it under 354 the terms of the GNU Lesser General Public License as published by the Free 355 Software Foundation; either version 3 of the License, or (at your option) any 356 later version. 357 358 """ 359 360 if __name__ == "__main__": 361 main()