MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveDuplicateMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2026 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 import re
  35 
  36 # RDKit imports...
  37 try:
  38     from rdkit import rdBase
  39     from rdkit import Chem
  40     from rdkit.Chem import AllChem
  41 except ImportError as ErrMsg:
  42     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  43     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  44     sys.exit(1)
  45 
  46 # MayaChemTools imports...
  47 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 
  62 def main():
  63     """Start execution of the script."""
  64 
  65     MiscUtil.PrintInfo(
  66         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  67         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  68     )
  69 
  70     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  71 
  72     # Retrieve command line arguments and options...
  73     RetrieveOptions()
  74 
  75     # Process and validate command line arguments and options...
  76     ProcessOptions()
  77 
  78     # Perform actions required by the script...
  79     RemoveDuplicates()
  80 
  81     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  82     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  83 
  84 
  85 def RemoveDuplicates():
  86     """Identify and remove duplicate molecules based on canonical SMILES."""
  87 
  88     Infile = OptionsInfo["Infile"]
  89     Outfile = OptionsInfo["Outfile"]
  90     DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"]
  91 
  92     CountMode = OptionsInfo["CountMode"]
  93     UseChirality = OptionsInfo["UseChirality"]
  94 
  95     # Setup a molecule reader...
  96     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  97     Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  98 
  99     # Set up a molecule writer...
 100     Writer = None
 101     DuplicatesWriter = None
 102     if not CountMode:
 103         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 104         if Writer is None:
 105             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 106         DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"])
 107         if DuplicatesWriter is None:
 108             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % DuplicatesOutfile)
 109 
 110         MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, DuplicatesOutfile))
 111 
 112     # Process molecules...
 113     MolCount = 0
 114     ValidMolCount = 0
 115 
 116     UniqueMolCount = 0
 117     DuplicateMolCount = 0
 118 
 119     CanonicalSMILESMap = {}
 120     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 121     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 122 
 123     FirstMol = True
 124     for Mol in Mols:
 125         MolCount += 1
 126 
 127         if Mol is None:
 128             continue
 129 
 130         if RDKitUtil.IsMolEmpty(Mol):
 131             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 132             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 133             continue
 134 
 135         ValidMolCount += 1
 136 
 137         if FirstMol:
 138             FirstMol = False
 139             if not CountMode:
 140                 if SetSMILESMolProps:
 141                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 142                     RDKitUtil.SetWriterMolProps(DuplicatesWriter, Mol)
 143 
 144         CanonicalSMILES = Chem.MolToSmiles(Mol, isomericSmiles=UseChirality, canonical=True)
 145 
 146         if Compute2DCoords:
 147             if not CountMode:
 148                 AllChem.Compute2DCoords(Mol)
 149 
 150         if CanonicalSMILES in CanonicalSMILESMap:
 151             DuplicateMolCount += 1
 152             if not CountMode:
 153                 DuplicatesWriter.write(Mol)
 154         else:
 155             UniqueMolCount += 1
 156             CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES
 157             if not CountMode:
 158                 Writer.write(Mol)
 159 
 160     if Writer is not None:
 161         Writer.close()
 162 
 163     if DuplicatesWriter is not None:
 164         DuplicatesWriter.close()
 165 
 166     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 167     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 168     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 169 
 170     MiscUtil.PrintInfo("\nTotal number of unique molecules: %d" % UniqueMolCount)
 171     MiscUtil.PrintInfo("Total number of duplicate molecules: %d" % DuplicateMolCount)
 172 
 173 
 174 def ProcessOptions():
 175     """Process and validate command line arguments and options."""
 176 
 177     MiscUtil.PrintInfo("Processing options...")
 178 
 179     # Validate options...
 180     ValidateOptions()
 181 
 182     OptionsInfo["Infile"] = Options["--infile"]
 183     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
 184         "--infileParams", Options["--infileParams"], Options["--infile"]
 185     )
 186 
 187     OptionsInfo["Outfile"] = Options["--outfile"]
 188     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
 189         "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
 190     )
 191 
 192     OptionsInfo["Overwrite"] = Options["--overwrite"]
 193 
 194     OptionsInfo["CountMode"] = False
 195     if re.match("^count$", Options["--mode"], re.I):
 196         OptionsInfo["CountMode"] = True
 197 
 198     # Setup outfile for writing out duplicates...
 199     OptionsInfo["DuplicatesOutfile"] = ""
 200     if not OptionsInfo["CountMode"]:
 201         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 202         OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt)
 203 
 204     OptionsInfo["UseChirality"] = False
 205     if re.match("^yes$", Options["--useChirality"], re.I):
 206         OptionsInfo["UseChirality"] = True
 207 
 208 
 209 def RetrieveOptions():
 210     """Retrieve command line arguments and options."""
 211 
 212     # Get options...
 213     global Options
 214     Options = docopt(_docoptUsage_)
 215 
 216     # Set current working directory to the specified directory...
 217     WorkingDir = Options["--workingdir"]
 218     if WorkingDir:
 219         os.chdir(WorkingDir)
 220 
 221     # Handle examples option...
 222     if "--examples" in Options and Options["--examples"]:
 223         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 224         sys.exit(0)
 225 
 226 
 227 def ValidateOptions():
 228     """Validate option values."""
 229 
 230     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 231     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 232 
 233     if Options["--outfile"]:
 234         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 235         MiscUtil.ValidateOptionsOutputFileOverwrite(
 236             "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 237         )
 238         MiscUtil.ValidateOptionsDistinctFileNames(
 239             "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 240         )
 241 
 242     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 243     if re.match("^remove$", Options["--mode"], re.I):
 244         if not Options["--outfile"]:
 245             MiscUtil.PrintError(
 246                 'The outfile must be specified using "-o, --outfile" during "remove" value of "-m, --mode" option'
 247             )
 248 
 249     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 250 
 251 
 252 # Setup a usage string for docopt...
 253 _docoptUsage_ = """
 254 RDKitRemoveDuplicateMolecules.py - Remove duplicate molecules
 255 
 256 Usage:
 257     RDKitRemoveDuplicateMolecules.py  [--infileParams <Name,Value,...>]
 258                               [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 
 259                               [--overwrite] [--useChirality <yes or no>] [-w <dir>] [-o <outfile>]  -i <infile>
 260     RDKitRemoveDuplicateMolecules.py -h | --help | -e | --examples
 261 
 262 Description:
 263     Identify and remove duplicate molecules based on canonical SMILES strings or
 264     simply count the number of duplicate molecules.
 265 
 266     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 267 
 268     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 269 
 270 Options:
 271     -e, --examples
 272         Print examples.
 273     -h, --help
 274         Print this help message.
 275     -i, --infile <infile>
 276         Input file name.
 277     --infileParams <Name,Value,...>  [default: auto]
 278         A comma delimited list of parameter name and value pairs for reading
 279         molecules from files. The supported parameter names for different file
 280         formats, along with their default values, are shown below:
 281             
 282             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 283             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 284                 smilesTitleLine,auto,sanitize,yes
 285             
 286         Possible values for smilesDelimiter: space, comma or tab.
 287     -m, --mode <remove or count>  [default: remove]
 288         Specify whether to remove duplicate molecules and write out filtered molecules
 289         to output files or or simply count the number of duplicate molecules.
 290     -o, --outfile <outfile>
 291         Output file name.
 292     --outfileParams <Name,Value,...>  [default: auto]
 293         A comma delimited list of parameter name and value pairs for writing
 294         molecules to files. The supported parameter names for different file
 295         formats, along with their default values, are shown below:
 296             
 297             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 298             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 299                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 300             
 301         Default value for compute2DCoords: yes for SMILES input file; no for all other
 302         file types.
 303     --overwrite
 304         Overwrite existing files.
 305     -u, --useChirality <yes or no>  [default: yes]
 306         Use stereochemistry information for generation of canonical SMILES strings
 307         to identify duplicate molecules.
 308     -w, --workingdir <dir>
 309         Location of working directory which defaults to the current directory.
 310 
 311 Examples:
 312     To remove duplicate molecules and generate output files containing unique and
 313     duplicate SMILES strings, type:
 314 
 315         % RDKitRemoveDuplicateMolecules.py -i Sample.smi -o SampleOut.smi
 316 
 317     To remove duplicate molecules without using stereochemistry information for
 318     generation of canonical SMILES and generate output files containing unique and
 319     duplicate SMILES strings, type:
 320 
 321         % RDKitRemoveDuplicateMolecules.py -u no -i Sample.sdf -o SampleOut.sdf
 322 
 323     To count number of unique and duplicate molecules without generating any
 324     output files, type:
 325 
 326         % RDKitRemoveDuplicateMolecules.py -m count -i Sample.sdf
 327 
 328     To remove duplicate molecules from a CSV SMILES file, SMILES strings in
 329     column 1, name in column 2, and generate output SD files containing unique and
 330     duplicate molecules, type:
 331 
 332         % RDKitRemoveDuplicateMolecules.py --infileParams 
 333           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 334           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 335           -i SampleSMILES.csv -o SampleOut.sdf
 336 
 337 Author:
 338     Manish Sud(msud@san.rr.com)
 339 
 340 See also:
 341     RDKitConvertFileFormat.py, RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts,
 342     RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py,
 343     RDKitStandardizeMolecules.py
 344 
 345 Copyright:
 346     Copyright (C) 2026 Manish Sud. All rights reserved.
 347 
 348     The functionality available in this script is implemented using RDKit, an
 349     open source toolkit for cheminformatics developed by Greg Landrum.
 350 
 351     This file is part of MayaChemTools.
 352 
 353     MayaChemTools is free software; you can redistribute it and/or modify it under
 354     the terms of the GNU Lesser General Public License as published by the Free
 355     Software Foundation; either version 3 of the License, or (at your option) any
 356     later version.
 357 
 358 """
 359 
 360 if __name__ == "__main__":
 361     main()