MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveDuplicateMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script."""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     RemoveDuplicates()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def RemoveDuplicates():
  81     """Identify and remove duplicate molecules based on canonical SMILES."""
  82     
  83     Infile = OptionsInfo["Infile"]
  84     Outfile = OptionsInfo["Outfile"]
  85     DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"]
  86     
  87     CountMode = OptionsInfo["CountMode"]
  88     UseChirality = OptionsInfo["UseChirality"]
  89     
  90     # Setup a molecule reader...
  91     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  92     Mols  = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  93     
  94     # Set up a molecule writer...
  95     Writer = None
  96     DuplicatesWriter = None
  97     if not CountMode:
  98         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
  99         if Writer is None:
 100             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 101         DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"])
 102         if DuplicatesWriter is None:
 103             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % DuplicatesOutfile)
 104         
 105         MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, DuplicatesOutfile))
 106 
 107     # Process molecules...
 108     MolCount = 0
 109     ValidMolCount = 0
 110     
 111     UniqueMolCount = 0
 112     DuplicateMolCount = 0
 113     
 114     CanonicalSMILESMap = {}
 115     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 116     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 117 
 118     FirstMol = True
 119     for Mol in Mols:
 120         MolCount += 1
 121         
 122         if Mol is None:
 123             continue
 124         
 125         if RDKitUtil.IsMolEmpty(Mol):
 126             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 127             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 128             continue
 129         
 130         ValidMolCount += 1
 131         
 132         if FirstMol:
 133             FirstMol = False
 134             if not CountMode:
 135                 if SetSMILESMolProps:
 136                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 137                     RDKitUtil.SetWriterMolProps(DuplicatesWriter, Mol)
 138         
 139         CanonicalSMILES = Chem.MolToSmiles(Mol, isomericSmiles = UseChirality, canonical = True)
 140         
 141         if Compute2DCoords:
 142             if not CountMode:
 143                 AllChem.Compute2DCoords(Mol)
 144         
 145         if CanonicalSMILES in CanonicalSMILESMap:
 146             DuplicateMolCount += 1
 147             if not CountMode:
 148                 DuplicatesWriter.write(Mol)
 149         else:
 150             UniqueMolCount += 1
 151             CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES
 152             if not CountMode:
 153                 Writer.write(Mol)
 154     
 155     if Writer is not None:
 156         Writer.close()
 157     
 158     if DuplicatesWriter is not None:
 159         DuplicatesWriter.close()
 160     
 161     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 162     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 163     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 164 
 165     MiscUtil.PrintInfo("\nTotal number of unique molecules: %d" % UniqueMolCount)
 166     MiscUtil.PrintInfo("Total number of duplicate molecules: %d" % DuplicateMolCount)
 167 
 168 def ProcessOptions():
 169     """Process and validate command line arguments and options."""
 170     
 171     MiscUtil.PrintInfo("Processing options...")
 172     
 173     # Validate options...
 174     ValidateOptions()
 175     
 176     OptionsInfo["Infile"] = Options["--infile"]
 177     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 178     
 179     OptionsInfo["Outfile"] = Options["--outfile"]
 180     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 181     
 182     OptionsInfo["Overwrite"] = Options["--overwrite"]
 183 
 184     OptionsInfo["CountMode"] = False
 185     if re.match("^count$", Options["--mode"], re.I):
 186         OptionsInfo["CountMode"] = True
 187     
 188     # Setup outfile for writing out duplicates...
 189     OptionsInfo["DuplicatesOutfile"] = ""
 190     if not OptionsInfo["CountMode"] :
 191         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 192         OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt)
 193 
 194     OptionsInfo["UseChirality"] = False
 195     if re.match("^yes$", Options["--useChirality"], re.I):
 196         OptionsInfo["UseChirality"] = True
 197     
 198 def RetrieveOptions():
 199     """Retrieve command line arguments and options."""
 200     
 201     # Get options...
 202     global Options
 203     Options = docopt(_docoptUsage_)
 204     
 205     # Set current working directory to the specified directory...
 206     WorkingDir = Options["--workingdir"]
 207     if WorkingDir:
 208         os.chdir(WorkingDir)
 209     
 210     # Handle examples option...
 211     if "--examples" in Options and Options["--examples"]:
 212         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 213         sys.exit(0)
 214 
 215 def ValidateOptions():
 216     """Validate option values."""
 217     
 218     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 219     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 220     
 221     if Options["--outfile"]:
 222         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 223         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 224         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 225 
 226     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 227     if re.match("^remove$", Options["--mode"], re.I):
 228         if not Options["--outfile"]:
 229             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 230         
 231     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 232     
 233 # Setup a usage string for docopt...
 234 _docoptUsage_ = """
 235 RDKitRemoveDuplicateMolecules.py - Remove duplicate molecules
 236 
 237 Usage:
 238     RDKitRemoveDuplicateMolecules.py  [--infileParams <Name,Value,...>]
 239                               [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 
 240                               [--overwrite] [--useChirality <yes or no>] [-w <dir>] [-o <outfile>]  -i <infile>
 241     RDKitRemoveDuplicateMolecules.py -h | --help | -e | --examples
 242 
 243 Description:
 244     Identify and remove duplicate molecules based on canonical SMILES strings or
 245     simply count the number of duplicate molecules.
 246 
 247     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 248 
 249     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 250 
 251 Options:
 252     -e, --examples
 253         Print examples.
 254     -h, --help
 255         Print this help message.
 256     -i, --infile <infile>
 257         Input file name.
 258     --infileParams <Name,Value,...>  [default: auto]
 259         A comma delimited list of parameter name and value pairs for reading
 260         molecules from files. The supported parameter names for different file
 261         formats, along with their default values, are shown below:
 262             
 263             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 264             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 265                 smilesTitleLine,auto,sanitize,yes
 266             
 267         Possible values for smilesDelimiter: space, comma or tab.
 268     -m, --mode <remove or count>  [default: remove]
 269         Specify whether to remove duplicate molecules and write out filtered molecules
 270         to output files or or simply count the number of duplicate molecules.
 271     -o, --outfile <outfile>
 272         Output file name.
 273     --outfileParams <Name,Value,...>  [default: auto]
 274         A comma delimited list of parameter name and value pairs for writing
 275         molecules to files. The supported parameter names for different file
 276         formats, along with their default values, are shown below:
 277             
 278             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 279             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 280                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 281             
 282         Default value for compute2DCoords: yes for SMILES input file; no for all other
 283         file types.
 284     --overwrite
 285         Overwrite existing files.
 286     -u, --useChirality <yes or no>  [default: yes]
 287         Use stereochemistry information for generation of canonical SMILES strings
 288         to identify duplicate molecules.
 289     -w, --workingdir <dir>
 290         Location of working directory which defaults to the current directory.
 291 
 292 Examples:
 293     To remove duplicate molecules and generate output files containing unique and
 294     duplicate SMILES strings, type:
 295 
 296         % RDKitRemoveDuplicateMolecules.py -i Sample.smi -o SampleOut.smi
 297 
 298     To remove duplicate molecules without using stereochemistry information for
 299     generation of canonical SMILES and generate output files containing unique and
 300     duplicate SMILES strings, type:
 301 
 302         % RDKitRemoveDuplicateMolecules.py -u no -i Sample.sdf -o SampleOut.sdf
 303 
 304     To count number of unique and duplicate molecules without generating any
 305     output files, type:
 306 
 307         % RDKitRemoveDuplicateMolecules.py -m count -i Sample.sdf
 308 
 309     To remove duplicate molecules from a CSV SMILES file, SMILES strings in
 310     column 1, name in column 2, and generate output SD files containing unique and
 311     duplicate molecules, type:
 312 
 313         % RDKitRemoveDuplicateMolecules.py --infileParams 
 314           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 315           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 316           -i SampleSMILES.csv -o SampleOut.sdf
 317 
 318 Author:
 319     Manish Sud(msud@san.rr.com)
 320 
 321 See also:
 322     RDKitConvertFileFormat.py, RDKitRemoveInvalidMolecules.py, RDKitRemoveSalts,
 323     RDKitSearchFunctionalGroups.py, RDKitSearchSMARTS.py,
 324     RDKitStandardizeMolecules.py
 325 
 326 Copyright:
 327     Copyright (C) 2025 Manish Sud. All rights reserved.
 328 
 329     The functionality available in this script is implemented using RDKit, an
 330     open source toolkit for cheminformatics developed by Greg Landrum.
 331 
 332     This file is part of MayaChemTools.
 333 
 334     MayaChemTools is free software; you can redistribute it and/or modify it under
 335     the terms of the GNU Lesser General Public License as published by the Free
 336     Software Foundation; either version 3 of the License, or (at your option) any
 337     later version.
 338 
 339 """
 340 
 341 if __name__ == "__main__":
 342     main()