MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveDuplicateMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script"""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDK v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     RemoveDuplicates()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def RemoveDuplicates():
  81     """Identify and remove duplicate molecules based on canonical SMILES"""
  82     
  83     Infile = OptionsInfo["Infile"]
  84     Outfile = OptionsInfo["Outfile"]
  85     DuplicatesOutfile = OptionsInfo["DuplicatesOutfile"]
  86     
  87     CountMode = OptionsInfo["CountMode"]
  88     UseChirality = OptionsInfo["UseChirality"]
  89     
  90     # Setup a molecule reader...
  91     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  92     Mols  = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  93     
  94     # Set up a molecule writer...
  95     Writer = None
  96     DuplicatesWriter = None
  97     if not CountMode:
  98         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
  99         if Writer is None:
 100             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 101         DuplicatesWriter = RDKitUtil.MoleculesWriter(DuplicatesOutfile, **OptionsInfo["OutfileParams"])
 102         if DuplicatesWriter is None:
 103             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % DuplicatesOutfile)
 104         
 105         MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, DuplicatesOutfile))
 106 
 107     # Process molecules...
 108     MolCount = 0
 109     ValidMolCount = 0
 110     
 111     UniqueMolCount = 0
 112     DuplicateMolCount = 0
 113     
 114     CanonicalSMILESMap = {}
 115     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 116 
 117     for Mol in Mols:
 118         MolCount += 1
 119         
 120         if Mol is None:
 121             continue
 122         
 123         if RDKitUtil.IsMolEmpty(Mol):
 124             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 125             MiscUtil.PrintWarning("Ignoring empty molecule: %s" % MolName)
 126             continue
 127         
 128         ValidMolCount += 1
 129         
 130         CanonicalSMILES = Chem.MolToSmiles(Mol, isomericSmiles = UseChirality, canonical = True)
 131         
 132         if Compute2DCoords:
 133             if not CountMode:
 134                 AllChem.Compute2DCoords(Mol)
 135         
 136         if CanonicalSMILES in CanonicalSMILESMap:
 137             DuplicateMolCount += 1
 138             if not CountMode:
 139                 DuplicatesWriter.write(Mol)
 140         else:
 141             UniqueMolCount += 1
 142             CanonicalSMILESMap[CanonicalSMILES] = CanonicalSMILES
 143             if not CountMode:
 144                 Writer.write(Mol)
 145     
 146     if Writer is not None:
 147         Writer.close()
 148     
 149     if DuplicatesWriter is not None:
 150         DuplicatesWriter.close()
 151     
 152     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 153     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 154     MiscUtil.PrintInfo("Number of ignored molecules: %d" % (MolCount - ValidMolCount))
 155 
 156     MiscUtil.PrintInfo("\nTotal number of unique molecules: %d" % UniqueMolCount)
 157     MiscUtil.PrintInfo("Total number of duplicate molecules: %d" % DuplicateMolCount)
 158 
 159 def ProcessOptions():
 160     """Process and validate command line arguments and options"""
 161     
 162     MiscUtil.PrintInfo("Processing options...")
 163     
 164     # Validate options...
 165     ValidateOptions()
 166     
 167     OptionsInfo["Infile"] = Options["--infile"]
 168     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 169     
 170     OptionsInfo["Outfile"] = Options["--outfile"]
 171     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 172     
 173     OptionsInfo["Overwrite"] = Options["--overwrite"]
 174 
 175     OptionsInfo["CountMode"] = False
 176     if re.match("^count$", Options["--mode"], re.I):
 177         OptionsInfo["CountMode"] = True
 178     
 179     # Setup outfile for writing out duplicates...
 180     OptionsInfo["DuplicatesOutfile"] = ""
 181     if not OptionsInfo["CountMode"] :
 182         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 183         OptionsInfo["DuplicatesOutfile"] = "%sDuplicates.%s" % (FileName, FileExt)
 184 
 185     OptionsInfo["UseChirality"] = False
 186     if re.match("^yes$", Options["--useChirality"], re.I):
 187         OptionsInfo["UseChirality"] = True
 188     
 189 def RetrieveOptions():
 190     """Retrieve command line arguments and options"""
 191     
 192     # Get options...
 193     global Options
 194     Options = docopt(_docoptUsage_)
 195     
 196     # Set current working directory to the specified directory...
 197     WorkingDir = Options["--workingdir"]
 198     if WorkingDir:
 199         os.chdir(WorkingDir)
 200     
 201     # Handle examples option...
 202     if "--examples" in Options and Options["--examples"]:
 203         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 204         sys.exit(0)
 205 
 206 def ValidateOptions():
 207     """Validate option values"""
 208     
 209     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 210     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 211     
 212     if Options["--outfile"]:
 213         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 214         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 215         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 216 
 217     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 218     if re.match("^remove$", Options["--mode"], re.I):
 219         if not Options["--outfile"]:
 220             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 221         
 222     MiscUtil.ValidateOptionTextValue("--useChirality", Options["--useChirality"], "yes no")
 223     
 224 # Setup a usage string for docopt...
 225 _docoptUsage_ = """
 226 RDKitRemoveDuplicateMolecules.py - Remove duplicate molecules
 227 
 228 Usage:
 229     RDKitRemoveDuplicateMolecules.py  [--infileParams <Name,Value,...>]
 230                               [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 
 231                               [--overwrite] [--useChirality <yes or no>] [-w <dir>] [-o <outfile>]  -i <infile>
 232     RDKitRemoveDuplicateMolecules.py -h | --help | -e | --examples
 233 
 234 Description:
 235     Identify and remove duplicate molecules based on canonical SMILES strings or
 236     simply count the number of duplicate molecules.
 237 
 238     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 239 
 240     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 241 
 242 Options:
 243     -e, --examples
 244         Print examples.
 245     -h, --help
 246         Print this help message.
 247     -i, --infile <infile>
 248         Input file name.
 249     --infileParams <Name,Value,...>  [default: auto]
 250         A comma delimited list of parameter name and value pairs for reading
 251         molecules from files. The supported parameter names for different file
 252         formats, along with their default values, are shown below:
 253             
 254             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 255             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 256                 smilesTitleLine,auto,sanitize,yes
 257             
 258         Possible values for smilesDelimiter: space, comma or tab.
 259     -m, --mode <remove or count>  [default: remove]
 260         Specify whether to remove duplicate molecules and write out filtered molecules
 261         to output files or or simply count the number of duplicate molecules.
 262     -o, --outfile <outfile>
 263         Output file name.
 264     --outfileParams <Name,Value,...>  [default: auto]
 265         A comma delimited list of parameter name and value pairs for writing
 266         molecules to files. The supported parameter names for different file
 267         formats, along with their default values, are shown below:
 268             
 269             SD: compute2DCoords,auto,kekulize,no
 270             SMILES: kekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 271                 smilesTitleLine,yes
 272             
 273         Default value for compute2DCoords: yes for SMILES input file; no for all other
 274         file types.
 275     --overwrite
 276         Overwrite existing files.
 277     -u, --useChirality <yes or no>  [default: yes]
 278         Use stereochemistry information for generation of canonical SMILES strings
 279         to identify duplicate molecules.
 280     -w, --workingdir <dir>
 281         Location of working directory which defaults to the current directory.
 282 
 283 Examples:
 284     To remove duplicate molecules and generate output files containing unique and
 285     duplicate SMILES strings, type:
 286 
 287         % RDKitRemoveDuplicateMolecules.py -i Sample.smi -o SampleOut.smi
 288 
 289     To remove duplicate molecules without using stereochemistry information for
 290     generation of canonical SMILES and generate output files containing unique and
 291     duplicate SMILES strings, type:
 292 
 293         % RDKitRemoveDuplicateMolecules.py -u no -i Sample.sdf -o SampleOut.sdf
 294 
 295     To count number of unique and duplicate molecules without generating any
 296     output files, type:
 297 
 298         % RDKitRemoveDuplicateMolecules.py -m count -i Sample.sdf
 299 
 300     To remove duplicate molecules from a CSV SMILES file, SMILES strings in
 301     column 1, name in column 2, and generate output SD files containing unique and
 302     duplicate molecules, type:
 303 
 304         % RDKitRemoveDuplicateMolecules.py --infileParams 
 305           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 306           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 307           -i SampleSMILES.csv -o SampleOut.sdf
 308 
 309 Author:
 310     Manish Sud(msud@san.rr.com)
 311 
 312 See also:
 313     RDKitConvertFileFormat.py, RDKitRemoveSalts, RDKitSearchFunctionalGroups.py,
 314     RDKitSearchSMARTS.py
 315 
 316 Copyright:
 317     Copyright (C) 2019 Manish Sud. All rights reserved.
 318 
 319     The functionality available in this script is implemented using RDKit, an
 320     open source toolkit for cheminformatics developed by Greg Landrum.
 321 
 322     This file is part of MayaChemTools.
 323 
 324     MayaChemTools is free software; you can redistribute it and/or modify it under
 325     the terms of the GNU Lesser General Public License as published by the Free
 326     Software Foundation; either version 3 of the License, or (at your option) any
 327     later version.
 328 
 329 """
 330 
 331 if __name__ == "__main__":
 332     main()