MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveInvalidMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 # Add local python path to the global path and import standard library modules...
  32 import os
  33 import sys;  sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  34 import time
  35 import re
  36 
  37 # RDKit imports...
  38 try:
  39     from rdkit import rdBase
  40     from rdkit import Chem
  41     from rdkit.Chem import AllChem
  42 except ImportError as ErrMsg:
  43     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  44     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  45     sys.exit(1)
  46 
  47 # MayaChemTools imports...
  48 try:
  49     from docopt import docopt
  50     import MiscUtil
  51     import RDKitUtil
  52 except ImportError as ErrMsg:
  53     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  54     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  55     sys.exit(1)
  56 
  57 ScriptName = os.path.basename(sys.argv[0])
  58 Options = {}
  59 OptionsInfo = {}
  60 
  61 def main():
  62     """Start execution of the script."""
  63     
  64     MiscUtil.PrintInfo("\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n" % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime()))
  65     
  66     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  67     
  68     # Retrieve command line arguments and options...
  69     RetrieveOptions()
  70     
  71     # Process and validate command line arguments and options...
  72     ProcessOptions()
  73     
  74     # Perform actions required by the script...
  75     RemoveInvalidMolecules()
  76     
  77     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  78     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  79 
  80 def RemoveInvalidMolecules():
  81     """Identify and remove invalid molecules."""
  82     
  83     Infile = OptionsInfo["Infile"]
  84     Outfile = OptionsInfo["Outfile"]
  85     InvalidMolsOutfile = OptionsInfo["InvalidMolsOutfile"]
  86     
  87     CountMode = OptionsInfo["CountMode"]
  88     
  89     # Setup a molecule reader...
  90     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  91     Mols  = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  92     
  93     Writer = None
  94     MolNumWriter = None
  95     if not CountMode:
  96         # Set up a molecule writer...
  97         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
  98         if Writer is None:
  99             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 100     
 101         # Set up a invalid molecule number writer...
 102         InvalidMolsWriter = open(InvalidMolsOutfile, "w")
 103         if InvalidMolsWriter is None:
 104             MiscUtil.PrintError("Failed to open output fie %s " % InvalidMolsOutfile)
 105         InvalidMolsWriter.write("MolName\n")
 106     
 107         MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, InvalidMolsOutfile))
 108 
 109     # Process molecules...
 110     MolCount = 0
 111     ValidMolCount = 0
 112     
 113     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 114     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 115 
 116     FirstMol = True
 117     for Mol in Mols:
 118         MolCount += 1
 119         
 120         if Mol is None:
 121             MolName = "Mol%s" % MolCount
 122             MiscUtil.PrintWarning("Ignoring invalid molecule:  %s" % MolName)
 123             if not CountMode:
 124                 InvalidMolsWriter.write("%s\n" % MolName)
 125             continue
 126         
 127         if RDKitUtil.IsMolEmpty(Mol):
 128             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 129             MiscUtil.PrintWarning("Ignoring invalid empty molecule: %s" % MolName)
 130             if not CountMode:
 131                 InvalidMolsWriter.write("%s\n" % MolName)
 132             continue
 133         ValidMolCount += 1
 134         
 135         if FirstMol:
 136             FirstMol = False
 137             if not CountMode:
 138                 if SetSMILESMolProps:
 139                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 140         
 141         if Compute2DCoords:
 142             if not CountMode:
 143                 AllChem.Compute2DCoords(Mol)
 144         
 145         if not CountMode:
 146             Writer.write(Mol)
 147     
 148     if Writer is not None:
 149         Writer.close()
 150     
 151     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 152     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 153     MiscUtil.PrintInfo("Number of invalid molecules: %d" % (MolCount - ValidMolCount))
 154 
 155 def ProcessOptions():
 156     """Process and validate command line arguments and options."""
 157     
 158     MiscUtil.PrintInfo("Processing options...")
 159     
 160     # Validate options...
 161     ValidateOptions()
 162     
 163     OptionsInfo["Infile"] = Options["--infile"]
 164     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters("--infileParams", Options["--infileParams"], Options["--infile"])
 165     
 166     OptionsInfo["Outfile"] = Options["--outfile"]
 167     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters("--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"])
 168     
 169     OptionsInfo["Overwrite"] = Options["--overwrite"]
 170 
 171     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 172     
 173     # Setup outfile for writing out molecule number for invalid molecules...
 174     OptionsInfo["InvalidMolsOutfile"] = ""
 175     if not OptionsInfo["CountMode"] :
 176         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 177         OptionsInfo["InvalidMolsOutfile"] = "%sInvalidMols.csv" % (FileName)
 178 
 179 def RetrieveOptions():
 180     """Retrieve command line arguments and options."""
 181     
 182     # Get options...
 183     global Options
 184     Options = docopt(_docoptUsage_)
 185     
 186     # Set current working directory to the specified directory...
 187     WorkingDir = Options["--workingdir"]
 188     if WorkingDir:
 189         os.chdir(WorkingDir)
 190     
 191     # Handle examples option...
 192     if "--examples" in Options and Options["--examples"]:
 193         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 194         sys.exit(0)
 195 
 196 def ValidateOptions():
 197     """Validate option values."""
 198     
 199     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 200     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 201     
 202     if Options["--outfile"]:
 203         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 204         MiscUtil.ValidateOptionsOutputFileOverwrite("-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"])
 205         MiscUtil.ValidateOptionsDistinctFileNames("-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"])
 206 
 207     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 208     if re.match("^remove$", Options["--mode"], re.I):
 209         if not Options["--outfile"]:
 210             MiscUtil.PrintError("The outfile must be specified using \"-o, --outfile\" during \"remove\" value of \"-m, --mode\" option")
 211     
 212 # Setup a usage string for docopt...
 213 _docoptUsage_ = """
 214 RDKitRemoveInvalidMolecules.py - Remove invalid molecules
 215 
 216 Usage:
 217     RDKitRemoveInvalidMolecules.py  [--infileParams <Name,Value,...>]
 218                               [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 
 219                               [--overwrite] [-w <dir>] [-o <outfile>]  -i <infile>
 220     RDKitRemoveInvalidMolecules.py -h | --help | -e | --examples
 221 
 222 Description:
 223     Identify and remove invalid molecules based on success or failure of RDKit molecule
 224     readers or simply count the number of invalid molecules.
 225 
 226     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 227 
 228     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 229 
 230 Options:
 231     -e, --examples
 232         Print examples.
 233     -h, --help
 234         Print this help message.
 235     -i, --infile <infile>
 236         Input file name.
 237     --infileParams <Name,Value,...>  [default: auto]
 238         A comma delimited list of parameter name and value pairs for reading
 239         molecules from files. The supported parameter names for different file
 240         formats, along with their default values, are shown below:
 241             
 242             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 243             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 244                 smilesTitleLine,auto,sanitize,yes
 245             
 246         Possible values for smilesDelimiter: space, comma or tab.
 247     -m, --mode <remove or count>  [default: remove]
 248         Specify whether to remove invalid molecules and write out filtered molecules
 249         to output file or or simply count the number of invalid molecules.
 250     -o, --outfile <outfile>
 251         Output file name.
 252     --outfileParams <Name,Value,...>  [default: auto]
 253         A comma delimited list of parameter name and value pairs for writing
 254         molecules to files. The supported parameter names for different file
 255         formats, along with their default values, are shown below:
 256             
 257             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 258             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 259                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 260             
 261         Default value for compute2DCoords: yes for SMILES input file; no for all other
 262         file types.
 263     --overwrite
 264         Overwrite existing files.
 265     -w, --workingdir <dir>
 266         Location of working directory which defaults to the current directory.
 267 
 268 Examples:
 269     To remove invalid molecules and generate an output file SMILES file
 270     containing valid molecules, type:
 271 
 272         % RDKitRemoveInvalidMolecules.py -i Sample.smi -o SampleOut.smi
 273 
 274     To count number of valid and invaid molecules without generating any
 275     output file, type:
 276 
 277         % RDKitRemoveInvalidMolecules.py -m count -i Sample.sdf
 278 
 279     To remove invalid molecules from a CSV SMILES file, SMILES strings in
 280     column 1, name in column 2, and generate output SD file containing valid
 281     molecules, type:
 282 
 283         % RDKitRemoveInvalidMolecules.py --infileParams 
 284           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 285           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 286           -i SampleSMILES.csv -o SampleOut.sdf
 287 
 288 Author:
 289     Manish Sud(msud@san.rr.com)
 290 
 291 See also:
 292     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 293     RDKitRemoveSalts, RDKitSearchFunctionalGroups.py,
 294     RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py
 295 
 296 Copyright:
 297     Copyright (C) 2024 Manish Sud. All rights reserved.
 298 
 299     The functionality available in this script is implemented using RDKit, an
 300     open source toolkit for cheminformatics developed by Greg Landrum.
 301 
 302     This file is part of MayaChemTools.
 303 
 304     MayaChemTools is free software; you can redistribute it and/or modify it under
 305     the terms of the GNU Lesser General Public License as published by the Free
 306     Software Foundation; either version 3 of the License, or (at your option) any
 307     later version.
 308 
 309 """
 310 
 311 if __name__ == "__main__":
 312     main()