MayaChemTools

   1 #!/bin/env python
   2 #
   3 # File: RDKitRemoveInvalidMolecules.py
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # The functionality available in this script is implemented using RDKit, an
   9 # open source toolkit for cheminformatics developed by Greg Landrum.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 from __future__ import print_function
  30 
  31 import os
  32 import sys
  33 import time
  34 import re
  35 
  36 # RDKit imports...
  37 try:
  38     from rdkit import rdBase
  39     from rdkit.Chem import AllChem
  40 except ImportError as ErrMsg:
  41     sys.stderr.write("\nFailed to import RDKit module/package: %s\n" % ErrMsg)
  42     sys.stderr.write("Check/update your RDKit environment and try again.\n\n")
  43     sys.exit(1)
  44 
  45 # MayaChemTools imports...
  46 sys.path.insert(0, os.path.join(os.path.dirname(sys.argv[0]), "..", "lib", "Python"))
  47 try:
  48     from docopt import docopt
  49     import MiscUtil
  50     import RDKitUtil
  51 except ImportError as ErrMsg:
  52     sys.stderr.write("\nFailed to import MayaChemTools module/package: %s\n" % ErrMsg)
  53     sys.stderr.write("Check/update your MayaChemTools environment and try again.\n\n")
  54     sys.exit(1)
  55 
  56 ScriptName = os.path.basename(sys.argv[0])
  57 Options = {}
  58 OptionsInfo = {}
  59 
  60 
  61 def main():
  62     """Start execution of the script."""
  63 
  64     MiscUtil.PrintInfo(
  65         "\n%s (RDKit v%s; MayaChemTools v%s; %s): Starting...\n"
  66         % (ScriptName, rdBase.rdkitVersion, MiscUtil.GetMayaChemToolsVersion(), time.asctime())
  67     )
  68 
  69     (WallClockTime, ProcessorTime) = MiscUtil.GetWallClockAndProcessorTime()
  70 
  71     # Retrieve command line arguments and options...
  72     RetrieveOptions()
  73 
  74     # Process and validate command line arguments and options...
  75     ProcessOptions()
  76 
  77     # Perform actions required by the script...
  78     RemoveInvalidMolecules()
  79 
  80     MiscUtil.PrintInfo("\n%s: Done...\n" % ScriptName)
  81     MiscUtil.PrintInfo("Total time: %s" % MiscUtil.GetFormattedElapsedTime(WallClockTime, ProcessorTime))
  82 
  83 
  84 def RemoveInvalidMolecules():
  85     """Identify and remove invalid molecules."""
  86 
  87     Infile = OptionsInfo["Infile"]
  88     Outfile = OptionsInfo["Outfile"]
  89     InvalidMolsOutfile = OptionsInfo["InvalidMolsOutfile"]
  90 
  91     CountMode = OptionsInfo["CountMode"]
  92 
  93     # Setup a molecule reader...
  94     MiscUtil.PrintInfo("\nProcessing file %s..." % Infile)
  95     Mols = RDKitUtil.ReadMolecules(Infile, **OptionsInfo["InfileParams"])
  96 
  97     Writer = None
  98     if not CountMode:
  99         # Set up a molecule writer...
 100         Writer = RDKitUtil.MoleculesWriter(Outfile, **OptionsInfo["OutfileParams"])
 101         if Writer is None:
 102             MiscUtil.PrintError("Failed to setup a writer for output fie %s " % Outfile)
 103 
 104         # Set up a invalid molecule number writer...
 105         InvalidMolsWriter = open(InvalidMolsOutfile, "w")
 106         if InvalidMolsWriter is None:
 107             MiscUtil.PrintError("Failed to open output fie %s " % InvalidMolsOutfile)
 108         InvalidMolsWriter.write("MolName\n")
 109 
 110         MiscUtil.PrintInfo("Generating files %s and %s..." % (Outfile, InvalidMolsOutfile))
 111 
 112     # Process molecules...
 113     MolCount = 0
 114     ValidMolCount = 0
 115 
 116     Compute2DCoords = OptionsInfo["OutfileParams"]["Compute2DCoords"]
 117     SetSMILESMolProps = OptionsInfo["OutfileParams"]["SetSMILESMolProps"]
 118 
 119     FirstMol = True
 120     for Mol in Mols:
 121         MolCount += 1
 122 
 123         if Mol is None:
 124             MolName = "Mol%s" % MolCount
 125             MiscUtil.PrintWarning("Ignoring invalid molecule:  %s" % MolName)
 126             if not CountMode:
 127                 InvalidMolsWriter.write("%s\n" % MolName)
 128             continue
 129 
 130         if RDKitUtil.IsMolEmpty(Mol):
 131             MolName = RDKitUtil.GetMolName(Mol, MolCount)
 132             MiscUtil.PrintWarning("Ignoring invalid empty molecule: %s" % MolName)
 133             if not CountMode:
 134                 InvalidMolsWriter.write("%s\n" % MolName)
 135             continue
 136         ValidMolCount += 1
 137 
 138         if FirstMol:
 139             FirstMol = False
 140             if not CountMode:
 141                 if SetSMILESMolProps:
 142                     RDKitUtil.SetWriterMolProps(Writer, Mol)
 143 
 144         if Compute2DCoords:
 145             if not CountMode:
 146                 AllChem.Compute2DCoords(Mol)
 147 
 148         if not CountMode:
 149             Writer.write(Mol)
 150 
 151     if Writer is not None:
 152         Writer.close()
 153 
 154     MiscUtil.PrintInfo("\nTotal number of molecules: %d" % MolCount)
 155     MiscUtil.PrintInfo("Number of valid molecules: %d" % ValidMolCount)
 156     MiscUtil.PrintInfo("Number of invalid molecules: %d" % (MolCount - ValidMolCount))
 157 
 158 
 159 def ProcessOptions():
 160     """Process and validate command line arguments and options."""
 161 
 162     MiscUtil.PrintInfo("Processing options...")
 163 
 164     # Validate options...
 165     ValidateOptions()
 166 
 167     OptionsInfo["Infile"] = Options["--infile"]
 168     OptionsInfo["InfileParams"] = MiscUtil.ProcessOptionInfileParameters(
 169         "--infileParams", Options["--infileParams"], Options["--infile"]
 170     )
 171 
 172     OptionsInfo["Outfile"] = Options["--outfile"]
 173     OptionsInfo["OutfileParams"] = MiscUtil.ProcessOptionOutfileParameters(
 174         "--outfileParams", Options["--outfileParams"], Options["--infile"], Options["--outfile"]
 175     )
 176 
 177     OptionsInfo["Overwrite"] = Options["--overwrite"]
 178 
 179     OptionsInfo["CountMode"] = True if re.match("^count$", Options["--mode"], re.I) else False
 180 
 181     # Setup outfile for writing out molecule number for invalid molecules...
 182     OptionsInfo["InvalidMolsOutfile"] = ""
 183     if not OptionsInfo["CountMode"]:
 184         FileDir, FileName, FileExt = MiscUtil.ParseFileName(OptionsInfo["Outfile"])
 185         OptionsInfo["InvalidMolsOutfile"] = "%sInvalidMols.csv" % (FileName)
 186 
 187 
 188 def RetrieveOptions():
 189     """Retrieve command line arguments and options."""
 190 
 191     # Get options...
 192     global Options
 193     Options = docopt(_docoptUsage_)
 194 
 195     # Set current working directory to the specified directory...
 196     WorkingDir = Options["--workingdir"]
 197     if WorkingDir:
 198         os.chdir(WorkingDir)
 199 
 200     # Handle examples option...
 201     if "--examples" in Options and Options["--examples"]:
 202         MiscUtil.PrintInfo(MiscUtil.GetExamplesTextFromDocOptText(_docoptUsage_))
 203         sys.exit(0)
 204 
 205 
 206 def ValidateOptions():
 207     """Validate option values."""
 208 
 209     MiscUtil.ValidateOptionFilePath("-i, --infile", Options["--infile"])
 210     MiscUtil.ValidateOptionFileExt("-i, --infile", Options["--infile"], "sdf sd smi txt csv tsv")
 211 
 212     if Options["--outfile"]:
 213         MiscUtil.ValidateOptionFileExt("-o, --outfile", Options["--outfile"], "sdf sd smi")
 214         MiscUtil.ValidateOptionsOutputFileOverwrite(
 215             "-o, --outfile", Options["--outfile"], "--overwrite", Options["--overwrite"]
 216         )
 217         MiscUtil.ValidateOptionsDistinctFileNames(
 218             "-i, --infile", Options["--infile"], "-o, --outfile", Options["--outfile"]
 219         )
 220 
 221     MiscUtil.ValidateOptionTextValue("-m, --mode", Options["--mode"], "remove count")
 222     if re.match("^remove$", Options["--mode"], re.I):
 223         if not Options["--outfile"]:
 224             MiscUtil.PrintError(
 225                 'The outfile must be specified using "-o, --outfile" during "remove" value of "-m, --mode" option'
 226             )
 227 
 228 
 229 # Setup a usage string for docopt...
 230 _docoptUsage_ = """
 231 RDKitRemoveInvalidMolecules.py - Remove invalid molecules
 232 
 233 Usage:
 234     RDKitRemoveInvalidMolecules.py  [--infileParams <Name,Value,...>]
 235                               [--mode <remove or count>] [ --outfileParams <Name,Value,...> ] 
 236                               [--overwrite] [-w <dir>] [-o <outfile>]  -i <infile>
 237     RDKitRemoveInvalidMolecules.py -h | --help | -e | --examples
 238 
 239 Description:
 240     Identify and remove invalid molecules based on success or failure of RDKit molecule
 241     readers or simply count the number of invalid molecules.
 242 
 243     The supported input file formats are: SD (.sdf, .sd), SMILES (.smi., csv, .tsv, .txt)
 244 
 245     The supported output file formats are: SD (.sdf, .sd), SMILES (.smi)
 246 
 247 Options:
 248     -e, --examples
 249         Print examples.
 250     -h, --help
 251         Print this help message.
 252     -i, --infile <infile>
 253         Input file name.
 254     --infileParams <Name,Value,...>  [default: auto]
 255         A comma delimited list of parameter name and value pairs for reading
 256         molecules from files. The supported parameter names for different file
 257         formats, along with their default values, are shown below:
 258             
 259             SD: removeHydrogens,yes,sanitize,yes,strictParsing,yes
 260             SMILES: smilesColumn,1,smilesNameColumn,2,smilesDelimiter,space,
 261                 smilesTitleLine,auto,sanitize,yes
 262             
 263         Possible values for smilesDelimiter: space, comma or tab.
 264     -m, --mode <remove or count>  [default: remove]
 265         Specify whether to remove invalid molecules and write out filtered molecules
 266         to output file or or simply count the number of invalid molecules.
 267     -o, --outfile <outfile>
 268         Output file name.
 269     --outfileParams <Name,Value,...>  [default: auto]
 270         A comma delimited list of parameter name and value pairs for writing
 271         molecules to files. The supported parameter names for different file
 272         formats, along with their default values, are shown below:
 273             
 274             SD: compute2DCoords,auto,kekulize,yes,forceV3000,no
 275             SMILES: smilesKekulize,no,smilesDelimiter,space, smilesIsomeric,yes,
 276                 smilesTitleLine,yes,smilesMolName,yes,smilesMolProps,no
 277             
 278         Default value for compute2DCoords: yes for SMILES input file; no for all other
 279         file types.
 280     --overwrite
 281         Overwrite existing files.
 282     -w, --workingdir <dir>
 283         Location of working directory which defaults to the current directory.
 284 
 285 Examples:
 286     To remove invalid molecules and generate an output file SMILES file
 287     containing valid molecules, type:
 288 
 289         % RDKitRemoveInvalidMolecules.py -i Sample.smi -o SampleOut.smi
 290 
 291     To count number of valid and invaid molecules without generating any
 292     output file, type:
 293 
 294         % RDKitRemoveInvalidMolecules.py -m count -i Sample.sdf
 295 
 296     To remove invalid molecules from a CSV SMILES file, SMILES strings in
 297     column 1, name in column 2, and generate output SD file containing valid
 298     molecules, type:
 299 
 300         % RDKitRemoveInvalidMolecules.py --infileParams 
 301           "smilesDelimiter,comma,smilesTitleLine,yes,smilesColumn,1,
 302           smilesNameColumn,2" --outfileParams "compute2DCoords,yes"
 303           -i SampleSMILES.csv -o SampleOut.sdf
 304 
 305 Author:
 306     Manish Sud(msud@san.rr.com)
 307 
 308 See also:
 309     RDKitConvertFileFormat.py, RDKitRemoveDuplicateMolecules.py,
 310     RDKitRemoveSalts, RDKitSearchFunctionalGroups.py,
 311     RDKitSearchSMARTS.py, RDKitStandardizeMolecules.py
 312 
 313 Copyright:
 314     Copyright (C) 2025 Manish Sud. All rights reserved.
 315 
 316     The functionality available in this script is implemented using RDKit, an
 317     open source toolkit for cheminformatics developed by Greg Landrum.
 318 
 319     This file is part of MayaChemTools.
 320 
 321     MayaChemTools is free software; you can redistribute it and/or modify it under
 322     the terms of the GNU Lesser General Public License as published by the Free
 323     Software Foundation; either version 3 of the License, or (at your option) any
 324     later version.
 325 
 326 """
 327 
 328 if __name__ == "__main__":
 329     main()