MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: FilterSDFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Benchmark;
  31 use SDFileUtil;
  32 use FileUtil;
  33 
  34 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  35 
  36 # Autoflush STDOUT
  37 $| = 1;
  38 
  39 # Starting message...
  40 $ScriptName = basename $0;
  41 print "\n$ScriptName:Starting...\n\n";
  42 $StartTime = new Benchmark;
  43 
  44 # Get the options and setup script...
  45 SetupScriptUsage();
  46 if ($Options{help} || @ARGV < 1) {
  47   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  48 }
  49 
  50 my(@SDFilesList);
  51 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  52 
  53 # Process options...
  54 print "Processing options...\n";
  55 my(%OptionsInfo);
  56 ProcessOptions();
  57 
  58 print "Checking input SD file(s)...\n";
  59 my(%SDFilesInfo);
  60 RetrieveSDFilesInfo();
  61 
  62 # Generate output files...
  63 my($FileIndex, %FilteredSDFileInfo);
  64 if (@SDFilesList > 1) {
  65   print "\nProcessing SD files...\n";
  66 }
  67 for $FileIndex (0 .. $#SDFilesList) {
  68   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  69     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  70     FilterSDFile($FileIndex);
  71   }
  72 }
  73 print "\n$ScriptName:Done...\n\n";
  74 
  75 $EndTime = new Benchmark;
  76 $TotalTime = timediff ($EndTime, $StartTime);
  77 print "Total time: ", timestr($TotalTime), "\n";
  78 
  79 ###############################################################################
  80 
  81 # Filter SD file...
  82 sub FilterSDFile {
  83   my($Index) = @_;
  84   my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines);
  85 
  86   $SDFile = $SDFilesList[$Index];
  87   $NewSDFile = $SDFilesInfo{OutFile}[$Index];
  88   $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index];
  89 
  90   open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
  91   if ($OptionsInfo{Keep}) {
  92     open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n";
  93   }
  94   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
  95 
  96   print "\nGenerating SD file $NewSDFile...\n";
  97   if ($OptionsInfo{Keep}) {
  98     print "Generating file $NewKeepSDFile...\n";
  99   }
 100 
 101   %FilteredSDFileInfo = ();
 102 
 103   $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0;
 104   $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0;
 105 
 106   $PrintCmpdCounterHeader = 1;
 107 
 108   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 109     $FilteredSDFileInfo{CmpdCount} += 1;
 110     $FilteredSDFileInfo{FilterCmpd} = 0;
 111     if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) {
 112       if ($PrintCmpdCounterHeader) {
 113         $PrintCmpdCounterHeader = 0;
 114         print "\nProcessing compounds:";
 115       }
 116       print "$FilteredSDFileInfo{CmpdCount}...";
 117     }
 118     @CmpdLines = split "\n", $CmpdString;
 119     $CtabLinesCount = GetCtabLinesCount(\@CmpdLines);
 120     if ($CtabLinesCount <= 0) {
 121       $FilteredSDFileInfo{FilterCmpd} = 1;
 122       WriteOutCmpdString($CmpdString);
 123       next CMPDSTRING;
 124     }
 125     my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]);
 126     if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) {
 127       if ($CtabLinesCount != ($AtomCount + $BondCount)) {
 128         $FilteredSDFileInfo{FilterCmpd} = 1;
 129         WriteOutCmpdString($CmpdString);
 130         next CMPDSTRING;
 131       }
 132     }
 133     if ($CtabLinesCount == ($AtomCount + $BondCount)) {
 134       if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) {
 135         my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines);
 136         if ($UnknownAtomCount) {
 137           $FilteredSDFileInfo{FilterCmpd} = 1;
 138           WriteOutCmpdString($CmpdString);
 139           next CMPDSTRING;
 140         }
 141       }
 142       if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) {
 143         my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines);
 144         if ($FragmentsCount > 1) {
 145           if ($OptionsInfo{All} || $OptionsInfo{CleanSalts}) {
 146             $CmpdString = $WashedCmpdString;
 147           }
 148           else {
 149             $FilteredSDFileInfo{FilterCmpd} = 1;
 150           }
 151           WriteOutCmpdString($CmpdString);
 152           next CMPDSTRING;
 153         }
 154       }
 155     }
 156     WriteOutCmpdString($CmpdString);
 157   }
 158   if (!$PrintCmpdCounterHeader) {
 159     print "\n";
 160   }
 161 
 162   close NEWSDFILE;
 163   if ($OptionsInfo{Keep}) {
 164     close NEWKEEPSDFILE;
 165   }
 166   close SDFILE;
 167 
 168   print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n";
 169   print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n";
 170   print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n";
 171 }
 172 
 173 # Write out the compound data...
 174 sub WriteOutCmpdString {
 175   my($CmpdString) = @_;
 176 
 177   if ($FilteredSDFileInfo{FilterCmpd}) {
 178     $FilteredSDFileInfo{KeepCmpdCount} += 1;
 179     if ($OptionsInfo{Keep}) {
 180       print NEWKEEPSDFILE "$CmpdString\n";
 181     }
 182   }
 183   else {
 184     $FilteredSDFileInfo{FilteredCmpdCount} += 1;
 185     print NEWSDFILE "$CmpdString\n";
 186   }
 187 }
 188 
 189 # Retrieve information about input SD files...
 190 sub RetrieveSDFilesInfo {
 191   my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile);
 192 
 193   %SDFilesInfo = ();
 194   @{$SDFilesInfo{FileOkay}} = ();
 195   @{$SDFilesInfo{OutFile}} = ();
 196   @{$SDFilesInfo{OutFileKeep}} = ();
 197 
 198    FILELIST: for $Index (0 .. $#SDFilesList) {
 199     $SDFile = $SDFilesList[$Index];
 200 
 201     $SDFilesInfo{FileOkay}[$Index] = 0;
 202     $SDFilesInfo{OutFile}[$Index] = '';
 203     $SDFilesInfo{OutFileKeep}[$Index] = '';
 204 
 205     if (!(-e $SDFile)) {
 206       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 207       next FILELIST;
 208     }
 209     if (!CheckFileType($SDFile, "sd sdf")) {
 210       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 211       next FILELIST;
 212     }
 213 
 214     # Setup new file names...
 215     $FileDir = ""; $FileName = ""; $FileExt = "";
 216     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 217     if ($Options{root} && (@SDFilesList == 1)) {
 218       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 219       if ($RootFileName && $RootFileExt) {
 220         $NewSDFile = $RootFileName;
 221       }
 222       else {
 223         $NewSDFile = $Options{root};
 224       }
 225       $NewKeepSDFile = $NewSDFile;
 226     }
 227     else {
 228       $NewSDFile = $FileName . "Filtered";
 229       $NewKeepSDFile = $FileName;
 230     }
 231     $NewSDFile .= ".$FileExt";
 232     $NewKeepSDFile .= "Ignored" . ".$FileExt";
 233     if (!$Options{overwrite}) {
 234       if (-e $NewSDFile) {
 235         warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n";
 236         next FILELIST;
 237       }
 238       if ($Options{keep}) {
 239         if (-e $NewKeepSDFile) {
 240           warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n";
 241           next FILELIST;
 242         }
 243       }
 244     }
 245     if (lc($NewSDFile) eq lc($SDFile)) {
 246       warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n";
 247       print "Specify a different name using \"-r --root\" option or use default name.\n";
 248       next FILELIST;
 249     }
 250 
 251     $SDFilesInfo{FileOkay}[$Index] = 1;
 252     $SDFilesInfo{OutFile}[$Index] = $NewSDFile;
 253     $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile;
 254   }
 255 }
 256 
 257 # Process option values...
 258 sub ProcessOptions {
 259   %OptionsInfo = ();
 260 
 261   $OptionsInfo{All} = $Options{all} ? $Options{all} : undef;
 262   $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef;
 263   $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef;
 264   $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef;
 265   $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef;
 266   $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
 267   $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef;
 268   $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef;
 269 
 270 }
 271 
 272 # Setup script usage  and retrieve command line arguments specified using various options...
 273 sub SetupScriptUsage {
 274 
 275   # Retrieve all the options...
 276   %Options = ();
 277   if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) {
 278     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 279   }
 280   if ($Options{workingdir}) {
 281     if (! -d $Options{workingdir}) {
 282       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 283     }
 284     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 285   }
 286 }
 287