1 #!/usr/bin/perl -w 2 # 3 # File: FilterSDFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Benchmark; 31 use SDFileUtil; 32 use FileUtil; 33 34 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 35 36 # Autoflush STDOUT 37 $| = 1; 38 39 # Starting message... 40 $ScriptName = basename $0; 41 print "\n$ScriptName:Starting...\n\n"; 42 $StartTime = new Benchmark; 43 44 # Get the options and setup script... 45 SetupScriptUsage(); 46 if ($Options{help} || @ARGV < 1) { 47 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 48 } 49 50 my(@SDFilesList); 51 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 52 53 # Process options... 54 print "Processing options...\n"; 55 my(%OptionsInfo); 56 ProcessOptions(); 57 58 print "Checking input SD file(s)...\n"; 59 my(%SDFilesInfo); 60 RetrieveSDFilesInfo(); 61 62 # Generate output files... 63 my($FileIndex, %FilteredSDFileInfo); 64 if (@SDFilesList > 1) { 65 print "\nProcessing SD files...\n"; 66 } 67 for $FileIndex (0 .. $#SDFilesList) { 68 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 69 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 70 FilterSDFile($FileIndex); 71 } 72 } 73 print "\n$ScriptName:Done...\n\n"; 74 75 $EndTime = new Benchmark; 76 $TotalTime = timediff ($EndTime, $StartTime); 77 print "Total time: ", timestr($TotalTime), "\n"; 78 79 ############################################################################### 80 81 # Filter SD file... 82 sub FilterSDFile { 83 my($Index) = @_; 84 my($SDFile, $NewSDFile, $NewKeepSDFile, $CtabLinesCount, $CmpdString, $PrintCmpdCounterHeader, @CmpdLines); 85 86 $SDFile = $SDFilesList[$Index]; 87 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 88 $NewKeepSDFile = $SDFilesInfo{OutFileKeep}[$Index]; 89 90 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 91 if ($OptionsInfo{Keep}) { 92 open NEWKEEPSDFILE, ">$NewKeepSDFile" or die "Error: Couldn't open $NewKeepSDFile: $! \n"; 93 } 94 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 95 96 print "\nGenerating SD file $NewSDFile...\n"; 97 if ($OptionsInfo{Keep}) { 98 print "Generating file $NewKeepSDFile...\n"; 99 } 100 101 %FilteredSDFileInfo = (); 102 103 $FilteredSDFileInfo{CmpdCount} = 0; $FilteredSDFileInfo{FilterCmpd} = 0; 104 $FilteredSDFileInfo{FilteredCmpdCount} = 0; $FilteredSDFileInfo{KeepCmpdCount} = 0; 105 106 $PrintCmpdCounterHeader = 1; 107 108 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 109 $FilteredSDFileInfo{CmpdCount} += 1; 110 $FilteredSDFileInfo{FilterCmpd} = 0; 111 if (($FilteredSDFileInfo{CmpdCount} % 5000) == 0) { 112 if ($PrintCmpdCounterHeader) { 113 $PrintCmpdCounterHeader = 0; 114 print "\nProcessing compounds:"; 115 } 116 print "$FilteredSDFileInfo{CmpdCount}..."; 117 } 118 @CmpdLines = split "\n", $CmpdString; 119 $CtabLinesCount = GetCtabLinesCount(\@CmpdLines); 120 if ($CtabLinesCount <= 0) { 121 $FilteredSDFileInfo{FilterCmpd} = 1; 122 WriteOutCmpdString($CmpdString); 123 next CMPDSTRING; 124 } 125 my ($AtomCount, $BondCount) = ParseCmpdCountsLine($CmpdLines[3]); 126 if ($OptionsInfo{All} || $OptionsInfo{Mismatch}) { 127 if ($CtabLinesCount != ($AtomCount + $BondCount)) { 128 $FilteredSDFileInfo{FilterCmpd} = 1; 129 WriteOutCmpdString($CmpdString); 130 next CMPDSTRING; 131 } 132 } 133 if ($CtabLinesCount == ($AtomCount + $BondCount)) { 134 if ($OptionsInfo{All} || $OptionsInfo{UnknownAtoms}) { 135 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines) = GetUnknownAtoms(\@CmpdLines); 136 if ($UnknownAtomCount) { 137 $FilteredSDFileInfo{FilterCmpd} = 1; 138 WriteOutCmpdString($CmpdString); 139 next CMPDSTRING; 140 } 141 } 142 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts} || $OptionsInfo{Salts}) { 143 my ($FragmentsCount, $Fragments, $WashedCmpdString) = WashCmpd(\@CmpdLines); 144 if ($FragmentsCount > 1) { 145 if ($OptionsInfo{All} || $OptionsInfo{CleanSalts}) { 146 $CmpdString = $WashedCmpdString; 147 } 148 else { 149 $FilteredSDFileInfo{FilterCmpd} = 1; 150 } 151 WriteOutCmpdString($CmpdString); 152 next CMPDSTRING; 153 } 154 } 155 } 156 WriteOutCmpdString($CmpdString); 157 } 158 if (!$PrintCmpdCounterHeader) { 159 print "\n"; 160 } 161 162 close NEWSDFILE; 163 if ($OptionsInfo{Keep}) { 164 close NEWKEEPSDFILE; 165 } 166 close SDFILE; 167 168 print "\nTotal Number of compounds: $FilteredSDFileInfo{CmpdCount}\n"; 169 print "Number of compounds left after filtering: $FilteredSDFileInfo{FilteredCmpdCount}\n"; 170 print "Number of compounds ignored: $FilteredSDFileInfo{KeepCmpdCount}\n"; 171 } 172 173 # Write out the compound data... 174 sub WriteOutCmpdString { 175 my($CmpdString) = @_; 176 177 if ($FilteredSDFileInfo{FilterCmpd}) { 178 $FilteredSDFileInfo{KeepCmpdCount} += 1; 179 if ($OptionsInfo{Keep}) { 180 print NEWKEEPSDFILE "$CmpdString\n"; 181 } 182 } 183 else { 184 $FilteredSDFileInfo{FilteredCmpdCount} += 1; 185 print NEWSDFILE "$CmpdString\n"; 186 } 187 } 188 189 # Retrieve information about input SD files... 190 sub RetrieveSDFilesInfo { 191 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $NewSDFile, $NewKeepSDFile); 192 193 %SDFilesInfo = (); 194 @{$SDFilesInfo{FileOkay}} = (); 195 @{$SDFilesInfo{OutFile}} = (); 196 @{$SDFilesInfo{OutFileKeep}} = (); 197 198 FILELIST: for $Index (0 .. $#SDFilesList) { 199 $SDFile = $SDFilesList[$Index]; 200 201 $SDFilesInfo{FileOkay}[$Index] = 0; 202 $SDFilesInfo{OutFile}[$Index] = ''; 203 $SDFilesInfo{OutFileKeep}[$Index] = ''; 204 205 if (!(-e $SDFile)) { 206 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 207 next FILELIST; 208 } 209 if (!CheckFileType($SDFile, "sd sdf")) { 210 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 211 next FILELIST; 212 } 213 214 # Setup new file names... 215 $FileDir = ""; $FileName = ""; $FileExt = ""; 216 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 217 if ($Options{root} && (@SDFilesList == 1)) { 218 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 219 if ($RootFileName && $RootFileExt) { 220 $NewSDFile = $RootFileName; 221 } 222 else { 223 $NewSDFile = $Options{root}; 224 } 225 $NewKeepSDFile = $NewSDFile; 226 } 227 else { 228 $NewSDFile = $FileName . "Filtered"; 229 $NewKeepSDFile = $FileName; 230 } 231 $NewSDFile .= ".$FileExt"; 232 $NewKeepSDFile .= "Ignored" . ".$FileExt"; 233 if (!$Options{overwrite}) { 234 if (-e $NewSDFile) { 235 warn "Warning: Ignoring file $SDFile: New SD file, $NewSDFile, already exists\n"; 236 next FILELIST; 237 } 238 if ($Options{keep}) { 239 if (-e $NewKeepSDFile) { 240 warn "Warning: Ignoring file $SDFile: New SD file, $NewKeepSDFile, already exists\n"; 241 next FILELIST; 242 } 243 } 244 } 245 if (lc($NewSDFile) eq lc($SDFile)) { 246 warn "Warning: Ignoring file $SDFile: Same output, $NewSDFile, and input file name\n"; 247 print "Specify a different name using \"-r --root\" option or use default name.\n"; 248 next FILELIST; 249 } 250 251 $SDFilesInfo{FileOkay}[$Index] = 1; 252 $SDFilesInfo{OutFile}[$Index] = $NewSDFile; 253 $SDFilesInfo{OutFileKeep}[$Index] = $NewKeepSDFile; 254 } 255 } 256 257 # Process option values... 258 sub ProcessOptions { 259 %OptionsInfo = (); 260 261 $OptionsInfo{All} = $Options{all} ? $Options{all} : undef; 262 $OptionsInfo{CleanSalts} = $Options{cleansalts} ? $Options{cleansalts} : undef; 263 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : undef; 264 $OptionsInfo{Keep} = $Options{keep} ? $Options{keep} : undef; 265 $OptionsInfo{Mismatch} = $Options{mismatch} ? $Options{mismatch} : undef; 266 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 267 $OptionsInfo{Salts} = $Options{salts} ? $Options{salts} : undef; 268 $OptionsInfo{UnknownAtoms} = $Options{unknownatoms} ? $Options{unknownatoms} : undef; 269 270 } 271 272 # Setup script usage and retrieve command line arguments specified using various options... 273 sub SetupScriptUsage { 274 275 # Retrieve all the options... 276 %Options = (); 277 if (!GetOptions(\%Options, "all|a", "cleansalts|c", "empty|e", "help|h", "keep|k", "mismatch|m", "overwrite|o", "root|r=s", "salts|s", "unknownatoms|u", "workingdir|w=s")) { 278 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 279 } 280 if ($Options{workingdir}) { 281 if (! -d $Options{workingdir}) { 282 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 283 } 284 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 285 } 286 } 287