1 #!/usr/bin/perl -w 2 # 3 # File: InfoSequenceFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SequenceFileUtil; 35 use StatisticsUtil; 36 37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 38 39 # Autoflush STDOUT 40 $| = 1; 41 42 # Starting message... 43 $ScriptName = basename($0); 44 print "\n$ScriptName: Starting...\n\n"; 45 $StartTime = new Benchmark; 46 47 # Get the options and setup script... 48 SetupScriptUsage(); 49 if ($Options{help} || @ARGV < 1) { 50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 51 } 52 53 my(@SequenceFilesList); 54 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); 55 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 print "Checking input sequence file(s)...\n"; 61 my(%SequenceFilesInfo); 62 RetrieveSequenceFilesInfo(); 63 64 my($FileIndex); 65 if (@SequenceFilesList > 1) { 66 print "\nProcessing sequence files...\n"; 67 } 68 for $FileIndex (0 .. $#SequenceFilesList) { 69 if ($SequenceFilesInfo{FileOkay}[$FileIndex]) { 70 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n"; 71 ListSequenceFileInfo($FileIndex); 72 } 73 } 74 ListTotalSizeOfFiles(); 75 76 print "\n$ScriptName:Done...\n\n"; 77 78 $EndTime = new Benchmark; 79 $TotalTime = timediff ($EndTime, $StartTime); 80 print "Total time: ", timestr($TotalTime), "\n"; 81 82 ############################################################################### 83 84 # List appropriate information... 85 sub ListSequenceFileInfo { 86 my($Index) = @_; 87 my($SequenceFile, $SequenceDataRef); 88 89 $SequenceFile = $SequenceFilesList[$Index]; 90 91 $SequenceDataRef = ReadSequenceFile($SequenceFile); 92 93 my($SequencesCount) = $SequenceDataRef->{Count}; 94 print "\nNumber of sequences: $SequencesCount\n"; 95 96 if ($OptionsInfo{ListShortestSequence} && ($SequencesCount > 1)) { 97 my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps}); 98 print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n"; 99 if ($OptionsInfo{DetailLevel} >= 2) { 100 print "Description: $Description\n"; 101 } 102 if ($OptionsInfo{DetailLevel} >= 3) { 103 print "Sequence: $ShortestSeq\n"; 104 } 105 } 106 if ($OptionsInfo{ListLongestSequence} && ($SequencesCount > 1)) { 107 my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps}); 108 print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n"; 109 if ($OptionsInfo{DetailLevel} >= 2) { 110 print "Description: $Description\n"; 111 } 112 if ($OptionsInfo{DetailLevel} >= 3) { 113 print "Sequence: $LongestSeq\n"; 114 } 115 } 116 if ($OptionsInfo{FrequencyAnalysis} && ($SequencesCount > 1)) { 117 PerformLengthFrequencyAnalysis($SequenceDataRef); 118 } 119 if ($OptionsInfo{ListSequenceLengths}) { 120 ListSequenceLengths($SequenceDataRef); 121 } 122 123 # File size and modification information... 124 print "\nFile size: ", FormatFileSize($SequenceFilesInfo{FileSize}[$Index]), " \n"; 125 print "Last modified: ", $SequenceFilesInfo{FileLastModified}[$Index], " \n"; 126 } 127 128 # List information about sequence lengths... 129 sub ListSequenceLengths { 130 my($SequenceDataRef) = @_; 131 my($ID, $SeqLen, $Sequence, $Description); 132 133 print "\nSequence lengths information:\n"; 134 for $ID (@{$SequenceDataRef->{IDs}}) { 135 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 136 $Description = $SequenceDataRef->{Description}{$ID}; 137 $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps}); 138 if ($OptionsInfo{IgnoreGaps}) { 139 $Sequence = RemoveSequenceGaps($Sequence); 140 } 141 print "ID: $ID; Length:$SeqLen\n"; 142 if ($OptionsInfo{DetailLevel} >= 2) { 143 print "Description: $Description\n"; 144 } 145 if ($OptionsInfo{DetailLevel} >= 3) { 146 print "Sequence: $Sequence\n"; 147 } 148 if ($OptionsInfo{DetailLevel} >= 2) { 149 print "\n"; 150 } 151 } 152 } 153 154 # Total size of all the fiels... 155 sub ListTotalSizeOfFiles { 156 my($FileOkayCount, $TotalSize, $Index); 157 158 $FileOkayCount = 0; 159 $TotalSize = 0; 160 161 for $Index (0 .. $#SequenceFilesList) { 162 if ($SequenceFilesInfo{FileOkay}[$Index]) { 163 $FileOkayCount++; 164 $TotalSize += $SequenceFilesInfo{FileSize}[$Index]; 165 } 166 } 167 if ($FileOkayCount > 1) { 168 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 169 } 170 } 171 172 173 # Perform frequency analysis of sequence lengths 174 sub PerformLengthFrequencyAnalysis { 175 my($SequenceDataRef, $SequenceLengthsRef) = @_; 176 my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency); 177 178 @SequenceLengths = (); 179 %SequenceLenFrequency = (); 180 for $ID (@{$SequenceDataRef->{IDs}}) { 181 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 182 $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps}); 183 push @SequenceLengths, $SeqLen; 184 } 185 if (@{$OptionsInfo{BinRange}}) { 186 %SequenceLenFrequency = Frequency(\@SequenceLengths, \@{$OptionsInfo{BinRange}}); 187 } 188 else { 189 %SequenceLenFrequency = Frequency(\@SequenceLengths, $OptionsInfo{NumOfBins}); 190 } 191 print "\nDistribution of sequence lengths (LengthBin => Count):\n"; 192 for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) { 193 $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin}; 194 $LenBin = sprintf("%.1f", $SequenceLenBin) + 0; 195 print "$LenBin => $SequenceLenCount; "; 196 } 197 print "\n"; 198 } 199 200 # Retrieve information about sequence files... 201 sub RetrieveSequenceFilesInfo { 202 my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString); 203 204 %SequenceFilesInfo = (); 205 @{$SequenceFilesInfo{FileOkay}} = (); 206 @{$SequenceFilesInfo{FileFormat}} = (); 207 @{$SequenceFilesInfo{FileSize}} = (); 208 @{$SequenceFilesInfo{FileLastModified}} = (); 209 210 FILELIST: for $Index (0 .. $#SequenceFilesList) { 211 $SequenceFile = $SequenceFilesList[$Index]; 212 213 if (! open SEQUENCEFILE, "$SequenceFile") { 214 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; 215 next FILELIST; 216 } 217 close SEQUENCEFILE; 218 219 $SequenceFilesInfo{FileOkay}[$Index] = 0; 220 $SequenceFilesInfo{FileFormat}[$Index] = 'NotSupported'; 221 $SequenceFilesInfo{FileSize}[$Index] = 0; 222 $SequenceFilesInfo{FileLastModified}[$Index] = ''; 223 224 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); 225 if (!$FileSupported) { 226 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; 227 next FILELIST; 228 } 229 230 $SequenceFilesInfo{FileOkay}[$Index] = 1; 231 $SequenceFilesInfo{FileFormat}[$Index] = $FileFormat; 232 $SequenceFilesInfo{FileSize}[$Index] = FileSize($SequenceFile); 233 234 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile); 235 $SequenceFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 236 } 237 } 238 239 # Process option values... 240 sub ProcessOptions { 241 242 $OptionsInfo{All} = defined $Options{all} ? $Options{all} : undef; 243 244 $OptionsInfo{Count} = defined $Options{count} ? $Options{count} : undef; 245 $OptionsInfo{DetailLevel} = $Options{detail}; 246 $OptionsInfo{Frequency} = defined $Options{frequency} ? $Options{frequency} : undef; 247 $OptionsInfo{FrequencyBins} = defined $Options{frequencybins} ? $Options{frequencybins} : undef; 248 $OptionsInfo{IgnoreGaps} = defined $Options{ignoregaps} ? $Options{ignoregaps} : undef; 249 $OptionsInfo{Longest} = defined $Options{longest} ? $Options{longest} : undef; 250 $OptionsInfo{Shortest} = defined $Options{shortest} ? $Options{shortest} : undef; 251 $OptionsInfo{SequenceLengths} = defined $Options{sequencelengths} ? $Options{sequencelengths} : undef; 252 253 $OptionsInfo{FrequencyAnalysis} = ($Options{all} || $Options{frequency}) ? 1 : 0; 254 $OptionsInfo{ListLongestSequence} = ($Options{all} || $Options{longest}) ? 1 : 0; 255 $OptionsInfo{ListShortestSequence} = ($Options{all} || $Options{shortest}) ? 1 : 0; 256 $OptionsInfo{ListSequenceLengths} = ($Options{all} || $Options{sequencelengths}) ? 1 : 0; 257 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; 258 259 # Setup frequency bin values... 260 $OptionsInfo{NumOfBins} = 4; 261 @{$OptionsInfo{BinRange}} = (); 262 263 if ($Options{frequencybins} =~ /\,/) { 264 my($BinValue, @SpecifiedBinRange); 265 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 266 if (@SpecifiedBinRange < 2) { 267 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 268 } 269 for $BinValue (@SpecifiedBinRange) { 270 if (!IsNumerical($BinValue)) { 271 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 272 } 273 } 274 my($Index1, $Index2); 275 for $Index1 (0 .. $#SpecifiedBinRange) { 276 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 277 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 278 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 279 } 280 } 281 } 282 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 283 } 284 else { 285 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 286 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 287 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 288 } 289 } 290 } 291 292 # Setup script usage and retrieve command line arguments specified using various options... 293 sub SetupScriptUsage { 294 295 # Retrieve all the options... 296 %Options = (); 297 $Options{detail} = 1; 298 $Options{ignoregaps} = 'no'; 299 $Options{frequencybins} = 10; 300 301 if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) { 302 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 303 } 304 if ($Options{workingdir}) { 305 if (! -d $Options{workingdir}) { 306 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 307 } 308 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 309 } 310 if (!IsPositiveInteger($Options{detail})) { 311 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 312 } 313 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { 314 die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n"; 315 } 316 } 317