MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: InfoSequenceFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SequenceFileUtil;
  35 use StatisticsUtil;
  36 
  37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  38 
  39 # Autoflush STDOUT
  40 $| = 1;
  41 
  42 # Starting message...
  43 $ScriptName = basename($0);
  44 print "\n$ScriptName: Starting...\n\n";
  45 $StartTime = new Benchmark;
  46 
  47 # Get the options and setup script...
  48 SetupScriptUsage();
  49 if ($Options{help} || @ARGV < 1) {
  50   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  51 }
  52 
  53 my(@SequenceFilesList);
  54 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  55 
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 print "Checking input sequence file(s)...\n";
  61 my(%SequenceFilesInfo);
  62 RetrieveSequenceFilesInfo();
  63 
  64 my($FileIndex);
  65 if (@SequenceFilesList > 1) {
  66   print "\nProcessing sequence files...\n";
  67 }
  68 for $FileIndex (0 .. $#SequenceFilesList) {
  69   if ($SequenceFilesInfo{FileOkay}[$FileIndex]) {
  70     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  71     ListSequenceFileInfo($FileIndex);
  72   }
  73 }
  74 ListTotalSizeOfFiles();
  75 
  76 print "\n$ScriptName:Done...\n\n";
  77 
  78 $EndTime = new Benchmark;
  79 $TotalTime = timediff ($EndTime, $StartTime);
  80 print "Total time: ", timestr($TotalTime), "\n";
  81 
  82 ###############################################################################
  83 
  84 # List appropriate information...
  85 sub ListSequenceFileInfo {
  86   my($Index) = @_;
  87   my($SequenceFile, $SequenceDataRef);
  88 
  89   $SequenceFile = $SequenceFilesList[$Index];
  90 
  91   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  92 
  93   my($SequencesCount) = $SequenceDataRef->{Count};
  94   print "\nNumber of sequences: $SequencesCount\n";
  95 
  96   if ($OptionsInfo{ListShortestSequence} && ($SequencesCount > 1)) {
  97     my($ShortestSeqID, $ShortestSeq, $ShortestSeqLen, $Description) = GetShortestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps});
  98     print "\nShortest sequence information:\nID: $ShortestSeqID; Length:$ShortestSeqLen\n";
  99     if ($OptionsInfo{DetailLevel} >= 2) {
 100       print "Description: $Description\n";
 101     }
 102     if ($OptionsInfo{DetailLevel} >= 3) {
 103       print "Sequence: $ShortestSeq\n";
 104     }
 105   }
 106   if ($OptionsInfo{ListLongestSequence} && ($SequencesCount > 1)) {
 107     my($LongestSeqID, $LongestSeq, $LongestSeqLen, $Description) = GetLongestSequence($SequenceDataRef, $OptionsInfo{IgnoreGaps});
 108     print "\nLongest sequence information:\nID: $LongestSeqID; Length: $LongestSeqLen\n";
 109     if ($OptionsInfo{DetailLevel} >= 2) {
 110       print "Description: $Description\n";
 111     }
 112     if ($OptionsInfo{DetailLevel} >= 3) {
 113       print "Sequence: $LongestSeq\n";
 114     }
 115   }
 116   if ($OptionsInfo{FrequencyAnalysis} && ($SequencesCount > 1)) {
 117     PerformLengthFrequencyAnalysis($SequenceDataRef);
 118   }
 119   if ($OptionsInfo{ListSequenceLengths}) {
 120     ListSequenceLengths($SequenceDataRef);
 121   }
 122 
 123   # File size and modification information...
 124   print "\nFile size: ", FormatFileSize($SequenceFilesInfo{FileSize}[$Index]), " \n";
 125   print "Last modified: ", $SequenceFilesInfo{FileLastModified}[$Index], " \n";
 126 }
 127 
 128 # List information about sequence lengths...
 129 sub ListSequenceLengths {
 130   my($SequenceDataRef) = @_;
 131   my($ID, $SeqLen, $Sequence, $Description);
 132 
 133   print "\nSequence lengths information:\n";
 134   for $ID (@{$SequenceDataRef->{IDs}}) {
 135     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 136     $Description = $SequenceDataRef->{Description}{$ID};
 137     $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps});
 138     if ($OptionsInfo{IgnoreGaps}) {
 139       $Sequence = RemoveSequenceGaps($Sequence);
 140     }
 141     print "ID: $ID; Length:$SeqLen\n";
 142     if ($OptionsInfo{DetailLevel} >= 2) {
 143       print "Description: $Description\n";
 144     }
 145     if ($OptionsInfo{DetailLevel} >= 3) {
 146       print "Sequence: $Sequence\n";
 147     }
 148     if ($OptionsInfo{DetailLevel} >= 2) {
 149       print "\n";
 150     }
 151   }
 152 }
 153 
 154 # Total size of all the fiels...
 155 sub ListTotalSizeOfFiles {
 156   my($FileOkayCount, $TotalSize, $Index);
 157 
 158   $FileOkayCount = 0;
 159   $TotalSize = 0;
 160 
 161   for $Index (0 .. $#SequenceFilesList) {
 162     if ($SequenceFilesInfo{FileOkay}[$Index]) {
 163       $FileOkayCount++;
 164       $TotalSize += $SequenceFilesInfo{FileSize}[$Index];
 165     }
 166   }
 167   if ($FileOkayCount > 1) {
 168     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 169   }
 170 }
 171 
 172 
 173 # Perform frequency analysis of sequence lengths
 174 sub PerformLengthFrequencyAnalysis {
 175   my($SequenceDataRef, $SequenceLengthsRef) = @_;
 176   my ($ID, $SeqLen, $Sequence, $SequenceLenBin, $LenBin, $SequenceLenCount, @SequenceLengths, %SequenceLenFrequency);
 177 
 178   @SequenceLengths = ();
 179   %SequenceLenFrequency = ();
 180   for $ID (@{$SequenceDataRef->{IDs}}) {
 181     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 182     $SeqLen = GetSequenceLength($Sequence, $OptionsInfo{IgnoreGaps});
 183     push @SequenceLengths, $SeqLen;
 184   }
 185   if (@{$OptionsInfo{BinRange}}) {
 186     %SequenceLenFrequency = Frequency(\@SequenceLengths, \@{$OptionsInfo{BinRange}});
 187   }
 188   else {
 189     %SequenceLenFrequency = Frequency(\@SequenceLengths, $OptionsInfo{NumOfBins});
 190   }
 191   print "\nDistribution of sequence lengths (LengthBin => Count):\n";
 192   for $SequenceLenBin (sort { $a <=> $b} keys %SequenceLenFrequency) {
 193     $SequenceLenCount = $SequenceLenFrequency{$SequenceLenBin};
 194     $LenBin = sprintf("%.1f", $SequenceLenBin) + 0;
 195     print "$LenBin => $SequenceLenCount; ";
 196   }
 197   print "\n";
 198 }
 199 
 200 # Retrieve information about sequence files...
 201 sub RetrieveSequenceFilesInfo {
 202   my($Index, $SequenceFile, $FileSupported, $FileFormat, $ModifiedTimeString, $ModifiedDateString);
 203 
 204   %SequenceFilesInfo = ();
 205   @{$SequenceFilesInfo{FileOkay}} = ();
 206   @{$SequenceFilesInfo{FileFormat}} = ();
 207   @{$SequenceFilesInfo{FileSize}} = ();
 208   @{$SequenceFilesInfo{FileLastModified}} = ();
 209 
 210   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 211     $SequenceFile = $SequenceFilesList[$Index];
 212 
 213     if (! open SEQUENCEFILE, "$SequenceFile") {
 214       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 215       next FILELIST;
 216     }
 217     close SEQUENCEFILE;
 218 
 219     $SequenceFilesInfo{FileOkay}[$Index] = 0;
 220     $SequenceFilesInfo{FileFormat}[$Index] = 'NotSupported';
 221     $SequenceFilesInfo{FileSize}[$Index] = 0;
 222     $SequenceFilesInfo{FileLastModified}[$Index] = '';
 223 
 224     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 225     if (!$FileSupported) {
 226       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 227       next FILELIST;
 228     }
 229 
 230     $SequenceFilesInfo{FileOkay}[$Index] = 1;
 231     $SequenceFilesInfo{FileFormat}[$Index] = $FileFormat;
 232     $SequenceFilesInfo{FileSize}[$Index] = FileSize($SequenceFile);
 233 
 234     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($SequenceFile);
 235     $SequenceFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 236   }
 237 }
 238 
 239 # Process option values...
 240 sub ProcessOptions {
 241 
 242   $OptionsInfo{All} = defined $Options{all} ? $Options{all} : undef;
 243 
 244   $OptionsInfo{Count} = defined $Options{count} ? $Options{count} : undef;
 245   $OptionsInfo{DetailLevel} = $Options{detail};
 246   $OptionsInfo{Frequency} = defined $Options{frequency} ? $Options{frequency} : undef;
 247   $OptionsInfo{FrequencyBins} = defined $Options{frequencybins} ? $Options{frequencybins} : undef;
 248   $OptionsInfo{IgnoreGaps} = defined $Options{ignoregaps} ? $Options{ignoregaps} : undef;
 249   $OptionsInfo{Longest} = defined $Options{longest} ? $Options{longest} : undef;
 250   $OptionsInfo{Shortest} = defined $Options{shortest} ? $Options{shortest} : undef;
 251   $OptionsInfo{SequenceLengths} = defined $Options{sequencelengths} ? $Options{sequencelengths} : undef;
 252 
 253   $OptionsInfo{FrequencyAnalysis} = ($Options{all} || $Options{frequency}) ? 1 : 0;
 254   $OptionsInfo{ListLongestSequence} = ($Options{all} || $Options{longest}) ? 1 : 0;
 255   $OptionsInfo{ListShortestSequence} = ($Options{all} || $Options{shortest}) ? 1 : 0;
 256   $OptionsInfo{ListSequenceLengths} = ($Options{all} || $Options{sequencelengths}) ? 1 : 0;
 257   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 258 
 259   # Setup frequency bin values...
 260   $OptionsInfo{NumOfBins} = 4;
 261   @{$OptionsInfo{BinRange}} = ();
 262 
 263   if ($Options{frequencybins} =~ /\,/) {
 264     my($BinValue, @SpecifiedBinRange);
 265     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 266     if (@SpecifiedBinRange < 2) {
 267       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 268     }
 269     for $BinValue (@SpecifiedBinRange) {
 270       if (!IsNumerical($BinValue)) {
 271         die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 272       }
 273     }
 274     my($Index1, $Index2);
 275     for $Index1 (0 .. $#SpecifiedBinRange) {
 276       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 277         if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 278           die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 279         }
 280       }
 281     }
 282     push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
 283   }
 284   else {
 285     $OptionsInfo{NumOfBins} = $Options{frequencybins};
 286     if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
 287       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 288     }
 289   }
 290 }
 291 
 292 # Setup script usage  and retrieve command line arguments specified using various options...
 293 sub SetupScriptUsage {
 294 
 295   # Retrieve all the options...
 296   %Options = ();
 297   $Options{detail} = 1;
 298   $Options{ignoregaps} = 'no';
 299   $Options{frequencybins} = 10;
 300 
 301   if (!GetOptions(\%Options, "all|a", "count|c", "detail|d=i", "frequency|f", "frequencybins=s", "help|h", "ignoregaps|i=s", "longest|l", "shortest|s", "sequencelengths", "workingdir|w=s")) {
 302     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 303   }
 304   if ($Options{workingdir}) {
 305     if (! -d $Options{workingdir}) {
 306       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 307     }
 308     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 309   }
 310   if (!IsPositiveInteger($Options{detail})) {
 311     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 312   }
 313   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 314     die "Error: The value specified, $Options{ignoregaps}, for option \"-i --IgnoreGaps\" is not valid. Allowed values: yes or no\n";
 315   }
 316 }
 317