MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: AnalyzeSequenceFilesData.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2019 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SequenceFileUtil;
  35 use AminoAcids;
  36 use NucleicAcids;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename($0);
  45 print "\n$ScriptName: Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Setup script usage message...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 # Expand wild card file names...
  55 my(@SequenceFilesList);
  56 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  57 
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 # Set up information about input files...
  63 print "Checking input sequence file(s)...\n";
  64 my(%SequenceFilesInfo);
  65 RetrieveSequenceFilesInfo();
  66 SetupSequenceRegionsData();
  67 
  68 # Process input files..
  69 my($FileIndex);
  70 if (@SequenceFilesList > 1) {
  71   print "\nProcessing sequence files...\n";
  72 }
  73 for $FileIndex (0 .. $#SequenceFilesList) {
  74   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  75     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  76     AnalyzeSequenceFileData($FileIndex);
  77   }
  78 }
  79 print "\n$ScriptName:Done...\n\n";
  80 
  81 $EndTime = new Benchmark;
  82 $TotalTime = timediff ($EndTime, $StartTime);
  83 print "Total time: ", timestr($TotalTime), "\n";
  84 
  85 ###############################################################################
  86 
  87 # Analyze sequence file...
  88 sub AnalyzeSequenceFileData {
  89   my($FileIndex) = @_;
  90   my($SequenceFile, $SequenceDataRef);
  91 
  92   $SequenceFile = $SequenceFilesList[$FileIndex];
  93 
  94   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
  95   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  96   close SEQUENCEFILE;
  97 
  98   if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
  99     CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef);
 100   }
 101   if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 102     PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef);
 103   }
 104 }
 105 
 106 # Calculate percent identity matrix...
 107 sub CalculatePercentIdentityMatrix {
 108   my($FileIndex, $SequenceDataRef) = @_;
 109   my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords);
 110 
 111   $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex];
 112   $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision});
 113 
 114   print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n";
 115   open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n";
 116 
 117   # Write out column labels...
 118   @LineWords = ();
 119   push @LineWords, '';
 120   for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 121     push @LineWords, $ColID;
 122   }
 123   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 124   print OUTFILE "$Line\n";
 125 
 126   # Write out rows...
 127   for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) {
 128     @LineWords = ();
 129     push @LineWords, $RowID;
 130     for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) {
 131       $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID};
 132       push @LineWords, $PercentIdentity;
 133     }
 134     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 135     print OUTFILE "$Line\n";
 136   }
 137   close OUTFILE;
 138 }
 139 
 140 # Perform frequency analysis...
 141 sub PerformResidueFrequencyAnalysis {
 142   my($FileIndex, $SequenceDataRef) = @_;
 143 
 144   CountResiduesInRegions($FileIndex, $SequenceDataRef);
 145   CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef);
 146   GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef);
 147 }
 148 
 149 # Count residues...
 150 sub CountResiduesInRegions {
 151   my($FileIndex, $SequenceDataRef) = @_;
 152 
 153   # Setup rerfernce sequence data...
 154   my($RefereceSequenceID, $RefereceSequence);
 155   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 156   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 157 
 158   # Count residues...
 159   my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue);
 160   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 161     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 162     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 163     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 164       $ResIndex = $ResNum - 1;
 165       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 166         next RESNUM;
 167       }
 168       # Go over residues in column $ResNum in all the sequences...
 169       ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 170         $Sequence = $SequenceDataRef->{Sequence}{$ID};
 171         $Residue = substr($Sequence, $ResIndex, 1);
 172         if (IsGapResidue($Residue)) {
 173           $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1;
 174         }
 175         else {
 176           if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) {
 177             $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1;
 178           }
 179           else {
 180             # Internal error...
 181             print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n";
 182           }
 183         }
 184       }
 185     }
 186   }
 187 }
 188 
 189 # Calculate percent frequency for various residues in the sequence regions...
 190 sub CalculatePercentResidueFrequencyInRegions {
 191   my($FileIndex, $SequenceDataRef) = @_;
 192   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision);
 193 
 194   $MaxResiduesCount = $SequenceDataRef->{Count};
 195   $Precision = $OptionsInfo{Precision};
 196   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 197     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 198     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 199     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 200       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 201         next RESNUM;
 202       }
 203       for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) {
 204         $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 205         $PercentCount = ($Count / $MaxResiduesCount) * 100;
 206         $PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0;
 207         $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount;
 208       }
 209     }
 210   }
 211 }
 212 
 213 # Generate output files...
 214 sub GeneratePercentResidueFrequencyOutFilesForRegions {
 215   my($FileIndex, $SequenceDataRef) = @_;
 216 
 217   # Setup rerfernce sequence data...
 218   my($RefereceSequenceID, $RefereceSequence);
 219   $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex];
 220   $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex];
 221 
 222   my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords);
 223 
 224   $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex];
 225   $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex];
 226   $RegionNum = 0;
 227   for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) {
 228     $RegionNum++;
 229     $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum};
 230     $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum};
 231 
 232     $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 233     $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}";
 234 
 235     print "Generating $RegionOutFile and $PercentRegionOutFile...\n";
 236     open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n";
 237     open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n";
 238 
 239     # Write out reference residue positions as column values....
 240     @LineWords = ();
 241     push @LineWords, '';
 242     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 243       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 244         next RESNUM;
 245       }
 246       push @LineWords, $ResNum;
 247     }
 248     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 249     print REGIONOUTFILE "$Line\n";
 250     print PERCENTREGIONOUTFILE "$Line\n";
 251 
 252 
 253     # Write out row data for each residue; Gap residue is written last...
 254     RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) {
 255       if ($Residue =~ /^Gap$/i) {
 256         next RESIDUE;
 257       }
 258       @LineWords = ();
 259       push @LineWords, $Residue;
 260       @PercentLineWords = ();
 261       push @PercentLineWords, $Residue;
 262 
 263       RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 264         if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 265           next RESNUM;
 266         }
 267         $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 268         push @LineWords, $Count;
 269         $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 270         push @PercentLineWords, $PercentCount;
 271       }
 272       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 273       print REGIONOUTFILE "$Line\n";
 274 
 275       $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 276       print PERCENTREGIONOUTFILE "$Line\n";
 277     }
 278 
 279     # Write out data for gap...
 280     $Residue = 'Gap';
 281     @LineWords = ();
 282     push @LineWords, $Residue;
 283     @PercentLineWords = ();
 284     push @PercentLineWords, $Residue;
 285 
 286     RESNUM: for $ResNum ($StartResNum .. $EndResNum) {
 287       if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) {
 288         next RESNUM;
 289       }
 290       $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue};
 291       push @LineWords, $Count;
 292 
 293       $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue};
 294       push @PercentLineWords, $PercentCount;
 295     }
 296     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 297     print REGIONOUTFILE "$Line\n";
 298 
 299     $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 300     print PERCENTREGIONOUTFILE "$Line\n";
 301 
 302     close REGIONOUTFILE;
 303     close PERCENTREGIONOUTFILE;
 304   }
 305 }
 306 
 307 # Process option values...
 308 sub ProcessOptions {
 309   %OptionsInfo = ();
 310 
 311   # Setup analysis mode...
 312   $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0;
 313   $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0;
 314 
 315   # Setup delimiter and quotes...
 316   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 317   $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 318 
 319   # Setup reference sequence and regions for residue frequence analysis...
 320   $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence};
 321   $OptionsInfo{SpecifiedRegion} = $Options{region};
 322   @{$OptionsInfo{SpecifiedRegions}} = ();
 323 
 324   my(@SpecifiedRegions);
 325   @SpecifiedRegions = ();
 326   if ($Options{region} =~ /\,/) {
 327     @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion};
 328     if (@SpecifiedRegions % 2) {
 329       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 330     }
 331     # Make sure EndResNum > StartResNum...
 332     my($StartResNum, $EndResNum, $Index, $RegionNum);
 333     $RegionNum = 0;
 334     for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) {
 335       $StartResNum = $SpecifiedRegions[$Index];
 336       $EndResNum = $SpecifiedRegions[$Index + 1];
 337       $RegionNum++;
 338       if (!IsPositiveInteger($StartResNum)) {
 339         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n";
 340       }
 341       if (!IsPositiveInteger($EndResNum)) {
 342         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n";
 343       }
 344       if ($StartResNum >= $EndResNum) {
 345         die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n";
 346       }
 347     }
 348   }
 349   else {
 350     if ($Options{region} !~ /^UseCompleteSequence$/i) {
 351       die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n";
 352     }
 353   }
 354   push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions;
 355 
 356   # Miscellaneous options...
 357   $OptionsInfo{Precision} = $Options{precision};
 358   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 359   $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode};
 360 
 361   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 362   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 363 }
 364 
 365 # Retrieve information about sequence files...
 366 sub RetrieveSequenceFilesInfo {
 367   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions);
 368 
 369   %SequenceFilesInfo = ();
 370   @{$SequenceFilesInfo{FilesOkay}} = ();
 371   @{$SequenceFilesInfo{OutFileRoot}} = ();
 372   @{$SequenceFilesInfo{OutFileExt}} = ();
 373   @{$SequenceFilesInfo{Format}} = ();
 374   @{$SequenceFilesInfo{SequenceCount}} = ();
 375   @{$SequenceFilesInfo{RefereceSequenceID}} = ();
 376   @{$SequenceFilesInfo{RefereceSequence}} = ();
 377   @{$SequenceFilesInfo{RefereceSequenceLen}} = ();
 378   @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = ();
 379   @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = ();
 380   @{$SequenceFilesInfo{RefereceSequenceRegions}} = ();
 381   @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = ();
 382   @{$SequenceFilesInfo{ResidueCodes}} = ();
 383 
 384   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 385     $SequenceFile = $SequenceFilesList[$Index];
 386     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 387     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 388     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 389     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 390     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 391     $SequenceFilesInfo{RefereceSequenceID}[$Index] = '';
 392     $SequenceFilesInfo{RefereceSequence}[$Index] = '';
 393     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = '';
 394     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = '';
 395     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = '';
 396     @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = ();
 397     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0;
 398     @{$SequenceFilesInfo{ResidueCodes}[$Index]} = ();
 399 
 400     if (! open SEQUENCEFILE, "$SequenceFile") {
 401       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 402       next FILELIST;
 403     }
 404     close SEQUENCEFILE;
 405 
 406     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 407     if (!$FileSupported) {
 408       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 409       next FILELIST;
 410     }
 411 
 412     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 413 
 414     $SequenceCount = $SequenceDataRef->{Count};
 415     if (!$SequenceCount) {
 416       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 417       next FILELIST;
 418     }
 419 
 420     # Make sure all sequence lengths are identical...
 421     if (!AreSequenceLengthsIdentical($SequenceDataRef)) {
 422       warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n";
 423       next FILELIST;
 424     }
 425     $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence};
 426     # Make sure reference sequence ID is valid...
 427     if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) {
 428       $RefereceSequenceID = $SequenceDataRef->{IDs}[0];
 429     }
 430     else {
 431       if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) {
 432         warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n";
 433         next FILELIST;
 434       }
 435       $RefereceSequenceID = $SpecifiedRefereceSequence;
 436     }
 437 
 438     # Make sure sequence regions corresponding to reference sequence are valid...
 439     @RefereceSequenceRegions = ();
 440     $RefereceSequenceRegionCount = 0;
 441     $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID};
 442     $RefereceSequenceLen = length $RefereceSequence;
 443 
 444     $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence);
 445     $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps;
 446 
 447     @SpecifiedRegions = ();
 448     push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}};
 449     if (@SpecifiedRegions) {
 450       # Make sure specified regions are valid...
 451       my($StartResNum, $EndResNum, $RegionIndex, $RegionNum);
 452       $RegionNum = 0;
 453       for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) {
 454         $StartResNum = $SpecifiedRegions[$RegionIndex];
 455         $EndResNum = $SpecifiedRegions[$RegionIndex + 1];
 456         $RegionNum++;
 457         if ($OptionsInfo{IgnoreGaps}) {
 458           if ($StartResNum > $RefereceSequenceWithNoGapsLen) {
 459             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 460             next FILELIST;
 461           }
 462           if ($EndResNum > $RefereceSequenceWithNoGapsLen) {
 463             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n";
 464             next FILELIST;
 465           }
 466         }
 467         else {
 468           if ($StartResNum > $RefereceSequenceLen) {
 469             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 470             next FILELIST;
 471           }
 472           if ($EndResNum > $RefereceSequenceLen) {
 473             warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID,  $RefereceSequenceID, in region $RegionNum.\n";
 474             next FILELIST;
 475           }
 476         }
 477         push @RefereceSequenceRegions, ($StartResNum, $EndResNum);
 478       }
 479       $RefereceSequenceRegionCount = $RegionNum;
 480     }
 481     else {
 482       # Use complete sequence corresponding to reference sequence ID...
 483       if ($OptionsInfo{IgnoreGaps}) {
 484         push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen);
 485       }
 486       else {
 487         push @RefereceSequenceRegions, (1, $RefereceSequenceLen);
 488       }
 489       $RefereceSequenceRegionCount = 1;
 490     }
 491     # Setup output file names...
 492     $FileDir = ""; $FileName = ""; $FileExt = "";
 493     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 494     $FileExt = "csv";
 495     if ($Options{outdelim} =~ /^tab$/i) {
 496       $FileExt = "tsv";
 497     }
 498     $OutFileExt = $FileExt;
 499     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 500       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 501       if ($RootFileName && $RootFileExt) {
 502         $FileName = $RootFileName;
 503       }
 504       else {
 505         $FileName = $OptionsInfo{OutFileRoot};
 506       }
 507       $OutFileRoot = $FileName;
 508     }
 509     else {
 510       $OutFileRoot = $FileName;
 511     }
 512     if (!$OptionsInfo{OverwriteFiles}) {
 513       if ($OptionsInfo{CalculatePercentIdentityMatrix}) {
 514         if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") {
 515           warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n";
 516           next FILELIST;
 517         }
 518       }
 519       if ($OptionsInfo{PerformResidueFrequencyAnalysis}) {
 520         my($RegionNum);
 521         for $RegionNum (1 .. $RefereceSequenceRegionCount) {
 522           if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 523             warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 524             next FILELIST;
 525           }
 526           if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") {
 527             warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n";
 528             next FILELIST;
 529           }
 530         }
 531       }
 532     }
 533 
 534     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 535     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 536     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 537 
 538     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 539     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 540     $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID;
 541     $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence;
 542     $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen;
 543     $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps;
 544     $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen;
 545     push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions;
 546     $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount;
 547 
 548     # Setup residue codes...
 549     SetupSequenceFileResidueCodes($SequenceDataRef, $Index);
 550   }
 551 }
 552 
 553 sub SetupSequenceFileResidueCodes {
 554   my($SequenceDataRef, $FileIndex) = @_;
 555   my($Residue, @ResidueCodesList, %ResidueCodesMap);
 556 
 557   # Initialize
 558   @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = ();
 559 
 560   %ResidueCodesMap = ();
 561   @ResidueCodesList = ();
 562 
 563   # Setup standard amino acids and nucleic acids one letter codes...
 564   if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) {
 565     @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode');
 566   }
 567   elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) {
 568     push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C');
 569   }
 570   push @ResidueCodesList, 'Gap';
 571   for $Residue (@ResidueCodesList) {
 572     $ResidueCodesMap{$Residue} = $Residue;
 573   }
 574 
 575   # Go over all the residues in all the sequences and add missing ones to the list...
 576   my($ID, $Sequence, $ResIndex);
 577   for $ID (@{$SequenceDataRef->{IDs}}) {
 578     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 579     RES: for $ResIndex (0 .. (length($Sequence) - 1)) {
 580       $Residue = substr($Sequence, $ResIndex, 1);
 581       if (IsGapResidue($Residue)) {
 582         next RES;
 583       }
 584       if (exists $ResidueCodesMap{$Residue}) {
 585         next RES;
 586       }
 587       push @ResidueCodesList, $Residue;
 588       $ResidueCodesMap{$Residue} = $Residue;
 589     }
 590   }
 591   push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList;
 592 }
 593 
 594 # Setup regions data for performing residue frequency analysis...
 595 sub SetupSequenceRegionsData {
 596   my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions);
 597 
 598 
 599   @{$SequenceFilesInfo{RefereceSequenceResNums}} = ();
 600   @{$SequenceFilesInfo{RegionsData}} = ();
 601 
 602   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 603     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = ();
 604     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = ();
 605     %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = ();
 606     %{$SequenceFilesInfo{RegionsData}[$Index]} = ();
 607     @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = ();
 608 
 609     if (!$SequenceFilesInfo{FilesOkay}[$Index]) {
 610       next FILELIST;
 611     }
 612     if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) {
 613       next FILELIST;
 614     }
 615 
 616     $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index];
 617     $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index];
 618 
 619     # Setup residue number mapping and gap status for refernece sequence...
 620     $NoGapResNum = 0;
 621     $ResNum = 0;
 622     for $ResIndex (0 .. ($RefereceSequenceLen - 1)) {
 623       $ResNum++;
 624       $Residue = substr($RefereceSequence, $ResIndex, 1);
 625       $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1;
 626       if (!IsGapResidue($Residue)) {
 627         $NoGapResNum++;
 628         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0;
 629         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum;
 630         $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum;
 631       }
 632     }
 633     # Map residue numbers for specified regions to the reference residue in input sequence/alignment files
 634     $RegionNum = 0;
 635     @RefereceSequenceRegions = ();
 636     push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]};
 637     for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) {
 638       $StartResNum = $RefereceSequenceRegions[$RegionIndex];
 639       $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1];
 640       $RegionNum++;
 641       $RegionID = "Region${RegionNum}";
 642       if ($OptionsInfo{IgnoreGaps}) {
 643         # Map residue numbers to the reference sequence residue numbers to account for any ignored gaps...
 644         $StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum};
 645         $EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum};
 646       }
 647       push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID;
 648       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum;
 649       $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum;
 650 
 651       # Initialize data for residue codes...
 652       for $ResNum ($StartResNum .. $EndResNum) {
 653         for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) {
 654           $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0;
 655         }
 656       }
 657     }
 658   }
 659 }
 660 
 661 # Setup script usage  and retrieve command line arguments specified using various options...
 662 sub SetupScriptUsage {
 663 
 664   # Retrieve all the options...
 665   %Options = ();
 666   $Options{ignoregaps} = 'yes';
 667   $Options{regionresiduesmode} = 'None';
 668   $Options{mode} = 'PercentIdentityMatrix';
 669   $Options{outdelim} = 'comma';
 670   $Options{precision} = 2;
 671   $Options{quote} = 'yes';
 672   $Options{referencesequence} = 'UseFirstSequenceID';
 673   $Options{region} = 'UseCompleteSequence';
 674 
 675   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) {
 676     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 677   }
 678   if ($Options{workingdir}) {
 679     if (! -d $Options{workingdir}) {
 680       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 681     }
 682     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 683   }
 684   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 685     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 686   }
 687   if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) {
 688     die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n";
 689   }
 690   if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) {
 691     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis  or All\n";
 692   }
 693   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 694     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 695   }
 696   if ($Options{quote} !~ /^(yes|no)$/i) {
 697     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 698   }
 699   if (!IsPositiveInteger($Options{precision})) {
 700     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
 701   }
 702 }
 703