1 #!/usr/bin/perl -w 2 # 3 # File: AnalyzeSequenceFilesData.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use SequenceFileUtil; 35 use AminoAcids; 36 use NucleicAcids; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename($0); 45 print "\n$ScriptName: Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Setup script usage message... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 # Expand wild card file names... 55 my(@SequenceFilesList); 56 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir"); 57 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 # Set up information about input files... 63 print "Checking input sequence file(s)...\n"; 64 my(%SequenceFilesInfo); 65 RetrieveSequenceFilesInfo(); 66 SetupSequenceRegionsData(); 67 68 # Process input files.. 69 my($FileIndex); 70 if (@SequenceFilesList > 1) { 71 print "\nProcessing sequence files...\n"; 72 } 73 for $FileIndex (0 .. $#SequenceFilesList) { 74 if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) { 75 print "\nProcessing file $SequenceFilesList[$FileIndex]...\n"; 76 AnalyzeSequenceFileData($FileIndex); 77 } 78 } 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # Analyze sequence file... 88 sub AnalyzeSequenceFileData { 89 my($FileIndex) = @_; 90 my($SequenceFile, $SequenceDataRef); 91 92 $SequenceFile = $SequenceFilesList[$FileIndex]; 93 94 open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n"; 95 $SequenceDataRef = ReadSequenceFile($SequenceFile); 96 close SEQUENCEFILE; 97 98 if ($OptionsInfo{CalculatePercentIdentityMatrix}) { 99 CalculatePercentIdentityMatrix($FileIndex, $SequenceDataRef); 100 } 101 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) { 102 PerformResidueFrequencyAnalysis($FileIndex, $SequenceDataRef); 103 } 104 } 105 106 # Calculate percent identity matrix... 107 sub CalculatePercentIdentityMatrix { 108 my($FileIndex, $SequenceDataRef) = @_; 109 my($PercentIdentity, $PercentIdentityMatrixFile, $PercentIdentityMatrixRef, $RowID, $ColID, $Line, @LineWords); 110 111 $PercentIdentityMatrixFile = $SequenceFilesInfo{OutFileRoot}[$FileIndex] . 'PercentIdentityMatrix.' . $SequenceFilesInfo{OutFileExt}[$FileIndex]; 112 $PercentIdentityMatrixRef = CalculatePercentSequenceIdentityMatrix($SequenceDataRef, $OptionsInfo{IgnoreGaps}, $OptionsInfo{Precision}); 113 114 print "Generating percent identity matrix file $PercentIdentityMatrixFile...\n"; 115 open OUTFILE, ">$PercentIdentityMatrixFile" or die "Can't open $PercentIdentityMatrixFile: $!\n"; 116 117 # Write out column labels... 118 @LineWords = (); 119 push @LineWords, ''; 120 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) { 121 push @LineWords, $ColID; 122 } 123 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 124 print OUTFILE "$Line\n"; 125 126 # Write out rows... 127 for $RowID (@{$PercentIdentityMatrixRef->{IDs}}) { 128 @LineWords = (); 129 push @LineWords, $RowID; 130 for $ColID (@{$PercentIdentityMatrixRef->{IDs}}) { 131 $PercentIdentity = $PercentIdentityMatrixRef->{PercentIdentity}{$RowID}{$ColID}; 132 push @LineWords, $PercentIdentity; 133 } 134 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 135 print OUTFILE "$Line\n"; 136 } 137 close OUTFILE; 138 } 139 140 # Perform frequency analysis... 141 sub PerformResidueFrequencyAnalysis { 142 my($FileIndex, $SequenceDataRef) = @_; 143 144 CountResiduesInRegions($FileIndex, $SequenceDataRef); 145 CalculatePercentResidueFrequencyInRegions($FileIndex, $SequenceDataRef); 146 GeneratePercentResidueFrequencyOutFilesForRegions($FileIndex, $SequenceDataRef); 147 } 148 149 # Count residues... 150 sub CountResiduesInRegions { 151 my($FileIndex, $SequenceDataRef) = @_; 152 153 # Setup rerfernce sequence data... 154 my($RefereceSequenceID, $RefereceSequence); 155 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex]; 156 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex]; 157 158 # Count residues... 159 my($RegionID, $StartResNum, $EndResNum, $ResNum, $ResIndex, $ID, $Sequence, $Residue); 160 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { 161 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; 162 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; 163 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { 164 $ResIndex = $ResNum - 1; 165 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { 166 next RESNUM; 167 } 168 # Go over residues in column $ResNum in all the sequences... 169 ID: for $ID (@{$SequenceDataRef->{IDs}}) { 170 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 171 $Residue = substr($Sequence, $ResIndex, 1); 172 if (IsGapResidue($Residue)) { 173 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{Gap} += 1; 174 } 175 else { 176 if (exists $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}) { 177 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue} += 1; 178 } 179 else { 180 # Internal error... 181 print "Warning: Ignoring residue $Residue in sequence $ID during ResidueFrequencyAnalysis calculation: Unknown residue...\n"; 182 } 183 } 184 } 185 } 186 } 187 } 188 189 # Calculate percent frequency for various residues in the sequence regions... 190 sub CalculatePercentResidueFrequencyInRegions { 191 my($FileIndex, $SequenceDataRef) = @_; 192 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Residue, $Count, $PercentCount, $MaxResiduesCount, $Precision); 193 194 $MaxResiduesCount = $SequenceDataRef->{Count}; 195 $Precision = $OptionsInfo{Precision}; 196 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { 197 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; 198 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; 199 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { 200 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { 201 next RESNUM; 202 } 203 for $Residue (keys %{$SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}}) { 204 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; 205 $PercentCount = ($Count / $MaxResiduesCount) * 100; 206 $PercentCount = sprintf("%.${Precision}f", $PercentCount) + 0; 207 $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue} = $PercentCount; 208 } 209 } 210 } 211 } 212 213 # Generate output files... 214 sub GeneratePercentResidueFrequencyOutFilesForRegions { 215 my($FileIndex, $SequenceDataRef) = @_; 216 217 # Setup rerfernce sequence data... 218 my($RefereceSequenceID, $RefereceSequence); 219 $RefereceSequenceID = $SequenceFilesInfo{RefereceSequenceID}[$FileIndex]; 220 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$FileIndex]; 221 222 my($RegionID, $StartResNum, $EndResNum, $ResNum, $Count, $PercentCount, $Residue, $RegionNum, $RegionOutFile, $PercentRegionOutFile, $OutFileRoot, $OutFileExt, $Line, @LineWords, @PercentLineWords); 223 224 $OutFileRoot = $SequenceFilesInfo{OutFileRoot}[$FileIndex]; 225 $OutFileExt = $SequenceFilesInfo{OutFileExt}[$FileIndex]; 226 $RegionNum = 0; 227 for $RegionID (@{$SequenceFilesInfo{RegionsData}[$FileIndex]{RegionIDs}}) { 228 $RegionNum++; 229 $StartResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{StartResNum}; 230 $EndResNum = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{EndResNum}; 231 232 $RegionOutFile = "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}"; 233 $PercentRegionOutFile = "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}"; 234 235 print "Generating $RegionOutFile and $PercentRegionOutFile...\n"; 236 open REGIONOUTFILE, ">$RegionOutFile" or die "Error: Can't open $RegionOutFile: $! \n"; 237 open PERCENTREGIONOUTFILE, ">$PercentRegionOutFile" or die "Error: Can't open $PercentRegionOutFile: $! \n"; 238 239 # Write out reference residue positions as column values.... 240 @LineWords = (); 241 push @LineWords, ''; 242 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { 243 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { 244 next RESNUM; 245 } 246 push @LineWords, $ResNum; 247 } 248 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 249 print REGIONOUTFILE "$Line\n"; 250 print PERCENTREGIONOUTFILE "$Line\n"; 251 252 253 # Write out row data for each residue; Gap residue is written last... 254 RESIDUE: for $Residue (sort @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}) { 255 if ($Residue =~ /^Gap$/i) { 256 next RESIDUE; 257 } 258 @LineWords = (); 259 push @LineWords, $Residue; 260 @PercentLineWords = (); 261 push @PercentLineWords, $Residue; 262 263 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { 264 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { 265 next RESNUM; 266 } 267 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; 268 push @LineWords, $Count; 269 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue}; 270 push @PercentLineWords, $PercentCount; 271 } 272 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 273 print REGIONOUTFILE "$Line\n"; 274 275 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 276 print PERCENTREGIONOUTFILE "$Line\n"; 277 } 278 279 # Write out data for gap... 280 $Residue = 'Gap'; 281 @LineWords = (); 282 push @LineWords, $Residue; 283 @PercentLineWords = (); 284 push @PercentLineWords, $Residue; 285 286 RESNUM: for $ResNum ($StartResNum .. $EndResNum) { 287 if ($OptionsInfo{IgnoreGaps} && $SequenceFilesInfo{RefereceSequenceResNums}[$FileIndex]{IsGap}{$ResNum}) { 288 next RESNUM; 289 } 290 $Count = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{Count}{$ResNum}{$Residue}; 291 push @LineWords, $Count; 292 293 $PercentCount = $SequenceFilesInfo{RegionsData}[$FileIndex]{$RegionID}{PercentCount}{$ResNum}{$Residue}; 294 push @PercentLineWords, $PercentCount; 295 } 296 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 297 print REGIONOUTFILE "$Line\n"; 298 299 $Line = JoinWords(\@PercentLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 300 print PERCENTREGIONOUTFILE "$Line\n"; 301 302 close REGIONOUTFILE; 303 close PERCENTREGIONOUTFILE; 304 } 305 } 306 307 # Process option values... 308 sub ProcessOptions { 309 %OptionsInfo = (); 310 311 # Setup analysis mode... 312 $OptionsInfo{CalculatePercentIdentityMatrix} = ($Options{mode} =~ /^(PercentIdentityMatrix|All)$/i) ? 1 : 0; 313 $OptionsInfo{PerformResidueFrequencyAnalysis} = ($Options{mode} =~ /^(ResidueFrequencyAnalysis|All)$/i) ? 1 : 0; 314 315 # Setup delimiter and quotes... 316 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 317 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; 318 319 # Setup reference sequence and regions for residue frequence analysis... 320 $OptionsInfo{SpecifiedRefereceSequence} = $Options{referencesequence}; 321 $OptionsInfo{SpecifiedRegion} = $Options{region}; 322 @{$OptionsInfo{SpecifiedRegions}} = (); 323 324 my(@SpecifiedRegions); 325 @SpecifiedRegions = (); 326 if ($Options{region} =~ /\,/) { 327 @SpecifiedRegions = split /\,/, $OptionsInfo{SpecifiedRegion}; 328 if (@SpecifiedRegions % 2) { 329 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n"; 330 } 331 # Make sure EndResNum > StartResNum... 332 my($StartResNum, $EndResNum, $Index, $RegionNum); 333 $RegionNum = 0; 334 for ($Index = 0; $Index <= $#SpecifiedRegions; $Index += 2) { 335 $StartResNum = $SpecifiedRegions[$Index]; 336 $EndResNum = $SpecifiedRegions[$Index + 1]; 337 $RegionNum++; 338 if (!IsPositiveInteger($StartResNum)) { 339 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be a positive integer for region $RegionNum.\n"; 340 } 341 if (!IsPositiveInteger($EndResNum)) { 342 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $EndResNum, must be a positive integer for region $RegionNum.\n"; 343 } 344 if ($StartResNum >= $EndResNum) { 345 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller than end residue number, $EndResNum, for region $RegionNum.\n"; 346 } 347 } 348 } 349 else { 350 if ($Options{region} !~ /^UseCompleteSequence$/i) { 351 die "Error: The value specified, $Options{region}, for option \"--region\" is not valid. Allowed values: \"StartResNum,EndResNum,[StartResNum,EndResNum...]\" or UseCompleteSequence\n"; 352 } 353 } 354 push @{$OptionsInfo{SpecifiedRegions}}, @SpecifiedRegions; 355 356 # Miscellaneous options... 357 $OptionsInfo{Precision} = $Options{precision}; 358 $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0; 359 $OptionsInfo{RegionResiduesMode} = $Options{regionresiduesmode}; 360 361 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 362 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 363 } 364 365 # Retrieve information about sequence files... 366 sub RetrieveSequenceFilesInfo { 367 my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $RefereceSequence, $RefereceSequenceID, $RefereceSequenceLen, $RefereceSequenceWithNoGaps, $RefereceSequenceWithNoGapsLen, $RefereceSequenceRegionCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $SequenceDataRef, $SpecifiedRefereceSequence, @SpecifiedRegions, @RefereceSequenceRegions); 368 369 %SequenceFilesInfo = (); 370 @{$SequenceFilesInfo{FilesOkay}} = (); 371 @{$SequenceFilesInfo{OutFileRoot}} = (); 372 @{$SequenceFilesInfo{OutFileExt}} = (); 373 @{$SequenceFilesInfo{Format}} = (); 374 @{$SequenceFilesInfo{SequenceCount}} = (); 375 @{$SequenceFilesInfo{RefereceSequenceID}} = (); 376 @{$SequenceFilesInfo{RefereceSequence}} = (); 377 @{$SequenceFilesInfo{RefereceSequenceLen}} = (); 378 @{$SequenceFilesInfo{RefereceSequenceWithNoGaps}} = (); 379 @{$SequenceFilesInfo{RefereceSequenceWithNoGapsLen}} = (); 380 @{$SequenceFilesInfo{RefereceSequenceRegions}} = (); 381 @{$SequenceFilesInfo{RefereceSequenceRegionCount}} = (); 382 @{$SequenceFilesInfo{ResidueCodes}} = (); 383 384 FILELIST: for $Index (0 .. $#SequenceFilesList) { 385 $SequenceFile = $SequenceFilesList[$Index]; 386 $SequenceFilesInfo{FilesOkay}[$Index] = 0; 387 $SequenceFilesInfo{OutFileRoot}[$Index] = ''; 388 $SequenceFilesInfo{OutFileExt}[$Index] = ''; 389 $SequenceFilesInfo{Format}[$Index] = 'NotSupported'; 390 $SequenceFilesInfo{SequenceCount}[$Index] = 0; 391 $SequenceFilesInfo{RefereceSequenceID}[$Index] = ''; 392 $SequenceFilesInfo{RefereceSequence}[$Index] = ''; 393 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = ''; 394 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = ''; 395 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = ''; 396 @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]} = (); 397 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = 0; 398 @{$SequenceFilesInfo{ResidueCodes}[$Index]} = (); 399 400 if (! open SEQUENCEFILE, "$SequenceFile") { 401 warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n"; 402 next FILELIST; 403 } 404 close SEQUENCEFILE; 405 406 ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile); 407 if (!$FileSupported) { 408 warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n"; 409 next FILELIST; 410 } 411 412 $SequenceDataRef = ReadSequenceFile($SequenceFile); 413 414 $SequenceCount = $SequenceDataRef->{Count}; 415 if (!$SequenceCount) { 416 warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n"; 417 next FILELIST; 418 } 419 420 # Make sure all sequence lengths are identical... 421 if (!AreSequenceLengthsIdentical($SequenceDataRef)) { 422 warn "Warning: Ignoring file $SequenceFile: Sequence legths are not identical.\n"; 423 next FILELIST; 424 } 425 $SpecifiedRefereceSequence = $OptionsInfo{SpecifiedRefereceSequence}; 426 # Make sure reference sequence ID is valid... 427 if ($SpecifiedRefereceSequence =~ /^UseFirstSequenceID$/i) { 428 $RefereceSequenceID = $SequenceDataRef->{IDs}[0]; 429 } 430 else { 431 if (!exists($SequenceDataRef->{Sequence}{$SpecifiedRefereceSequence})) { 432 warn "Warning: Ignoring file $SequenceFile: Rreference sequence ID, $SpecifiedRefereceSequence, specified using option \"--referencesequence\" is missing.\n"; 433 next FILELIST; 434 } 435 $RefereceSequenceID = $SpecifiedRefereceSequence; 436 } 437 438 # Make sure sequence regions corresponding to reference sequence are valid... 439 @RefereceSequenceRegions = (); 440 $RefereceSequenceRegionCount = 0; 441 $RefereceSequence = $SequenceDataRef->{Sequence}{$RefereceSequenceID}; 442 $RefereceSequenceLen = length $RefereceSequence; 443 444 $RefereceSequenceWithNoGaps = RemoveSequenceGaps($RefereceSequence); 445 $RefereceSequenceWithNoGapsLen = length $RefereceSequenceWithNoGaps; 446 447 @SpecifiedRegions = (); 448 push @SpecifiedRegions, @{$OptionsInfo{SpecifiedRegions}}; 449 if (@SpecifiedRegions) { 450 # Make sure specified regions are valid... 451 my($StartResNum, $EndResNum, $RegionIndex, $RegionNum); 452 $RegionNum = 0; 453 for ($RegionIndex = 0; $RegionIndex <= $#SpecifiedRegions; $RegionIndex += 2) { 454 $StartResNum = $SpecifiedRegions[$RegionIndex]; 455 $EndResNum = $SpecifiedRegions[$RegionIndex + 1]; 456 $RegionNum++; 457 if ($OptionsInfo{IgnoreGaps}) { 458 if ($StartResNum > $RefereceSequenceWithNoGapsLen) { 459 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n"; 460 next FILELIST; 461 } 462 if ($EndResNum > $RefereceSequenceWithNoGapsLen) { 463 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceWithNoGapsLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum. The reference sequence residue numbers correspond to the sequence with no gaps. Specify \"No\" value for \"-i, --ignoregaps\" option to use residue numbers corresponding to reference sequence including gaps.\n"; 464 next FILELIST; 465 } 466 } 467 else { 468 if ($StartResNum > $RefereceSequenceLen) { 469 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The start residue number, $StartResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n"; 470 next FILELIST; 471 } 472 if ($EndResNum > $RefereceSequenceLen) { 473 warn "Warning: Ignoring file $SequenceFile: The value specified, $Options{region}, for option \"--region\" is not valid: The end residue number, $EndResNum, must be smaller the sequence length, $RefereceSequenceLen, of reference sequence ID, $RefereceSequenceID, in region $RegionNum.\n"; 474 next FILELIST; 475 } 476 } 477 push @RefereceSequenceRegions, ($StartResNum, $EndResNum); 478 } 479 $RefereceSequenceRegionCount = $RegionNum; 480 } 481 else { 482 # Use complete sequence corresponding to reference sequence ID... 483 if ($OptionsInfo{IgnoreGaps}) { 484 push @RefereceSequenceRegions, (1, $RefereceSequenceWithNoGapsLen); 485 } 486 else { 487 push @RefereceSequenceRegions, (1, $RefereceSequenceLen); 488 } 489 $RefereceSequenceRegionCount = 1; 490 } 491 # Setup output file names... 492 $FileDir = ""; $FileName = ""; $FileExt = ""; 493 ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile); 494 $FileExt = "csv"; 495 if ($Options{outdelim} =~ /^tab$/i) { 496 $FileExt = "tsv"; 497 } 498 $OutFileExt = $FileExt; 499 if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) { 500 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 501 if ($RootFileName && $RootFileExt) { 502 $FileName = $RootFileName; 503 } 504 else { 505 $FileName = $OptionsInfo{OutFileRoot}; 506 } 507 $OutFileRoot = $FileName; 508 } 509 else { 510 $OutFileRoot = $FileName; 511 } 512 if (!$OptionsInfo{OverwriteFiles}) { 513 if ($OptionsInfo{CalculatePercentIdentityMatrix}) { 514 if (-e "${OutFileRoot}PercentIdentityMatrix.${OutFileExt}") { 515 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentIdentityMatrix.${OutFileExt} already exists\n"; 516 next FILELIST; 517 } 518 } 519 if ($OptionsInfo{PerformResidueFrequencyAnalysis}) { 520 my($RegionNum); 521 for $RegionNum (1 .. $RefereceSequenceRegionCount) { 522 if (-e "${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") { 523 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}ResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n"; 524 next FILELIST; 525 } 526 if (-e "${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt}") { 527 warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}PercentResidueFrequencyAnalysisRegion${RegionNum}.${OutFileExt} already exists\n"; 528 next FILELIST; 529 } 530 } 531 } 532 } 533 534 $SequenceFilesInfo{FilesOkay}[$Index] = 1; 535 $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 536 $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt; 537 538 $SequenceFilesInfo{Format}[$Index] = $FileFormat; 539 $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount; 540 $SequenceFilesInfo{RefereceSequenceID}[$Index] = $RefereceSequenceID; 541 $SequenceFilesInfo{RefereceSequence}[$Index] = $RefereceSequence; 542 $SequenceFilesInfo{RefereceSequenceLen}[$Index] = $RefereceSequenceLen; 543 $SequenceFilesInfo{RefereceSequenceWithNoGaps}[$Index] = $RefereceSequenceWithNoGaps; 544 $SequenceFilesInfo{RefereceSequenceWithNoGapsLen}[$Index] = $RefereceSequenceWithNoGapsLen; 545 push @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}, @RefereceSequenceRegions; 546 $SequenceFilesInfo{RefereceSequenceRegionCount}[$Index] = $RefereceSequenceRegionCount; 547 548 # Setup residue codes... 549 SetupSequenceFileResidueCodes($SequenceDataRef, $Index); 550 } 551 } 552 553 sub SetupSequenceFileResidueCodes { 554 my($SequenceDataRef, $FileIndex) = @_; 555 my($Residue, @ResidueCodesList, %ResidueCodesMap); 556 557 # Initialize 558 @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]} = (); 559 560 %ResidueCodesMap = (); 561 @ResidueCodesList = (); 562 563 # Setup standard amino acids and nucleic acids one letter codes... 564 if ($OptionsInfo{RegionResiduesMode} =~ /^AminoAcids$/i) { 565 @ResidueCodesList = AminoAcids::GetAminoAcids('OneLetterCode'); 566 } 567 elsif ($OptionsInfo{RegionResiduesMode} =~ /^NucleicAcids$/i) { 568 push @ResidueCodesList, ('A', 'G', 'T', 'U', 'C'); 569 } 570 push @ResidueCodesList, 'Gap'; 571 for $Residue (@ResidueCodesList) { 572 $ResidueCodesMap{$Residue} = $Residue; 573 } 574 575 # Go over all the residues in all the sequences and add missing ones to the list... 576 my($ID, $Sequence, $ResIndex); 577 for $ID (@{$SequenceDataRef->{IDs}}) { 578 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 579 RES: for $ResIndex (0 .. (length($Sequence) - 1)) { 580 $Residue = substr($Sequence, $ResIndex, 1); 581 if (IsGapResidue($Residue)) { 582 next RES; 583 } 584 if (exists $ResidueCodesMap{$Residue}) { 585 next RES; 586 } 587 push @ResidueCodesList, $Residue; 588 $ResidueCodesMap{$Residue} = $Residue; 589 } 590 } 591 push @{$SequenceFilesInfo{ResidueCodes}[$FileIndex]}, @ResidueCodesList; 592 } 593 594 # Setup regions data for performing residue frequency analysis... 595 sub SetupSequenceRegionsData { 596 my($Index, $RefereceSequence, $RefereceSequenceLen, $RegionID, $StartResNum, $EndResNum, $RegionIndex, $RegionNum, $NoGapResNum, $ResNum, $ResIndex, $Residue, $ResidueCode, @RefereceSequenceRegions); 597 598 599 @{$SequenceFilesInfo{RefereceSequenceResNums}} = (); 600 @{$SequenceFilesInfo{RegionsData}} = (); 601 602 FILELIST: for $Index (0 .. $#SequenceFilesList) { 603 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}} = (); 604 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}} = (); 605 %{$SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}} = (); 606 %{$SequenceFilesInfo{RegionsData}[$Index]} = (); 607 @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}} = (); 608 609 if (!$SequenceFilesInfo{FilesOkay}[$Index]) { 610 next FILELIST; 611 } 612 if (!$OptionsInfo{PerformResidueFrequencyAnalysis}) { 613 next FILELIST; 614 } 615 616 $RefereceSequence = $SequenceFilesInfo{RefereceSequence}[$Index]; 617 $RefereceSequenceLen = $SequenceFilesInfo{RefereceSequenceLen}[$Index]; 618 619 # Setup residue number mapping and gap status for refernece sequence... 620 $NoGapResNum = 0; 621 $ResNum = 0; 622 for $ResIndex (0 .. ($RefereceSequenceLen - 1)) { 623 $ResNum++; 624 $Residue = substr($RefereceSequence, $ResIndex, 1); 625 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 1; 626 if (!IsGapResidue($Residue)) { 627 $NoGapResNum++; 628 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{IsGap}{$ResNum} = 0; 629 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$NoGapResNum} = $ResNum; 630 $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{GapToNoGap}{$ResNum} = $NoGapResNum; 631 } 632 } 633 # Map residue numbers for specified regions to the reference residue in input sequence/alignment files 634 $RegionNum = 0; 635 @RefereceSequenceRegions = (); 636 push @RefereceSequenceRegions, @{$SequenceFilesInfo{RefereceSequenceRegions}[$Index]}; 637 for ($RegionIndex = 0; $RegionIndex <= $#RefereceSequenceRegions; $RegionIndex += 2) { 638 $StartResNum = $RefereceSequenceRegions[$RegionIndex]; 639 $EndResNum = $RefereceSequenceRegions[$RegionIndex + 1]; 640 $RegionNum++; 641 $RegionID = "Region${RegionNum}"; 642 if ($OptionsInfo{IgnoreGaps}) { 643 # Map residue numbers to the reference sequence residue numbers to account for any ignored gaps... 644 $StartResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$StartResNum}; 645 $EndResNum = $SequenceFilesInfo{RefereceSequenceResNums}[$Index]{NoGapToGap}{$EndResNum}; 646 } 647 push @{$SequenceFilesInfo{RegionsData}[$Index]{RegionIDs}}, $RegionID; 648 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{StartResNum} = $StartResNum; 649 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{EndResNum} = $EndResNum; 650 651 # Initialize data for residue codes... 652 for $ResNum ($StartResNum .. $EndResNum) { 653 for $ResidueCode (@{$SequenceFilesInfo{ResidueCodes}[$Index]}) { 654 $SequenceFilesInfo{RegionsData}[$Index]{$RegionID}{Count}{$ResNum}{$ResidueCode} = 0; 655 } 656 } 657 } 658 } 659 } 660 661 # Setup script usage and retrieve command line arguments specified using various options... 662 sub SetupScriptUsage { 663 664 # Retrieve all the options... 665 %Options = (); 666 $Options{ignoregaps} = 'yes'; 667 $Options{regionresiduesmode} = 'None'; 668 $Options{mode} = 'PercentIdentityMatrix'; 669 $Options{outdelim} = 'comma'; 670 $Options{precision} = 2; 671 $Options{quote} = 'yes'; 672 $Options{referencesequence} = 'UseFirstSequenceID'; 673 $Options{region} = 'UseCompleteSequence'; 674 675 if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "referencesequence=s", "region=s", "regionresiduesmode=s", "root|r=s", "workingdir|w=s")) { 676 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 677 } 678 if ($Options{workingdir}) { 679 if (! -d $Options{workingdir}) { 680 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 681 } 682 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 683 } 684 if ($Options{ignoregaps} !~ /^(yes|no)$/i) { 685 die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 686 } 687 if ($Options{regionresiduesmode} !~ /^(AminoAcids|NucleicAcids|None)$/i) { 688 die "Error: The value specified, $Options{regionresiduesmode}, for option \"--regionresiduesmode\" is not valid. Allowed values: AminoAcids, NucleicAcids or None\n"; 689 } 690 if ($Options{mode} !~ /^(PercentIdentityMatrix|ResidueFrequencyAnalysis|All)$/i) { 691 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: PercentIdentityMatrix, ResidueFrequencyAnalysis or All\n"; 692 } 693 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 694 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 695 } 696 if ($Options{quote} !~ /^(yes|no)$/i) { 697 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 698 } 699 if (!IsPositiveInteger($Options{precision})) { 700 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 701 } 702 } 703