1 package SequenceFileUtil; 2 # 3 # File: SequenceFileUtil.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Exporter; 28 use Text::ParseWords; 29 use TextUtil; 30 use FileUtil; 31 32 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 33 34 @ISA = qw(Exporter); 35 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); 36 @EXPORT_OK = qw(); 37 38 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 39 40 # Compare lengths of all sequences... 41 sub AreSequenceLengthsIdentical { 42 my($SequencesDataRef) = @_; 43 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); 44 45 $Status = 1; 46 $FirstID = ''; 47 $FirstDifferentLenID = ''; 48 49 ID: for $ID (@{$SequencesDataRef->{IDs}}) { 50 if (!$FirstID) { 51 $FirstID = $ID; 52 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); 53 next ID; 54 } 55 $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); 56 if ($SeqLen != $FirstSeqLen) { 57 $Status = 0; 58 $FirstDifferentLenID = $ID; 59 last ID; 60 } 61 } 62 return ($Status); 63 } 64 65 # Calculate percent identity between two sequences. By default, gaps are ignored. 66 sub CalcuatePercentSequenceIdentity { 67 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); 68 69 $PercentIdentity = ''; 70 $Precision = 1; 71 $IgnoreGaps = 1; 72 if (@_ == 4) { 73 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; 74 } 75 elsif (@_ == 3) { 76 ($Sequence1, $Sequence2, $IgnoreGaps) = @_; 77 } 78 elsif (@_ == 2) { 79 ($Sequence1, $Sequence2) = @_; 80 } 81 else { 82 return $PercentIdentity; 83 } 84 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { 85 return $PercentIdentity; 86 } 87 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); 88 89 $Sequence1Len = length($Sequence1); 90 $Sequence2Len = length($Sequence2); 91 92 $ResMatchCount = 0; 93 $ResCount = 0; 94 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { 95 $Residue1 = substr($Sequence1, $Index, 1); 96 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; 97 if ($IgnoreGaps) { 98 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { 99 next RESIDUE; 100 } 101 } 102 if ($Residue1 eq $Residue2) { 103 $ResMatchCount++; 104 } 105 $ResCount++; 106 } 107 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; 108 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); 109 110 return $PercentIdentity; 111 } 112 113 # Calculate pairwise identify matrix for all the sequences and return a reference 114 # to a hash with the following keys: 115 # 116 # {IDs} - Sequence IDs 117 # {Count} - Number of IDs 118 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences 119 # 120 sub CalculatePercentSequenceIdentityMatrix { 121 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); 122 123 $IgnoreGaps = 1; 124 $Precision = 1; 125 if (@_ == 3) { 126 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; 127 } 128 elsif (@_ == 2) { 129 ($SequencesDataRef, $IgnoreGaps) = @_; 130 } 131 else { 132 ($SequencesDataRef) = @_; 133 } 134 135 %IdentityMatrixData = (); 136 @{$IdentityMatrixData{IDs}} = (); 137 %{$IdentityMatrixData{PercentIdentity}} = (); 138 $IdentityMatrixData{Count} = 0; 139 140 for $ID (@{$SequencesDataRef->{IDs}}) { 141 push @{$IdentityMatrixData{IDs}}, $ID; 142 $IdentityMatrixData{Count} += 1; 143 } 144 # Initialize and calculate percent identity data values... 145 for $RowID (@{$SequencesDataRef->{IDs}}) { 146 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); 147 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; 148 for $ColID (@{$SequencesDataRef->{IDs}}) { 149 $IdentityMatrixData{$RowID}{$ColID} = ''; 150 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; 151 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); 152 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; 153 } 154 } 155 return \%IdentityMatrixData; 156 } 157 158 # Retrieve information about shortest sequence... 159 sub GetShortestSequence { 160 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 161 162 $IgnoreGaps = 1; 163 if (@_ == 2) { 164 ($SequencesDataRef, $IgnoreGaps) = @_; 165 } 166 else { 167 ($SequencesDataRef) = @_; 168 } 169 170 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); 171 return ($ID, $Sequence, $SeqLen, $Description); 172 } 173 174 # Retrieve information about longest sequence.. 175 sub GetLongestSequence { 176 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 177 178 $IgnoreGaps = 1; 179 if (@_ == 2) { 180 ($SequencesDataRef, $IgnoreGaps) = @_; 181 } 182 else { 183 ($SequencesDataRef) = @_; 184 } 185 186 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); 187 return ($ID, $Sequence, $SeqLen, $Description); 188 } 189 190 # Get sequence length... 191 sub GetSequenceLength { 192 my($Seq, $SeqLen, $IgnoreGaps); 193 194 $SeqLen = ''; $IgnoreGaps = 1; 195 if (@_ == 2) { 196 ($Seq, $IgnoreGaps) = @_; 197 } 198 else { 199 ($Seq) = @_; 200 } 201 if ($IgnoreGaps) { 202 my($Index, $Residue); 203 $SeqLen = 0; 204 for $Index (0 .. (length($Seq) - 1)) { 205 $Residue = substr($Seq, $Index, 1); 206 if ($Residue =~ /[A-Z]/i) { 207 $SeqLen++; 208 } 209 } 210 } 211 else { 212 $SeqLen = length($Seq); 213 } 214 215 return $SeqLen; 216 } 217 218 # Is it a gap residue... 219 sub IsGapResidue { 220 my($Residue) = @_; 221 my($Status); 222 223 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; 224 225 return $Status; 226 } 227 228 # Is it a supported sequence file? 229 # 230 # Supported seqence formats are: 231 # 232 # ALN/ClustalW .aln 233 # GCG/MSF .msf 234 # PILEUP/MSF .msf 235 # Fasts(Pearson) .fasta, .fta 236 # NBRF/PIR .pir 237 # 238 sub IsSupportedSequenceFile { 239 my($SequenceFile) = @_; 240 my($Status, $SequenceFormat); 241 $Status = 0; $SequenceFormat = 'NotSupported'; 242 243 SEQFORMAT: { 244 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} 245 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} 246 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} 247 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} 248 $Status = 0; $SequenceFormat = 'NotSupported'; 249 } 250 return ($Status, $SequenceFormat); 251 } 252 253 # Is it a ClustalW multiple sequence sequence file... 254 sub IsClustalWSequenceFile { 255 my($SequenceFile) = @_; 256 my($Status, $Line); 257 258 $Status = 0; 259 260 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 261 $Line = GetTextLine(\*SEQUENCEFILE); 262 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; 263 close SEQUENCEFILE; 264 265 return $Status; 266 } 267 268 # Is it a valid Pearson fasta sequence or alignment file? 269 # 270 sub IsPearsonFastaSequenceFile { 271 my($FastaFile, $Line, $Status); 272 273 ($FastaFile) = @_; 274 $Status = 0; 275 276 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 277 $Line = GetTextLine(\*FASTAFILE); 278 279 # First line starts with > and the fourth character is not ';'; otherwise, it's 280 # PIR FASTA format. 281 if ($Line =~ /^>/) { 282 my($FourthChar); 283 $FourthChar = substr($Line, 3, 1); 284 $Status = ($FourthChar !~ /\;/) ? 1 : 0; 285 } 286 close FASTAFILE; 287 288 return $Status; 289 } 290 291 # Is it a valid NBRF/PIR fasta sequence or alignment file? 292 # 293 sub IsPIRFastaSequenceFile { 294 my($FastaFile, $Line, $Status); 295 296 ($FastaFile) = @_; 297 $Status = 0; 298 299 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 300 $Line = GetTextLine(\*FASTAFILE); 301 302 # First line starts with > and the fourth character is ';'; otherwise, it's 303 # a Pearson FASTA format. 304 if ($Line =~ /^>/) { 305 my($FourthChar); 306 $FourthChar = substr($Line, 3, 1); 307 $Status = ($FourthChar =~ /\;/) ? 1 : 0; 308 } 309 close FASTAFILE; 310 311 return $Status; 312 } 313 314 # Is it a valid MSF sequence or alignment file? 315 # 316 sub IsMSFSequenceFile { 317 my($MSFFile) = @_; 318 319 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; 320 321 my($Line, $Status); 322 323 $Status = 0; 324 # Find a line that contains MSF: keyword and ends with '..' 325 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 326 $Line = RemoveLeadingWhiteSpaces($Line); 327 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { 328 $Status = 1; 329 last LINE; 330 } 331 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { 332 # Pileup MSF... 333 $Status = 1; 334 last LINE; 335 } 336 } 337 close MSFFILE; 338 339 return $Status; 340 } 341 342 # Read sequence or sequence alignment file... 343 sub ReadSequenceFile { 344 my($SequenceFile) = @_; 345 346 if (IsPearsonFastaSequenceFile($SequenceFile)) { 347 return ReadPearsonFastaSequenceFile($SequenceFile); 348 } 349 elsif (IsPIRFastaSequenceFile($SequenceFile)) { 350 return ReadPIRFastaSequenceFile($SequenceFile); 351 } 352 elsif (IsMSFSequenceFile($SequenceFile)) { 353 return ReadMSFSequenceFile($SequenceFile); 354 } 355 elsif (IsClustalWSequenceFile($SequenceFile)) { 356 return ReadClustalWSequenceFile($SequenceFile); 357 } 358 else { 359 return undef; 360 } 361 } 362 363 # Read file and setup alignment data... 364 sub ReadClustalWSequenceFile { 365 my($SequenceFile) = @_; 366 367 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); 368 } 369 370 # Read file and setup alignment data... 371 sub ReadPearsonFastaSequenceFile { 372 my($SequenceFile) = @_; 373 374 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); 375 } 376 377 # Read file and setup alignment data... 378 sub ReadPIRFastaSequenceFile { 379 my($SequenceFile) = @_; 380 381 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); 382 } 383 384 385 # Read file and setup sequence data... 386 sub ReadMSFSequenceFile { 387 my($SequenceFile) = @_; 388 389 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); 390 } 391 392 # Write out a Pearson FASTA file... 393 sub WritePearsonFastaSequenceFile { 394 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); 395 396 $MaxLength = 80; 397 if (@_ == 3) { 398 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; 399 } 400 elsif (@_ == 2) { 401 ($SequenceFileName, $SequenceDataRef) = @_; 402 } 403 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; 404 for $ID (@{$SequenceDataRef->{IDs}}) { 405 $Description = $SequenceDataRef->{Description}{$ID}; 406 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 407 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); 408 409 # Description also contains ID... 410 print SEQUENCEFILE ">$Description\n"; 411 print SEQUENCEFILE "$WrappedSequence\n"; 412 } 413 close SEQUENCEFILE; 414 } 415 416 # Get ID, Sequence and Length for smallest or longest sequence 417 sub _GetShortestOrLongestSequence { 418 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; 419 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 420 421 ($ID, $Seq, $SeqLen) = ('', '', ''); 422 $FirstID = ''; 423 424 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { 425 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; 426 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); 427 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; 428 if (!$FirstID) { 429 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; 430 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 431 next ID; 432 } 433 if ($CurrentSeqLen != $SeqLen) { 434 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { 435 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 436 } 437 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { 438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 439 } 440 } 441 } 442 return ($ID, $Seq, $SeqLen, $Description); 443 } 444 445 # Remove gaps in the sequence and return new sequence... 446 sub RemoveSequenceGaps { 447 my($Seq) = @_; 448 my($SeqWithoutGaps, $SeqLen, $Index, $Residue); 449 450 $SeqWithoutGaps = ''; 451 $SeqLen = length($Seq); 452 for $Index (0 .. ($SeqLen - 1)) { 453 $Residue = substr($Seq, $Index, 1); 454 if ($Residue =~ /[A-Z]/i) { 455 $SeqWithoutGaps .= $Residue; 456 } 457 } 458 459 return $SeqWithoutGaps; 460 } 461 462 # Using input alignment data map ref containing following keys, generate 463 # a new hash with same set of keys after residue columns containg only 464 # gaps have been removed: 465 # 466 # {IDs} : Array of IDs in order as they appear in file 467 # {Count}: ID count... 468 # {Description}{$ID} : Description data... 469 # {Sequence}{$ID} : Sequence data... 470 # 471 sub RemoveSequenceAlignmentGapColumns { 472 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); 473 474 ($AlignmentDataMapRef) = @_; 475 476 %NewAlignmentDataMap = (); 477 @{$NewAlignmentDataMap{IDs}} =(); 478 %{$NewAlignmentDataMap{Description}} =(); 479 %{$NewAlignmentDataMap{Sequence}} =(); 480 $NewAlignmentDataMap{Count} = 0; 481 482 # Transfer ID and count information... 483 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 484 push @{$NewAlignmentDataMap{IDs}}, $ID; 485 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; 486 $NewAlignmentDataMap{Sequence}{$ID} = ''; 487 $NewAlignmentDataMap{Count} += 1; 488 } 489 490 # Go over residue columns and transfer the data... 491 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); 492 493 $FirstID = $AlignmentDataMapRef->{IDs}[0]; 494 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; 495 $FirstSeqLen = length($FirstSeq); 496 497 RES: for $Index (0 .. ($FirstSeqLen - 1)) { 498 # Is this a gap column? 499 $GapColumn = 1; 500 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { 501 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 502 if ($Res =~ /[A-Z]/i) { 503 $GapColumn = 0; 504 last ID; 505 } 506 } 507 if ($GapColumn) { 508 next RES; 509 } 510 # Transfer this residue... 511 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 512 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 513 $NewAlignmentDataMap{Sequence}{$ID} .= $Res; 514 } 515 } 516 517 return (\%NewAlignmentDataMap); 518 } 519 520 # 521 # Read sequences file and return a reference to hash with the following keys: 522 # 523 # {IDs} - Array of sequence IDs 524 # {Count} - Number of sequences 525 # {Description}{$ID} - Sequence description 526 # {Sequence}{$ID} - Sequence for a specific ID 527 # {InputFileType} - Sequence file format 528 # {ConservedAnnotation} - Conserved residue annonation 529 # 530 # Note: 531 # . Conserved residue annotation either exist in the input sequence alignment file or set 532 # for a file containing same number of residues for all the sequence using the following 533 # notation: * - Residue conserved; ' ' - Residue not conserved. 534 # 535 sub _ReadFileAndSetupSequencesData { 536 my($SequenceFile, $SequenceType) = @_; 537 my($SequenceDataMapRef); 538 539 $SequenceDataMapRef = undef; 540 541 # Read sequence file... 542 $SequenceDataMapRef = ''; 543 if ($SequenceType =~ /^ClustalW$/i) { 544 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); 545 } 546 elsif ($SequenceType =~ /^Pearson$/i) { 547 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); 548 } 549 elsif ($SequenceType =~ /^PIR$/i) { 550 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); 551 } 552 elsif ($SequenceType =~ /^MSF$/i) { 553 $SequenceDataMapRef = _ReadMSFFile($SequenceFile); 554 } 555 else { 556 return $SequenceDataMapRef; 557 } 558 559 if (exists $SequenceDataMapRef->{ConservedAnnotation}) { 560 return ($SequenceDataMapRef); 561 } 562 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { 563 return ($SequenceDataMapRef); 564 } 565 566 # Use the first sequence to setup an empty ConservedAnnotation key... 567 # And mark fully conserved residues... 568 # 569 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); 570 $ID = $SequenceDataMapRef->{IDs}[0]; 571 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; 572 $FirstSeqLen = length($FirstSequence); 573 $SequenceDataMapRef->{ConservedAnnotation} = ''; 574 for $Index (0 .. ($FirstSeqLen - 1)) { 575 $FirstRes = ''; 576 $ResConserved = 1; 577 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { 578 $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; 579 $Res = substr($Sequence, $Index, 1); 580 if (!$FirstRes) { 581 $FirstRes = $Res; 582 next ID; 583 } 584 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { 585 $ResConserved = 0; 586 last ID; 587 } 588 } 589 if ($ResConserved) { 590 $SequenceDataMapRef->{ConservedAnnotation} .= '*'; 591 } 592 else { 593 $SequenceDataMapRef->{ConservedAnnotation} .= ' '; 594 } 595 } 596 597 return ($SequenceDataMapRef); 598 } 599 600 # Read sequence data in ClustalW multiple sequence alignment file and 601 # return a reference to hash with these keys and values: 602 # 603 # {IDs} - Array of sequence IDs 604 # {Count} - Number of sequences 605 # {Description}{$ID} - Sequence description 606 # {Sequence}{$ID} - Sequence for a specific ID 607 # {InputFileType} - Sequence file format 608 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . 609 # 610 # 611 # 612 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: 613 # 614 # '*' indicates positions which have a single, fully conserved residue 615 # 616 # ':' indicates that one of the following 'strong' groups is fully conserved: STA 617 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW 618 619 # '.' indicates that one of the following 'weaker' groups is fully conserved: 620 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY 621 # 622 # These are all the positively scoring groups that occur in the Gonnet Pam250 623 # matrix. The strong and weak groups are defined as strong score >0.5 and weak 624 # score =<0.5 respectively. 625 # 626 sub _ReadClustalWFile { 627 my($SequenceFile) = @_; 628 my(%SequencesDataMap); 629 630 # Initialize data... 631 %SequencesDataMap = (); 632 @{$SequencesDataMap{IDs}} = (); 633 %{$SequencesDataMap{Description}} = (); 634 %{$SequencesDataMap{Sequence}} = (); 635 $SequencesDataMap{Count} = 0; 636 $SequencesDataMap{ConservedAnnotation} = ''; 637 $SequencesDataMap{InputFileType} = 'ClustalW'; 638 639 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 640 641 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); 642 643 # Ignore the header line... 644 $Line = <SEQUENCEFILE>; 645 646 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { 647 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { 648 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. 649 # Extract it and save... 650 $LineLength = length($Line); 651 $AnnotationStart = $LineLength - $SequenceLength; 652 $AnnotationLength = $SequenceLength; 653 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); 654 $SequencesDataMap{ConservedAnnotation} .= $Annotation; 655 } 656 else { 657 # Extract ID and sequences... 658 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; 659 $Sequence =~ s/ //g; 660 if (!($ID && $Sequence)) { 661 next LINE; 662 } 663 664 if (exists $SequencesDataMap{Sequence}{$ID}) { 665 # Append to existing alignment value... 666 $SequenceLength = length($Sequence); 667 $SequencesDataMap{Sequence}{$ID} .= $Sequence; 668 } 669 else { 670 # New alignment data... 671 $SequencesDataMap{Count} += 1; 672 push @{$SequencesDataMap{IDs}}, $ID; 673 $SequencesDataMap{Description}{$ID} = $ID; 674 $SequencesDataMap{Sequence}{$ID} = $Sequence; 675 $SequenceLength = length($Sequence); 676 } 677 } 678 } 679 close SEQUENCEFILE; 680 return (\%SequencesDataMap); 681 } 682 683 # Read Pearson fasta file and return a reference to hash with these keys: 684 # 685 # {IDs} - Array of sequence IDs 686 # {Count} - Number of sequences 687 # {Description}{$ID} - Sequence description 688 # {Sequence}{$ID} - Sequence for a specific ID 689 # {InputFileType} - Sequence file format 690 # {ConservedAnnotation} - Conserved residue annonation 691 # 692 sub _ReadPearsonFastaFile { 693 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); 694 695 ($FastaFileName) = @_; 696 697 %FastaDataMap = (); 698 @{$FastaDataMap{IDs}} =(); 699 %{$FastaDataMap{Description}} =(); 700 %{$FastaDataMap{Sequence}} =(); 701 $FastaDataMap{Count} = 0; 702 $FastaDataMap{InputFileType} = 'Pearson'; 703 704 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 705 $ID = ''; 706 $IgnoreID = 0; 707 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 708 if ($Line =~ /^\>/) { 709 # Start of a new ID... 710 $Line =~ s/^\>//; 711 $Line = RemoveLeadingWhiteSpaces($Line); 712 @LineWords = (); 713 @LineWords = split / /, $Line; 714 715 $ID = $LineWords[0]; 716 $ID =~ s/ //g; 717 $Description = $Line; 718 719 $IgnoreID = 0; 720 if (exists $FastaDataMap{Sequence}{$ID}) { 721 $IgnoreID = 1; 722 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 723 next LINE; 724 } 725 push @{$FastaDataMap{IDs}}, $ID; 726 $FastaDataMap{Description}{$ID} = $Description; 727 $FastaDataMap{Count} += 1; 728 next LINE; 729 } 730 if ($IgnoreID) { next LINE; } 731 732 # Remove any spaces in the sequence... 733 $Line =~ s/ //g; 734 # Sequence data for active ID... 735 if (exists $FastaDataMap{Sequence}{$ID}) { 736 $FastaDataMap{Sequence}{$ID} .= $Line; 737 } 738 else { 739 $FastaDataMap{Sequence}{$ID} = $Line; 740 } 741 } 742 close FASTAFILE; 743 return \%FastaDataMap; 744 } 745 746 # Read PIR fasta file and return a reference to hash with these keys: 747 # 748 # {IDs} - Array of sequence IDs 749 # {Count} - Number of sequences 750 # {Description}{$ID} - Sequence description 751 # {Sequence}{$ID} - Sequence for a specific ID 752 # {InputFileType} - Sequence file format 753 # {ConservedAnnotation} - Conserved residue annonation 754 # 755 # Format: 756 # A sequence in PIR format consists of: 757 # One line starting with 758 # a ">" (greater-than) sign, followed by 759 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by 760 # a semicolon, followed by 761 # the sequence identification code (the database ID-code). 762 # One line containing a textual description of the sequence. 763 # One or more lines containing the sequence itself. The end of the 764 # sequence is marked by a "*" (asterisk) character. 765 # 766 # A file in PIR format may comprise more than one sequence. 767 # 768 # The PIR format is also often referred to as the NBRF format. 769 # 770 # Code SequenceType 771 # P1 Protein (complete) 772 # F1 Protein (fragment) 773 # DL DNA (linear) 774 # DC DNA (circular) 775 # RL RNA (linear) 776 # RC RNA (circular) 777 # N3 tRNA 778 # N1 Other functional RNA 779 # 780 781 sub _ReadPIRFastaFile { 782 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); 783 784 ($FastaFileName) = @_; 785 786 %FastaDataMap = (); 787 @{$FastaDataMap{IDs}} =(); 788 %{$FastaDataMap{Description}} =(); 789 %{$FastaDataMap{Sequence}} =(); 790 %{$FastaDataMap{SequenceTypeCode}} =(); 791 $FastaDataMap{Count} = 0; 792 $FastaDataMap{InputFileType} = 'PIR'; 793 794 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 795 $ID = ''; 796 $ReadingSequenceData = 0; 797 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 798 if ($Line =~ /^\>/) { 799 # Start of a new ID... 800 $Line =~ s/^\>//; 801 $Line = RemoveLeadingWhiteSpaces($Line); 802 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; 803 804 # Use next line to retrieve sequence description... 805 $Line = GetTextLine(\*FASTAFILE); 806 $Line = RemoveLeadingWhiteSpaces($Line); 807 $Description = $Line; 808 809 if (exists $FastaDataMap{Sequence}{$ID}) { 810 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 811 next LINE; 812 } 813 $ReadingSequenceData = 1; 814 push @{$FastaDataMap{IDs}}, $ID; 815 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; 816 $FastaDataMap{Description}{$ID} = $Description; 817 $FastaDataMap{Count} += 1; 818 next LINE; 819 } 820 if (!$ReadingSequenceData) { next LINE; } 821 822 # Remove any spaces in the sequence... 823 $Line =~ s/ //g; 824 if ($Line =~ /[\*]$/) { 825 # End of sequence... 826 $ReadingSequenceData = 0; 827 $Line =~ s/[\*]$//; 828 } 829 # Sequence data for active ID... 830 if (exists $FastaDataMap{Sequence}{$ID}) { 831 $FastaDataMap{Sequence}{$ID} .= $Line; 832 } 833 else { 834 $FastaDataMap{Sequence}{$ID} = $Line; 835 } 836 } 837 close FASTAFILE; 838 return \%FastaDataMap; 839 } 840 841 # Read MSF file and return a reference to hash with these keys: 842 # 843 # {IDs} : Array of IDs in order as they appear in file 844 # {Count}: ID count... 845 # {Description}{$ID} : Description data... 846 # {Sequence}{$ID} : Sequence data... 847 # 848 sub _ReadMSFFile { 849 my($MSFFileName, $Line, @LineWords, %MSFDataMap); 850 851 ($MSFFileName) = @_; 852 853 %MSFDataMap = (); 854 @{$MSFDataMap{IDs}} =(); 855 %{$MSFDataMap{Description}} =(); 856 %{$MSFDataMap{Sequence}} =(); 857 $MSFDataMap{Count} = 0; 858 $MSFDataMap{InputFileType} = 'MSF'; 859 860 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; 861 862 # Collect sequences and IDs... 863 # 864 # '//' after the name fields indicates end of header list and start of sequence data. 865 # 866 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); 867 %MSFIDsMap = (); 868 $NameFieldsFound = 0; 869 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 870 if ($Line =~ /Name:/) { 871 $NameFieldsFound++; 872 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; 873 if ($ID =~ / /) { 874 ($ID) = $ID =~ /^(.*?)[ ]+/ 875 } 876 if (exists $MSFIDsMap{$ID}) { 877 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; 878 next LINE; 879 } 880 $MSFIDsMap{$ID} = $ID; 881 push @{$MSFDataMap{IDs}}, $ID; 882 $MSFDataMap{Description}{$ID} = $ID; 883 $MSFDataMap{Count} += 1; 884 } 885 elsif ( /\/\// && $NameFieldsFound) { 886 # End of header list... 887 last LINE; 888 } 889 } 890 # Collect all sequences... 891 # 892 my($FirstField, $SecondField); 893 while ($Line = GetTextLine(\*MSFFILE)) { 894 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; 895 if (exists $MSFIDsMap{$FirstField}) { 896 # It's ID and sequence data... 897 $ID = $FirstField; 898 $Sequence = $SecondField; 899 # Take out spaces and leave the gap characters... 900 $Sequence =~ s/ //g; 901 if ($MSFDataMap{Sequence}{$ID}) { 902 $MSFDataMap{Sequence}{$ID} .= $Sequence; 903 } 904 else { 905 $MSFDataMap{Sequence}{$ID} = $Sequence; 906 } 907 } 908 } 909 910 close MSFFILE; 911 return \%MSFDataMap; 912 } 913 914