MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: InfoPDBFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use PDBFileUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName: Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Get the options and setup script...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 my(@PDBFilesList);
  53 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
  54 
  55 # Process options...
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 # Setup information about input files...
  61 my(%PDBFilesInfo);
  62 print "Checking input PDB file(s)...\n";
  63 RetrievePDBFilesInfo();
  64 
  65 # Process input files..
  66 my($FileIndex);
  67 if (@PDBFilesList > 1) {
  68   print "\nProcessing PDB files...\n";
  69 }
  70 for $FileIndex (0 .. $#PDBFilesList) {
  71   if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
  72     print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
  73     ListPDBFileInfo($FileIndex);
  74   }
  75 }
  76 ListTotalSizeOfFiles();
  77 
  78 print "\n$ScriptName:Done...\n\n";
  79 
  80 $EndTime = new Benchmark;
  81 $TotalTime = timediff ($EndTime, $StartTime);
  82 print "Total time: ", timestr($TotalTime), "\n";
  83 
  84 ###############################################################################
  85 
  86 # List appropriate information...
  87 sub ListPDBFileInfo {
  88   my($Index) = @_;
  89   my($PDBFile, $PDBRecordLinesRef);
  90 
  91   $PDBFile = $PDBFilesList[$Index];
  92   $PDBRecordLinesRef = ReadPDBFile($PDBFile);
  93 
  94   # Header informaton...
  95   if ($OptionsInfo{ListHeaderInfo}) {
  96     ListHeaderInfo($PDBRecordLinesRef);
  97   }
  98 
  99   # Experiment informaton...
 100   if ($OptionsInfo{ListExperimentalTechniqueInfo}) {
 101     ListExperimentalTechniqueInfo($PDBRecordLinesRef);
 102   }
 103 
 104   # Total number of records...
 105   my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef};
 106   print "\nTotal number of records: $TotalRecordsCount\n";
 107 
 108   # List record type count information...
 109   ListRecordTypesInfo($PDBRecordLinesRef);
 110 
 111   if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) {
 112     ListChainsAndResiduesInfo($PDBRecordLinesRef);
 113   }
 114   if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) {
 115     ListAllResiduesInfo($PDBRecordLinesRef);
 116   }
 117   if ($OptionsInfo{ResidueNumbersInfo}) {
 118     ListResidueNumbersInfo($PDBRecordLinesRef);
 119   }
 120   if ($OptionsInfo{CalculateBoundingBox}) {
 121     ListBoundingBox($PDBRecordLinesRef);
 122   }
 123 
 124   # File size and modification information...
 125   print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n";
 126   print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n";
 127 }
 128 
 129 sub ListHeaderInfo {
 130   my($PDBRecordLinesRef) = @_;
 131   my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode);
 132 
 133   ($Classification, $DepositionDate, $IDCode) = (undef) x 3;
 134   $HeaderRecordLine = $PDBRecordLinesRef->[0];
 135   if (IsHeaderRecordType($HeaderRecordLine)) {
 136     ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
 137   }
 138 
 139   $Classification = IsEmpty($Classification) ? 'Not available' : $Classification;
 140   $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate;
 141   $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode;
 142 
 143   print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n";
 144 }
 145 
 146 # List experimental technique information info...
 147 sub ListExperimentalTechniqueInfo {
 148   my($PDBRecordLinesRef) = @_;
 149   my($ExperimentalTechnique, $Resolution, $ResolutionUnits);
 150 
 151   $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef);
 152   print "\nExperimental technique: " . ($ExperimentalTechnique ? $ExperimentalTechnique : "Not available") . "\n";
 153 
 154   ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution($PDBRecordLinesRef);
 155   print "Resolution: " . ($Resolution ? "$Resolution $ResolutionUnits" : "Not available") . "\n";
 156 
 157 }
 158 
 159 # List record type info...
 160 sub ListRecordTypesInfo {
 161   my($PDBRecordLinesRef) = @_;
 162   my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo);
 163 
 164   $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef);
 165 
 166   @RecordTypeCountInfo = ();
 167   if ($OptionsInfo{CountRecordType} =~ /^All$/i) {
 168     for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) {
 169       $RecordCount = $RecordTypesCountRef->{Count}{$RecordType};
 170       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 171     }
 172   }
 173   else {
 174     for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) {
 175       $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0;
 176       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 177     }
 178   }
 179   print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n";
 180 
 181   if ($OptionsInfo{CheckMasterRecord}) {
 182     CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef);
 183   }
 184 }
 185 
 186 # List information about residues and chains...
 187 sub ListChainsAndResiduesInfo {
 188   my($PDBRecordLinesRef) = @_;
 189   my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef);
 190 
 191   $CollectChainResiduesBeyondTER = 1;
 192   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 193   $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}};
 194   if ($OptionsInfo{CountChains}) {
 195     print "\nNumber of chains: $ChainCount \n";
 196     my($ChainID, @ChainIDsList);
 197     @ChainIDsList = ();
 198     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 199       push @ChainIDsList, CleanupChainID($ChainID);
 200     }
 201     print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n";
 202   }
 203 
 204   if ($OptionsInfo{CountResiduesInChains}) {
 205     my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo);
 206     @ResiduesCountInfo = ();
 207     $TotalResiduesCount = 0;
 208     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 209       $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 210       $TotalResiduesCount += $ResidueCount;
 211       $ResidueCountInfo =  "Chain ${ChainID} - ${ResidueCount}";
 212       push @ResiduesCountInfo, $ResidueCountInfo;
 213     }
 214     print "\nNumber of residues in chain(s): ";
 215     if ($ChainCount > 1) {
 216       print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n";
 217     }
 218     else {
 219       print "$TotalResiduesCount\n";
 220     }
 221 
 222     # List of residues in each chain...
 223     if ($OptionsInfo{DetailLevel} >= 3) {
 224       print "List of residues in chain(s): \n";
 225       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 226         if ($ChainCount > 1) {
 227           print "Chain ", CleanupChainID($ChainID), ": ";
 228         }
 229         print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n";
 230       }
 231     }
 232   }
 233   if ($OptionsInfo{ResiduesFrequencyInChains}) {
 234     # Setup a hash using residue count as key for sorting the values...
 235     my(%ResiduesCountToNameMap);
 236     %ResiduesCountToNameMap = ();
 237     @{$ResiduesCountToNameMap{ChainIDs}} = ();
 238     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 239 
 240     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 241       push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID;
 242       %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = ();
 243 
 244       for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 245         $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 246         # Setup count value for each chain...
 247         if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) {
 248           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}";
 249         }
 250         else {
 251           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName;
 252         }
 253       }
 254     }
 255     # Collect data for all the residues in all the chains...
 256     my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap);
 257     %AllResiduesNameToCountMap = ();
 258     %AllResiduesCountToNameMap = ();
 259     if ($ChainCount > 1) {
 260       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 261         for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 262           $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 263           if (exists $AllResiduesNameToCountMap{$ResidueName}) {
 264             $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount;
 265           }
 266           else {
 267             $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount;
 268           }
 269         }
 270       }
 271       for $ResidueName (keys %AllResiduesNameToCountMap) {
 272         $ResidueCount = $AllResiduesNameToCountMap{$ResidueName};
 273         if (exists $AllResiduesCountToNameMap{$ResidueCount}) {
 274           $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}";
 275         }
 276         else {
 277           $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName;
 278         }
 279       }
 280     }
 281 
 282     # Setup distribution data for individual chains and the grand total as well...
 283     my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap);
 284     @{$ResiduesFrequencyInfoMap{ChainIDs}} = ();
 285     %{$ResiduesFrequencyInfoMap{Frequency}} = ();
 286     %{$ResiduesFrequencyInfoMap{PercentFrequency}} = ();
 287 
 288     @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = ();
 289     @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = ();
 290 
 291     $TotalResidueCount = 0;
 292 
 293     for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) {
 294       push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID;
 295       @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = ();
 296       @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = ();
 297 
 298       $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 299       $TotalResidueCount += $ChainResidueCount;
 300 
 301       for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) {
 302         $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount};
 303         @ResidueNamesList = split /~/, $ResidueNames;
 304         for $ResidueName (@ResidueNamesList) {
 305           push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}";
 306           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0;
 307           push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%";
 308         }
 309       }
 310     }
 311     if ($ChainCount > 1) {
 312       for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) {
 313         $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount};
 314         @ResidueNamesList = split /~/, $ResidueNames;
 315         for $ResidueName (@ResidueNamesList) {
 316           push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}";
 317           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 318           push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%";
 319         }
 320       }
 321     }
 322 
 323     # List distribution of residues
 324     print "\nDistribution of residues in chain(s): \n";
 325     for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 326       if ($ChainCount > 1) {
 327         print "Chain ", CleanupChainID($ChainID), ": ";
 328       }
 329       print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n";
 330     }
 331     if ($OptionsInfo{DetailLevel} >= 2) {
 332       print "\nPercent distribution of residues in chain(s): \n";
 333       for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 334         if ($ChainCount > 1) {
 335           print "Chain ", CleanupChainID($ChainID), ": ";
 336         }
 337         print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n";
 338       }
 339     }
 340     if ($ChainCount > 1) {
 341       print "\nDistribution of residues across all chains: \n";
 342       print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n";
 343 
 344       if ($OptionsInfo{DetailLevel} >= 2) {
 345         print "\nPercent distribution of residues across all chains: \n";
 346         print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n";
 347       }
 348     }
 349   }
 350 }
 351 
 352 # List information about all the residues...
 353 sub ListAllResiduesInfo {
 354   my($PDBRecordLinesRef) = @_;
 355   my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef);
 356 
 357   $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef);
 358   $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}};
 359   $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}};
 360   $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}};
 361 
 362   if ($OptionsInfo{CountResiduesAll}) {
 363     print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n";
 364 
 365     if ($OptionsInfo{DetailLevel} >= 3) {
 366       print "List of residues: \n";
 367       if ($AtomResiduesCount) {
 368         print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n";
 369       }
 370       if ($HetatmResiduesCount) {
 371         print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n";
 372       }
 373     }
 374   }
 375 
 376   if ($OptionsInfo{ResiduesFrequencyAll}) {
 377     my($ResidueName, $ResidueCount);
 378 
 379     # Setup a hash using residue count as key for sorting the values...
 380     my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap);
 381     %ResiduesCountToNameMap = ();
 382     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 383 
 384     %AtomResiduesCountToNameMap = ();
 385     %{$AtomResiduesCountToNameMap{ResidueNames}} = ();
 386 
 387     %HetatmResiduesCountToNameMap = ();
 388     %{$HetatmResiduesCountToNameMap{ResidueNames}} = ();
 389 
 390     for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) {
 391       $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName};
 392       if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 393         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 394       }
 395       else {
 396         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 397       }
 398     }
 399 
 400     if ($OptionsInfo{DetailLevel} >= 1) {
 401       for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) {
 402         $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName};
 403         if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 404           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 405         }
 406         else {
 407           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 408         }
 409       }
 410       for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) {
 411         $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName};
 412         if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 413           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 414         }
 415         else {
 416           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 417         }
 418       }
 419     }
 420 
 421     # Setup distribution of residues info...
 422     my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap);
 423 
 424     @{$ResiduesCountInfoMap{Frequency}} = ();
 425     @{$ResiduesCountInfoMap{PercentFrequency}} = ();
 426     for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) {
 427       $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 428       $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 429       @ResidueNamesList = split /~/, $ResidueNames;
 430       for $ResidueName (@ResidueNamesList) {
 431         push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 432         push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 433       }
 434     }
 435     if ($OptionsInfo{DetailLevel} >= 1) {
 436       @{$AtomResiduesCountInfoMap{Frequency}} = ();
 437       @{$AtomResiduesCountInfoMap{PercentFrequency}} = ();
 438       for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) {
 439         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 440         $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 441         @ResidueNamesList = split /~/, $ResidueNames;
 442         for $ResidueName (@ResidueNamesList) {
 443           push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 444           push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 445         }
 446       }
 447       @{$HetatmResiduesCountInfoMap{Frequency}} = ();
 448       @{$HetatmResiduesCountInfoMap{PercentFrequency}} = ();
 449       for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) {
 450         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 451         $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 452         @ResidueNamesList = split /~/, $ResidueNames;
 453         for $ResidueName (@ResidueNamesList) {
 454           push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 455           push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 456         }
 457       }
 458     }
 459 
 460     # List distribution of residues
 461     print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 462     if ($OptionsInfo{DetailLevel} >= 2) {
 463       print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 464     }
 465 
 466     if ($OptionsInfo{DetailLevel} >= 1) {
 467       print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 468       if ($OptionsInfo{DetailLevel} >= 2) {
 469         print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 470       }
 471 
 472       print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 473       if ($OptionsInfo{DetailLevel} >= 2) {
 474         print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 475       }
 476     }
 477   }
 478 }
 479 
 480 # List information about residue numbers for each chain...
 481 sub ListResidueNumbersInfo {
 482   my($PDBRecordLinesRef) = @_;
 483   my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs);
 484 
 485   $CollectChainResiduesBeyondTER = 0;
 486   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 487 
 488   print "\nATOM/HETATM residue numbers information for chains:\n";
 489 
 490   for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 491     print "\nChain ID - ",  CleanupChainID($ChainID), "";
 492 
 493     $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}};
 494 
 495     # Start and end residue numbers...
 496     $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0];
 497     $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1];
 498     print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n";
 499 
 500     # Identify any gaps in residue numbers or non-ascending order residue numbers...
 501     $GapResiduePairsCount = 0;
 502     $DescendingOrderResiduePairsCount = 0;
 503 
 504     @DescendingOrderResiduePairs = ();
 505     @GapResiduePairs = ();
 506 
 507     RESIDUE: for $Index (1 .. ($ResidueCount - 1)) {
 508       $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index];
 509       $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1];
 510 
 511       $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index];
 512       $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1];
 513 
 514       if ($ResidueNum == ($PreviousResidueNum + 1)) {
 515         # All is good...
 516         next RESIDUE;
 517       }
 518 
 519       # Are residue in descending order?
 520       if ($ResidueNum < $PreviousResidueNum) {
 521         $DescendingOrderResiduePairsCount++;
 522         push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>";
 523       }
 524 
 525       # Track gaps in residue pairs...
 526       $GapResiduePairsCount++;
 527       $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1;
 528 
 529       push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>";
 530     }
 531 
 532     # Print gaps information...
 533     print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None";
 534     if ($GapResiduePairsCount) {
 535       print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0);
 536     }
 537     print "\n";
 538 
 539     # Print descending residue order information...
 540     print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None";
 541     if ($DescendingOrderResiduePairsCount) {
 542       print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0);
 543     }
 544     print "\n";
 545   }
 546 }
 547 
 548 # List min/max XYZ coordinates for ATOM/HETATM records...
 549 sub ListBoundingBox {
 550   my($PDBRecordLinesRef) = @_;
 551   my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize);
 552 
 553   ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef);
 554   $XSize = abs($XMax - $XMin);
 555   $YSize = abs($YMax - $YMin);
 556   $ZSize = abs($ZMax - $ZMin);
 557 
 558   $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0;
 559   $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0;
 560   $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0;
 561 
 562   $XSize = sprintf("%.3f", $XSize) + 0;
 563   $YSize = sprintf("%.3f", $YSize) + 0;
 564   $ZSize = sprintf("%.3f", $ZSize) + 0;
 565 
 566   print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n";
 567   print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n";
 568 
 569 }
 570 
 571 # Check master record counts against actual record counts...
 572 sub CheckMasterRecord {
 573   my($RecordTypesCountRef, $PDBRecordLinesRef) = @_;
 574 
 575   # Get master record information...
 576   my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11;
 577   my($RecordLine, $MasterRecordFound);
 578   $MasterRecordFound = 0;
 579 
 580   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 581       if (IsMasterRecordType($RecordLine)) {
 582         ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine);
 583         $MasterRecordFound = 1;
 584         last LINE;
 585       }
 586   }
 587   if (!$MasterRecordFound) {
 588     print "\nWarning: MASTER record is missing.\n";
 589     return;
 590   }
 591   my(@MasterRecordValidationInfo);
 592   @MasterRecordValidationInfo = ();
 593   $NumOfRemarkRecords += 0;
 594   if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) {
 595     push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 596   }
 597   $NumOfHetRecords += 0;
 598   if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) {
 599     push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}.";
 600   }
 601   $NumOfHelixRecords += 0;
 602   if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) {
 603     push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}.";
 604   }
 605   $NumOfSheetRecords += 0;
 606   if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) {
 607     push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}.";
 608   }
 609   $NumOfTurnRecords += 0;
 610   if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) {
 611     push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 612   }
 613   $NumOfSiteRecords += 0;
 614   if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) {
 615     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}.";
 616   }
 617 
 618   $NumOfTransformationsRecords += 0;
 619   my($RecordsCount, $ID, $RecordID, $RecordLabel);
 620   $RecordsCount = 0;
 621   for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') {
 622     for $ID (1 .. 3) {
 623       $RecordID = "${RecordLabel}${ID}";
 624       if (exists $RecordTypesCountRef->{Count}{$RecordID}) {
 625         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID};
 626       }
 627     }
 628   }
 629   if ($NumOfTransformationsRecords != $RecordsCount) {
 630     push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 631   }
 632 
 633   $RecordsCount = 0;
 634   for $RecordLabel ('ATOM', 'HETATM') {
 635       if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) {
 636         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel};
 637       }
 638   }
 639   $NumOfAtomAndHetatmRecords += 0;
 640   if ($NumOfAtomAndHetatmRecords != $RecordsCount) {
 641     push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 642   }
 643   $NumOfTerRecords += 0;
 644   if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) {
 645     push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}.";
 646   }
 647   $NumOfConectRecords += 0;
 648   if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) {
 649     push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}.";
 650   }
 651   $NumOfSeqresRecords += 0;
 652   if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) {
 653     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}.";
 654   }
 655 
 656   if (@MasterRecordValidationInfo) {
 657     print "\nMASTER record validation: Count mismatches found:\n";
 658     print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n";
 659   }
 660   else {
 661     print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n";
 662   }
 663 }
 664 
 665 # Total size of all the files...
 666 sub ListTotalSizeOfFiles {
 667   my($FileOkayCount, $TotalSize, $Index);
 668 
 669   $FileOkayCount = 0;
 670   $TotalSize = 0;
 671 
 672   for $Index (0 .. $#PDBFilesList) {
 673     if ($PDBFilesInfo{FileOkay}[$Index]) {
 674       $FileOkayCount++;
 675       $TotalSize += $PDBFilesInfo{FileSize}[$Index];
 676     }
 677   }
 678   if ($FileOkayCount > 1) {
 679     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 680   }
 681 
 682 }
 683 
 684 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any
 685 # numbers from label...
 686 sub CleanupChainID {
 687   my($ChainID) = @_;
 688 
 689   if ($ChainID =~ /^None/i) {
 690     return 'None';
 691   }
 692   return $ChainID;
 693 }
 694 
 695 # Process option values...
 696 sub ProcessOptions {
 697   %OptionsInfo = ();
 698 
 699   # Setup record types to count...
 700   if ($Options{count}) {
 701     $OptionsInfo{CountRecordType} = $Options{count};
 702   }
 703   else {
 704     $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM';
 705   }
 706   @{$OptionsInfo{SpecifiedRecordTypes}} =();
 707   if ($OptionsInfo{CountRecordType} !~ /^All$/i) {
 708     my(@RecordTypes);
 709     @RecordTypes = split /\,/, $OptionsInfo{CountRecordType};
 710     push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes;
 711   }
 712   $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0;
 713   $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0;
 714 
 715   # Residue count is the default. So $Options{residues} is simply ignored.
 716   my($CountResidues) = 1;
 717   $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 718   $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 719 
 720   $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 721   $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 722 
 723   $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all})  ? 1 : 0;
 724 
 725   $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0;
 726 
 727   $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0;
 728   $OptionsInfo{DetailLevel} = $Options{detail};
 729 
 730   $OptionsInfo{ListExperimentalTechniqueInfo} = ($Options{experiment} || $Options{all}) ? 1 : 0;
 731 
 732 }
 733 
 734 # Retrieve information about PDB files...
 735 sub RetrievePDBFilesInfo {
 736   my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString);
 737 
 738   %PDBFilesInfo = ();
 739   @{$PDBFilesInfo{FileOkay}} = ();
 740   @{$PDBFilesInfo{FileSize}} = ();
 741   @{$PDBFilesInfo{FileLastModified}} = ();
 742 
 743   FILELIST: for $Index (0 .. $#PDBFilesList) {
 744     $PDBFilesInfo{FileOkay}[$Index] = 0;
 745     $PDBFilesInfo{FileSize}[$Index] = 0;
 746     $PDBFilesInfo{FileLastModified}[$Index] = '';
 747 
 748     $PDBFile = $PDBFilesList[$Index];
 749     if (!(-e $PDBFile)) {
 750       warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
 751       next FILELIST;
 752     }
 753     if (!CheckFileType($PDBFile, "pdb")) {
 754       warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
 755       next FILELIST;
 756     }
 757     if (! open PDBFILE, "$PDBFile") {
 758       warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
 759       next FILELIST;
 760     }
 761     close PDBFILE;
 762 
 763     $PDBFilesInfo{FileOkay}[$Index] = 1;
 764     $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile);
 765     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile);
 766     $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 767   }
 768 }
 769 
 770 
 771 # Setup script usage  and retrieve command line arguments specified using various options...
 772 sub SetupScriptUsage {
 773 
 774   # Retrieve all the options...
 775   %Options = ();
 776   $Options{count} = '';
 777   $Options{detail} = 1;
 778   $Options{residuesmode} = 'Both';
 779 
 780   if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "experiment|e", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) {
 781     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 782   }
 783   if ($Options{workingdir}) {
 784     if (! -d $Options{workingdir}) {
 785       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 786     }
 787     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 788   }
 789   if (!IsPositiveInteger($Options{detail})) {
 790     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 791   }
 792   if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) {
 793     die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n";
 794   }
 795 }
 796