1 #!/usr/bin/perl -w 2 # 3 # File: InfoPDBFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use PDBFileUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename($0); 43 print "\n$ScriptName: Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help} || @ARGV < 1) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 my(@PDBFilesList); 53 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 54 55 # Process options... 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 # Setup information about input files... 61 my(%PDBFilesInfo); 62 print "Checking input PDB file(s)...\n"; 63 RetrievePDBFilesInfo(); 64 65 # Process input files.. 66 my($FileIndex); 67 if (@PDBFilesList > 1) { 68 print "\nProcessing PDB files...\n"; 69 } 70 for $FileIndex (0 .. $#PDBFilesList) { 71 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 72 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 73 ListPDBFileInfo($FileIndex); 74 } 75 } 76 ListTotalSizeOfFiles(); 77 78 print "\n$ScriptName:Done...\n\n"; 79 80 $EndTime = new Benchmark; 81 $TotalTime = timediff ($EndTime, $StartTime); 82 print "Total time: ", timestr($TotalTime), "\n"; 83 84 ############################################################################### 85 86 # List appropriate information... 87 sub ListPDBFileInfo { 88 my($Index) = @_; 89 my($PDBFile, $PDBRecordLinesRef); 90 91 $PDBFile = $PDBFilesList[$Index]; 92 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 93 94 # Header informaton... 95 if ($OptionsInfo{ListHeaderInfo}) { 96 ListHeaderInfo($PDBRecordLinesRef); 97 } 98 99 # Experiment informaton... 100 if ($OptionsInfo{ListExperimentalTechniqueInfo}) { 101 ListExperimentalTechniqueInfo($PDBRecordLinesRef); 102 } 103 104 # Total number of records... 105 my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef}; 106 print "\nTotal number of records: $TotalRecordsCount\n"; 107 108 # List record type count information... 109 ListRecordTypesInfo($PDBRecordLinesRef); 110 111 if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) { 112 ListChainsAndResiduesInfo($PDBRecordLinesRef); 113 } 114 if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) { 115 ListAllResiduesInfo($PDBRecordLinesRef); 116 } 117 if ($OptionsInfo{ResidueNumbersInfo}) { 118 ListResidueNumbersInfo($PDBRecordLinesRef); 119 } 120 if ($OptionsInfo{CalculateBoundingBox}) { 121 ListBoundingBox($PDBRecordLinesRef); 122 } 123 124 # File size and modification information... 125 print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n"; 126 print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n"; 127 } 128 129 sub ListHeaderInfo { 130 my($PDBRecordLinesRef) = @_; 131 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode); 132 133 ($Classification, $DepositionDate, $IDCode) = (undef) x 3; 134 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 135 if (IsHeaderRecordType($HeaderRecordLine)) { 136 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 137 } 138 139 $Classification = IsEmpty($Classification) ? 'Not available' : $Classification; 140 $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate; 141 $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode; 142 143 print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n"; 144 } 145 146 # List experimental technique information info... 147 sub ListExperimentalTechniqueInfo { 148 my($PDBRecordLinesRef) = @_; 149 my($ExperimentalTechnique, $Resolution, $ResolutionUnits); 150 151 $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef); 152 print "\nExperimental technique: " . ($ExperimentalTechnique ? $ExperimentalTechnique : "Not available") . "\n"; 153 154 ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution($PDBRecordLinesRef); 155 print "Resolution: " . ($Resolution ? "$Resolution $ResolutionUnits" : "Not available") . "\n"; 156 157 } 158 159 # List record type info... 160 sub ListRecordTypesInfo { 161 my($PDBRecordLinesRef) = @_; 162 my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo); 163 164 $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef); 165 166 @RecordTypeCountInfo = (); 167 if ($OptionsInfo{CountRecordType} =~ /^All$/i) { 168 for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) { 169 $RecordCount = $RecordTypesCountRef->{Count}{$RecordType}; 170 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 171 } 172 } 173 else { 174 for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) { 175 $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0; 176 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 177 } 178 } 179 print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n"; 180 181 if ($OptionsInfo{CheckMasterRecord}) { 182 CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef); 183 } 184 } 185 186 # List information about residues and chains... 187 sub ListChainsAndResiduesInfo { 188 my($PDBRecordLinesRef) = @_; 189 my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef); 190 191 $CollectChainResiduesBeyondTER = 1; 192 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 193 $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 194 if ($OptionsInfo{CountChains}) { 195 print "\nNumber of chains: $ChainCount \n"; 196 my($ChainID, @ChainIDsList); 197 @ChainIDsList = (); 198 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 199 push @ChainIDsList, CleanupChainID($ChainID); 200 } 201 print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n"; 202 } 203 204 if ($OptionsInfo{CountResiduesInChains}) { 205 my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo); 206 @ResiduesCountInfo = (); 207 $TotalResiduesCount = 0; 208 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 209 $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 210 $TotalResiduesCount += $ResidueCount; 211 $ResidueCountInfo = "Chain ${ChainID} - ${ResidueCount}"; 212 push @ResiduesCountInfo, $ResidueCountInfo; 213 } 214 print "\nNumber of residues in chain(s): "; 215 if ($ChainCount > 1) { 216 print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n"; 217 } 218 else { 219 print "$TotalResiduesCount\n"; 220 } 221 222 # List of residues in each chain... 223 if ($OptionsInfo{DetailLevel} >= 3) { 224 print "List of residues in chain(s): \n"; 225 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 226 if ($ChainCount > 1) { 227 print "Chain ", CleanupChainID($ChainID), ": "; 228 } 229 print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n"; 230 } 231 } 232 } 233 if ($OptionsInfo{ResiduesFrequencyInChains}) { 234 # Setup a hash using residue count as key for sorting the values... 235 my(%ResiduesCountToNameMap); 236 %ResiduesCountToNameMap = (); 237 @{$ResiduesCountToNameMap{ChainIDs}} = (); 238 %{$ResiduesCountToNameMap{ResidueNames}} = (); 239 240 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 241 push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID; 242 %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = (); 243 244 for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 245 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 246 # Setup count value for each chain... 247 if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) { 248 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}"; 249 } 250 else { 251 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName; 252 } 253 } 254 } 255 # Collect data for all the residues in all the chains... 256 my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap); 257 %AllResiduesNameToCountMap = (); 258 %AllResiduesCountToNameMap = (); 259 if ($ChainCount > 1) { 260 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 261 for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 262 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 263 if (exists $AllResiduesNameToCountMap{$ResidueName}) { 264 $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount; 265 } 266 else { 267 $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount; 268 } 269 } 270 } 271 for $ResidueName (keys %AllResiduesNameToCountMap) { 272 $ResidueCount = $AllResiduesNameToCountMap{$ResidueName}; 273 if (exists $AllResiduesCountToNameMap{$ResidueCount}) { 274 $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}"; 275 } 276 else { 277 $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName; 278 } 279 } 280 } 281 282 # Setup distribution data for individual chains and the grand total as well... 283 my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap); 284 @{$ResiduesFrequencyInfoMap{ChainIDs}} = (); 285 %{$ResiduesFrequencyInfoMap{Frequency}} = (); 286 %{$ResiduesFrequencyInfoMap{PercentFrequency}} = (); 287 288 @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = (); 289 @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = (); 290 291 $TotalResidueCount = 0; 292 293 for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) { 294 push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID; 295 @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = (); 296 @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = (); 297 298 $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 299 $TotalResidueCount += $ChainResidueCount; 300 301 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) { 302 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}; 303 @ResidueNamesList = split /~/, $ResidueNames; 304 for $ResidueName (@ResidueNamesList) { 305 push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}"; 306 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0; 307 push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%"; 308 } 309 } 310 } 311 if ($ChainCount > 1) { 312 for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) { 313 $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount}; 314 @ResidueNamesList = split /~/, $ResidueNames; 315 for $ResidueName (@ResidueNamesList) { 316 push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}"; 317 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 318 push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%"; 319 } 320 } 321 } 322 323 # List distribution of residues 324 print "\nDistribution of residues in chain(s): \n"; 325 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 326 if ($ChainCount > 1) { 327 print "Chain ", CleanupChainID($ChainID), ": "; 328 } 329 print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n"; 330 } 331 if ($OptionsInfo{DetailLevel} >= 2) { 332 print "\nPercent distribution of residues in chain(s): \n"; 333 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 334 if ($ChainCount > 1) { 335 print "Chain ", CleanupChainID($ChainID), ": "; 336 } 337 print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n"; 338 } 339 } 340 if ($ChainCount > 1) { 341 print "\nDistribution of residues across all chains: \n"; 342 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n"; 343 344 if ($OptionsInfo{DetailLevel} >= 2) { 345 print "\nPercent distribution of residues across all chains: \n"; 346 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n"; 347 } 348 } 349 } 350 } 351 352 # List information about all the residues... 353 sub ListAllResiduesInfo { 354 my($PDBRecordLinesRef) = @_; 355 my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef); 356 357 $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef); 358 $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}}; 359 $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}}; 360 $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}}; 361 362 if ($OptionsInfo{CountResiduesAll}) { 363 print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n"; 364 365 if ($OptionsInfo{DetailLevel} >= 3) { 366 print "List of residues: \n"; 367 if ($AtomResiduesCount) { 368 print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n"; 369 } 370 if ($HetatmResiduesCount) { 371 print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n"; 372 } 373 } 374 } 375 376 if ($OptionsInfo{ResiduesFrequencyAll}) { 377 my($ResidueName, $ResidueCount); 378 379 # Setup a hash using residue count as key for sorting the values... 380 my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap); 381 %ResiduesCountToNameMap = (); 382 %{$ResiduesCountToNameMap{ResidueNames}} = (); 383 384 %AtomResiduesCountToNameMap = (); 385 %{$AtomResiduesCountToNameMap{ResidueNames}} = (); 386 387 %HetatmResiduesCountToNameMap = (); 388 %{$HetatmResiduesCountToNameMap{ResidueNames}} = (); 389 390 for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) { 391 $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName}; 392 if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 393 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 394 } 395 else { 396 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 397 } 398 } 399 400 if ($OptionsInfo{DetailLevel} >= 1) { 401 for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) { 402 $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName}; 403 if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 404 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 405 } 406 else { 407 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 408 } 409 } 410 for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) { 411 $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName}; 412 if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 413 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 414 } 415 else { 416 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 417 } 418 } 419 } 420 421 # Setup distribution of residues info... 422 my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap); 423 424 @{$ResiduesCountInfoMap{Frequency}} = (); 425 @{$ResiduesCountInfoMap{PercentFrequency}} = (); 426 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) { 427 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 428 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 429 @ResidueNamesList = split /~/, $ResidueNames; 430 for $ResidueName (@ResidueNamesList) { 431 push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 432 push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 433 } 434 } 435 if ($OptionsInfo{DetailLevel} >= 1) { 436 @{$AtomResiduesCountInfoMap{Frequency}} = (); 437 @{$AtomResiduesCountInfoMap{PercentFrequency}} = (); 438 for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) { 439 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 440 $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 441 @ResidueNamesList = split /~/, $ResidueNames; 442 for $ResidueName (@ResidueNamesList) { 443 push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 444 push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 445 } 446 } 447 @{$HetatmResiduesCountInfoMap{Frequency}} = (); 448 @{$HetatmResiduesCountInfoMap{PercentFrequency}} = (); 449 for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) { 450 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 451 $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 452 @ResidueNamesList = split /~/, $ResidueNames; 453 for $ResidueName (@ResidueNamesList) { 454 push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 455 push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 456 } 457 } 458 } 459 460 # List distribution of residues 461 print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 462 if ($OptionsInfo{DetailLevel} >= 2) { 463 print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 464 } 465 466 if ($OptionsInfo{DetailLevel} >= 1) { 467 print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 468 if ($OptionsInfo{DetailLevel} >= 2) { 469 print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 470 } 471 472 print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 473 if ($OptionsInfo{DetailLevel} >= 2) { 474 print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 475 } 476 } 477 } 478 } 479 480 # List information about residue numbers for each chain... 481 sub ListResidueNumbersInfo { 482 my($PDBRecordLinesRef) = @_; 483 my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs); 484 485 $CollectChainResiduesBeyondTER = 0; 486 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 487 488 print "\nATOM/HETATM residue numbers information for chains:\n"; 489 490 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 491 print "\nChain ID - ", CleanupChainID($ChainID), ""; 492 493 $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}}; 494 495 # Start and end residue numbers... 496 $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0]; 497 $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1]; 498 print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n"; 499 500 # Identify any gaps in residue numbers or non-ascending order residue numbers... 501 $GapResiduePairsCount = 0; 502 $DescendingOrderResiduePairsCount = 0; 503 504 @DescendingOrderResiduePairs = (); 505 @GapResiduePairs = (); 506 507 RESIDUE: for $Index (1 .. ($ResidueCount - 1)) { 508 $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index]; 509 $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1]; 510 511 $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index]; 512 $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1]; 513 514 if ($ResidueNum == ($PreviousResidueNum + 1)) { 515 # All is good... 516 next RESIDUE; 517 } 518 519 # Are residue in descending order? 520 if ($ResidueNum < $PreviousResidueNum) { 521 $DescendingOrderResiduePairsCount++; 522 push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>"; 523 } 524 525 # Track gaps in residue pairs... 526 $GapResiduePairsCount++; 527 $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1; 528 529 push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>"; 530 } 531 532 # Print gaps information... 533 print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None"; 534 if ($GapResiduePairsCount) { 535 print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0); 536 } 537 print "\n"; 538 539 # Print descending residue order information... 540 print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None"; 541 if ($DescendingOrderResiduePairsCount) { 542 print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0); 543 } 544 print "\n"; 545 } 546 } 547 548 # List min/max XYZ coordinates for ATOM/HETATM records... 549 sub ListBoundingBox { 550 my($PDBRecordLinesRef) = @_; 551 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize); 552 553 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef); 554 $XSize = abs($XMax - $XMin); 555 $YSize = abs($YMax - $YMin); 556 $ZSize = abs($ZMax - $ZMin); 557 558 $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0; 559 $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0; 560 $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0; 561 562 $XSize = sprintf("%.3f", $XSize) + 0; 563 $YSize = sprintf("%.3f", $YSize) + 0; 564 $ZSize = sprintf("%.3f", $ZSize) + 0; 565 566 print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n"; 567 print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n"; 568 569 } 570 571 # Check master record counts against actual record counts... 572 sub CheckMasterRecord { 573 my($RecordTypesCountRef, $PDBRecordLinesRef) = @_; 574 575 # Get master record information... 576 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11; 577 my($RecordLine, $MasterRecordFound); 578 $MasterRecordFound = 0; 579 580 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 581 if (IsMasterRecordType($RecordLine)) { 582 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine); 583 $MasterRecordFound = 1; 584 last LINE; 585 } 586 } 587 if (!$MasterRecordFound) { 588 print "\nWarning: MASTER record is missing.\n"; 589 return; 590 } 591 my(@MasterRecordValidationInfo); 592 @MasterRecordValidationInfo = (); 593 $NumOfRemarkRecords += 0; 594 if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) { 595 push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 596 } 597 $NumOfHetRecords += 0; 598 if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) { 599 push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}."; 600 } 601 $NumOfHelixRecords += 0; 602 if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) { 603 push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}."; 604 } 605 $NumOfSheetRecords += 0; 606 if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) { 607 push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}."; 608 } 609 $NumOfTurnRecords += 0; 610 if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) { 611 push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 612 } 613 $NumOfSiteRecords += 0; 614 if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) { 615 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}."; 616 } 617 618 $NumOfTransformationsRecords += 0; 619 my($RecordsCount, $ID, $RecordID, $RecordLabel); 620 $RecordsCount = 0; 621 for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') { 622 for $ID (1 .. 3) { 623 $RecordID = "${RecordLabel}${ID}"; 624 if (exists $RecordTypesCountRef->{Count}{$RecordID}) { 625 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID}; 626 } 627 } 628 } 629 if ($NumOfTransformationsRecords != $RecordsCount) { 630 push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 631 } 632 633 $RecordsCount = 0; 634 for $RecordLabel ('ATOM', 'HETATM') { 635 if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) { 636 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel}; 637 } 638 } 639 $NumOfAtomAndHetatmRecords += 0; 640 if ($NumOfAtomAndHetatmRecords != $RecordsCount) { 641 push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 642 } 643 $NumOfTerRecords += 0; 644 if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) { 645 push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}."; 646 } 647 $NumOfConectRecords += 0; 648 if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) { 649 push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}."; 650 } 651 $NumOfSeqresRecords += 0; 652 if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) { 653 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}."; 654 } 655 656 if (@MasterRecordValidationInfo) { 657 print "\nMASTER record validation: Count mismatches found:\n"; 658 print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n"; 659 } 660 else { 661 print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n"; 662 } 663 } 664 665 # Total size of all the files... 666 sub ListTotalSizeOfFiles { 667 my($FileOkayCount, $TotalSize, $Index); 668 669 $FileOkayCount = 0; 670 $TotalSize = 0; 671 672 for $Index (0 .. $#PDBFilesList) { 673 if ($PDBFilesInfo{FileOkay}[$Index]) { 674 $FileOkayCount++; 675 $TotalSize += $PDBFilesInfo{FileSize}[$Index]; 676 } 677 } 678 if ($FileOkayCount > 1) { 679 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 680 } 681 682 } 683 684 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any 685 # numbers from label... 686 sub CleanupChainID { 687 my($ChainID) = @_; 688 689 if ($ChainID =~ /^None/i) { 690 return 'None'; 691 } 692 return $ChainID; 693 } 694 695 # Process option values... 696 sub ProcessOptions { 697 %OptionsInfo = (); 698 699 # Setup record types to count... 700 if ($Options{count}) { 701 $OptionsInfo{CountRecordType} = $Options{count}; 702 } 703 else { 704 $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM'; 705 } 706 @{$OptionsInfo{SpecifiedRecordTypes}} =(); 707 if ($OptionsInfo{CountRecordType} !~ /^All$/i) { 708 my(@RecordTypes); 709 @RecordTypes = split /\,/, $OptionsInfo{CountRecordType}; 710 push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes; 711 } 712 $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0; 713 $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0; 714 715 # Residue count is the default. So $Options{residues} is simply ignored. 716 my($CountResidues) = 1; 717 $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 718 $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 719 720 $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 721 $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 722 723 $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all}) ? 1 : 0; 724 725 $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0; 726 727 $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0; 728 $OptionsInfo{DetailLevel} = $Options{detail}; 729 730 $OptionsInfo{ListExperimentalTechniqueInfo} = ($Options{experiment} || $Options{all}) ? 1 : 0; 731 732 } 733 734 # Retrieve information about PDB files... 735 sub RetrievePDBFilesInfo { 736 my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString); 737 738 %PDBFilesInfo = (); 739 @{$PDBFilesInfo{FileOkay}} = (); 740 @{$PDBFilesInfo{FileSize}} = (); 741 @{$PDBFilesInfo{FileLastModified}} = (); 742 743 FILELIST: for $Index (0 .. $#PDBFilesList) { 744 $PDBFilesInfo{FileOkay}[$Index] = 0; 745 $PDBFilesInfo{FileSize}[$Index] = 0; 746 $PDBFilesInfo{FileLastModified}[$Index] = ''; 747 748 $PDBFile = $PDBFilesList[$Index]; 749 if (!(-e $PDBFile)) { 750 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 751 next FILELIST; 752 } 753 if (!CheckFileType($PDBFile, "pdb")) { 754 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 755 next FILELIST; 756 } 757 if (! open PDBFILE, "$PDBFile") { 758 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 759 next FILELIST; 760 } 761 close PDBFILE; 762 763 $PDBFilesInfo{FileOkay}[$Index] = 1; 764 $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile); 765 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile); 766 $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 767 } 768 } 769 770 771 # Setup script usage and retrieve command line arguments specified using various options... 772 sub SetupScriptUsage { 773 774 # Retrieve all the options... 775 %Options = (); 776 $Options{count} = ''; 777 $Options{detail} = 1; 778 $Options{residuesmode} = 'Both'; 779 780 if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "experiment|e", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) { 781 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 782 } 783 if ($Options{workingdir}) { 784 if (! -d $Options{workingdir}) { 785 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 786 } 787 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 788 } 789 if (!IsPositiveInteger($Options{detail})) { 790 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 791 } 792 if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) { 793 die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n"; 794 } 795 } 796