1 #!/usr/bin/perl -w 2 # 3 # File: ExtractFromPDBFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use PDBFileUtil; 35 use AminoAcids; 36 use SequenceFileUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename($0); 45 print "\n$ScriptName: Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 my(@PDBFilesList); 55 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 56 57 # Process options... 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 # Setup information about input files... 63 print "Checking input PDB file(s)...\n"; 64 my(%PDBFilesInfo); 65 RetrievePDBFilesInfo(); 66 67 # Process input files.. 68 my($FileIndex); 69 if (@PDBFilesList > 1) { 70 print "\nProcessing PDB files...\n"; 71 } 72 for $FileIndex (0 .. $#PDBFilesList) { 73 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 74 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 75 ExtractFromPDBFiles($FileIndex); 76 } 77 } 78 print "\n$ScriptName:Done...\n\n"; 79 80 $EndTime = new Benchmark; 81 $TotalTime = timediff ($EndTime, $StartTime); 82 print "Total time: ", timestr($TotalTime), "\n"; 83 84 ############################################################################### 85 86 # Extract appropriate information... 87 sub ExtractFromPDBFiles { 88 my($FileIndex) = @_; 89 my($PDBFile, $PDBRecordLinesRef); 90 91 # Get PDB data... 92 $PDBFile = $PDBFilesList[$FileIndex]; 93 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 94 95 if ($OptionsInfo{Mode} =~ /Chains/i) { 96 ExtractChains($FileIndex, $PDBRecordLinesRef); 97 } 98 elsif ($OptionsInfo{Mode} =~ /Sequences/i) { 99 ExtractSequences($FileIndex, $PDBRecordLinesRef); 100 } 101 elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) { 102 ExtractByAtoms($FileIndex, $PDBRecordLinesRef); 103 } 104 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { 105 ExtractByResidues($FileIndex, $PDBRecordLinesRef); 106 } 107 elsif ($OptionsInfo{Mode} =~ /Distance/i) { 108 ExtractByDistance($FileIndex, $PDBRecordLinesRef); 109 } 110 elsif ($OptionsInfo{Mode} =~ /NonWater/i) { 111 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); 112 } 113 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { 114 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); 115 } 116 } 117 118 # Extract chains and generate new PDB files... 119 # 120 sub ExtractChains { 121 my($FileIndex, $PDBRecordLinesRef) = @_; 122 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, $CollectChainResiduesBeyondTER, %ChainAtomNumbersMap); 123 124 $CollectChainResiduesBeyondTER = ($OptionsInfo{ChainsRecordMode} =~ /^AcrossTER$/i) ? 1 : 0; 125 126 # Get chains and residues data... 127 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER, 1); 128 129 if ($OptionsInfo{CombineChains}) { 130 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 131 print "Generating PDBFileName file $PDBFileName...\n"; 132 133 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 134 135 # Write out header and other older records... 136 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 137 } 138 139 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 140 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 141 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 142 143 if (!$OptionsInfo{CombineChains}) { 144 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 145 print "Generating PDBFileName file $PDBFileName...\n"; 146 147 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 148 149 # Write out header and other older recors... 150 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 151 } 152 153 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers 154 # for writing out appropriate CONECT records... 155 %ChainAtomNumbersMap = (); 156 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { 157 if (CheckRecordType($RecordLine)) { 158 print OUTFILE "$RecordLine\n"; 159 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); 160 $AtomNumber = int $AtomNumber; 161 $ChainAtomNumbersMap{$AtomNumber} = $AtomName; 162 } 163 } 164 # Write out TER record using information from last chain record... 165 $AtomNumber += 1; 166 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; 167 168 # Write out CONECT records... 169 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); 170 171 for $RecordLine (@{$ConectRecordLinesRef}) { 172 print OUTFILE "$RecordLine\n"; 173 } 174 175 if (!$OptionsInfo{CombineChains}) { 176 # Write out END record... 177 print OUTFILE GenerateEndRecordLine(), "\n"; 178 179 close OUTFILE; 180 } 181 } 182 183 if ($OptionsInfo{CombineChains}) { 184 # Write out END record... 185 print OUTFILE GenerateEndRecordLine(), "\n"; 186 187 close OUTFILE; 188 } 189 190 } 191 192 # Extract sequences for individual chains or combine all the chains... 193 sub ExtractSequences { 194 my($FileIndex, $PDBRecordLinesRef) = @_; 195 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); 196 197 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 198 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 199 } 200 else { 201 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 202 } 203 204 # Generate sequence data for all the chains... 205 %ChainSequencesDataMap = (); 206 @{$ChainSequencesDataMap{IDs}} = (); 207 %{$ChainSequencesDataMap{Sequence}} = (); 208 %{$ChainSequencesDataMap{Description}} = (); 209 210 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 211 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 212 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 213 214 # Setup sequence ID... 215 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; 216 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; 217 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; 218 219 # Collect sequence data for the chain... 220 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { 221 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 222 } 223 else { 224 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 225 } 226 # Setup sequence data... 227 $ChainSequence = ''; 228 RESIDUE: for $Residue (@{$ChainResiduesRef}) { 229 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); 230 if (!$StandardResidue) { 231 if ($OptionsInfo{KeepNonStandardSequences}) { 232 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 233 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; 234 } 235 else { 236 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; 237 next RESIDUE; 238 } 239 } 240 $ChainSequence .= $ResidueCode; 241 } 242 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; 243 244 } 245 246 # Write out the sequence files... 247 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); 248 if ($OptionsInfo{CombineChainSequences}) { 249 # Combine all the chain sequences... 250 $Sequence = ''; 251 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 252 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 253 254 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; 255 } 256 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; 257 $SequenceDescription = $SequenceID; 258 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 259 260 print "Generating sequence file $SequenceFileName...\n"; 261 %SequencesDataMap = (); 262 @{$SequencesDataMap{IDs}} = (); 263 %{$SequencesDataMap{Sequence}} = (); 264 %{$SequencesDataMap{Description}} = (); 265 266 push @{$SequencesDataMap{IDs}}, $SequenceID; 267 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 268 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 269 270 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 271 } 272 else { 273 # For each specifed chain, write out the sequences... 274 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 275 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 276 277 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 278 279 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; 280 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; 281 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; 282 283 print "Generating sequence file $SequenceFileName...\n"; 284 %SequencesDataMap = (); 285 @{$SequencesDataMap{IDs}} = (); 286 %{$SequencesDataMap{Sequence}} = (); 287 %{$SequencesDataMap{Description}} = (); 288 289 push @{$SequencesDataMap{IDs}}, $SequenceID; 290 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 291 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 292 293 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 294 } 295 } 296 } 297 298 # Extract atoms... 299 sub ExtractByAtoms { 300 my($FileIndex, $PDBRecordLinesRef) = @_; 301 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap); 302 303 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 304 print "Generating PDBFileName file $PDBFileName...\n"; 305 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 306 307 # Write out header and other older recors... 308 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 309 310 # Write out all ATOM records along with TER and model records to indicate 311 # chains and multiple models.. 312 %AtomNumbersMap = (); 313 $ChainRecordCount = 0; 314 for $RecordLine (@{$PDBRecordLinesRef}) { 315 if (CheckRecordType($RecordLine)) { 316 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 317 318 # Check atoms... 319 $IgnoreRecord = 1; 320 if ($OptionsInfo{Mode} =~ /^Atoms$/i) { 321 $IgnoreRecord = 0; 322 } 323 elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) { 324 if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) { 325 $IgnoreRecord = 0; 326 } 327 } 328 elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) { 329 if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) { 330 $IgnoreRecord = 0; 331 } 332 } 333 elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { 334 if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) { 335 $IgnoreRecord = 0; 336 } 337 } 338 339 if (!$IgnoreRecord) { 340 $ChainRecordCount++; 341 print OUTFILE "$RecordLine\n"; 342 343 $AtomNumber = int $AtomNumber; 344 $AtomNumbersMap{$AtomNumber} = $AtomName; 345 } 346 } 347 elsif (IsTerRecordType($RecordLine)) { 348 if ($ChainRecordCount) { 349 print OUTFILE GenerateTerRecordLine(), "\n"; 350 } 351 $ChainRecordCount = 0; 352 } 353 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 354 print OUTFILE "$RecordLine\n"; 355 } 356 } 357 358 # Write out appropriate CONECT records... 359 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 360 for $RecordLine (@{$ConectRecordLinesRef}) { 361 print OUTFILE "$RecordLine\n"; 362 } 363 364 # Write out END record... 365 print OUTFILE GenerateEndRecordLine(), "\n"; 366 367 close OUTFILE; 368 } 369 370 # Extract residues... 371 sub ExtractByResidues { 372 my($FileIndex, $PDBRecordLinesRef) = @_; 373 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap); 374 375 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 376 print "Generating PDBFileName file $PDBFileName...\n"; 377 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 378 379 # Write out header and other older recors... 380 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 381 382 # Write out all ATOM records for specified residues with TER and model records to indicate 383 # chains and multiple models... 384 %AtomNumbersMap = (); 385 $ChainRecordCount = 0; 386 for $RecordLine (@{$PDBRecordLinesRef}) { 387 if (CheckRecordType($RecordLine)) { 388 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine); 389 390 # Check residues... 391 $IgnoreRecord = 1; 392 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) { 393 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) { 394 $IgnoreRecord = 0; 395 } 396 } 397 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 398 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) { 399 $IgnoreRecord = 0; 400 } 401 } 402 elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { 403 if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) { 404 $IgnoreRecord = 0; 405 } 406 } 407 if (!$IgnoreRecord) { 408 $ChainRecordCount++; 409 print OUTFILE "$RecordLine\n"; 410 $AtomNumber = int $AtomNumber; 411 $AtomNumbersMap{$AtomNumber} = $AtomName; 412 } 413 } 414 elsif (IsTerRecordType($RecordLine)) { 415 if ($ChainRecordCount) { 416 print OUTFILE GenerateTerRecordLine(), "\n"; 417 } 418 $ChainRecordCount = 0; 419 } 420 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 421 print OUTFILE "$RecordLine\n"; 422 } 423 } 424 425 # Write out appropriate CONECT records... 426 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 427 for $RecordLine (@{$ConectRecordLinesRef}) { 428 print OUTFILE "$RecordLine\n"; 429 } 430 # Write out END record... 431 print OUTFILE GenerateEndRecordLine(), "\n"; 432 433 close OUTFILE; 434 } 435 436 # Extract non water records... 437 sub ExtractNonWaterRecords { 438 my($FileIndex, $PDBRecordLinesRef) = @_; 439 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); 440 441 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 442 print "Generating PDBFileName file $PDBFileName...\n"; 443 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 444 445 # Write out header and other older recors... 446 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 447 448 # Write out all ATOM/HETATM non water records along with TER and model records to indicate 449 # chains and multiple models.. 450 %AtomNumbersMap = (); 451 $ChainRecordCount = 0; 452 for $RecordLine (@{$PDBRecordLinesRef}) { 453 if (CheckRecordType($RecordLine)) { 454 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); 455 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { 456 $ChainRecordCount++; 457 print OUTFILE "$RecordLine\n"; 458 $AtomNumber = int $AtomNumber; 459 $AtomNumbersMap{$AtomNumber} = $AtomName; 460 } 461 } 462 elsif (IsTerRecordType($RecordLine)) { 463 if ($ChainRecordCount) { 464 print OUTFILE GenerateTerRecordLine(), "\n"; 465 } 466 $ChainRecordCount = 0; 467 } 468 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 469 print OUTFILE "$RecordLine\n"; 470 } 471 } 472 473 # Write out appropriate CONECT records... 474 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 475 for $RecordLine (@{$ConectRecordLinesRef}) { 476 print OUTFILE "$RecordLine\n"; 477 } 478 # Write out END record... 479 print OUTFILE GenerateEndRecordLine(), "\n"; 480 481 close OUTFILE; 482 } 483 484 # Extract non hydrogen records... 485 sub ExtractNonHydrogenRecords { 486 my($FileIndex, $PDBRecordLinesRef) = @_; 487 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); 488 489 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 490 print "Generating PDBFileName file $PDBFileName...\n"; 491 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 492 493 # Write out header and other older recors... 494 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 495 496 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate 497 # chains and multiple models.. 498 %AtomNumbersMap = (); 499 $ChainRecordCount = 0; 500 for $RecordLine (@{$PDBRecordLinesRef}) { 501 if (CheckRecordType($RecordLine)) { 502 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 503 if ($ElementSymbol !~ /^H$/i) { 504 $ChainRecordCount++; 505 print OUTFILE "$RecordLine\n"; 506 $AtomNumber = int $AtomNumber; 507 $AtomNumbersMap{$AtomNumber} = $AtomName; 508 } 509 } 510 elsif (IsTerRecordType($RecordLine)) { 511 if ($ChainRecordCount) { 512 print OUTFILE GenerateTerRecordLine(), "\n"; 513 } 514 $ChainRecordCount = 0; 515 } 516 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 517 print OUTFILE "$RecordLine\n"; 518 } 519 } 520 521 # Write out appropriate CONECT records... 522 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 523 for $RecordLine (@{$ConectRecordLinesRef}) { 524 print OUTFILE "$RecordLine\n"; 525 } 526 # Write out END record... 527 print OUTFILE GenerateEndRecordLine(), "\n"; 528 529 close OUTFILE; 530 } 531 532 # Extract ATOM/HETATM records by distance... 533 sub ExtractByDistance { 534 my($FileIndex, $PDBRecordLinesRef) = @_; 535 my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap); 536 537 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 538 print "Generating PDBFileName file $PDBFileName...\n"; 539 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 540 541 # Write out header and other older recors... 542 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 543 544 # Setup coordinates of origin to calculate distance... 545 @OriginCoords = (); 546 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; 547 548 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate 549 # chains and multiple models... 550 %AtomNumbersMap = (); 551 552 %ResiduesDataMap = (); 553 %{$ResiduesDataMap{ID}} = (); 554 %{$ResiduesDataMap{Status}} = (); 555 556 $ChainRecordCount = 0; 557 $RecordLineNum = 0; 558 559 for $RecordLine (@{$PDBRecordLinesRef}) { 560 $RecordLineNum++; 561 if (CheckRecordType($RecordLine)) { 562 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 563 @Coords = (); push @Coords, ($X, $Y, $Z); 564 565 $IgnoreRecord = 1; 566 if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) { 567 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; 568 if (exists $ResiduesDataMap{ID}{$ResidueID}) { 569 # Residue data has been processed; check its selection status... 570 if ($ResiduesDataMap{Status}{$ResidueID}) { 571 $IgnoreRecord = 0; 572 } 573 } 574 else { 575 # Residue hasn't been processed... 576 $ResiduesDataMap{ID}{$ResidueID} = $ResidueID; 577 $ResiduesDataMap{Status}{$ResidueID} = 0; 578 if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) { 579 $IgnoreRecord = 0; 580 $ResiduesDataMap{Status}{$ResidueID} = 1; 581 } 582 } 583 } 584 elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) { 585 if (CheckDistance(\@Coords, \@OriginCoords)) { 586 $IgnoreRecord = 0; 587 } 588 } 589 590 if (!$IgnoreRecord) { 591 $ChainRecordCount++; 592 print OUTFILE "$RecordLine\n"; 593 $AtomNumber = int $AtomNumber; 594 $AtomNumbersMap{$AtomNumber} = $AtomName; 595 } 596 } 597 elsif (IsTerRecordType($RecordLine)) { 598 if ($ChainRecordCount) { 599 print OUTFILE GenerateTerRecordLine(), "\n"; 600 } 601 $ChainRecordCount = 0; 602 } 603 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 604 print OUTFILE "$RecordLine\n"; 605 } 606 } 607 608 # Write out appropriate CONECT records... 609 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 610 for $RecordLine (@{$ConectRecordLinesRef}) { 611 print OUTFILE "$RecordLine\n"; 612 } 613 614 # Write out END record... 615 print OUTFILE GenerateEndRecordLine(), "\n"; 616 617 close OUTFILE; 618 } 619 620 # Does record type correspond to the specified record type? 621 sub CheckRecordType { 622 my($RecordLine) = @_; 623 my($Status); 624 625 $Status = 0; 626 if ($OptionsInfo{RecordMode} =~ /^Atom$/i) { 627 $Status = IsAtomRecordType($RecordLine) ? 1 : 0; 628 } 629 elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) { 630 $Status = IsHetatmRecordType($RecordLine) ? 1 : 0; 631 } 632 elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) { 633 $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0; 634 } 635 636 return $Status; 637 } 638 639 # Does record meets distance citerion specified by the user? 640 sub CheckResidueDistance { 641 my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_; 642 my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords); 643 644 $Status = 0; 645 646 RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) { 647 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 648 if (!CheckRecordType($RecordLine)) { 649 next RECORDLINE; 650 } 651 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 652 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; 653 654 if ($ResidueID !~ /^$SpecifiedResidueID$/i) { 655 # It's a new residue line... 656 last RECORDLINE; 657 } 658 659 # Check distance... 660 @Coords = (); push @Coords, ($X, $Y, $Z); 661 if (CheckDistance(\@Coords, $OriginCoordsRef)) { 662 # Distance criterion is met for at least one record in the residue... 663 $Status = 1; 664 last RECORDLINE; 665 } 666 } 667 return $Status; 668 } 669 670 # Does record meets distance citerion specified by the user? 671 sub CheckDistance { 672 my($CoordsRef, $OriginCoordsRef) = @_; 673 my($Status, $Index, $Distance, $DistanceSquare); 674 675 $Status = 0; 676 677 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 678 # Go over coordinates of all the atoms in the residue... 679 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; 680 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { 681 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; 682 $Distance = sqrt $DistanceSquare; 683 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { 684 $Status = 1; 685 last INDEX; 686 } 687 } 688 } 689 else { 690 $DistanceSquare = 0; 691 for $Index (0 .. 2) { 692 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; 693 } 694 $Distance = sqrt $DistanceSquare; 695 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; 696 } 697 698 return $Status; 699 } 700 701 # Write out modifed header and other older records... 702 sub WriteHeaderAndOlderRecords { 703 my($OutFileRef, $PDBRecordLinesRef) = @_; 704 705 if ($OptionsInfo{ModifyHeaderRecord}) { 706 # Write out modified HEADER record... 707 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 708 $Classification = 'Data extracted using MayaChemTools'; 709 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; 710 } 711 else { 712 print $OutFileRef $PDBRecordLinesRef->[0], "\n"; 713 } 714 715 # Write out any old records... 716 if ($OptionsInfo{KeepOldRecords}) { 717 my($RecordLineIndex, $RecordLine); 718 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... 719 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { 720 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 721 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 722 last RECORDLINE; 723 } 724 print $OutFileRef "$RecordLine\n"; 725 } 726 } 727 } 728 729 # Get header record information assuming it's the first record... 730 sub GetHeaderRecordInformation { 731 my($PDBRecordLinesRef) = @_; 732 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); 733 734 ($Classification, $DepositionDate, $IDCode) = ('') x 3; 735 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 736 if (IsHeaderRecordType($HeaderRecordLine)) { 737 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 738 } 739 return ($Classification, $DepositionDate, $IDCode); 740 } 741 742 # Get one letter residue code... 743 sub GetResidueCode { 744 my($ResidueName) = @_; 745 my($ResidueCode, $StandardResidue); 746 747 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 748 $StandardResidue = 0; 749 750 if (length($ResidueName) == 3) { 751 # Assume it's an amino acid... 752 if (AminoAcids::IsAminoAcid($ResidueName)) { 753 # Standard amino acid... 754 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); 755 $StandardResidue = 1; 756 } 757 } 758 elsif (length($ResidueName) == 1) { 759 # Assume it's a nucleic acid... 760 if ($ResidueName =~ /^(A|G|T|U|C)$/i) { 761 $ResidueCode = $ResidueName; 762 $StandardResidue = 1; 763 } 764 } 765 766 return ($ResidueCode, $StandardResidue); 767 } 768 769 # Process option values... 770 sub ProcessOptions { 771 %OptionsInfo = (); 772 $OptionsInfo{Mode} = $Options{mode}; 773 774 my(@SpecifiedChains) = (); 775 if ($Options{chains} =~ /^(First|All)$/i) { 776 $OptionsInfo{ChainsToExtract} = $Options{chains}; 777 } 778 else { 779 @SpecifiedChains = split /\,/, $Options{chains}; 780 $OptionsInfo{ChainsToExtract} = 'Specified'; 781 } 782 @{$OptionsInfo{SpecifiedChains}} = (); 783 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; 784 785 $OptionsInfo{ChainsRecordMode} = $Options{chainsrecordmode}; 786 787 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 788 789 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 790 791 ProcessResiduesOptions(); 792 ProcessAtomsOptions(); 793 ProcessDistanceOptions(); 794 795 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; 796 @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); 797 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); 798 799 my(@SpecifiedWaterResiduesList); 800 @SpecifiedWaterResiduesList = (); 801 802 if ($OptionsInfo{Mode} =~ /^NonWater$/i) { 803 my($WaterResidueName); 804 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { 805 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); 806 } 807 else { 808 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; 809 } 810 for $WaterResidueName (@SpecifiedWaterResiduesList) { 811 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; 812 } 813 } 814 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; 815 816 $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm"); 817 818 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; 819 820 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; 821 822 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; 823 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; 824 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; 825 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; 826 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; 827 828 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 829 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 830 } 831 832 # Process specified residue options... 833 sub ProcessResiduesOptions { 834 my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList); 835 836 @SpecifiedResidueNumsList = (); 837 ($StartResidueNum, $EndResNum) = (0, 0); 838 839 @SpecifiedResidueNamesList = (); 840 841 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { 842 if (!$Options{residues}) { 843 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n"; 844 } 845 $OptionsInfo{Residues} = $Options{residues}; 846 $OptionsInfo{Residues} =~ s/ //g; 847 848 if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { 849 @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues}; 850 } 851 else { 852 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues}; 853 for $ResidueNum (@SpecifiedResidueNumsList) { 854 if (!IsPositiveInteger($ResidueNum)) { 855 die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n"; 856 } 857 } 858 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 859 if (@SpecifiedResidueNumsList != 2) { 860 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n"; 861 } 862 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) { 863 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n"; 864 } 865 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList; 866 } 867 } 868 } 869 870 @{$OptionsInfo{SpecifiedResidueNumsList}} = (); 871 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList; 872 873 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum; 874 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum; 875 876 @{$OptionsInfo{SpecifiedResidueNamesList}} = (); 877 push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList; 878 879 # Set up a specified residue numbers map... 880 %{$OptionsInfo{SpecifiedResidueNumsMap}} = (); 881 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) { 882 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum; 883 } 884 885 # Set up a specified residue names map... 886 %{$OptionsInfo{SpecifiedResidueNamesMap}} = (); 887 for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) { 888 $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName; 889 } 890 891 } 892 893 # Process specified atom options... 894 sub ProcessAtomsOptions { 895 my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList); 896 897 @SpecifiedAtomNumsList = (); 898 ($StartAtomNum, $EndAtomNum) = (0, 0); 899 900 @SpecifiedAtomNamesList = (); 901 902 if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) { 903 if (!$Options{atoms}) { 904 die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n"; 905 } 906 $OptionsInfo{Atoms} = $Options{atoms}; 907 $OptionsInfo{Atoms} =~ s/ //g; 908 909 if ($OptionsInfo{Mode} =~ /^AtomNames$/i) { 910 @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms}; 911 } 912 else { 913 @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms}; 914 for $AtomNum (@SpecifiedAtomNumsList) { 915 if (!IsPositiveInteger($AtomNum)) { 916 die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n"; 917 } 918 } 919 if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { 920 if (@SpecifiedAtomNumsList != 2) { 921 die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n"; 922 } 923 if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) { 924 die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n"; 925 } 926 ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList; 927 } 928 } 929 } 930 elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) { 931 @SpecifiedAtomNamesList = ("CA"); 932 } 933 934 @{$OptionsInfo{SpecifiedAtomNumsList}} = (); 935 push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList; 936 937 $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum; 938 $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum; 939 940 @{$OptionsInfo{SpecifiedAtomNamesList}} = (); 941 push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList; 942 943 # Set up a specified residue numbers map... 944 %{$OptionsInfo{SpecifiedAtomNumsMap}} = (); 945 for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) { 946 $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum; 947 } 948 949 # Set up a specified residue names map... 950 %{$OptionsInfo{SpecifiedAtomNamesMap}} = (); 951 for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) { 952 $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName; 953 } 954 955 } 956 957 # Process specified distance options... 958 sub ProcessDistanceOptions { 959 my(@SpecifiedDistanceOrigin) = (); 960 961 $OptionsInfo{MaxExtractionDistance} = $Options{distance}; 962 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; 963 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; 964 $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode}; 965 966 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 967 if (!$Options{distanceorigin}) { 968 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; 969 } 970 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; 971 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 972 if (@SpecifiedDistanceOrigin != 2) { 973 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; 974 } 975 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 976 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; 977 } 978 } 979 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 980 if (@SpecifiedDistanceOrigin != 2) { 981 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; 982 } 983 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 984 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; 985 } 986 } 987 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 988 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { 989 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; 990 } 991 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 992 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; 993 } 994 } 995 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 996 if (@SpecifiedDistanceOrigin != 3) { 997 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; 998 } 999 my($Value); 1000 for $Value (@SpecifiedDistanceOrigin) { 1001 if (!IsNumerical($Value)) { 1002 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; 1003 } 1004 } 1005 } 1006 } 1007 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); 1008 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; 1009 1010 } 1011 1012 # Retrieve information about PDB files... 1013 sub RetrievePDBFilesInfo { 1014 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); 1015 1016 %PDBFilesInfo = (); 1017 @{$PDBFilesInfo{FileOkay}} = (); 1018 @{$PDBFilesInfo{OutFileRoot}} = (); 1019 @{$PDBFilesInfo{OutFileNames}} = (); 1020 @{$PDBFilesInfo{ChainLabels}} = (); 1021 @{$PDBFilesInfo{ChainSequenceIDs}} = (); 1022 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); 1023 @{$PDBFilesInfo{SpecifiedChains}} = (); 1024 @{$PDBFilesInfo{DistanceOrigin}} = (); 1025 1026 FILELIST: for $Index (0 .. $#PDBFilesList) { 1027 $PDBFilesInfo{FileOkay}[$Index] = 0; 1028 1029 $PDBFilesInfo{OutFileRoot}[$Index] = ''; 1030 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 1031 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 1032 @{$PDBFilesInfo{ChainLabels}[$Index]} = (); 1033 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); 1034 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); 1035 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); 1036 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); 1037 1038 $PDBFile = $PDBFilesList[$Index]; 1039 if (!(-e $PDBFile)) { 1040 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 1041 next FILELIST; 1042 } 1043 if (!CheckFileType($PDBFile, "pdb")) { 1044 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 1045 next FILELIST; 1046 } 1047 if (! open PDBFILE, "$PDBFile") { 1048 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 1049 next FILELIST; 1050 } 1051 close PDBFILE; 1052 1053 # Get PDB data... 1054 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 1055 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 1056 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 1057 } 1058 else { 1059 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 1060 } 1061 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 1062 warn "Warning: Ignoring file $PDBFile: No chains found \n"; 1063 next FILELIST; 1064 } 1065 1066 # Make sure specified chains exist in PDB file... 1067 @SpecifiedChains = (); 1068 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { 1069 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { 1070 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { 1071 push @SpecifiedChains, $ChainID; 1072 } 1073 else { 1074 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; 1075 next FILELIST; 1076 } 1077 } 1078 } 1079 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { 1080 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; 1081 } 1082 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { 1083 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 1084 } 1085 # Setup chain labels to use for sequence IDs and generating output files... 1086 @ChainLabels = (); 1087 for $ChainID (@SpecifiedChains) { 1088 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; 1089 $ChainLabel = "Chain${ChainLabel}"; 1090 push @ChainLabels, $ChainLabel; 1091 } 1092 1093 # Make sure specified distance origin is valid... 1094 @DistanceOrigin = (); 1095 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 1096 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { 1097 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 1098 $RecordType = $OptionsInfo{ExtractionDistanceMode}; 1099 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1100 $RecordFound = 0; 1101 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 1102 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 1103 next LINE; 1104 } 1105 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 1106 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); 1107 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { 1108 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { 1109 $RecordFound = 1; 1110 last LINE; 1111 } 1112 } 1113 } 1114 if (!$RecordFound) { 1115 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 1116 next FILELIST; 1117 } 1118 push @DistanceOrigin, ($X, $Y, $Z); 1119 } 1120 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 1121 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 1122 $SpecifiedChainID = ''; 1123 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { 1124 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1125 } 1126 else { 1127 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1128 } 1129 $RecordFound = 0; 1130 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 1131 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 1132 next LINE; 1133 } 1134 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 1135 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); 1136 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); 1137 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { 1138 next LINE; 1139 } 1140 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { 1141 # Store coordinates for all the atoms... 1142 $RecordFound = 1; 1143 push @DistanceOrigin, ($X, $Y, $Z); 1144 next LINE; 1145 } 1146 } 1147 if (!$RecordFound) { 1148 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 1149 next FILELIST; 1150 } 1151 } 1152 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 1153 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1154 } 1155 } 1156 # Setup output file names... 1157 @OutFileNames = (); 1158 $FileDir = ""; $FileName = ""; $FileExt = ""; 1159 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); 1160 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { 1161 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 1162 if ($RootFileName && $RootFileExt) { 1163 $FileName = $RootFileName; 1164 } 1165 else { 1166 $FileName = $OptionsInfo{OutFileRoot}; 1167 } 1168 $OutFileRoot = $FileName; 1169 } 1170 else { 1171 $OutFileRoot = $FileName; 1172 } 1173 $Mode = $OptionsInfo{Mode}; 1174 if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) { 1175 $OutFileName = ''; 1176 if ($Mode =~ /^CAlphas$/i) { 1177 $OutFileName = "${OutFileRoot}CAlphas.pdb"; 1178 } 1179 elsif ($Mode =~ /^Atoms$/i) { 1180 $OutFileName = "${OutFileRoot}Atoms.pdb"; 1181 } 1182 elsif ($Mode =~ /^AtomNums$/i) { 1183 $OutFileName = "${OutFileRoot}AtomNums.pdb"; 1184 } 1185 elsif ($Mode =~ /^AtomsRange$/i) { 1186 $OutFileName = "${OutFileRoot}AtomsRange.pdb"; 1187 } 1188 elsif ($Mode =~ /^AtomNames$/i) { 1189 $OutFileName = "${OutFileRoot}AtomNames.pdb"; 1190 } 1191 elsif ($Mode =~ /^ResidueNums$/i) { 1192 $OutFileName = "${OutFileRoot}ResidueNums.pdb"; 1193 } 1194 elsif ($Mode =~ /^ResiduesRange$/i) { 1195 $OutFileName = "${OutFileRoot}ResiduesRange.pdb"; 1196 } 1197 elsif ($Mode =~ /^ResidueNames$/i) { 1198 $OutFileName = "${OutFileRoot}ResidueNames.pdb"; 1199 } 1200 elsif ($Mode =~ /^NonWater$/i) { 1201 $OutFileName = "${OutFileRoot}NonWater.pdb"; 1202 } 1203 elsif ($Mode =~ /^NonHydrogens$/i) { 1204 $OutFileName = "${OutFileRoot}NonHydrogens.pdb"; 1205 } 1206 elsif ($Mode =~ /^Distance$/i) { 1207 my($DistanceMode) = ''; 1208 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 1209 $DistanceMode = 'Atom'; 1210 } 1211 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 1212 $DistanceMode = 'Hetatm'; 1213 } 1214 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 1215 $DistanceMode = 'Residue'; 1216 } 1217 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 1218 $DistanceMode = 'XYZ'; 1219 } 1220 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb"; 1221 } 1222 push @OutFileNames, $OutFileName; 1223 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1224 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1225 next FILELIST; 1226 } 1227 } 1228 elsif ($Mode =~ /^(Chains|Sequences)$/i) { 1229 if ($OptionsInfo{CombineChainSequences}) { 1230 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta"; 1231 push @OutFileNames, $OutFileName; 1232 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1233 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1234 next FILELIST; 1235 } 1236 } 1237 else { 1238 for $ChainLabel (@ChainLabels) { 1239 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta"; 1240 push @OutFileNames, $OutFileName; 1241 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1242 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1243 next FILELIST; 1244 } 1245 } 1246 } 1247 } 1248 @ChainSequenceIDs = (); 1249 @ChainSequenceIDsPrefix = (); 1250 if ($Mode =~ /^Sequences$/i) { 1251 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix); 1252 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 1253 1254 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) { 1255 $IDPrefix = $FileName; 1256 } 1257 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) { 1258 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : ''; 1259 } 1260 else { 1261 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName; 1262 } 1263 1264 for $ChainLabel (@ChainLabels) { 1265 push @ChainSequenceIDsPrefix, $IDPrefix; 1266 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB"; 1267 } 1268 } 1269 1270 $PDBFilesInfo{FileOkay}[$Index] = 1; 1271 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 1272 1273 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames; 1274 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels; 1275 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix; 1276 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs; 1277 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains; 1278 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin; 1279 } 1280 } 1281 1282 1283 # Setup script usage and retrieve command line arguments specified using various options... 1284 sub SetupScriptUsage { 1285 1286 # Retrieve all the options... 1287 %Options = (); 1288 $Options{chains} = 'First'; 1289 $Options{chainsrecordmode} = 'NotAcrossTER'; 1290 $Options{combinechains} = 'no'; 1291 $Options{distance} = 10.0; 1292 $Options{distancemode} = 'XYZ'; 1293 $Options{distanceselectionmode} = 'ByAtom'; 1294 $Options{keepoldrecords} = 'no'; 1295 $Options{mode} = 'NonWater'; 1296 $Options{modifyheader} = 'yes'; 1297 $Options{nonstandardkeep} = 'yes'; 1298 $Options{nonstandardcode} = 'X'; 1299 $Options{sequencelength} = 80; 1300 $Options{sequenceidprefix} = 'Automatic'; 1301 $Options{sequencerecords} = 'Atom'; 1302 $Options{waterresiduenames} = 'Automatic'; 1303 1304 if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "chainsrecordmode=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) { 1305 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1306 } 1307 if ($Options{workingdir}) { 1308 if (! -d $Options{workingdir}) { 1309 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1310 } 1311 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1312 } 1313 if ($Options{combinechains} !~ /^(yes|no)$/i) { 1314 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n"; 1315 } 1316 if ($Options{chainsrecordmode} !~ /^(AcrossTER|NotAcrossTER)$/i) { 1317 die "Error: The value specified, $Options{chainsrecordmode}, for option \"--ChainsRecordMode\" is not valid. Allowed values: AcrossTER or NotAcrossTER\n"; 1318 } 1319 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) { 1320 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n"; 1321 } 1322 if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) { 1323 die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n"; 1324 } 1325 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) { 1326 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n"; 1327 } 1328 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { 1329 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n"; 1330 } 1331 if ($Options{modifyheader} !~ /^(yes|no)$/i) { 1332 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n"; 1333 } 1334 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) { 1335 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n"; 1336 } 1337 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) { 1338 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n"; 1339 } 1340 if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) { 1341 die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n"; 1342 } 1343 if (!IsPositiveInteger($Options{sequencelength})) { 1344 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; 1345 } 1346 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) { 1347 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n"; 1348 } 1349 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) { 1350 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n"; 1351 } 1352 } 1353