1 #!/usr/bin/perl -w 2 # 3 # File: DownloadPDBFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use File::Fetch; 31 use File::Copy; 32 use Text::ParseWords; 33 use Benchmark; 34 use FileUtil; 35 use TextUtil; 36 use PDBFileUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename($0); 45 print "\n$ScriptName: Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 # Process options... 55 print "Processing options...\n"; 56 my(%OptionsInfo, %PDBIDsFileInfo); 57 ProcessOptions(); 58 59 # Collect PDB IDs and download corresponding files... 60 my(%PDBFilesInfo); 61 SetupPDBFilesInfo(); 62 DownloadPDBFiles(); 63 64 print "\n$ScriptName:Done...\n\n"; 65 66 $EndTime = new Benchmark; 67 $TotalTime = timediff ($EndTime, $StartTime); 68 print "Total time: ", timestr($TotalTime), "\n"; 69 70 ############################################################################### 71 72 # Download appropriate PDB fies... 73 sub DownloadPDBFiles { 74 my($PDBIDCount, $PDBIDOkayCount, $PDBIDFailedCount, $PDBIDsIgnoredCount, $PDBID, $PDBStatus, $CIFStatus, $DownloadDensityMap, $CryoEMDataStatus, $EMDBID, $TotalEDMapOkayCount , $TotalEDMapFailedCount, $EDMapOkayCount, $EDMapFailedCount, $TotalCryoEMMapOkayCount, $TotalCryoEMMapFailedCount, $CryoEMMapOkayCount, $CryoEMMapFailedCount); 75 76 print "\nDownloading PDB files...\n"; 77 78 ($PDBIDCount, $PDBIDOkayCount, $PDBIDFailedCount, $PDBIDsIgnoredCount, $TotalEDMapOkayCount , $TotalEDMapFailedCount, $TotalCryoEMMapOkayCount, $TotalCryoEMMapFailedCount) = (0) x 8; 79 80 $DownloadDensityMap = $OptionsInfo{DensityMap}; 81 82 # Turn off warnings from File::Fetch 83 $File::Fetch::WARN = 0; 84 85 PDBID: for $PDBID (@{$PDBFilesInfo{PDBIDs}}) { 86 $PDBIDCount++; 87 88 print "\nProcessing PDB ID $PDBID...\n"; 89 90 if ($PDBID =~ /\./) { 91 $PDBIDsIgnoredCount++; 92 warn "Warning: Ignoring invalid PDB ID $PDBID\n"; 93 next PDBID; 94 } 95 96 $PDBStatus = 0; 97 # Download PDB format file... 98 if ($PDBFilesInfo{DownloadPDB}{$PDBID}) { 99 print "Downloading PDB file: $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID}\n"; 100 $PDBStatus = DownloadFile($PDBFilesInfo{RemoteFilePDBFormat}{$PDBID}, $PDBFilesInfo{LocalFilePDBFormat}{$PDBID}); 101 } 102 103 # Try downloading CIF format file... 104 $CIFStatus = 0; 105 if ($PDBFilesInfo{DownloadCIF}{$PDBID} && !$PDBStatus) { 106 print "Downloading PDB file: $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID}\n"; 107 $CIFStatus = DownloadFile($PDBFilesInfo{RemoteFileCIFFormat}{$PDBID}, $PDBFilesInfo{LocalFileCIFFormat}{$PDBID}); 108 } 109 110 # Check status of download... 111 if (!($PDBStatus || $CIFStatus)) { 112 $PDBIDFailedCount++; 113 next PDBID; 114 } 115 $PDBIDOkayCount++; 116 117 # Any need to download density files... 118 if (!$DownloadDensityMap) { 119 next PDBID; 120 } 121 122 # Check whether it's cryo-EM data file... 123 # 124 $CryoEMDataStatus = 0; 125 $EMDBID = 0; 126 if ($PDBStatus) { 127 ($CryoEMDataStatus, $EMDBID) = RetrieveEMDBIDFromPDBFile($PDBFilesInfo{LocalFilePDBFormat}{$PDBID}); 128 } 129 elsif ($CIFStatus) { 130 ($CryoEMDataStatus, $EMDBID) = RetrieveEMDBIDFromCIFFile($PDBFilesInfo{LocalFileCIFFormat}{$PDBID}); 131 } 132 133 ($EDMapOkayCount, $EDMapFailedCount) = DownloadEDMapFiles($PDBID, $CryoEMDataStatus); 134 $TotalEDMapOkayCount += $EDMapOkayCount; 135 $TotalEDMapFailedCount += $EDMapFailedCount, 136 137 ($CryoEMMapOkayCount, $CryoEMMapFailedCount) = DownloadCryoEMMapFiles($PDBID, $CryoEMDataStatus, $EMDBID); 138 $TotalCryoEMMapOkayCount += $CryoEMMapOkayCount; 139 $TotalCryoEMMapFailedCount += $CryoEMMapFailedCount, 140 } 141 142 print "\nTotal number of PDB IDs: $PDBIDCount\n"; 143 print "Number of PDB IDs ignored: $PDBIDsIgnoredCount\n"; 144 145 print "\nNumber of successful downloads: $PDBIDOkayCount\n"; 146 print "Number of failed downloads: $PDBIDFailedCount\n"; 147 148 if ($DownloadDensityMap) { 149 print "\nNumber of successful ED map downloads: $TotalEDMapOkayCount\n"; 150 print "Number of failed ED map downloads: $TotalEDMapFailedCount\n"; 151 152 print "\nNumber of successful cryo-EM map downloads: $TotalCryoEMMapOkayCount\n"; 153 print "Number of failed cryo-EM map downloads: $TotalCryoEMMapFailedCount\n"; 154 } 155 } 156 157 # Download x-ray electron density files... 158 sub DownloadEDMapFiles { 159 my($PDBID, $CryoEMDataStatus) = @_; 160 my($Index, $RemoteEDMapFile, $LocalEDMapFile, $TmpLocalEDMapFile, $FinalLocalEDMapFile, $EDMapFailedCount, $EDMapOkayCount, $Status); 161 162 ($EDMapOkayCount, $EDMapFailedCount) = (0) x 2; 163 164 if ($CryoEMDataStatus) { 165 print "Skipping download of x-ray electron density files for cryo-EM PDB data...\n"; 166 return ($EDMapOkayCount, $EDMapFailedCount); 167 } 168 169 EDMAPFILE: for $Index (0 .. $#{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}}) { 170 $RemoteEDMapFile = $PDBFilesInfo{RemoteEDMapFiles}{$PDBID}[$Index]; 171 $LocalEDMapFile = $PDBFilesInfo{LocalEDMapFiles}{$PDBID}[$Index]; 172 $TmpLocalEDMapFile = $PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}[$Index]; 173 $FinalLocalEDMapFile = $PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}[$Index]; 174 175 print "Downloading x-ray electron density map file: $RemoteEDMapFile\n"; 176 $Status = DownloadFile($RemoteEDMapFile, $LocalEDMapFile); 177 178 if (!$Status) { 179 $EDMapFailedCount++; 180 next EDMAPFILE; 181 } 182 # Rename downloaded ED file... 183 print "Moving file from $LocalEDMapFile to $FinalLocalEDMapFile\n"; 184 move $LocalEDMapFile, $TmpLocalEDMapFile or warn "Warning: Couldn't move file $LocalEDMapFile to $TmpLocalEDMapFile\n"; 185 move $TmpLocalEDMapFile, $FinalLocalEDMapFile or warn "Warning: Couldn't move file $TmpLocalEDMapFile to $FinalLocalEDMapFile\n"; 186 $EDMapOkayCount++; 187 } 188 189 return ($EDMapOkayCount, $EDMapFailedCount); 190 } 191 192 # Download cryo-EM density files... 193 sub DownloadCryoEMMapFiles { 194 my($PDBID, $CryoEMDataStatus, $EMDBID) = @_; 195 my($Index, $RemoteCryoEMMapFile, $LocalCryoEMMapFile, $CryoEMMapFailedCount, $CryoEMMapOkayCount, $Status, $FileType); 196 197 ($CryoEMMapOkayCount, $CryoEMMapFailedCount) = (0) x 2; 198 199 if (!$CryoEMDataStatus) { 200 print "Skipping download of cryo-EM density files for non cryo-EM PDB data...\n"; 201 return ($CryoEMMapOkayCount, $CryoEMMapFailedCount); 202 } 203 204 CRYOEMMAPFILE: for $Index (0 .. $#{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}) { 205 $FileType = $PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}[$Index]; 206 $RemoteCryoEMMapFile = $PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}[$Index]; 207 $LocalCryoEMMapFile = $PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}[$Index]; 208 209 # Update file names with actual EMDBID... 210 $RemoteCryoEMMapFile =~ s/EMDBIDPlaceHolder/$EMDBID/ig; 211 $LocalCryoEMMapFile =~ s/EMDBIDPlaceHolder/$EMDBID/ig; 212 213 print "Downloading cryo-EM density $FileType file: $RemoteCryoEMMapFile\n"; 214 $Status = DownloadFile($RemoteCryoEMMapFile, $LocalCryoEMMapFile); 215 216 if (!$Status) { 217 $CryoEMMapFailedCount++; 218 next CRYOEMMAPFILE; 219 } 220 $CryoEMMapOkayCount++; 221 } 222 223 return ($CryoEMMapOkayCount, $CryoEMMapFailedCount); 224 } 225 226 227 # Download specified file... 228 sub DownloadFile { 229 my($RemoteFileURL, $LocalFileName) = @_; 230 my($Status, $FileFetch, $FetchedFilePath); 231 232 $Status = 1; 233 234 # Setup a fetch object... 235 $FileFetch = File::Fetch->new(uri => $RemoteFileURL); 236 237 # Fetch file to the CWD... 238 $FetchedFilePath = $FileFetch->fetch(); 239 240 if (IsEmpty($FetchedFilePath)) { 241 warn "Warning: Download failed for file $RemoteFileURL: " . $FileFetch->error() . "\n"; 242 if (-e $LocalFileName) { 243 warn "Warning: Deleting empty file $LocalFileName\n"; 244 unlink $LocalFileName or warn "Warning: Couldn't delete file $LocalFileName\n"; 245 } 246 $Status = 0; 247 } 248 return $Status; 249 } 250 251 # Collect specified PDB IDs along with settting up PDB and ED file names for all 252 # specified PDB IDs... 253 # 254 sub SetupPDBFilesInfo { 255 256 %PDBFilesInfo = (); 257 RetrievePDBIDs(); 258 SetupPDBandEDFileNames(); 259 } 260 261 # Retrieve EMDB ID from PDB file... 262 sub RetrieveEMDBIDFromPDBFile { 263 my($PDBFile) = @_; 264 my($EMDBID, $CryoEMDataType, $Line); 265 266 $EMDBID = 0; 267 268 if (!-e $PDBFile) { 269 return $EMDBID; 270 } 271 272 $CryoEMDataType = 0; 273 274 open PDBFILE, "$PDBFile" or die "Couldn't open $PDBFile: $! \n"; 275 LINE: while ($Line = GetTextLine(\*PDBFILE)) { 276 if ($Line =~ /^EXPDTA/i) { 277 if ($Line =~ /ELECTRON MICROSCOPY/i) { 278 $CryoEMDataType = 1; 279 } 280 } 281 elsif ($Line =~ /^REMARK/i) { 282 if ($Line =~ /DB: EMDB/i) { 283 (undef, $EMDBID, undef) = ($Line =~ /^(.*?) EMD-([0-9]+) (.*?)$/); 284 if (!defined $EMDBID) { 285 $EMDBID = 0; 286 } 287 last LINE; 288 } 289 } 290 } 291 close PDBFILE; 292 293 return ($CryoEMDataType, $EMDBID); 294 } 295 296 # Retrieve EMDB ID from CIF file... 297 sub RetrieveEMDBIDFromCIFFile { 298 my($CIFFile) = @_; 299 my($EMDBID, $CryoEMDataType, $Line); 300 301 $EMDBID = 0; 302 303 if (!-e $CIFFile) { 304 return $EMDBID; 305 } 306 307 $CryoEMDataType = 0; 308 309 open CIFFILE, "$CIFFile" or die "Couldn't open $CIFFile: $! \n"; 310 LINE: while ($Line = GetTextLine(\*CIFFILE)) { 311 if ($Line =~ /^_exptl.method/i) { 312 if ($Line =~ /ELECTRON MICROSCOPY/i) { 313 $CryoEMDataType = 1; 314 last LINE; 315 } 316 } 317 elsif ($Line =~ /^EMDB EMD-/i) { 318 (undef, $EMDBID, undef) = ($Line =~ /^(.*?) EMD-([0-9]+)(.*?)$/); 319 if (!defined $EMDBID) { 320 $EMDBID = 0; 321 } 322 } 323 } 324 close CIFFILE; 325 326 return ($CryoEMDataType, $EMDBID); 327 } 328 329 330 # Set up PDB and ED file names for downloading.... 331 sub SetupPDBandEDFileNames { 332 my($PDBID, $DownloadDensityMap, $PDBDataLocationURL, $EDMapDataLocaltionURL, $EDMapType, $EDMapPDBID, $EDMapSuffix, $EDMapFileExt, $CryoEMDataLocationURL, $EMDBID); 333 334 @{$PDBFilesInfo{PDBIDs}} = (); 335 336 %{$PDBFilesInfo{DownloadPDB}} = (); 337 %{$PDBFilesInfo{RemoteFilePDBFormat}} = (); 338 %{$PDBFilesInfo{LocalFilePDBFormat}} = (); 339 340 %{$PDBFilesInfo{DownloadCIF}} = (); 341 %{$PDBFilesInfo{RemoteFileCIFFormat}} = (); 342 %{$PDBFilesInfo{LocalFileCIFFormat}} = (); 343 344 # Initilaize X-ray electron density file names... 345 %{$PDBFilesInfo{DownloadEDMap}} = (); 346 %{$PDBFilesInfo{RemoteEDMapFiles}} = (); 347 %{$PDBFilesInfo{LocalEDMapFiles}} = (); 348 %{$PDBFilesInfo{TmpLocalEDMapFiles}} = (); 349 %{$PDBFilesInfo{FinalLocalEDMapFiles}} = (); 350 351 # Initilaize cryo-EM density file names... 352 %{$PDBFilesInfo{DownloadCryoEMMap}} = (); 353 %{$PDBFilesInfo{RemoteCyroEMMapFileTypes}} = (); 354 %{$PDBFilesInfo{RemoteCyroEMMapFiles}} = (); 355 %{$PDBFilesInfo{LocalCyroEMMapFiles}} = (); 356 357 $DownloadDensityMap = $OptionsInfo{DensityMap}; 358 359 $PDBDataLocationURL = $OptionsInfo{DataLocationURL}; 360 if ($PDBDataLocationURL !~ /\/$/) { 361 $PDBDataLocationURL .= "/"; 362 } 363 364 $EDMapDataLocaltionURL = $OptionsInfo{DenistyMapLocationURLXRay}; 365 if ($EDMapDataLocaltionURL !~ /\/$/) { 366 $EDMapDataLocaltionURL .= "/"; 367 } 368 369 $CryoEMDataLocationURL = $OptionsInfo{DensityMapLocationURLCryoEM}; 370 if ($CryoEMDataLocationURL !~ /\/$/) { 371 $CryoEMDataLocationURL .= "/"; 372 } 373 374 PDBID: for $PDBID (@{$OptionsInfo{PDBIDs}}) { 375 # Track PDB IDs.. 376 push @{$PDBFilesInfo{PDBIDs}}, $PDBID; 377 378 # Intialize PDB file names... 379 $PDBFilesInfo{DownloadPDB}{$PDBID} = 0; 380 $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID} = ""; 381 $PDBFilesInfo{LocalFilePDBFormat}{$PDBID} = ""; 382 383 $PDBFilesInfo{DownloadCIF}{$PDBID} = 0; 384 $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID} = ""; 385 $PDBFilesInfo{LocalFileCIFFormat}{$PDBID} = ""; 386 387 if ($OptionsInfo{PDBFormat} =~ /^(PDB|Auto)$/i) { 388 $PDBFilesInfo{DownloadPDB}{$PDBID} = 1; 389 $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID} = "${PDBDataLocationURL}${PDBID}.pdb"; 390 $PDBFilesInfo{LocalFilePDBFormat}{$PDBID} = "${PDBID}.pdb"; 391 } 392 if ($OptionsInfo{PDBFormat} =~ /^(CIF|Auto)$/i) { 393 $PDBFilesInfo{DownloadCIF}{$PDBID} = 1; 394 $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID} = "${PDBDataLocationURL}${PDBID}.cif"; 395 $PDBFilesInfo{LocalFileCIFFormat}{$PDBID} = "${PDBID}.cif"; 396 } 397 398 # Initialize x-ray ED map file names... 399 $PDBFilesInfo{DownloadEDMap}{$PDBID} = 0; 400 @{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}} = (); 401 @{$PDBFilesInfo{LocalEDMapFiles}{$PDBID}} = (); 402 @{$PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}} = (); 403 @{$PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}} = (); 404 405 # Initialize cryo-EM map file names... 406 $PDBFilesInfo{DownloadCryoEMMap}{$PDBID} = 0; 407 @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}} = (); 408 @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}} = (); 409 @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}} = (); 410 411 if (!$DownloadDensityMap) { 412 next PDBID; 413 } 414 415 # Setup x-ray ED file names... 416 if ($OptionsInfo{DensityMapMode} =~ /^(XRayElectronDensity|Auto)$/i) { 417 $PDBFilesInfo{DownloadEDMap}{$PDBID} = 1; 418 419 $EDMapPDBID = lc $PDBID; 420 421 for $EDMapType (@{$OptionsInfo{EDMapTypesList}}) { 422 $EDMapSuffix = $OptionsInfo{EDMapLocationSuffixesMap}{$EDMapType}; 423 $EDMapFileExt = $OptionsInfo{EDMapLocationFileExtMap}{$EDMapType}; 424 425 push @{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}}, "${EDMapDataLocaltionURL}${EDMapPDBID}${EDMapSuffix}.${EDMapFileExt}"; 426 push @{$PDBFilesInfo{LocalEDMapFiles}{$PDBID}}, "${EDMapPDBID}${EDMapSuffix}.${EDMapFileExt}"; 427 428 push @{$PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}}, "${EDMapPDBID}${EDMapSuffix}Tmp.${EDMapFileExt}"; 429 push @{$PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}}, "${PDBID}${EDMapSuffix}.${EDMapFileExt}"; 430 } 431 } 432 if ($OptionsInfo{DensityMapMode} =~ /^(CryoEMDensity|Auto)$/i) { 433 # Set up cryo-EM map file names using "EMDBIDPlaceHolder" to be replaced later by 434 # a valid ID retrieved from PDB or CIF file... 435 # 436 $EMDBID = "EMDBIDPlaceHolder"; 437 $PDBFilesInfo{DownloadCryoEMMap}{$PDBID} = 1; 438 439 # Map files... 440 push @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}}, "map"; 441 push @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}, "${CryoEMDataLocationURL}EMD-${EMDBID}/map/emd_${EMDBID}.map.gz"; 442 push @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}}, "emd_${EMDBID}.map.gz"; 443 444 # Metadata files... 445 push @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}}, "header"; 446 push @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}, "${CryoEMDataLocationURL}EMD-${EMDBID}/header/emd-${EMDBID}.xml"; 447 push @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}}, "emd-${EMDBID}.xml"; 448 449 } 450 } 451 } 452 453 # Collect PDB IDs... 454 # 455 sub RetrievePDBIDs { 456 @{$OptionsInfo{PDBIDs}} = (); 457 458 if ($OptionsInfo{Mode} =~ /^IDsOnCmdLine$/i) { 459 RetriveCommandLinePDBIDs(); 460 } 461 elsif ($OptionsInfo{Mode} =~ /^IDsInFile$/i) { 462 RetriveTextFilePDBIDs(); 463 } 464 } 465 466 # Collect PDB IDs specified on the command line... 467 # 468 sub RetriveCommandLinePDBIDs { 469 my($SpecifiedPDBID, @PDBIDs, @ProcessedPDBIDs); 470 471 print "\nProcessing PDB ID(s) from command line...\n"; 472 473 @PDBIDs = (); 474 for $SpecifiedPDBID (@{$OptionsInfo{CmdLinePDBIDs}}) { 475 @ProcessedPDBIDs = ProcessSpecifiedPDBIDs($SpecifiedPDBID); 476 if (@ProcessedPDBIDs) { 477 push @PDBIDs, @ProcessedPDBIDs; 478 } 479 } 480 @{$OptionsInfo{PDBIDs}} = @PDBIDs; 481 482 RetrieveUniquePDBIDs(); 483 } 484 485 # Collect PDB IDs specified in the text file... 486 # 487 sub RetriveTextFilePDBIDs { 488 my($TextFile, $InDelim, $IDsColIndex, $LineCount, $ProcessedLineCount, $IgnoredLineCount, $PDBID, $Line, @PDBIDs, @ProcessedPDBIDs, @LineWords); 489 490 $TextFile = $PDBIDsFileInfo{Name}; 491 492 $IDsColIndex = $PDBIDsFileInfo{IDsColIndex}; 493 $InDelim = $PDBIDsFileInfo{InDelim} ; 494 495 ($LineCount, $ProcessedLineCount, $IgnoredLineCount) = (0) x 3; 496 497 print "\nProcessing PDB ID(s) from PDB IDs file $TextFile...\n"; 498 499 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 500 # Skip label line... 501 $_ = <TEXTFILE>; 502 503 @PDBIDs = (); 504 LINE: while ($Line = GetTextLine(\*TEXTFILE)) { 505 $LineCount++; 506 @LineWords = quotewords($InDelim, 0, $Line); 507 508 if ($IDsColIndex >= scalar @LineWords) { 509 $IgnoredLineCount++; 510 warn "Warning: Ignoring line number $LineCount: PDB IDs column number, ". ($IDsColIndex + 1) . ", doesn't exist in the line containing, " . (scalar @LineWords) . ", columns.\nLine: $Line\n"; 511 next LINE; 512 } 513 $PDBID = $LineWords[$IDsColIndex]; 514 if (IsEmpty($PDBID )) { 515 $IgnoredLineCount++; 516 warn "Warning: Ignoring line number $LineCount: PDB ID value is empty.\nLine: $Line\n"; 517 next LINE; 518 } 519 $ProcessedLineCount++; 520 521 @ProcessedPDBIDs = ProcessSpecifiedPDBIDs($PDBID); 522 if (@ProcessedPDBIDs) { 523 push @PDBIDs, @ProcessedPDBIDs; 524 } 525 } 526 close TEXTFILE; 527 528 @{$OptionsInfo{PDBIDs}} = @PDBIDs; 529 530 print "\nTotal number of lines in PDB IDs text file: $LineCount\n"; 531 print "Total number of lines processed: $ProcessedLineCount\n"; 532 print "Total number of lines ignored: $IgnoredLineCount\n"; 533 534 RetrieveUniquePDBIDs(); 535 } 536 537 # Process specified PDB IDs... 538 # 539 # Notes: 540 # . Commas and spaces in the specification of PBD IDs are allowed. 541 # . All PDB IDs are turned into uppercase letters. 542 # 543 sub ProcessSpecifiedPDBIDs { 544 my($SpecifiedPDBID) = @_; 545 my($PDBID, @PDBIDWords, @PDBIDs); 546 547 $SpecifiedPDBID = RemoveLeadingAndTrailingWhiteSpaces($SpecifiedPDBID); 548 if ($SpecifiedPDBID =~ / /) { 549 @PDBIDWords = split " ", $SpecifiedPDBID; 550 } 551 elsif ($SpecifiedPDBID =~ /,/) { 552 @PDBIDWords = split ",", $SpecifiedPDBID; 553 } 554 else { 555 push @PDBIDWords, $SpecifiedPDBID; 556 } 557 558 @PDBIDs = (); 559 for $PDBID (@PDBIDWords) { 560 $PDBID =~ s/( |,)//g; 561 push @PDBIDs, uc $PDBID; 562 } 563 return @PDBIDs; 564 } 565 566 # Collect unique PDB IDs... 567 sub RetrieveUniquePDBIDs { 568 my($PDBID, $PDBIDsCount, $PDBIDsIgnoredCount, @UniquePDBIDs, %PDBIDsMap); 569 570 %PDBIDsMap = (); 571 @UniquePDBIDs = (); 572 573 $PDBIDsCount = 0; 574 $PDBIDsIgnoredCount = 0; 575 576 PDBID: for $PDBID (@{$OptionsInfo{PDBIDs}}) { 577 $PDBIDsCount++; 578 if (exists $PDBIDsMap{$PDBID}) { 579 $PDBIDsIgnoredCount++; 580 warn "Warning: Ignoring duplicate PDB ID $PDBID\n"; 581 next PDBID; 582 } 583 $PDBIDsMap{$PDBID} = $PDBID; 584 push @UniquePDBIDs, $PDBID; 585 } 586 @{$OptionsInfo{PDBIDs}} = @UniquePDBIDs; 587 print "\nTotal number of PDB IDs: $PDBIDsCount\n"; 588 print "Number of duplicate PDB IDs ignored: $PDBIDsIgnoredCount\n"; 589 } 590 591 # Process option values... 592 sub ProcessOptions { 593 my($EDMapTypes, $EDMapType, $EDLocationSuffixes, $EDMapSuffix, $Index, @EDMapTypesList, @EDLocationSuffixesList, %EDLocationSuffixesMap); 594 595 %OptionsInfo = (); 596 %PDBIDsFileInfo = (); 597 598 $OptionsInfo{Mode} = $Options{mode}; 599 $OptionsInfo{ColMode} = $Options{colmode}; 600 601 $OptionsInfo{DataLocationURL} = $Options{datalocationurl}; 602 if (IsEmpty($OptionsInfo{DataLocationURL} )) { 603 die "Error: PDB data location URL specified using \"-d, --dataLocationURL\" is empty. Allowed value: Non empty string\n"; 604 } 605 606 $OptionsInfo{DensityMap} = $Options{densitymap} =~ /^Yes$/i ? 1 : 0; 607 $OptionsInfo{DensityMapMode} = $Options{densitymapmode}; 608 609 $OptionsInfo{DensityMapLocationURLCryoEM} = $Options{densitymaplocationurlcryoem}; 610 $OptionsInfo{DenistyMapLocationURLXRay} = $Options{denistymaplocationurlxray}; 611 612 # Process x-ray ED map location file suffixes... 613 $EDLocationSuffixes = $Options{edmaplocationsuffixes}; 614 $EDLocationSuffixes =~ s/ //g; 615 %EDLocationSuffixesMap = (); 616 617 @EDLocationSuffixesList = split(",", $EDLocationSuffixes); 618 if (@EDLocationSuffixesList % 2) { 619 die "Invalid number of values specified using \"--EDMapLocationSuffixes\" option: It must contain even number of valid values.\n"; 620 } 621 for ($Index = 0; $Index < @EDLocationSuffixesList; $Index += 2) { 622 $EDMapType = $EDLocationSuffixesList[$Index]; 623 $EDMapSuffix = $EDLocationSuffixesList[$Index + 1]; 624 625 if ($EDMapType !~ /^(CompositeMap|DifferenceMap|ReflectionMap)$/i) { 626 die "Error: The value specified, $EDMapType, for option \"--EDMapLocationSuffixes\" is not valid. Allowed values: CompositeMap, DifferenceMap, ReflectionMap\n"; 627 } 628 if (exists $EDLocationSuffixesMap{$EDMapType}) { 629 die "Error: Duplicate ED map type, $EDMapType, specified for option \"--EDMapLocationSuffixes\"\n"; 630 } 631 632 # Track suffixes... 633 if ($EDMapSuffix =~ /^None$/i) { 634 $EDMapSuffix = ""; 635 } 636 $EDLocationSuffixesMap{$EDMapType} = $EDMapSuffix; 637 } 638 $OptionsInfo{EDMapLocationSuffixes} = $EDLocationSuffixes; 639 @{$OptionsInfo{EDMapLocationSuffixesList}} = (); 640 @{$OptionsInfo{EDMapLocationSuffixesList}} = @EDLocationSuffixesList; 641 642 %{$OptionsInfo{EDMapLocationSuffixesMap}} = ("CompositeMap" => "", "DifferenceMap" => "_diff", "ReflectionMap" => "_map"); 643 %{$OptionsInfo{EDMapLocationFileExtMap}} = ("CompositeMap" => "ccp4", "DifferenceMap" => "ccp4", "ReflectionMap" => "mtz"); 644 645 for $EDMapType (keys %EDLocationSuffixesMap) { 646 $EDMapSuffix = $EDLocationSuffixesMap{$EDMapType}; 647 $OptionsInfo{EDMapLocationSuffixesMap}{$EDMapType} = $EDMapSuffix; 648 } 649 650 # Process x-ray ED map types... 651 $EDMapTypes = $Options{edmaptypes}; 652 $EDMapTypes =~ s/ //g; 653 @EDMapTypesList = (); 654 if ($EDMapTypes =~ /^All$/i) { 655 push @EDMapTypesList, ("CompositeMap", "DifferenceMap", "ReflectionMap"); 656 } 657 else { 658 @EDMapTypesList = split(",", $EDMapTypes); 659 for $EDMapType (@EDMapTypesList) { 660 if ($EDMapType !~ /^(CompositeMap|DifferenceMap|ReflectionMap|All)$/i) { 661 die "Error: The value specified, $EDMapType, for option \"--EDMapTypes\" is not valid. Allowed values: CompositeMap, DifferenceMap, ReflectionMap, All\n"; 662 } 663 if ($EDMapType =~ /^All$/i) { 664 die "Error: The value specified, $EDMapType, for option \"--EDMapTypes\" must be specified alone. It can't be specified with other values.\n"; 665 } 666 } 667 } 668 669 $OptionsInfo{EDMapTypes} = $EDMapTypes; 670 @{$OptionsInfo{EDMapTypesList}} = (); 671 push @{$OptionsInfo{EDMapTypesList}}, @EDMapTypesList; 672 673 674 $OptionsInfo{InDelim} = $Options{indelim}; 675 676 $OptionsInfo{PDBIDsCol } = defined $Options{pdbidscol} ? $Options{pdbidscol} : ''; 677 678 $OptionsInfo{PDBFormat} = $Options{pdbformat}; 679 680 @{$OptionsInfo{CmdLinePDBIDs}} = (); 681 $OptionsInfo{PDBIDsFile} = ""; 682 683 if ($OptionsInfo{Mode} =~ /^IDsOnCmdLine$/i) { 684 push @{$OptionsInfo{CmdLinePDBIDs}}, @ARGV; 685 } 686 elsif ($OptionsInfo{Mode} =~ /^IDsInFile$/i) { 687 if (@ARGV != 1) { 688 die "Error: Invalid number of PDB IDs text files, ". (scalar @ARGV) . ",specified on the command line for \"IDsInFile\" value of $Options{mode}, for option \"-m --mode\". Allowed value: Only one text file\n"; 689 } 690 $OptionsInfo{PDBIDsFile} = $ARGV[0]; 691 692 RetrievePDBIDsTextFileInfo(); 693 } 694 else { 695 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: IDsOnCmdLine or IDsInFile\n"; 696 } 697 } 698 699 # Retrieve information for PDB IDs text file... 700 # 701 sub RetrievePDBIDsTextFileInfo { 702 my($TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $PDBIDsColIndex, $ColMode, $PDBIDsCol , $ColCount, $ColIndex, @ColLabels); 703 704 $TextFile = $OptionsInfo{PDBIDsFile}; 705 706 %PDBIDsFileInfo = (); 707 $PDBIDsFileInfo{Name} = $TextFile; 708 709 $PDBIDsFileInfo{ColCount} = 0; 710 @{$PDBIDsFileInfo{ColLabels}} = (); 711 %{$PDBIDsFileInfo{ColLabelToNumMap}} = (); 712 $PDBIDsFileInfo{InDelim} = ""; 713 714 $PDBIDsFileInfo{IDsColIndex} = ""; 715 716 if (!-e $TextFile) { 717 die "Error: PDBIDs text file, $TextFile, doesn't exist\n"; 718 } 719 720 if (!CheckFileType($TextFile, "csv tsv")) { 721 die "Error: Ignoring file $TextFile: It's not a csv or tsv file\n"; 722 } 723 724 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 725 if ($FileExt =~ /^tsv$/i) { 726 $InDelim = "\t"; 727 } 728 else { 729 $InDelim = "\,"; 730 if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) { 731 die "Error: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for textfile\n"; 732 } 733 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 734 $InDelim = "\;"; 735 } 736 } 737 738 if (!open TEXTFILE, "$TextFile") { 739 die "Error: Ignoring file $TextFile: Couldn't open it: $! \n"; 740 } 741 742 $Line = GetTextLine(\*TEXTFILE); 743 @ColLabels = quotewords($InDelim, 0, $Line); 744 close TEXTFILE; 745 $ColCount = scalar @ColLabels; 746 747 push @{$PDBIDsFileInfo{ColLabels}}, @ColLabels; 748 $PDBIDsFileInfo{ColCount} = $ColCount ; 749 $PDBIDsFileInfo{InDelim} = $InDelim; 750 751 # Setup collabel to colnum map... 752 %{$PDBIDsFileInfo{ColLabelToNumMap}} = (); 753 for $ColNum (0 .. $#ColLabels) { 754 $ColLabel = $ColLabels[$ColNum]; 755 $PDBIDsFileInfo{ColLabelToNumMap}{lc $ColLabel} = $ColNum; 756 } 757 758 # Identify column containing PDB IDs... 759 $PDBIDsColIndex = ""; 760 761 $ColMode = $OptionsInfo{ColMode}; 762 $PDBIDsCol = $OptionsInfo{PDBIDsCol }; 763 764 if (IsNotEmpty($PDBIDsCol )) { 765 if ($ColMode =~ /^collabel$/i) { 766 $ColLabel = lc $PDBIDsCol; 767 if (!exists $PDBIDsFileInfo{ColLabelToNumMap}{$ColLabel} ) { 768 die "Error: Ignoring file $TextFile: The column name, $PDBIDsCol, specified for option \"-p, --PDBIDsCol \" is not valid for text file\n"; 769 } 770 $PDBIDsColIndex = $PDBIDsFileInfo{ColLabelToNumMap}{$ColLabel}; 771 } 772 else { 773 $ColNum = $PDBIDsCol; 774 if ($ColNum <= 0 || $ColNum > $ColCount) { 775 die "Error: Ignoring file $TextFile: The column number, $PDBIDsCol, specified for option \"-p, --PDBIDsCol \" is not valid for text file. It must be > 0 and <= $ColCount\n"; 776 } 777 $PDBIDsColIndex = $ColNum - 1; 778 } 779 } 780 else { 781 # Look for column name containing PDB_ID or PDBID text string... 782 $PDBIDsCol = ""; 783 $ColIndex = 0; 784 COLLABEL: for $ColLabel (@ColLabels) { 785 if ($ColLabel =~ /(PDB_ID|PDBID)/i) { 786 $PDBIDsCol = $ColLabel; 787 $PDBIDsColIndex = $ColIndex; 788 last COLLABEL; 789 } 790 $ColIndex++; 791 } 792 if (IsEmpty($PDBIDsCol)) { 793 die "Error: Ignoring file $TextFile: Couldn't find PDB IDs default column containing text string PDB_ID or PDBID in its name\n"; 794 } 795 } 796 $PDBIDsFileInfo{IDsColIndex} = $PDBIDsColIndex; 797 } 798 799 # Setup script usage and retrieve command line arguments specified using various options... 800 sub SetupScriptUsage { 801 802 # Retrieve all the options... 803 %Options = (); 804 805 $Options{colmode} = "colnum"; 806 $Options{datalocationurl} = "http://www.rcsb.org/pdb/files/"; 807 808 $Options{densitymap} = "no"; 809 $Options{densitymapmode} = "auto"; 810 811 $Options{densitymaplocationurlcryoem} = "ftp://ftp.wwpdb.org/pub/emdb/structures/"; 812 $Options{denistymaplocationurlxray} = "http://www.ebi.ac.uk/pdbe/coordinates/files/"; 813 814 $Options{edmaplocationsuffixes} = "CompositeMap,None,DifferenceMap,_diff,ReflectionMap, _map"; 815 $Options{edmaptypes} = "CompositeMap,DifferenceMap"; 816 817 $Options{indelim} = "comma"; 818 $Options{mode} = "IDsOnCmdLine"; 819 820 $Options{pdbformat} = "Auto"; 821 822 if (!GetOptions(\%Options, "colmode|c=s", "datalocationurl|d=s", "densitymap=s", "densitymapmode=s", "densitymaplocationurlcryoem=s", "denistymaplocationurlxray=s", "edmaplocationsuffixes=s", "edmaptypes=s", "help|h", "indelim=s", "mode|m=s", "pdbidscol|p=s", "pdbformat=s", "workingdir|w=s")) { 823 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 824 } 825 if ($Options{workingdir}) { 826 if (! -d $Options{workingdir}) { 827 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 828 } 829 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 830 } 831 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 832 die "Error: The value specified, $Options{colmode}, for option \"-c, --colmode\" is not valid. Allowed values: colnum or collabel\n"; 833 } 834 if ($Options{densitymap} !~ /^(yes|no)$/i) { 835 die "Error: The value specified, $Options{densitymap}, for option \"--DensityMap\" is not valid. Allowed values: yes or no\n"; 836 } 837 if ($Options{densitymapmode} !~ /^(XRayElectronDensity|CryoEMDensity|Auto)$/i) { 838 die "Error: The value specified, $Options{densitymapmode}, for option \"--DensityMapMode\" is not valid. Allowed values: XRayElectronDensity, CryoEMDensity, Auto\n"; 839 } 840 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 841 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 842 } 843 if ($Options{mode} !~ /^(IDsOnCmdLine|IDsInFile)$/i) { 844 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: IDsOnCmdLine or IDsInFile\n"; 845 } 846 if ($Options{pdbformat} !~ /^(PDB|CIF|Auto)$/i) { 847 die "Error: The value specified, $Options{pdbformat}, for option \"--indelim\" is not valid. Allowed values: PDB, CIF or Auto\n"; 848 } 849 } 850