MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: DownloadPDBFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use File::Fetch;
  31 use File::Copy;
  32 use Text::ParseWords;
  33 use Benchmark;
  34 use FileUtil;
  35 use TextUtil;
  36 use PDBFileUtil;
  37 
  38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  39 
  40 # Autoflush STDOUT
  41 $| = 1;
  42 
  43 # Starting message...
  44 $ScriptName = basename($0);
  45 print "\n$ScriptName: Starting...\n\n";
  46 $StartTime = new Benchmark;
  47 
  48 # Get the options and setup script...
  49 SetupScriptUsage();
  50 if ($Options{help} || @ARGV < 1) {
  51   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  52 }
  53 
  54 # Process options...
  55 print "Processing options...\n";
  56 my(%OptionsInfo, %PDBIDsFileInfo);
  57 ProcessOptions();
  58 
  59 # Collect PDB IDs and download corresponding files...
  60 my(%PDBFilesInfo);
  61 SetupPDBFilesInfo();
  62 DownloadPDBFiles();
  63 
  64 print "\n$ScriptName:Done...\n\n";
  65 
  66 $EndTime = new Benchmark;
  67 $TotalTime = timediff ($EndTime, $StartTime);
  68 print "Total time: ", timestr($TotalTime), "\n";
  69 
  70 ###############################################################################
  71 
  72 # Download appropriate PDB fies...
  73 sub DownloadPDBFiles {
  74   my($PDBIDCount, $PDBIDOkayCount, $PDBIDFailedCount, $PDBIDsIgnoredCount, $PDBID, $PDBStatus, $CIFStatus, $DownloadDensityMap, $CryoEMDataStatus, $EMDBID, $TotalEDMapOkayCount , $TotalEDMapFailedCount, $EDMapOkayCount, $EDMapFailedCount, $TotalCryoEMMapOkayCount, $TotalCryoEMMapFailedCount, $CryoEMMapOkayCount, $CryoEMMapFailedCount);
  75 
  76   print "\nDownloading PDB files...\n";
  77 
  78   ($PDBIDCount, $PDBIDOkayCount, $PDBIDFailedCount, $PDBIDsIgnoredCount, $TotalEDMapOkayCount , $TotalEDMapFailedCount, $TotalCryoEMMapOkayCount, $TotalCryoEMMapFailedCount) = (0) x 8;
  79 
  80   $DownloadDensityMap = $OptionsInfo{DensityMap};
  81 
  82   # Turn off warnings from File::Fetch
  83   $File::Fetch::WARN = 0;
  84 
  85   PDBID: for $PDBID (@{$PDBFilesInfo{PDBIDs}}) {
  86     $PDBIDCount++;
  87 
  88     print "\nProcessing PDB ID $PDBID...\n";
  89 
  90     if ($PDBID =~ /\./) {
  91       $PDBIDsIgnoredCount++;
  92       warn "Warning: Ignoring invalid PDB ID $PDBID\n";
  93       next PDBID;
  94     }
  95 
  96     $PDBStatus = 0;
  97     # Download PDB format file...
  98     if ($PDBFilesInfo{DownloadPDB}{$PDBID}) {
  99       print "Downloading PDB file: $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID}\n";
 100       $PDBStatus = DownloadFile($PDBFilesInfo{RemoteFilePDBFormat}{$PDBID}, $PDBFilesInfo{LocalFilePDBFormat}{$PDBID});
 101     }
 102 
 103     # Try downloading CIF format file...
 104     $CIFStatus = 0;
 105     if ($PDBFilesInfo{DownloadCIF}{$PDBID} && !$PDBStatus) {
 106       print "Downloading PDB file: $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID}\n";
 107       $CIFStatus = DownloadFile($PDBFilesInfo{RemoteFileCIFFormat}{$PDBID}, $PDBFilesInfo{LocalFileCIFFormat}{$PDBID});
 108     }
 109 
 110     # Check status of download...
 111     if (!($PDBStatus || $CIFStatus)) {
 112       $PDBIDFailedCount++;
 113       next PDBID;
 114     }
 115     $PDBIDOkayCount++;
 116 
 117     # Any need to download density files...
 118     if (!$DownloadDensityMap) {
 119       next PDBID;
 120     }
 121 
 122     # Check whether it's cryo-EM data file...
 123     #
 124     $CryoEMDataStatus = 0;
 125     $EMDBID = 0;
 126     if ($PDBStatus) {
 127       ($CryoEMDataStatus, $EMDBID) = RetrieveEMDBIDFromPDBFile($PDBFilesInfo{LocalFilePDBFormat}{$PDBID});
 128     }
 129     elsif ($CIFStatus) {
 130       ($CryoEMDataStatus, $EMDBID) = RetrieveEMDBIDFromCIFFile($PDBFilesInfo{LocalFileCIFFormat}{$PDBID});
 131     }
 132 
 133     ($EDMapOkayCount, $EDMapFailedCount) = DownloadEDMapFiles($PDBID, $CryoEMDataStatus);
 134     $TotalEDMapOkayCount += $EDMapOkayCount;
 135     $TotalEDMapFailedCount += $EDMapFailedCount,
 136 
 137     ($CryoEMMapOkayCount, $CryoEMMapFailedCount) = DownloadCryoEMMapFiles($PDBID, $CryoEMDataStatus, $EMDBID);
 138     $TotalCryoEMMapOkayCount += $CryoEMMapOkayCount;
 139     $TotalCryoEMMapFailedCount += $CryoEMMapFailedCount,
 140   }
 141 
 142   print "\nTotal number of PDB IDs:  $PDBIDCount\n";
 143   print "Number of PDB IDs ignored:  $PDBIDsIgnoredCount\n";
 144 
 145   print "\nNumber of successful downloads:  $PDBIDOkayCount\n";
 146   print "Number of failed downloads:  $PDBIDFailedCount\n";
 147 
 148   if ($DownloadDensityMap) {
 149     print "\nNumber of successful ED map downloads:  $TotalEDMapOkayCount\n";
 150     print "Number of failed ED map downloads:  $TotalEDMapFailedCount\n";
 151 
 152     print "\nNumber of successful cryo-EM map downloads:  $TotalCryoEMMapOkayCount\n";
 153     print "Number of failed cryo-EM map downloads:  $TotalCryoEMMapFailedCount\n";
 154   }
 155 }
 156 
 157 # Download x-ray electron density files...
 158 sub DownloadEDMapFiles {
 159   my($PDBID, $CryoEMDataStatus) = @_;
 160   my($Index, $RemoteEDMapFile, $LocalEDMapFile, $TmpLocalEDMapFile, $FinalLocalEDMapFile, $EDMapFailedCount, $EDMapOkayCount, $Status);
 161 
 162   ($EDMapOkayCount, $EDMapFailedCount) = (0) x 2;
 163 
 164   if ($CryoEMDataStatus) {
 165     print "Skipping download of x-ray electron density files for cryo-EM PDB data...\n";
 166     return ($EDMapOkayCount, $EDMapFailedCount);
 167   }
 168 
 169  EDMAPFILE: for $Index (0 .. $#{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}}) {
 170     $RemoteEDMapFile = $PDBFilesInfo{RemoteEDMapFiles}{$PDBID}[$Index];
 171     $LocalEDMapFile = $PDBFilesInfo{LocalEDMapFiles}{$PDBID}[$Index];
 172     $TmpLocalEDMapFile = $PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}[$Index];
 173     $FinalLocalEDMapFile = $PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}[$Index];
 174 
 175     print "Downloading x-ray electron density map file: $RemoteEDMapFile\n";
 176     $Status = DownloadFile($RemoteEDMapFile, $LocalEDMapFile);
 177 
 178     if (!$Status) {
 179       $EDMapFailedCount++;
 180       next EDMAPFILE;
 181     }
 182     # Rename downloaded ED file...
 183     print "Moving file from $LocalEDMapFile to $FinalLocalEDMapFile\n";
 184     move $LocalEDMapFile, $TmpLocalEDMapFile or warn "Warning: Couldn't move file $LocalEDMapFile to $TmpLocalEDMapFile\n";
 185     move $TmpLocalEDMapFile, $FinalLocalEDMapFile or warn "Warning: Couldn't move file $TmpLocalEDMapFile to $FinalLocalEDMapFile\n";
 186     $EDMapOkayCount++;
 187   }
 188 
 189   return ($EDMapOkayCount, $EDMapFailedCount);
 190 }
 191 
 192 # Download cryo-EM density files...
 193 sub DownloadCryoEMMapFiles {
 194   my($PDBID, $CryoEMDataStatus, $EMDBID) = @_;
 195   my($Index, $RemoteCryoEMMapFile, $LocalCryoEMMapFile, $CryoEMMapFailedCount, $CryoEMMapOkayCount, $Status, $FileType);
 196 
 197   ($CryoEMMapOkayCount, $CryoEMMapFailedCount) = (0) x 2;
 198 
 199   if (!$CryoEMDataStatus) {
 200     print "Skipping download of cryo-EM density files for non cryo-EM PDB data...\n";
 201     return ($CryoEMMapOkayCount, $CryoEMMapFailedCount);
 202   }
 203 
 204   CRYOEMMAPFILE: for $Index (0 .. $#{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}) {
 205     $FileType = $PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}[$Index];
 206     $RemoteCryoEMMapFile = $PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}[$Index];
 207     $LocalCryoEMMapFile = $PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}[$Index];
 208 
 209     # Update file names with actual EMDBID...
 210     $RemoteCryoEMMapFile =~ s/EMDBIDPlaceHolder/$EMDBID/ig;
 211     $LocalCryoEMMapFile =~ s/EMDBIDPlaceHolder/$EMDBID/ig;
 212 
 213     print "Downloading cryo-EM density $FileType file: $RemoteCryoEMMapFile\n";
 214     $Status = DownloadFile($RemoteCryoEMMapFile, $LocalCryoEMMapFile);
 215 
 216     if (!$Status) {
 217       $CryoEMMapFailedCount++;
 218       next CRYOEMMAPFILE;
 219     }
 220     $CryoEMMapOkayCount++;
 221   }
 222 
 223   return ($CryoEMMapOkayCount, $CryoEMMapFailedCount);
 224 }
 225 
 226 
 227 # Download specified file...
 228 sub DownloadFile {
 229   my($RemoteFileURL, $LocalFileName) = @_;
 230   my($Status, $FileFetch, $FetchedFilePath);
 231 
 232   $Status = 1;
 233 
 234   # Setup a fetch object...
 235   $FileFetch = File::Fetch->new(uri => $RemoteFileURL);
 236 
 237   # Fetch file to the CWD...
 238   $FetchedFilePath = $FileFetch->fetch();
 239 
 240   if (IsEmpty($FetchedFilePath)) {
 241     warn "Warning: Download failed for file $RemoteFileURL: " . $FileFetch->error() . "\n";
 242     if (-e $LocalFileName) {
 243       warn "Warning: Deleting empty file $LocalFileName\n";
 244       unlink $LocalFileName or warn "Warning: Couldn't delete file $LocalFileName\n";
 245     }
 246     $Status = 0;
 247   }
 248   return $Status;
 249 }
 250 
 251 # Collect specified PDB IDs along with settting up PDB and ED file names for all
 252 # specified PDB IDs...
 253 #
 254 sub SetupPDBFilesInfo {
 255 
 256   %PDBFilesInfo = ();
 257   RetrievePDBIDs();
 258   SetupPDBandEDFileNames();
 259 }
 260 
 261 # Retrieve EMDB ID from PDB file...
 262 sub RetrieveEMDBIDFromPDBFile {
 263   my($PDBFile) = @_;
 264   my($EMDBID, $CryoEMDataType, $Line);
 265 
 266   $EMDBID = 0;
 267 
 268   if (!-e $PDBFile) {
 269     return $EMDBID;
 270   }
 271 
 272   $CryoEMDataType = 0;
 273 
 274   open PDBFILE, "$PDBFile" or die "Couldn't open $PDBFile: $! \n";
 275   LINE: while ($Line = GetTextLine(\*PDBFILE)) {
 276     if ($Line =~ /^EXPDTA/i) {
 277       if ($Line =~ /ELECTRON MICROSCOPY/i) {
 278          $CryoEMDataType = 1;
 279       }
 280     }
 281     elsif ($Line =~ /^REMARK/i) {
 282       if ($Line =~ /DB: EMDB/i) {
 283          (undef, $EMDBID, undef) = ($Line =~ /^(.*?) EMD-([0-9]+) (.*?)$/);
 284          if (!defined $EMDBID) {
 285            $EMDBID = 0;
 286          }
 287          last LINE;
 288       }
 289     }
 290   }
 291   close PDBFILE;
 292 
 293   return ($CryoEMDataType, $EMDBID);
 294 }
 295 
 296 # Retrieve EMDB ID from CIF file...
 297 sub RetrieveEMDBIDFromCIFFile {
 298   my($CIFFile) = @_;
 299   my($EMDBID, $CryoEMDataType, $Line);
 300 
 301   $EMDBID = 0;
 302 
 303   if (!-e $CIFFile) {
 304     return $EMDBID;
 305   }
 306 
 307   $CryoEMDataType = 0;
 308 
 309   open CIFFILE, "$CIFFile" or die "Couldn't open $CIFFile: $! \n";
 310   LINE: while ($Line = GetTextLine(\*CIFFILE)) {
 311     if ($Line =~ /^_exptl.method/i) {
 312       if ($Line =~ /ELECTRON MICROSCOPY/i) {
 313          $CryoEMDataType = 1;
 314          last LINE;
 315       }
 316     }
 317     elsif ($Line =~ /^EMDB  EMD-/i) {
 318       (undef, $EMDBID, undef) = ($Line =~ /^(.*?) EMD-([0-9]+)(.*?)$/);
 319        if (!defined $EMDBID) {
 320          $EMDBID = 0;
 321        }
 322     }
 323   }
 324   close CIFFILE;
 325 
 326   return ($CryoEMDataType, $EMDBID);
 327 }
 328 
 329 
 330 # Set up PDB and ED file names for downloading....
 331 sub SetupPDBandEDFileNames {
 332   my($PDBID, $DownloadDensityMap, $PDBDataLocationURL, $EDMapDataLocaltionURL, $EDMapType, $EDMapPDBID, $EDMapSuffix, $EDMapFileExt, $CryoEMDataLocationURL, $EMDBID);
 333 
 334   @{$PDBFilesInfo{PDBIDs}} = ();
 335 
 336   %{$PDBFilesInfo{DownloadPDB}} = ();
 337   %{$PDBFilesInfo{RemoteFilePDBFormat}} = ();
 338   %{$PDBFilesInfo{LocalFilePDBFormat}} = ();
 339 
 340   %{$PDBFilesInfo{DownloadCIF}} = ();
 341   %{$PDBFilesInfo{RemoteFileCIFFormat}} = ();
 342   %{$PDBFilesInfo{LocalFileCIFFormat}} = ();
 343 
 344   # Initilaize X-ray electron density file names...
 345   %{$PDBFilesInfo{DownloadEDMap}} = ();
 346   %{$PDBFilesInfo{RemoteEDMapFiles}} = ();
 347   %{$PDBFilesInfo{LocalEDMapFiles}} = ();
 348   %{$PDBFilesInfo{TmpLocalEDMapFiles}} = ();
 349   %{$PDBFilesInfo{FinalLocalEDMapFiles}} = ();
 350 
 351   # Initilaize cryo-EM  density file names...
 352   %{$PDBFilesInfo{DownloadCryoEMMap}} = ();
 353   %{$PDBFilesInfo{RemoteCyroEMMapFileTypes}} = ();
 354   %{$PDBFilesInfo{RemoteCyroEMMapFiles}} = ();
 355   %{$PDBFilesInfo{LocalCyroEMMapFiles}} = ();
 356 
 357   $DownloadDensityMap = $OptionsInfo{DensityMap};
 358 
 359   $PDBDataLocationURL = $OptionsInfo{DataLocationURL};
 360   if ($PDBDataLocationURL !~ /\/$/) {
 361     $PDBDataLocationURL .= "/";
 362   }
 363 
 364   $EDMapDataLocaltionURL = $OptionsInfo{DenistyMapLocationURLXRay};
 365   if ($EDMapDataLocaltionURL !~ /\/$/) {
 366     $EDMapDataLocaltionURL .= "/";
 367   }
 368 
 369   $CryoEMDataLocationURL = $OptionsInfo{DensityMapLocationURLCryoEM};
 370   if ($CryoEMDataLocationURL !~ /\/$/) {
 371     $CryoEMDataLocationURL .= "/";
 372   }
 373 
 374   PDBID: for $PDBID (@{$OptionsInfo{PDBIDs}}) {
 375     # Track PDB IDs..
 376     push @{$PDBFilesInfo{PDBIDs}}, $PDBID;
 377 
 378     # Intialize PDB file names...
 379     $PDBFilesInfo{DownloadPDB}{$PDBID} = 0;
 380     $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID} = "";
 381     $PDBFilesInfo{LocalFilePDBFormat}{$PDBID} = "";
 382 
 383     $PDBFilesInfo{DownloadCIF}{$PDBID} = 0;
 384     $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID} = "";
 385     $PDBFilesInfo{LocalFileCIFFormat}{$PDBID} = "";
 386 
 387     if ($OptionsInfo{PDBFormat} =~ /^(PDB|Auto)$/i) {
 388       $PDBFilesInfo{DownloadPDB}{$PDBID} = 1;
 389       $PDBFilesInfo{RemoteFilePDBFormat}{$PDBID} = "${PDBDataLocationURL}${PDBID}.pdb";
 390       $PDBFilesInfo{LocalFilePDBFormat}{$PDBID} = "${PDBID}.pdb";
 391     }
 392     if ($OptionsInfo{PDBFormat} =~ /^(CIF|Auto)$/i) {
 393       $PDBFilesInfo{DownloadCIF}{$PDBID} = 1;
 394       $PDBFilesInfo{RemoteFileCIFFormat}{$PDBID} = "${PDBDataLocationURL}${PDBID}.cif";
 395       $PDBFilesInfo{LocalFileCIFFormat}{$PDBID} = "${PDBID}.cif";
 396     }
 397 
 398     # Initialize x-ray ED map file names...
 399     $PDBFilesInfo{DownloadEDMap}{$PDBID} = 0;
 400     @{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}} = ();
 401     @{$PDBFilesInfo{LocalEDMapFiles}{$PDBID}} = ();
 402     @{$PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}} = ();
 403     @{$PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}} = ();
 404 
 405     # Initialize cryo-EM map file names...
 406     $PDBFilesInfo{DownloadCryoEMMap}{$PDBID} = 0;
 407     @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}} = ();
 408     @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}} = ();
 409     @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}} = ();
 410 
 411     if (!$DownloadDensityMap) {
 412       next PDBID;
 413     }
 414 
 415     # Setup x-ray ED file names...
 416     if ($OptionsInfo{DensityMapMode} =~ /^(XRayElectronDensity|Auto)$/i) {
 417       $PDBFilesInfo{DownloadEDMap}{$PDBID} = 1;
 418 
 419       $EDMapPDBID = lc $PDBID;
 420 
 421       for $EDMapType (@{$OptionsInfo{EDMapTypesList}}) {
 422         $EDMapSuffix = $OptionsInfo{EDMapLocationSuffixesMap}{$EDMapType};
 423         $EDMapFileExt = $OptionsInfo{EDMapLocationFileExtMap}{$EDMapType};
 424 
 425         push @{$PDBFilesInfo{RemoteEDMapFiles}{$PDBID}}, "${EDMapDataLocaltionURL}${EDMapPDBID}${EDMapSuffix}.${EDMapFileExt}";
 426         push @{$PDBFilesInfo{LocalEDMapFiles}{$PDBID}}, "${EDMapPDBID}${EDMapSuffix}.${EDMapFileExt}";
 427 
 428         push @{$PDBFilesInfo{TmpLocalEDMapFiles}{$PDBID}}, "${EDMapPDBID}${EDMapSuffix}Tmp.${EDMapFileExt}";
 429         push @{$PDBFilesInfo{FinalLocalEDMapFiles}{$PDBID}}, "${PDBID}${EDMapSuffix}.${EDMapFileExt}";
 430       }
 431     }
 432     if ($OptionsInfo{DensityMapMode} =~ /^(CryoEMDensity|Auto)$/i) {
 433       # Set up cryo-EM map file names using "EMDBIDPlaceHolder" to be replaced later by
 434       # a valid ID retrieved from PDB or CIF file...
 435       #
 436       $EMDBID = "EMDBIDPlaceHolder";
 437       $PDBFilesInfo{DownloadCryoEMMap}{$PDBID} = 1;
 438 
 439       # Map files...
 440       push @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}}, "map";
 441       push @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}, "${CryoEMDataLocationURL}EMD-${EMDBID}/map/emd_${EMDBID}.map.gz";
 442       push @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}}, "emd_${EMDBID}.map.gz";
 443 
 444       # Metadata files...
 445       push @{$PDBFilesInfo{RemoteCyroEMMapFileTypes}{$PDBID}}, "header";
 446       push @{$PDBFilesInfo{RemoteCyroEMMapFiles}{$PDBID}}, "${CryoEMDataLocationURL}EMD-${EMDBID}/header/emd-${EMDBID}.xml";
 447       push @{$PDBFilesInfo{LocalCyroEMMapFiles}{$PDBID}}, "emd-${EMDBID}.xml";
 448 
 449     }
 450   }
 451 }
 452 
 453 # Collect PDB IDs...
 454 #
 455 sub RetrievePDBIDs {
 456   @{$OptionsInfo{PDBIDs}} = ();
 457 
 458   if ($OptionsInfo{Mode} =~ /^IDsOnCmdLine$/i) {
 459     RetriveCommandLinePDBIDs();
 460   }
 461   elsif ($OptionsInfo{Mode} =~ /^IDsInFile$/i) {
 462     RetriveTextFilePDBIDs();
 463   }
 464 }
 465 
 466 # Collect PDB IDs specified on the command line...
 467 #
 468 sub RetriveCommandLinePDBIDs {
 469   my($SpecifiedPDBID, @PDBIDs, @ProcessedPDBIDs);
 470 
 471   print "\nProcessing PDB ID(s) from command line...\n";
 472 
 473   @PDBIDs = ();
 474   for $SpecifiedPDBID (@{$OptionsInfo{CmdLinePDBIDs}}) {
 475     @ProcessedPDBIDs = ProcessSpecifiedPDBIDs($SpecifiedPDBID);
 476     if (@ProcessedPDBIDs) {
 477       push @PDBIDs, @ProcessedPDBIDs;
 478     }
 479   }
 480   @{$OptionsInfo{PDBIDs}} = @PDBIDs;
 481 
 482   RetrieveUniquePDBIDs();
 483 }
 484 
 485 # Collect PDB IDs specified in the text file...
 486 #
 487 sub RetriveTextFilePDBIDs {
 488   my($TextFile, $InDelim, $IDsColIndex, $LineCount, $ProcessedLineCount, $IgnoredLineCount, $PDBID, $Line, @PDBIDs, @ProcessedPDBIDs, @LineWords);
 489 
 490   $TextFile = $PDBIDsFileInfo{Name};
 491 
 492   $IDsColIndex = $PDBIDsFileInfo{IDsColIndex};
 493   $InDelim = $PDBIDsFileInfo{InDelim} ;
 494 
 495   ($LineCount, $ProcessedLineCount, $IgnoredLineCount) = (0) x 3;
 496 
 497   print "\nProcessing PDB ID(s) from PDB IDs  file $TextFile...\n";
 498 
 499   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 500   # Skip label line...
 501   $_ = <TEXTFILE>;
 502 
 503   @PDBIDs = ();
 504   LINE: while ($Line = GetTextLine(\*TEXTFILE)) {
 505     $LineCount++;
 506     @LineWords = quotewords($InDelim, 0, $Line);
 507 
 508     if ($IDsColIndex >= scalar @LineWords) {
 509       $IgnoredLineCount++;
 510       warn "Warning: Ignoring line number $LineCount: PDB IDs column number, ". ($IDsColIndex + 1) . ", doesn't exist in the line containing, " . (scalar @LineWords) .  ", columns.\nLine: $Line\n";
 511       next LINE;
 512     }
 513     $PDBID = $LineWords[$IDsColIndex];
 514     if (IsEmpty($PDBID )) {
 515       $IgnoredLineCount++;
 516       warn "Warning: Ignoring line number $LineCount: PDB ID value is empty.\nLine: $Line\n";
 517       next LINE;
 518     }
 519     $ProcessedLineCount++;
 520 
 521     @ProcessedPDBIDs = ProcessSpecifiedPDBIDs($PDBID);
 522     if (@ProcessedPDBIDs) {
 523       push @PDBIDs, @ProcessedPDBIDs;
 524     }
 525   }
 526   close TEXTFILE;
 527 
 528   @{$OptionsInfo{PDBIDs}} = @PDBIDs;
 529 
 530   print "\nTotal number of lines in PDB IDs text file: $LineCount\n";
 531   print "Total number of lines processed: $ProcessedLineCount\n";
 532   print "Total number of lines ignored: $IgnoredLineCount\n";
 533 
 534   RetrieveUniquePDBIDs();
 535 }
 536 
 537 # Process specified PDB IDs...
 538 #
 539 # Notes:
 540 #   . Commas and spaces in the specification of PBD IDs are allowed.
 541 #   . All PDB IDs are turned into uppercase letters.
 542 #
 543 sub ProcessSpecifiedPDBIDs {
 544   my($SpecifiedPDBID) = @_;
 545   my($PDBID, @PDBIDWords, @PDBIDs);
 546 
 547   $SpecifiedPDBID = RemoveLeadingAndTrailingWhiteSpaces($SpecifiedPDBID);
 548    if ($SpecifiedPDBID =~ / /) {
 549     @PDBIDWords = split " ",  $SpecifiedPDBID;
 550   }
 551   elsif ($SpecifiedPDBID =~ /,/) {
 552     @PDBIDWords = split ",",  $SpecifiedPDBID;
 553   }
 554   else {
 555     push @PDBIDWords, $SpecifiedPDBID;
 556   }
 557 
 558   @PDBIDs = ();
 559   for $PDBID (@PDBIDWords) {
 560     $PDBID =~ s/( |,)//g;
 561     push @PDBIDs, uc $PDBID;
 562   }
 563   return @PDBIDs;
 564 }
 565 
 566 # Collect unique PDB IDs...
 567 sub RetrieveUniquePDBIDs {
 568   my($PDBID, $PDBIDsCount, $PDBIDsIgnoredCount, @UniquePDBIDs, %PDBIDsMap);
 569 
 570   %PDBIDsMap = ();
 571   @UniquePDBIDs = ();
 572 
 573   $PDBIDsCount = 0;
 574   $PDBIDsIgnoredCount = 0;
 575 
 576   PDBID: for $PDBID (@{$OptionsInfo{PDBIDs}}) {
 577    $PDBIDsCount++;
 578     if (exists $PDBIDsMap{$PDBID}) {
 579       $PDBIDsIgnoredCount++;
 580       warn "Warning: Ignoring duplicate PDB ID $PDBID\n";
 581       next PDBID;
 582     }
 583     $PDBIDsMap{$PDBID} = $PDBID;
 584     push @UniquePDBIDs, $PDBID;
 585   }
 586   @{$OptionsInfo{PDBIDs}} = @UniquePDBIDs;
 587   print "\nTotal number of PDB IDs:  $PDBIDsCount\n";
 588   print "Number of duplicate PDB IDs ignored:  $PDBIDsIgnoredCount\n";
 589 }
 590 
 591 # Process option values...
 592 sub ProcessOptions {
 593   my($EDMapTypes, $EDMapType, $EDLocationSuffixes, $EDMapSuffix, $Index, @EDMapTypesList, @EDLocationSuffixesList, %EDLocationSuffixesMap);
 594 
 595   %OptionsInfo = ();
 596   %PDBIDsFileInfo = ();
 597 
 598   $OptionsInfo{Mode} = $Options{mode};
 599   $OptionsInfo{ColMode} = $Options{colmode};
 600 
 601   $OptionsInfo{DataLocationURL} = $Options{datalocationurl};
 602   if (IsEmpty($OptionsInfo{DataLocationURL} )) {
 603     die "Error: PDB data location URL specified using \"-d, --dataLocationURL\" is empty. Allowed value: Non empty string\n";
 604   }
 605 
 606   $OptionsInfo{DensityMap} = $Options{densitymap} =~ /^Yes$/i ? 1 : 0;
 607   $OptionsInfo{DensityMapMode} = $Options{densitymapmode};
 608 
 609   $OptionsInfo{DensityMapLocationURLCryoEM} = $Options{densitymaplocationurlcryoem};
 610   $OptionsInfo{DenistyMapLocationURLXRay} = $Options{denistymaplocationurlxray};
 611 
 612   # Process x-ray ED map location file suffixes...
 613   $EDLocationSuffixes = $Options{edmaplocationsuffixes};
 614   $EDLocationSuffixes =~ s/ //g;
 615   %EDLocationSuffixesMap = ();
 616 
 617   @EDLocationSuffixesList = split(",", $EDLocationSuffixes);
 618   if (@EDLocationSuffixesList % 2) {
 619     die "Invalid number  of values specified using \"--EDMapLocationSuffixes\" option: It must contain even number of valid values.\n";
 620   }
 621   for ($Index = 0; $Index < @EDLocationSuffixesList; $Index += 2) {
 622     $EDMapType = $EDLocationSuffixesList[$Index];
 623     $EDMapSuffix = $EDLocationSuffixesList[$Index + 1];
 624 
 625     if ($EDMapType !~ /^(CompositeMap|DifferenceMap|ReflectionMap)$/i) {
 626       die "Error: The value specified, $EDMapType, for option \"--EDMapLocationSuffixes\" is not valid. Allowed values: CompositeMap, DifferenceMap, ReflectionMap\n";
 627     }
 628     if (exists $EDLocationSuffixesMap{$EDMapType}) {
 629       die "Error: Duplicate ED map type, $EDMapType, specified for option \"--EDMapLocationSuffixes\"\n";
 630     }
 631 
 632     # Track suffixes...
 633     if ($EDMapSuffix =~ /^None$/i) {
 634       $EDMapSuffix = "";
 635     }
 636     $EDLocationSuffixesMap{$EDMapType} = $EDMapSuffix;
 637   }
 638   $OptionsInfo{EDMapLocationSuffixes} = $EDLocationSuffixes;
 639   @{$OptionsInfo{EDMapLocationSuffixesList}} = ();
 640   @{$OptionsInfo{EDMapLocationSuffixesList}} = @EDLocationSuffixesList;
 641 
 642   %{$OptionsInfo{EDMapLocationSuffixesMap}} = ("CompositeMap" => "", "DifferenceMap" => "_diff", "ReflectionMap" => "_map");
 643   %{$OptionsInfo{EDMapLocationFileExtMap}} = ("CompositeMap" => "ccp4", "DifferenceMap" => "ccp4", "ReflectionMap" => "mtz");
 644 
 645   for $EDMapType (keys %EDLocationSuffixesMap) {
 646     $EDMapSuffix = $EDLocationSuffixesMap{$EDMapType};
 647     $OptionsInfo{EDMapLocationSuffixesMap}{$EDMapType} = $EDMapSuffix;
 648   }
 649 
 650   # Process x-ray ED map types...
 651   $EDMapTypes = $Options{edmaptypes};
 652   $EDMapTypes =~ s/ //g;
 653   @EDMapTypesList = ();
 654   if ($EDMapTypes =~ /^All$/i) {
 655     push @EDMapTypesList, ("CompositeMap", "DifferenceMap", "ReflectionMap");
 656   }
 657   else {
 658     @EDMapTypesList = split(",", $EDMapTypes);
 659     for $EDMapType (@EDMapTypesList) {
 660       if ($EDMapType !~ /^(CompositeMap|DifferenceMap|ReflectionMap|All)$/i) {
 661         die "Error: The value specified, $EDMapType, for option \"--EDMapTypes\" is not valid. Allowed values: CompositeMap, DifferenceMap, ReflectionMap, All\n";
 662       }
 663       if ($EDMapType =~ /^All$/i) {
 664         die "Error: The value specified, $EDMapType, for option \"--EDMapTypes\" must be specified alone. It can't be specified with other values.\n";
 665       }
 666     }
 667   }
 668 
 669   $OptionsInfo{EDMapTypes} = $EDMapTypes;
 670   @{$OptionsInfo{EDMapTypesList}} = ();
 671   push @{$OptionsInfo{EDMapTypesList}}, @EDMapTypesList;
 672 
 673 
 674   $OptionsInfo{InDelim} = $Options{indelim};
 675 
 676   $OptionsInfo{PDBIDsCol } = defined $Options{pdbidscol} ? $Options{pdbidscol} : '';
 677 
 678   $OptionsInfo{PDBFormat} = $Options{pdbformat};
 679 
 680   @{$OptionsInfo{CmdLinePDBIDs}} = ();
 681   $OptionsInfo{PDBIDsFile} = "";
 682 
 683   if ($OptionsInfo{Mode} =~ /^IDsOnCmdLine$/i) {
 684     push @{$OptionsInfo{CmdLinePDBIDs}}, @ARGV;
 685   }
 686   elsif ($OptionsInfo{Mode} =~ /^IDsInFile$/i) {
 687     if (@ARGV != 1) {
 688       die "Error: Invalid number of PDB IDs text files, ". (scalar @ARGV) . ",specified on the command line for \"IDsInFile\" value of  $Options{mode}, for option \"-m --mode\". Allowed value: Only one text file\n";
 689     }
 690     $OptionsInfo{PDBIDsFile} = $ARGV[0];
 691 
 692     RetrievePDBIDsTextFileInfo();
 693   }
 694   else {
 695     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: IDsOnCmdLine or IDsInFile\n";
 696   }
 697 }
 698 
 699 # Retrieve information for PDB IDs text file...
 700 #
 701 sub RetrievePDBIDsTextFileInfo {
 702   my($TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, $ColNum, $ColLabel, $PDBIDsColIndex, $ColMode, $PDBIDsCol , $ColCount, $ColIndex, @ColLabels);
 703 
 704   $TextFile = $OptionsInfo{PDBIDsFile};
 705 
 706   %PDBIDsFileInfo = ();
 707   $PDBIDsFileInfo{Name} = $TextFile;
 708 
 709   $PDBIDsFileInfo{ColCount} = 0;
 710   @{$PDBIDsFileInfo{ColLabels}} = ();
 711   %{$PDBIDsFileInfo{ColLabelToNumMap}} = ();
 712   $PDBIDsFileInfo{InDelim} = "";
 713 
 714   $PDBIDsFileInfo{IDsColIndex} = "";
 715 
 716   if (!-e $TextFile) {
 717     die "Error: PDBIDs text file, $TextFile, doesn't exist\n";
 718   }
 719 
 720   if (!CheckFileType($TextFile, "csv tsv")) {
 721     die "Error: Ignoring file $TextFile: It's not a csv or tsv file\n";
 722   }
 723 
 724   ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 725   if ($FileExt =~ /^tsv$/i) {
 726     $InDelim = "\t";
 727   }
 728   else {
 729     $InDelim = "\,";
 730     if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) {
 731       die "Error: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for textfile\n";
 732     }
 733     if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 734       $InDelim = "\;";
 735     }
 736   }
 737 
 738   if (!open TEXTFILE, "$TextFile") {
 739     die "Error: Ignoring file $TextFile: Couldn't open it: $! \n";
 740   }
 741 
 742   $Line = GetTextLine(\*TEXTFILE);
 743   @ColLabels = quotewords($InDelim, 0, $Line);
 744   close TEXTFILE;
 745   $ColCount = scalar @ColLabels;
 746 
 747   push @{$PDBIDsFileInfo{ColLabels}}, @ColLabels;
 748   $PDBIDsFileInfo{ColCount} = $ColCount ;
 749   $PDBIDsFileInfo{InDelim} = $InDelim;
 750 
 751   # Setup collabel to colnum map...
 752   %{$PDBIDsFileInfo{ColLabelToNumMap}} = ();
 753   for $ColNum (0 .. $#ColLabels) {
 754     $ColLabel = $ColLabels[$ColNum];
 755     $PDBIDsFileInfo{ColLabelToNumMap}{lc $ColLabel} = $ColNum;
 756   }
 757 
 758   # Identify column containing PDB IDs...
 759   $PDBIDsColIndex = "";
 760 
 761   $ColMode = $OptionsInfo{ColMode};
 762   $PDBIDsCol = $OptionsInfo{PDBIDsCol };
 763 
 764   if (IsNotEmpty($PDBIDsCol )) {
 765     if ($ColMode =~ /^collabel$/i) {
 766       $ColLabel = lc $PDBIDsCol;
 767       if (!exists $PDBIDsFileInfo{ColLabelToNumMap}{$ColLabel} ) {
 768         die "Error: Ignoring file $TextFile: The column name, $PDBIDsCol, specified for option \"-p, --PDBIDsCol \" is not valid for text file\n";
 769       }
 770       $PDBIDsColIndex = $PDBIDsFileInfo{ColLabelToNumMap}{$ColLabel};
 771     }
 772     else {
 773       $ColNum = $PDBIDsCol;
 774       if ($ColNum <= 0 || $ColNum > $ColCount) {
 775         die "Error: Ignoring file $TextFile: The column number, $PDBIDsCol, specified for option \"-p, --PDBIDsCol \" is not valid for text file. It must be > 0 and <= $ColCount\n";
 776       }
 777       $PDBIDsColIndex = $ColNum - 1;
 778     }
 779   }
 780   else {
 781     # Look for column name containing PDB_ID or PDBID text string...
 782     $PDBIDsCol = "";
 783     $ColIndex = 0;
 784     COLLABEL: for $ColLabel (@ColLabels) {
 785       if ($ColLabel =~ /(PDB_ID|PDBID)/i) {
 786         $PDBIDsCol = $ColLabel;
 787         $PDBIDsColIndex = $ColIndex;
 788         last COLLABEL;
 789       }
 790       $ColIndex++;
 791     }
 792     if (IsEmpty($PDBIDsCol)) {
 793       die "Error: Ignoring file $TextFile: Couldn't find PDB IDs default column containing text string PDB_ID or PDBID in its name\n";
 794     }
 795   }
 796   $PDBIDsFileInfo{IDsColIndex} = $PDBIDsColIndex;
 797 }
 798 
 799 # Setup script usage  and retrieve command line arguments specified using various options...
 800 sub SetupScriptUsage {
 801 
 802   # Retrieve all the options...
 803   %Options = ();
 804 
 805   $Options{colmode} = "colnum";
 806   $Options{datalocationurl} = "http://www.rcsb.org/pdb/files/";
 807 
 808   $Options{densitymap} = "no";
 809   $Options{densitymapmode} = "auto";
 810 
 811   $Options{densitymaplocationurlcryoem} = "ftp://ftp.wwpdb.org/pub/emdb/structures/";
 812   $Options{denistymaplocationurlxray} = "http://www.ebi.ac.uk/pdbe/coordinates/files/";
 813 
 814   $Options{edmaplocationsuffixes} = "CompositeMap,None,DifferenceMap,_diff,ReflectionMap, _map";
 815   $Options{edmaptypes} = "CompositeMap,DifferenceMap";
 816 
 817   $Options{indelim} = "comma";
 818   $Options{mode} = "IDsOnCmdLine";
 819 
 820   $Options{pdbformat} = "Auto";
 821 
 822   if (!GetOptions(\%Options, "colmode|c=s", "datalocationurl|d=s",  "densitymap=s", "densitymapmode=s", "densitymaplocationurlcryoem=s", "denistymaplocationurlxray=s", "edmaplocationsuffixes=s", "edmaptypes=s",  "help|h",  "indelim=s", "mode|m=s", "pdbidscol|p=s", "pdbformat=s", "workingdir|w=s")) {
 823     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 824   }
 825   if ($Options{workingdir}) {
 826     if (! -d $Options{workingdir}) {
 827       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 828     }
 829     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 830   }
 831   if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
 832     die "Error: The value specified, $Options{colmode}, for option \"-c, --colmode\" is not valid. Allowed values: colnum or collabel\n";
 833   }
 834   if ($Options{densitymap} !~ /^(yes|no)$/i) {
 835     die "Error: The value specified, $Options{densitymap}, for option \"--DensityMap\" is not valid. Allowed values: yes or no\n";
 836   }
 837   if ($Options{densitymapmode} !~ /^(XRayElectronDensity|CryoEMDensity|Auto)$/i) {
 838     die "Error: The value specified, $Options{densitymapmode}, for option \"--DensityMapMode\" is not valid. Allowed values: XRayElectronDensity, CryoEMDensity, Auto\n";
 839   }
 840   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 841     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 842   }
 843   if ($Options{mode} !~ /^(IDsOnCmdLine|IDsInFile)$/i) {
 844     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: IDsOnCmdLine or IDsInFile\n";
 845   }
 846   if ($Options{pdbformat} !~ /^(PDB|CIF|Auto)$/i) {
 847     die "Error: The value specified, $Options{pdbformat}, for option \"--indelim\" is not valid. Allowed values: PDB, CIF or Auto\n";
 848   }
 849 }
 850