MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: ExtractFromSequenceFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 use SequenceFileUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 # Starting message...
  42 $ScriptName = basename($0);
  43 print "\n$ScriptName: Starting...\n\n";
  44 $StartTime = new Benchmark;
  45 
  46 # Setup script usage message...
  47 SetupScriptUsage();
  48 if ($Options{help} || @ARGV < 1) {
  49   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  50 }
  51 
  52 # Expand wild card file names...
  53 my(@SequenceFilesList);
  54 @SequenceFilesList = ExpandFileNames(\@ARGV, "aln msf fasta fta pir");
  55 
  56 # Process options...
  57 print "Processing options...\n";
  58 my(%OptionsInfo);
  59 ProcessOptions();
  60 
  61 # Set up information about input files...
  62 print "Checking input sequence file(s)...\n";
  63 my(%SequenceFilesInfo);
  64 RetrieveSequenceFilesInfo();
  65 
  66 # Process input files..
  67 my($FileIndex);
  68 if (@SequenceFilesList > 1) {
  69   print "\nProcessing sequence files...\n";
  70 }
  71 for $FileIndex (0 .. $#SequenceFilesList) {
  72   if ($SequenceFilesInfo{FilesOkay}[$FileIndex]) {
  73     print "\nProcessing file $SequenceFilesList[$FileIndex]...\n";
  74     ExtractFromSequenceFiles($FileIndex);
  75   }
  76 }
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # Extract from sequence files...
  86 sub ExtractFromSequenceFiles {
  87   my($FileIndex) = @_;
  88   my($OutSequenceFile, $SequenceFile, $SequenceDataRef, $SpecifiedSequenceDataRef);
  89 
  90   # Read sequence file...
  91   $SequenceFile = $SequenceFilesList[$FileIndex];
  92   open SEQUENCEFILE, "$SequenceFile" or die "Error: Can't open $SequenceFile: $! \n";
  93   $SequenceDataRef = ReadSequenceFile($SequenceFile);
  94   close SEQUENCEFILE;
  95 
  96   $OutSequenceFile = $SequenceFilesInfo{OutFile}[$FileIndex];
  97   print "Generating sequence file $OutSequenceFile...\n";
  98 
  99   # Retrieve sequence data for specified sequences...
 100   $SpecifiedSequenceDataRef = GetSpecifiedSequenceData($SequenceDataRef);
 101 
 102   # Handle gaps...
 103   if ($OptionsInfo{IgnoreGaps}) {
 104     if (@{$SpecifiedSequenceDataRef->{IDs}} > 1) {
 105       if (AreSequenceLengthsIdentical($SpecifiedSequenceDataRef)) {
 106         $SpecifiedSequenceDataRef = RemoveSequenceAlignmentGapColumns($SpecifiedSequenceDataRef);
 107       }
 108     }
 109     else {
 110       # Remove the gaps from the sequence...
 111       my($ID, $Sequence);
 112       $ID = $SpecifiedSequenceDataRef->{IDs}[0];
 113       $Sequence = $SpecifiedSequenceDataRef->{Sequence}{$ID};
 114       $SpecifiedSequenceDataRef->{Sequence}{$ID} = RemoveSequenceGaps($Sequence);
 115     }
 116   }
 117 
 118   # Write out the file...
 119   WritePearsonFastaSequenceFile($OutSequenceFile, $SpecifiedSequenceDataRef, $OptionsInfo{MaxSequenceLength});
 120 }
 121 
 122 # Get specified sequence data...
 123 sub GetSpecifiedSequenceData {
 124   my($SequenceDataRef) = @_;
 125 
 126   if ($OptionsInfo{Mode} =~ /^SequenceID$/i) {
 127     return GetDataBySequenceIDs($SequenceDataRef);
 128   }
 129   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 130     return GetDataBySequenceNums($SequenceDataRef);
 131   }
 132   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 133     return GetDataBySequenceNumRange($SequenceDataRef);
 134   }
 135   else {
 136     return undef;
 137   }
 138 }
 139 
 140 # Get specified sequence data...
 141 sub GetDataBySequenceIDs {
 142   my($SequenceDataRef) = @_;
 143   my($ID, $SequenceCount, $IDMatched, $SpecifiedID, %SpecifiedSequenceDataMap);
 144 
 145   # Go over sequences and collect sequences for writing out a new sequence file...
 146   %SpecifiedSequenceDataMap = ();
 147   @{$SpecifiedSequenceDataMap{IDs}} = ();
 148   %{$SpecifiedSequenceDataMap{Description}} = ();
 149   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 150 
 151   $SequenceCount = 0;
 152   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 153     if ($OptionsInfo{MatchExactSequenceIDs}) {
 154       if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{lc($ID)}) {
 155         next ID;
 156       }
 157       if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 158         last ID;
 159       }
 160       $SequenceCount++;
 161     }
 162     else {
 163       # Does this ID contains specified ID as substring...
 164       $IDMatched = 0;
 165       SPECIFIEDID: for $SpecifiedID (@{$OptionsInfo{SpecifiedSequenceIDs}}) {
 166         if ($ID =~ /$SpecifiedID/i) {
 167           $IDMatched = 1;
 168           last SPECIFIEDID;
 169         }
 170       }
 171       if (!$IDMatched) {
 172         next ID;
 173       }
 174       $SequenceCount++;
 175     }
 176     # Collect sequence data...
 177     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 178     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 179     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 180   }
 181 
 182   return \%SpecifiedSequenceDataMap;
 183 }
 184 
 185 # Get specified sequence data...
 186 sub GetDataBySequenceNums {
 187   my($SequenceDataRef) = @_;
 188   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 189 
 190   # Go over sequences and collect sequences for writing out a new sequence file...
 191   %SpecifiedSequenceDataMap = ();
 192   @{$SpecifiedSequenceDataMap{IDs}} = ();
 193   %{$SpecifiedSequenceDataMap{Description}} = ();
 194   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 195 
 196   $SequenceNum = 0;
 197   $SequenceCount = 0;
 198   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 199     $SequenceNum++;
 200     if (!exists $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceNum}) {
 201       next ID;
 202     }
 203     if ($SequenceCount >= scalar @{$OptionsInfo{SpecifiedSequenceIDs}}) {
 204       last ID;
 205     }
 206     $SequenceCount++;
 207 
 208     # Collect sequence data...
 209     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 210     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 211     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 212   }
 213 
 214   return \%SpecifiedSequenceDataMap;
 215 }
 216 
 217 # Get specified sequence data...
 218 sub GetDataBySequenceNumRange {
 219   my($SequenceDataRef) = @_;
 220   my($ID, $SequenceNum, $SequenceCount, %SpecifiedSequenceDataMap);
 221 
 222   # Go over sequences and collect sequences for writing out a new sequence file...
 223   %SpecifiedSequenceDataMap = ();
 224   @{$SpecifiedSequenceDataMap{IDs}} = ();
 225   %{$SpecifiedSequenceDataMap{Description}} = ();
 226   %{$SpecifiedSequenceDataMap{Sequence}} = ();
 227 
 228   $SequenceNum = 0;
 229   $SequenceCount = 0;
 230   ID: for $ID (@{$SequenceDataRef->{IDs}}) {
 231     $SequenceNum++;
 232 
 233     if (!($SequenceNum >= $OptionsInfo{SpecifiedSequenceIDs}[0] && $SequenceNum <= $OptionsInfo{SpecifiedSequenceIDs}[1])) {
 234       next ID;
 235     }
 236     if ($SequenceNum > $OptionsInfo{SpecifiedSequenceIDs}[1]) {
 237       last ID;
 238     }
 239     $SequenceCount++;
 240     # Collect sequence data...
 241     push @{$SpecifiedSequenceDataMap{IDs}}, $ID;
 242     $SpecifiedSequenceDataMap{Description}{$ID} = $SequenceDataRef->{Description}{$ID};
 243     $SpecifiedSequenceDataMap{Sequence}{$ID} = $SequenceDataRef->{Sequence}{$ID};
 244   }
 245 
 246   return \%SpecifiedSequenceDataMap;
 247 }
 248 
 249 
 250 # Process option values...
 251 sub ProcessOptions {
 252   %OptionsInfo = ();
 253 
 254   # Miscellaneous options...
 255   $OptionsInfo{IgnoreGaps} = ($Options{ignoregaps} =~ /Yes/i) ? 1 : 0;
 256 
 257   $OptionsInfo{Mode} = $Options{mode};
 258   $OptionsInfo{MatchExactSequenceIDs} = $Options{sequenceidmatch} =~ /Exact/i ? 1 :0;
 259 
 260   # Check specified sequences value...
 261   $OptionsInfo{SpecifiedSequences} = $Options{sequences};
 262   @{$OptionsInfo{SpecifiedSequenceIDs}} = ();
 263   %{$OptionsInfo{SpecifiedSequenceIDsMap}} = ();
 264 
 265   my(@SpecifiedSequenceIDs) = ();
 266   if ($Options{mode} =~ /^SequenceID$/i) {
 267     if (!$Options{sequences}) {
 268       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceID\" of \"-m, --mode\" option\n";
 269     }
 270     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 271   }
 272   elsif ($Options{mode} =~ /^SequenceNum$/i) {
 273     if ($Options{sequences}) {
 274       @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 275       my($SequenceNum);
 276       for $SequenceNum (@SpecifiedSequenceIDs) {
 277         if (!IsPositiveInteger($SequenceNum)) {
 278           die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 279         }
 280       }
 281     }
 282     else {
 283       push @SpecifiedSequenceIDs, "1";
 284     }
 285   }
 286   elsif ($Options{mode} =~ /^SequenceNumRange$/i) {
 287     if (!$Options{sequences}) {
 288       die "Error: No value specified for option \"-s, --Sequences\" during \"SequenceNumRange\" of \"-m, --mode\" option\n";
 289     }
 290     @SpecifiedSequenceIDs = split /\,/, $Options{sequences};
 291     if (@SpecifiedSequenceIDs != 2) {
 292       die "Error: The number of values", scalar @SpecifiedSequenceIDs, " specified, $Options{sequences}, for option \"-s, --Sequences\" are not valid. Number of values must be 2 to indicate starting and ending sequence number.\n";
 293     }
 294     my($SequenceNum);
 295     for $SequenceNum (@SpecifiedSequenceIDs) {
 296       if (!IsPositiveInteger($SequenceNum)) {
 297         die "Error: The value specified, $SequenceNum, in \"$Options{sequences}\" for option \"-s, --Sequences\" is not valid: Valid values: > 0\n";
 298       }
 299     }
 300     if ($SpecifiedSequenceIDs[0] > $SpecifiedSequenceIDs[1]) {
 301       die "Error: The value specified \"$Options{sequences}\" for option \"-s, --Sequences\" are not valid: Starting sequence number $SpecifiedSequenceIDs[0] must be smaller than ending sequence number $SpecifiedSequenceIDs[1]\n";
 302     }
 303   }
 304   push @{$OptionsInfo{SpecifiedSequenceIDs}}, @SpecifiedSequenceIDs;
 305   my($SequenceID);
 306   for $SequenceID (@SpecifiedSequenceIDs) {
 307     if ($Options{mode} =~ /^SequenceID$/i) {
 308       $OptionsInfo{SpecifiedSequenceIDsMap}{lc($SequenceID)} = $SequenceID;
 309     }
 310     else {
 311       $OptionsInfo{SpecifiedSequenceIDsMap}{$SequenceID} = $SequenceID;
 312     }
 313   }
 314 
 315   $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
 316   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 317   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 318 }
 319 
 320 # Retrieve information about sequence files...
 321 sub RetrieveSequenceFilesInfo {
 322   my($Index, $SequenceFile, $FileSupported, $FileFormat, $SequenceCount, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFileExt, $OutFileMode, $SequenceDataRef);
 323 
 324   %SequenceFilesInfo = ();
 325   @{$SequenceFilesInfo{FilesOkay}} = ();
 326   @{$SequenceFilesInfo{OutFileRoot}} = ();
 327   @{$SequenceFilesInfo{OutFileExt}} = ();
 328   @{$SequenceFilesInfo{OutFile}} = ();
 329   @{$SequenceFilesInfo{Format}} = ();
 330   @{$SequenceFilesInfo{SequenceCount}} = ();
 331 
 332   FILELIST: for $Index (0 .. $#SequenceFilesList) {
 333     $SequenceFile = $SequenceFilesList[$Index];
 334     $SequenceFilesInfo{FilesOkay}[$Index] = 0;
 335     $SequenceFilesInfo{OutFileRoot}[$Index] = '';
 336     $SequenceFilesInfo{OutFileExt}[$Index] = '';
 337     $SequenceFilesInfo{OutFile}[$Index] = '';
 338     $SequenceFilesInfo{Format}[$Index] = 'NotSupported';
 339     $SequenceFilesInfo{SequenceCount}[$Index] = 0;
 340 
 341     if (! open SEQUENCEFILE, "$SequenceFile") {
 342       warn "Warning: Ignoring file $SequenceFile: Couldn't open it: $! \n";
 343       next FILELIST;
 344     }
 345     close SEQUENCEFILE;
 346 
 347     ($FileSupported, $FileFormat) = IsSupportedSequenceFile($SequenceFile);
 348     if (!$FileSupported) {
 349       warn "Warning: Ignoring file $SequenceFile: Sequence file format is not supported.\n";
 350       next FILELIST;
 351     }
 352     $SequenceDataRef = ReadSequenceFile($SequenceFile);
 353 
 354     $SequenceCount = $SequenceDataRef->{Count};
 355     if (!$SequenceCount) {
 356       warn "Warning: Ignoring file $SequenceFile: Sequence data is missing.\n";
 357       next FILELIST;
 358     }
 359 
 360     # Setup output file names...
 361     $FileDir = ""; $FileName = ""; $FileExt = "";
 362     ($FileDir, $FileName, $FileExt) = ParseFileName($SequenceFile);
 363     $OutFileExt = 'fasta';
 364     if ($OptionsInfo{OutFileRoot} && (@SequenceFilesList == 1)) {
 365       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 366       if ($RootFileName && $RootFileExt) {
 367         $FileName = $RootFileName;
 368       }
 369       else {
 370         $FileName = $OptionsInfo{OutFileRoot};
 371       }
 372       $OutFileRoot = $FileName;
 373     }
 374     else {
 375       $OutFileRoot = $FileName;
 376     }
 377     MODE: {
 378         if ($OptionsInfo{Mode} =~ /^SequenceID$/i) { $OutFileMode = 'SequenceID'; last MODE;}
 379         if ($OptionsInfo{Mode} =~ /^SequenceNum$/i) { $OutFileMode = 'SequenceNum'; last MODE;}
 380         if ($OptionsInfo{Mode} =~ /^SequenceNumRange$/i) { $OutFileMode = 'SequenceNumRange'; last MODE;}
 381         $OutFileMode = '';
 382     }
 383     if (!$OptionsInfo{OverwriteFiles}) {
 384       if (-e "${OutFileRoot}${OutFileMode}.${OutFileExt}") {
 385         warn "Warning: Ignoring file $SequenceFile: The file ${OutFileRoot}${OutFileMode}.${OutFileExt} already exists\n";
 386         next FILELIST;
 387       }
 388     }
 389 
 390     $SequenceFilesInfo{FilesOkay}[$Index] = 1;
 391     $SequenceFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 392     $SequenceFilesInfo{OutFileExt}[$Index] = $OutFileExt;
 393     $SequenceFilesInfo{OutFile}[$Index] = "${OutFileRoot}${OutFileMode}.${OutFileExt}";
 394 
 395     $SequenceFilesInfo{Format}[$Index] = $FileFormat;
 396     $SequenceFilesInfo{SequenceCount}[$Index] = $SequenceCount;
 397   }
 398 }
 399 
 400 # Setup script usage  and retrieve command line arguments specified using various options...
 401 sub SetupScriptUsage {
 402 
 403   # Retrieve all the options...
 404   %Options = ();
 405   $Options{ignoregaps} = 'Yes';
 406   $Options{mode} = 'SequenceNum';
 407   $Options{sequenceidmatch} = 'Relaxed';
 408   $Options{sequencelength} = 80;
 409 
 410   if (!GetOptions(\%Options, "help|h", "ignoregaps|i=s", "mode|m=s", "overwrite|o", "root|r=s", "sequences|s=s", "sequenceidmatch=s", "sequencelength=i", "workingdir|w=s")) {
 411     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 412   }
 413   if ($Options{workingdir}) {
 414     if (! -d $Options{workingdir}) {
 415       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 416     }
 417     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 418   }
 419   if ($Options{ignoregaps} !~ /^(yes|no)$/i) {
 420     die "Error: The value specified, $Options{ignoregaps}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 421   }
 422   if ($Options{mode} !~ /^(SequenceID|SequenceNum|SequenceNumRange)$/i) {
 423     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: SequenceID, SequenceNum, or SequenceNumRange\n";
 424   }
 425   if ($Options{sequenceidmatch} !~ /^(Exact|Relaxed)$/i) {
 426     die "Error: The value specified, $Options{sequenceidmatch}, for option \"--SequenceIDMatch\" is not valid. Allowed values: Exact or Relaxed\n";
 427   }
 428   if (!IsPositiveInteger($Options{sequencelength})) {
 429     die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
 430   }
 431 }
 432