MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: ExtractFromTextFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use FileHandle;
  32 use Benchmark;
  33 use FileUtil;
  34 use TextUtil;
  35 
  36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  37 
  38 # Autoflush STDOUT
  39 $| = 1;
  40 
  41 $StartTime = new Benchmark;
  42 
  43 # Starting message...
  44 $ScriptName = basename $0;
  45 print "\n$ScriptName:Starting...\n\n";
  46 
  47 # Get the options and setup script...
  48 SetupScriptUsage();
  49 if ($Options{help} || @ARGV < 1) {
  50   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  51 }
  52 
  53 my(@TextFilesList);
  54 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  55 
  56 # Process options...
  57 print "Processing options...\n";
  58 my(%OptionsInfo);
  59 ProcessOptions();
  60 
  61 # Collect column information for all the text files...
  62 print "Checking input text file(s)...\n";
  63 my(%TextFilesInfo);
  64 RetrieveTextFilesInfo();
  65 RetrieveColumnsAndRowsInfo();
  66 
  67 # Generate output files...
  68 my($FileIndex);
  69 if (@TextFilesList > 1) {
  70   print "\nProcessing text files...\n";
  71 }
  72 for $FileIndex (0 .. $#TextFilesList) {
  73   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  74     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  75     ExtractFromTextFile($FileIndex);
  76   }
  77 }
  78 print "\n$ScriptName:Done...\n\n";
  79 
  80 $EndTime = new Benchmark;
  81 $TotalTime = timediff ($EndTime, $StartTime);
  82 print "Total time: ", timestr($TotalTime), "\n";
  83 
  84 ###############################################################################
  85 
  86 # Extract appropriate data from text file...
  87 sub ExtractFromTextFile {
  88   my($Index) = @_;
  89 
  90   if ($OptionsInfo{Mode} =~ /^categories$/i) {
  91     ExtractCategoryData($Index);
  92   }
  93   elsif ($OptionsInfo{Mode} =~ /^rows$/i){
  94     ExtractRowsData($Index);
  95   }
  96   else {
  97     ExtractColumnData($Index);
  98   }
  99 }
 100 
 101 # Geneate category files...
 102 sub ExtractCategoryData {
 103   my($Index) = @_;
 104   my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels);
 105 
 106   $TextFile = $TextFilesList[$Index];
 107 
 108   $NewTextFile = $TextFilesInfo{OutFile}[$Index];
 109   $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index];
 110   $InDelim = $TextFilesInfo{InDelim}[$Index];
 111   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
 112 
 113   my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap);
 114   # Collect category data...
 115   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 116   # Skip label line...
 117   $_ = <TEXTFILE>;
 118 
 119   %CategoriesNameToCountMap = ();
 120   %CategoriesNameToLinesMap = ();
 121 
 122   while ($Line = GetTextLine(\*TEXTFILE)) {
 123     @LineWords = quotewords($InDelim, 0, $Line);
 124     $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : "";
 125     if (exists($CategoriesNameToCountMap{$CategoryName})) {
 126       $CategoriesNameToCountMap{$CategoryName} += 1;
 127       push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
 128     }
 129     else {
 130       $CategoriesNameToCountMap{$CategoryName} = 1;
 131       @{$CategoriesNameToLinesMap{$CategoryName}} = ();
 132       push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line;
 133     }
 134   }
 135   close TEXTFILE;
 136 
 137   # Setup file names for individual category files...
 138   my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle);
 139 
 140   %CategoriesNameToFileHandleMap = ();
 141   %CategoriesNameToFileNameMap = ();
 142 
 143   for $CategoryName (keys %CategoriesNameToCountMap) {
 144     $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";;
 145     $CategoryFile =~ s/ //g;
 146     $CategoryFileHandle = new FileHandle;
 147     open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n";
 148     $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile;
 149     $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle;
 150   }
 151 
 152   # Write out summary file...
 153   print "Generating file $NewTextFile...\n";
 154   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 155 
 156   # Write out column labels...
 157   @LineWords = ("Category","Count");
 158   $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 159   print NEWTEXTFILE "$Line\n";
 160 
 161   # Write out the category names and count...
 162   for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
 163     $CategoryCount = $CategoriesNameToCountMap{$CategoryName};
 164     @LineWords = ("$CategoryName","$CategoryCount");
 165     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 166     print NEWTEXTFILE "$Line\n";
 167   }
 168   close NEWTEXTFILE;
 169 
 170   # Write out a file for each category...
 171   my($ColLabelLine, $LineIndex);
 172 
 173   $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 174   print "\nGenerating text files for each category...\n";
 175 
 176   for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) {
 177     print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n";
 178     $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName};
 179     print $CategoryFileHandle "$ColLabelLine\n";
 180     for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) {
 181       $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex];
 182       @LineWords = quotewords($InDelim, 0, $Line);
 183       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 184       print $CategoryFileHandle "$Line\n";
 185     }
 186     close $CategoryFileHandle;
 187   }
 188 }
 189 
 190 # Extract data for specific columns...
 191 sub ExtractColumnData {
 192   my($Index) = @_;
 193   my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim);
 194 
 195   $TextFile = $TextFilesList[$Index];
 196   $NewTextFile =$TextFilesInfo{OutFile}[$Index];
 197   $InDelim = $TextFilesInfo{InDelim}[$Index];
 198   @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]};
 199 
 200   print "Generating file $NewTextFile...\n";
 201   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 202   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 203 
 204   $_ = <TEXTFILE>;
 205   # Write out column labels...
 206   my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue);
 207   @ColLabels = (); $ColLabelLine = "";
 208   for $ColNum (@ColNumsToExtract) {
 209     push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
 210   }
 211   $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 212   print NEWTEXTFILE "$ColLabelLine\n";
 213 
 214   while ($Line = GetTextLine(\*TEXTFILE)) {
 215     @LineWords = quotewords($InDelim, 0, $Line);
 216     @ColValues = (); $ColValuesLine = "";
 217     for $ColNum (@ColNumsToExtract) {
 218       $ColValue = "";
 219       if ($ColNum < @LineWords) {
 220         $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : "";
 221       }
 222       push @ColValues, $ColValue;
 223     }
 224     $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 225     print NEWTEXTFILE "$ColValuesLine\n";
 226   }
 227   close NEWTEXTFILE;
 228   close TEXTFILE;
 229 }
 230 
 231 # Extract data for specific rows...
 232 sub ExtractRowsData {
 233   my($Index) = @_;
 234   my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode);
 235 
 236   $TextFile = $TextFilesList[$Index];
 237   $NewTextFile =$TextFilesInfo{OutFile}[$Index];
 238   $InDelim = $TextFilesInfo{InDelim}[$Index];
 239 
 240   $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode};
 241 
 242   print "Generating file $NewTextFile...\n";
 243   open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n";
 244   open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n";
 245 
 246   my($Line, $RowCount, @LineWords, @ColLabels);
 247 
 248   # Write out column labels...
 249   $Line = <TEXTFILE>;
 250   push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]};
 251   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 252   print NEWTEXTFILE "$Line\n";
 253 
 254   if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) {
 255     ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
 256   }
 257   elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) {
 258     ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE);
 259   }
 260   elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) {
 261     ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
 262   }
 263   elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
 264     ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE);
 265   }
 266   elsif ($SpecifiedRowsMode =~ /^rownums$/i) {
 267     ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE);
 268   }
 269   elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) {
 270     ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE);
 271   }
 272 
 273   close NEWTEXTFILE;
 274   close TEXTFILE;
 275 }
 276 
 277 # Extract rows by column value...
 278 sub ExtractRowsByColValue {
 279   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 280   my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords);
 281 
 282   $InDelim = $TextFilesInfo{InDelim}[$Index];
 283 
 284   LINE: while ($Line = GetTextLine($TextFileRef)) {
 285     @LineWords = quotewords($InDelim, 0, $Line);
 286     for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) {
 287       $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
 288       $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1];
 289       $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2];
 290       if ($ColNum > $#LineWords) {
 291         next LINE;
 292       }
 293       $Value = $LineWords[$ColNum];
 294       if ($Criterion =~ /^le$/i) {
 295         if ($Value > $ColValue) {
 296           next LINE;
 297         }
 298       }
 299       elsif ($Criterion =~ /^ge$/i) {
 300         if ($Value < $ColValue) {
 301           next LINE;
 302         }
 303       }
 304       elsif ($Criterion =~ /^eq$/i) {
 305         if ($Value ne $ColValue) {
 306           next LINE;
 307         }
 308       }
 309     }
 310     # Write it out...
 311     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 312     print $NewTextFileRef "$Line\n";
 313   }
 314 }
 315 # Extract rows by column value list...
 316 sub ExtractRowsByColValueList {
 317   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 318   my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords);
 319 
 320   $InDelim = $TextFilesInfo{InDelim}[$Index];
 321   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 322 
 323   # Setup a col value map...
 324   %ColValueMap = ();
 325   for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) {
 326     $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex];
 327     $ColValueMap{$Value} = $Value;
 328   }
 329 
 330   LINE: while ($Line = GetTextLine($TextFileRef)) {
 331     @LineWords = quotewords($InDelim, 0, $Line);
 332     if ($ColNum > $#LineWords) {
 333       next LINE;
 334     }
 335     $ColValue = $LineWords[$ColNum];
 336     if (exists $ColValueMap{$ColValue}) {
 337       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 338       print $NewTextFileRef "$Line\n";
 339     }
 340   }
 341 }
 342 
 343 # Extract row by minimum column value...
 344 sub ExtractRowByMinOrMaxColValue {
 345   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 346   my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords);
 347 
 348   $InDelim = $TextFilesInfo{InDelim}[$Index];
 349   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 350 
 351   $ValueLine = ''; $ColValue = ''; $FirstValue = 1;
 352   LINE: while ($Line = GetTextLine($TextFileRef)) {
 353     @LineWords = quotewords($InDelim, 0, $Line);
 354     if ($ColNum > $#LineWords) {
 355       next LINE;
 356     }
 357     if ($FirstValue) {
 358       $FirstValue = 0;
 359       $ColValue = $LineWords[$ColNum];
 360       $ValueLine = $Line;
 361       next LINE;
 362     }
 363     if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) {
 364       if ($LineWords[$ColNum] > $ColValue) {
 365         $ColValue = $LineWords[$ColNum];
 366         $ValueLine = $Line;
 367       }
 368     }
 369     else {
 370       if ($LineWords[$ColNum] < $ColValue) {
 371         $ColValue = $LineWords[$ColNum];
 372         $ValueLine = $Line;
 373       }
 374     }
 375   }
 376   if ($ValueLine) {
 377     @LineWords = quotewords($InDelim, 0, $ValueLine);
 378     $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 379     print $NewTextFileRef "$Line\n";
 380   }
 381 }
 382 
 383 # Extract rows by column value range...
 384 sub ExtractRowsByColValueRange {
 385   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 386   my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords);
 387 
 388   $InDelim = $TextFilesInfo{InDelim}[$Index];
 389   $ColNum = $TextFilesInfo{RowValues}[$Index][0];
 390   $MinValue = $TextFilesInfo{RowValues}[$Index][1];
 391   $MaxValue = $TextFilesInfo{RowValues}[$Index][2];
 392 
 393   LINE: while ($Line = GetTextLine($TextFileRef)) {
 394     @LineWords = quotewords($InDelim, 0, $Line);
 395     if ($ColNum > $#LineWords) {
 396       next LINE;
 397     }
 398     $ColValue = $LineWords[$ColNum];
 399     if ($ColValue >= $MinValue && $ColValue <= $MaxValue) {
 400       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 401       print $NewTextFileRef "$Line\n";
 402     }
 403   }
 404 }
 405 
 406 # Extract rows by row number range...
 407 sub ExtractRowsByRowNumRange {
 408   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 409 
 410   my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords);
 411   $InDelim = $TextFilesInfo{InDelim}[$Index];
 412   $MinRowNum = $TextFilesInfo{RowValues}[$Index][0];
 413   $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1];
 414 
 415   $RowCount = 1;
 416   LINE: while ($Line = GetTextLine($TextFileRef)) {
 417     $RowCount++;
 418     if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) {
 419       @LineWords = quotewords($InDelim, 0, $Line);
 420       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 421       print $NewTextFileRef "$Line\n";
 422     }
 423     elsif ($RowCount > $MaxRowNum) {
 424       last LINE;
 425     }
 426   }
 427 }
 428 
 429 # Extract rows by row numbers...
 430 sub ExtractRowsByRowNums {
 431   my($Index, $TextFileRef, $NewTextFileRef) = @_;
 432   my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords);
 433 
 434   $InDelim = $TextFilesInfo{InDelim}[$Index];
 435 
 436   # Setup a row nums map...
 437   %RowNumMap = ();
 438   $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0];
 439   for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) {
 440     if ($RowNum > $MaxRowNum) {
 441       $MaxRowNum = $RowNum;
 442     }
 443     $RowNumMap{$RowNum} = $RowNum;
 444   }
 445 
 446   $RowCount = 1;
 447   LINE: while ($Line = GetTextLine($TextFileRef)) {
 448     $RowCount++;
 449     if (exists $RowNumMap{$RowCount}) {
 450       @LineWords = quotewords($InDelim, 0, $Line);
 451       $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 452       print $NewTextFileRef "$Line\n";
 453     }
 454     elsif ($RowCount > $MaxRowNum) {
 455       last LINE;
 456     }
 457   }
 458 }
 459 
 460 # Retrieve text file columns and rows information for specified options...
 461 sub RetrieveColumnsAndRowsInfo {
 462   ProcessColumnsInfo();
 463   ProcessRowsInfo();
 464 }
 465 
 466 # Make sure the specified columns exists in text files...
 467 sub ProcessColumnsInfo {
 468   my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract);
 469 
 470   @{$TextFilesInfo{CategoryColNum}} = ();
 471   @{$TextFilesInfo{ColNumsToExtract}} = ();
 472 
 473   $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol};
 474 
 475   FILELIST: for $Index (0 .. $#TextFilesList) {
 476     $TextFile = $TextFilesList[$Index];
 477 
 478     $TextFilesInfo{CategoryColNum}[$Index] = 0;
 479     @{$TextFilesInfo{ColNumsToExtract}[$Index]} = ();
 480 
 481     if ($TextFilesInfo{FileOkay}[$Index]) {
 482       if ($OptionsInfo{Mode} =~ /^categories$/i) {
 483         my($CategoryColNum, $CategoryColValid);
 484 
 485         $CategoryColNum = 0;
 486         $CategoryColValid = 1;
 487         if ($SpecifiedCategoryCol) {
 488           if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
 489             if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) {
 490               $CategoryColNum = $SpecifiedCategoryCol - 1;
 491             }
 492             else {
 493               $CategoryColValid = 0;
 494             }
 495           }
 496           else {
 497             if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) {
 498               $CategoryColNum =  $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol};
 499             }
 500             else {
 501               $CategoryColValid = 0;
 502             }
 503           }
 504         }
 505         if ($CategoryColValid) {
 506           $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum;
 507         }
 508         else {
 509           warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n";
 510           $TextFilesInfo{FileOkay}[$Index] = 0;
 511         }
 512       }
 513       elsif ($OptionsInfo{Mode} =~ /^columns$/i) {
 514         my($SpecifiedColNum, $ColNum);
 515 
 516         $ColNum = 0;
 517         @ColNumsToExtract = ();
 518 
 519         if (@{$OptionsInfo{SpecifiedColumns}}) {
 520           if ($OptionsInfo{ColMode} =~ /^colnum$/i) {
 521             for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) {
 522               if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
 523                 $ColNum = $SpecifiedColNum - 1;
 524                 push @ColNumsToExtract, $ColNum;
 525               }
 526             }
 527           }
 528           else {
 529             my($ColLabel);
 530             for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) {
 531               if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
 532                 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 533               }
 534             }
 535           }
 536         }
 537         else {
 538           push @ColNumsToExtract, $ColNum;
 539         }
 540         if (@ColNumsToExtract) {
 541           push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract;
 542         }
 543         else {
 544           warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n";
 545           $TextFilesInfo{FileOkay}[$Index] = 0;
 546         }
 547       }
 548     }
 549   }
 550 }
 551 
 552 # Process specified rows info...
 553 sub ProcessRowsInfo {
 554   my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues);
 555 
 556   @{$TextFilesInfo{RowValues}} = ();
 557 
 558   FILELIST: for $Index (0 .. $#TextFilesList) {
 559     $TextFile = $TextFilesList[$Index];
 560     @{$TextFilesInfo{RowValues}[$Index]} = ();
 561 
 562     if ($OptionsInfo{Mode} !~ /^rows$/i) {
 563       next FILELIST;
 564     }
 565     if (!$TextFilesInfo{FileOkay}[$Index]) {
 566       next FILELIST;
 567     }
 568 
 569     @RowValues = ();
 570 
 571     if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) {
 572       my($ValueIndex);
 573       for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) {
 574         $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex];
 575         $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1];
 576         $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2];
 577 
 578         $ColIDOkay = 0;
 579         if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
 580           if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
 581             $ColIDOkay = 1;
 582             $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
 583           }
 584         }
 585         else {
 586           if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
 587             $ColNum = $ColID - 1;
 588             $ColIDOkay = 1;
 589           }
 590         }
 591         if ($ColIDOkay) {
 592           push @RowValues, ($ColNum, $Value, $Criterion);
 593         }
 594       }
 595     }
 596     elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) {
 597       # Process coulumn id...
 598       $ColID = $OptionsInfo{SpecifiedRowValues}[0];
 599       $ColIDOkay = 0;
 600 
 601       if ($OptionsInfo{ColMode} =~ /^collabel$/i) {
 602         if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) {
 603           $ColIDOkay = 1;
 604           $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID};
 605         }
 606       }
 607       else {
 608         if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) {
 609           $ColIDOkay = 1;
 610           $ColNum = $ColID - 1;
 611         }
 612       }
 613       if ($ColIDOkay) {
 614         push @RowValues, $ColNum;
 615         # Get rest of the specified values...
 616         if (@{$OptionsInfo{SpecifiedRowValues}} > 1) {
 617           for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) {
 618             push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index];
 619           }
 620         }
 621       }
 622     }
 623     elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) {
 624       push @RowValues, @{$OptionsInfo{SpecifiedRowValues}};
 625     }
 626 
 627     if (@RowValues) {
 628       push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues;
 629     }
 630     else {
 631       warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n";
 632       $TextFilesInfo{FileOkay}[$Index] = 0;
 633     }
 634   }
 635 }
 636 
 637 # Retrieve information about input text files...
 638 sub RetrieveTextFilesInfo {
 639   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel);
 640 
 641   %TextFilesInfo = ();
 642 
 643   @{$TextFilesInfo{FileOkay}} = ();
 644   @{$TextFilesInfo{ColCount}} = ();
 645   @{$TextFilesInfo{ColLabels}} = ();
 646   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 647   @{$TextFilesInfo{InDelim}} = ();
 648   @{$TextFilesInfo{OutFile}} = ();
 649   @{$TextFilesInfo{OutFileExt}} = ();
 650   @{$TextFilesInfo{CategoryOutFileRoot}} = ();
 651 
 652   FILELIST: for $Index (0 .. $#TextFilesList) {
 653     $TextFile = $TextFilesList[$Index];
 654 
 655     $TextFilesInfo{FileOkay}[$Index] = 0;
 656     $TextFilesInfo{ColCount}[$Index] = 0;
 657     $TextFilesInfo{InDelim}[$Index] = "";
 658     $TextFilesInfo{OutFile}[$Index] = "";
 659     $TextFilesInfo{OutFileExt}[$Index] = "";
 660     $TextFilesInfo{CategoryOutFileRoot}[$Index] = "";
 661 
 662     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 663     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 664 
 665     if (!(-e $TextFile)) {
 666       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 667       next FILELIST;
 668     }
 669     if (!CheckFileType($TextFile, "csv tsv")) {
 670       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 671       next FILELIST;
 672     }
 673 
 674     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 675     if ($FileExt =~ /^tsv$/i) {
 676       $InDelim = "\t";
 677     }
 678     else {
 679       $InDelim = "\,";
 680       if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) {
 681         warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 682         next FILELIST;
 683       }
 684       if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 685         $InDelim = "\;";
 686       }
 687     }
 688 
 689     if (!open TEXTFILE, "$TextFile") {
 690       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 691       next FILELIST;
 692     }
 693 
 694     $Line = GetTextLine(\*TEXTFILE);
 695     @ColLabels = quotewords($InDelim, 0, $Line);
 696     close TEXTFILE;
 697 
 698     $FileDir = ""; $FileName = ""; $FileExt = "";
 699     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 700     $FileExt = "csv";
 701     if ($Options{outdelim} =~ /^tab$/i) {
 702       $FileExt = "tsv";
 703     }
 704 
 705     if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) {
 706       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 707       if ($RootFileName && $RootFileExt) {
 708         $FileName = $RootFileName;
 709       }
 710       else {
 711         $FileName = $OptionsInfo{OutFileRoot};
 712       }
 713       $OutFileRoot .= $FileName;
 714     }
 715     else {
 716       $OutFileRoot = $FileName;
 717       $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns");
 718     }
 719     $CategoryOutFileRoot = "$FileName" . "Category";
 720 
 721     $OutFile = $OutFileRoot . ".$FileExt";
 722     if (lc($OutFile) eq lc($TextFile)) {
 723       warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n";
 724       next FILELIST;
 725     }
 726 
 727     if (!$OptionsInfo{Overwrite}) {
 728       if (-e $OutFile) {
 729         warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 730         next FILELIST;
 731       }
 732     }
 733 
 734     $TextFilesInfo{FileOkay}[$Index] = 1;
 735     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 736     $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot;
 737     $TextFilesInfo{OutFile}[$Index] = "$OutFile";
 738     $TextFilesInfo{OutFileExt}[$Index] = "$FileExt";
 739 
 740     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 741     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 742 
 743     for $ColNum (0 .. $#ColLabels) {
 744       $ColLabel = $ColLabels[$ColNum];
 745       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 746     }
 747   }
 748 }
 749 
 750 # Process option values...
 751 sub ProcessOptions {
 752   my(@SpecifiedColumns, @SpecifiedRowValues);
 753 
 754   %OptionsInfo = ();
 755 
 756   $OptionsInfo{Mode} = $Options{mode};
 757 
 758   $OptionsInfo{ColMode} = $Options{colmode};
 759 
 760   $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef;
 761   $OptionsInfo{SpecifiedCategoryCol} = "";
 762 
 763   if (defined $Options{categorycol}) {
 764     my(@SpecifiedValues) = split ",", $Options{categorycol};
 765     if (@SpecifiedValues != 1) {
 766       die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n";
 767     }
 768     $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0];
 769     if ($Options{colmode} =~ /^colnum$/i) {
 770       if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) {
 771         die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n";
 772       }
 773     }
 774   }
 775 
 776   $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef;
 777   @{$OptionsInfo{SpecifiedColumns}} = ();
 778   @SpecifiedColumns = ();
 779 
 780   if (defined $Options{columns}) {
 781     my(@SpecifiedValues) = split ",", $Options{columns};
 782     if ($Options{colmode} =~ /^colnum$/i) {
 783       my($ColValue);
 784       for $ColValue (@SpecifiedValues) {
 785         if (!IsPositiveInteger($ColValue)) {
 786           die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n";
 787         }
 788       }
 789     }
 790     push @SpecifiedColumns, @SpecifiedValues;
 791   }
 792   @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns;
 793 
 794   $OptionsInfo{InDelim} = $Options{indelim};
 795 
 796   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,");
 797   $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
 798   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 799 
 800   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 801 
 802   # Process any specified rows values...
 803   @SpecifiedRowValues = ();
 804   @{$OptionsInfo{SpecifiedRowValues}} = ();
 805 
 806   $OptionsInfo{RowsMode} = $Options{rowsmode};
 807   $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef;
 808 
 809   $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode};
 810 
 811   if (defined $Options{rows}) {
 812     (@SpecifiedRowValues) = split ",", $Options{rows};
 813   }
 814   else {
 815     if ($Options{rowsmode} !~ /^rownums$/i) {
 816       die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n";
 817     }
 818     push @SpecifiedRowValues, "1";
 819   }
 820   @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues;
 821 
 822   my($SpecifiedColID, $SpecifiedRowID);
 823   # Make sure specified values are okay...
 824   if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) {
 825     if (@SpecifiedRowValues % 3) {
 826       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n";
 827     }
 828     # Triplet format: colid,value,criteria. Criterion: le,ge,eq
 829     my($Index, $ColID, $Criterion, $Value);
 830     for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) {
 831       $ColID = $SpecifiedRowValues[$Index];
 832       $Value = $SpecifiedRowValues[$Index + 1];
 833       $Criterion = $SpecifiedRowValues[$Index + 2];
 834       if ($Options{colmode} =~ /^colnum$/i) {
 835         if (!IsPositiveInteger($ColID)) {
 836           die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 837         }
 838       }
 839       if ($Criterion !~ /^(eq|le|ge)$/i) {
 840         die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n";
 841       }
 842     }
 843   }
 844   elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) {
 845     ($SpecifiedColID) = $SpecifiedRowValues[0];
 846     if ($Options{colmode} =~ /^colnum$/i) {
 847       if (!IsPositiveInteger($SpecifiedColID)) {
 848         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 849       }
 850     }
 851     if (@SpecifiedRowValues == 1) {
 852       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n";
 853     }
 854   }
 855   elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) {
 856     if (@SpecifiedRowValues != 3) {
 857       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n";
 858     }
 859     ($SpecifiedColID) = $SpecifiedRowValues[0];
 860     if ($Options{colmode} =~ /^colnum$/i) {
 861       if (!IsPositiveInteger($SpecifiedColID)) {
 862         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 863       }
 864     }
 865     if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) {
 866       die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n";
 867     }
 868   }
 869   elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) {
 870     if (@SpecifiedRowValues != 1) {
 871       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n";
 872     }
 873     ($SpecifiedColID) = $SpecifiedRowValues[0];
 874     if ($Options{colmode} =~ /^colnum$/i) {
 875       if (!IsPositiveInteger($SpecifiedColID)) {
 876         die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 877       }
 878     }
 879   }
 880   elsif ($Options{rowsmode} =~ /^rownums$/i) {
 881     for $SpecifiedRowID (@SpecifiedRowValues) {
 882       if (!IsPositiveInteger($SpecifiedRowID)) {
 883         die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 884       }
 885     }
 886   }
 887   elsif ($Options{rowsmode} =~ /^rownumrange$/i) {
 888     if (@SpecifiedRowValues != 2) {
 889       die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n";
 890     }
 891     for $SpecifiedRowID (@SpecifiedRowValues) {
 892       if (!IsPositiveInteger($SpecifiedRowID)) {
 893         die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n";
 894       }
 895     }
 896     if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) {
 897       die "Error: Invalid value pair -  ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n";
 898     }
 899   }
 900 }
 901 
 902 # Setup script usage  and retrieve command line arguments specified using various options...
 903 sub SetupScriptUsage {
 904 
 905   # Setup default and retrieve all the options...
 906   %Options = ();
 907   $Options{colmode} = "colnum";
 908   $Options{indelim} = "comma";
 909   $Options{mode} = "columns";
 910   $Options{outdelim} = "comma";
 911   $Options{quote} = "yes";
 912   $Options{rowsmode} = "rownums";
 913 
 914   if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) {
 915     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 916   }
 917   if ($Options{workingdir}) {
 918     if (! -d $Options{workingdir}) {
 919       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 920     }
 921     chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 922   }
 923   if ($Options{mode} !~ /^(columns|rows|categories)$/i) {
 924     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n";
 925   }
 926   if ($Options{colmode} !~ /^(colnum|collabel)$/i) {
 927     die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n";
 928   }
 929   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 930     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 931   }
 932   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
 933     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
 934   }
 935   if ($Options{quote} !~ /^(yes|no)$/i) {
 936     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
 937   }
 938   if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) {
 939     die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n";
 940   }
 941 }