1 #!/usr/bin/perl -w 2 # 3 # File: ExtractFromTextFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use FileHandle; 32 use Benchmark; 33 use FileUtil; 34 use TextUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 $StartTime = new Benchmark; 42 43 # Starting message... 44 $ScriptName = basename $0; 45 print "\n$ScriptName:Starting...\n\n"; 46 47 # Get the options and setup script... 48 SetupScriptUsage(); 49 if ($Options{help} || @ARGV < 1) { 50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 51 } 52 53 my(@TextFilesList); 54 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 55 56 # Process options... 57 print "Processing options...\n"; 58 my(%OptionsInfo); 59 ProcessOptions(); 60 61 # Collect column information for all the text files... 62 print "Checking input text file(s)...\n"; 63 my(%TextFilesInfo); 64 RetrieveTextFilesInfo(); 65 RetrieveColumnsAndRowsInfo(); 66 67 # Generate output files... 68 my($FileIndex); 69 if (@TextFilesList > 1) { 70 print "\nProcessing text files...\n"; 71 } 72 for $FileIndex (0 .. $#TextFilesList) { 73 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 74 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 75 ExtractFromTextFile($FileIndex); 76 } 77 } 78 print "\n$ScriptName:Done...\n\n"; 79 80 $EndTime = new Benchmark; 81 $TotalTime = timediff ($EndTime, $StartTime); 82 print "Total time: ", timestr($TotalTime), "\n"; 83 84 ############################################################################### 85 86 # Extract appropriate data from text file... 87 sub ExtractFromTextFile { 88 my($Index) = @_; 89 90 if ($OptionsInfo{Mode} =~ /^categories$/i) { 91 ExtractCategoryData($Index); 92 } 93 elsif ($OptionsInfo{Mode} =~ /^rows$/i){ 94 ExtractRowsData($Index); 95 } 96 else { 97 ExtractColumnData($Index); 98 } 99 } 100 101 # Geneate category files... 102 sub ExtractCategoryData { 103 my($Index) = @_; 104 my($TextFile, $CategoryCol, $NewTextFile, $InDelim, @ColLabels); 105 106 $TextFile = $TextFilesList[$Index]; 107 108 $NewTextFile = $TextFilesInfo{OutFile}[$Index]; 109 $CategoryCol = $TextFilesInfo{CategoryColNum}[$Index]; 110 $InDelim = $TextFilesInfo{InDelim}[$Index]; 111 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; 112 113 my($Line, @LineWords, $CategoryName, $CategoryCount, %CategoriesNameToCountMap, %CategoriesNameToLinesMap); 114 # Collect category data... 115 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 116 # Skip label line... 117 $_ = <TEXTFILE>; 118 119 %CategoriesNameToCountMap = (); 120 %CategoriesNameToLinesMap = (); 121 122 while ($Line = GetTextLine(\*TEXTFILE)) { 123 @LineWords = quotewords($InDelim, 0, $Line); 124 $CategoryName = ($CategoryCol <= @LineWords) ? $LineWords[$CategoryCol] : ""; 125 if (exists($CategoriesNameToCountMap{$CategoryName})) { 126 $CategoriesNameToCountMap{$CategoryName} += 1; 127 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 128 } 129 else { 130 $CategoriesNameToCountMap{$CategoryName} = 1; 131 @{$CategoriesNameToLinesMap{$CategoryName}} = (); 132 push @{$CategoriesNameToLinesMap{$CategoryName}}, $Line; 133 } 134 } 135 close TEXTFILE; 136 137 # Setup file names for individual category files... 138 my(%CategoriesNameToFileHandleMap, %CategoriesNameToFileNameMap, $CategoryFile, $CategoryFileHandle); 139 140 %CategoriesNameToFileHandleMap = (); 141 %CategoriesNameToFileNameMap = (); 142 143 for $CategoryName (keys %CategoriesNameToCountMap) { 144 $CategoryFile = $TextFilesInfo{CategoryOutFileRoot}[$Index] . "$CategoryName" . ".$TextFilesInfo{OutFileExt}[$Index]";; 145 $CategoryFile =~ s/ //g; 146 $CategoryFileHandle = new FileHandle; 147 open $CategoryFileHandle, ">$CategoryFile" or die "Couldn't open $CategoryFile: $! \n"; 148 $CategoriesNameToFileNameMap{$CategoryName} = $CategoryFile; 149 $CategoriesNameToFileHandleMap{$CategoryName} = $CategoryFileHandle; 150 } 151 152 # Write out summary file... 153 print "Generating file $NewTextFile...\n"; 154 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 155 156 # Write out column labels... 157 @LineWords = ("Category","Count"); 158 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 159 print NEWTEXTFILE "$Line\n"; 160 161 # Write out the category names and count... 162 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 163 $CategoryCount = $CategoriesNameToCountMap{$CategoryName}; 164 @LineWords = ("$CategoryName","$CategoryCount"); 165 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 166 print NEWTEXTFILE "$Line\n"; 167 } 168 close NEWTEXTFILE; 169 170 # Write out a file for each category... 171 my($ColLabelLine, $LineIndex); 172 173 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 174 print "\nGenerating text files for each category...\n"; 175 176 for $CategoryName (sort { lc($a) cmp lc($b) } keys %CategoriesNameToCountMap) { 177 print "Generating file $CategoriesNameToFileNameMap{$CategoryName}...\n"; 178 $CategoryFileHandle = $CategoriesNameToFileHandleMap{$CategoryName}; 179 print $CategoryFileHandle "$ColLabelLine\n"; 180 for $LineIndex (0 .. $#{$CategoriesNameToLinesMap{$CategoryName}}) { 181 $Line = ${$CategoriesNameToLinesMap{$CategoryName}}[$LineIndex]; 182 @LineWords = quotewords($InDelim, 0, $Line); 183 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 184 print $CategoryFileHandle "$Line\n"; 185 } 186 close $CategoryFileHandle; 187 } 188 } 189 190 # Extract data for specific columns... 191 sub ExtractColumnData { 192 my($Index) = @_; 193 my($TextFile, @ColNumsToExtract, $NewTextFile, $InDelim); 194 195 $TextFile = $TextFilesList[$Index]; 196 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; 197 $InDelim = $TextFilesInfo{InDelim}[$Index]; 198 @ColNumsToExtract = @{$TextFilesInfo{ColNumsToExtract}[$Index]}; 199 200 print "Generating file $NewTextFile...\n"; 201 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 202 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 203 204 $_ = <TEXTFILE>; 205 # Write out column labels... 206 my($Line, @LineWords, @ColLabels, $ColLabelLine, @ColValues, $ColValuesLine, $ColNum, $ColValue); 207 @ColLabels = (); $ColLabelLine = ""; 208 for $ColNum (@ColNumsToExtract) { 209 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 210 } 211 $ColLabelLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 212 print NEWTEXTFILE "$ColLabelLine\n"; 213 214 while ($Line = GetTextLine(\*TEXTFILE)) { 215 @LineWords = quotewords($InDelim, 0, $Line); 216 @ColValues = (); $ColValuesLine = ""; 217 for $ColNum (@ColNumsToExtract) { 218 $ColValue = ""; 219 if ($ColNum < @LineWords) { 220 $ColValue = (defined $LineWords[$ColNum]) ? $LineWords[$ColNum] : ""; 221 } 222 push @ColValues, $ColValue; 223 } 224 $ColValuesLine = JoinWords(\@ColValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 225 print NEWTEXTFILE "$ColValuesLine\n"; 226 } 227 close NEWTEXTFILE; 228 close TEXTFILE; 229 } 230 231 # Extract data for specific rows... 232 sub ExtractRowsData { 233 my($Index) = @_; 234 my($TextFile, $NewTextFile, $InDelim, $SpecifiedRowsMode); 235 236 $TextFile = $TextFilesList[$Index]; 237 $NewTextFile =$TextFilesInfo{OutFile}[$Index]; 238 $InDelim = $TextFilesInfo{InDelim}[$Index]; 239 240 $SpecifiedRowsMode = $OptionsInfo{SpecifiedRowsMode}; 241 242 print "Generating file $NewTextFile...\n"; 243 open TEXTFILE, "$TextFile" or die "Couldn't open $TextFile: $! \n"; 244 open NEWTEXTFILE, ">$NewTextFile" or die "Couldn't open $NewTextFile: $! \n"; 245 246 my($Line, $RowCount, @LineWords, @ColLabels); 247 248 # Write out column labels... 249 $Line = <TEXTFILE>; 250 push @ColLabels, @{$TextFilesInfo{ColLabels}[$Index]}; 251 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 252 print NEWTEXTFILE "$Line\n"; 253 254 if ($SpecifiedRowsMode =~ /^rowsbycolvalue$/i) { 255 ExtractRowsByColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 256 } 257 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluelist$/i) { 258 ExtractRowsByColValueList($Index, \*TEXTFILE, \*NEWTEXTFILE); 259 } 260 elsif ($SpecifiedRowsMode =~ /^rowsbycolvaluerange$/i) { 261 ExtractRowsByColValueRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 262 } 263 elsif ($SpecifiedRowsMode =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 264 ExtractRowByMinOrMaxColValue($Index, \*TEXTFILE, \*NEWTEXTFILE); 265 } 266 elsif ($SpecifiedRowsMode =~ /^rownums$/i) { 267 ExtractRowsByRowNums($Index, \*TEXTFILE, \*NEWTEXTFILE); 268 } 269 elsif ($SpecifiedRowsMode =~ /^rownumrange$/i) { 270 ExtractRowsByRowNumRange($Index, \*TEXTFILE, \*NEWTEXTFILE); 271 } 272 273 close NEWTEXTFILE; 274 close TEXTFILE; 275 } 276 277 # Extract rows by column value... 278 sub ExtractRowsByColValue { 279 my($Index, $TextFileRef, $NewTextFileRef) = @_; 280 my($Line, $ColNum, $ColValue, $Criterion, $Value, $ValueIndex, $InDelim, @LineWords); 281 282 $InDelim = $TextFilesInfo{InDelim}[$Index]; 283 284 LINE: while ($Line = GetTextLine($TextFileRef)) { 285 @LineWords = quotewords($InDelim, 0, $Line); 286 for ($ValueIndex = 0; $ValueIndex < @{$TextFilesInfo{RowValues}[$Index]}; $ValueIndex = $ValueIndex + 3) { 287 $ColNum = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; 288 $ColValue = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 1]; 289 $Criterion = $TextFilesInfo{RowValues}[$Index][$ValueIndex + 2]; 290 if ($ColNum > $#LineWords) { 291 next LINE; 292 } 293 $Value = $LineWords[$ColNum]; 294 if ($Criterion =~ /^le$/i) { 295 if ($Value > $ColValue) { 296 next LINE; 297 } 298 } 299 elsif ($Criterion =~ /^ge$/i) { 300 if ($Value < $ColValue) { 301 next LINE; 302 } 303 } 304 elsif ($Criterion =~ /^eq$/i) { 305 if ($Value ne $ColValue) { 306 next LINE; 307 } 308 } 309 } 310 # Write it out... 311 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 312 print $NewTextFileRef "$Line\n"; 313 } 314 } 315 # Extract rows by column value list... 316 sub ExtractRowsByColValueList { 317 my($Index, $TextFileRef, $NewTextFileRef) = @_; 318 my($Line, $ColNum, $ColValue, $ValueIndex, $Value, $InDelim, %ColValueMap, @LineWords); 319 320 $InDelim = $TextFilesInfo{InDelim}[$Index]; 321 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 322 323 # Setup a col value map... 324 %ColValueMap = (); 325 for $ValueIndex (1 .. $#{$TextFilesInfo{RowValues}[$Index]}) { 326 $Value = $TextFilesInfo{RowValues}[$Index][$ValueIndex]; 327 $ColValueMap{$Value} = $Value; 328 } 329 330 LINE: while ($Line = GetTextLine($TextFileRef)) { 331 @LineWords = quotewords($InDelim, 0, $Line); 332 if ($ColNum > $#LineWords) { 333 next LINE; 334 } 335 $ColValue = $LineWords[$ColNum]; 336 if (exists $ColValueMap{$ColValue}) { 337 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 338 print $NewTextFileRef "$Line\n"; 339 } 340 } 341 } 342 343 # Extract row by minimum column value... 344 sub ExtractRowByMinOrMaxColValue { 345 my($Index, $TextFileRef, $NewTextFileRef) = @_; 346 my($Line, $ColNum, $ColValue, $FirstValue, $ValueLine, $InDelim, @LineWords); 347 348 $InDelim = $TextFilesInfo{InDelim}[$Index]; 349 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 350 351 $ValueLine = ''; $ColValue = ''; $FirstValue = 1; 352 LINE: while ($Line = GetTextLine($TextFileRef)) { 353 @LineWords = quotewords($InDelim, 0, $Line); 354 if ($ColNum > $#LineWords) { 355 next LINE; 356 } 357 if ($FirstValue) { 358 $FirstValue = 0; 359 $ColValue = $LineWords[$ColNum]; 360 $ValueLine = $Line; 361 next LINE; 362 } 363 if ($OptionsInfo{SpecifiedRowsMode} =~ /^rowbymaxcolvalue$/i) { 364 if ($LineWords[$ColNum] > $ColValue) { 365 $ColValue = $LineWords[$ColNum]; 366 $ValueLine = $Line; 367 } 368 } 369 else { 370 if ($LineWords[$ColNum] < $ColValue) { 371 $ColValue = $LineWords[$ColNum]; 372 $ValueLine = $Line; 373 } 374 } 375 } 376 if ($ValueLine) { 377 @LineWords = quotewords($InDelim, 0, $ValueLine); 378 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 379 print $NewTextFileRef "$Line\n"; 380 } 381 } 382 383 # Extract rows by column value range... 384 sub ExtractRowsByColValueRange { 385 my($Index, $TextFileRef, $NewTextFileRef) = @_; 386 my($Line, $ColNum, $ColValue, $MinValue, $MaxValue, $InDelim, @LineWords); 387 388 $InDelim = $TextFilesInfo{InDelim}[$Index]; 389 $ColNum = $TextFilesInfo{RowValues}[$Index][0]; 390 $MinValue = $TextFilesInfo{RowValues}[$Index][1]; 391 $MaxValue = $TextFilesInfo{RowValues}[$Index][2]; 392 393 LINE: while ($Line = GetTextLine($TextFileRef)) { 394 @LineWords = quotewords($InDelim, 0, $Line); 395 if ($ColNum > $#LineWords) { 396 next LINE; 397 } 398 $ColValue = $LineWords[$ColNum]; 399 if ($ColValue >= $MinValue && $ColValue <= $MaxValue) { 400 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 401 print $NewTextFileRef "$Line\n"; 402 } 403 } 404 } 405 406 # Extract rows by row number range... 407 sub ExtractRowsByRowNumRange { 408 my($Index, $TextFileRef, $NewTextFileRef) = @_; 409 410 my($Line, $MinRowNum, $MaxRowNum, $RowCount, $InDelim, @LineWords); 411 $InDelim = $TextFilesInfo{InDelim}[$Index]; 412 $MinRowNum = $TextFilesInfo{RowValues}[$Index][0]; 413 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][1]; 414 415 $RowCount = 1; 416 LINE: while ($Line = GetTextLine($TextFileRef)) { 417 $RowCount++; 418 if ($RowCount >= $MinRowNum && $RowCount <= $MaxRowNum) { 419 @LineWords = quotewords($InDelim, 0, $Line); 420 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 421 print $NewTextFileRef "$Line\n"; 422 } 423 elsif ($RowCount > $MaxRowNum) { 424 last LINE; 425 } 426 } 427 } 428 429 # Extract rows by row numbers... 430 sub ExtractRowsByRowNums { 431 my($Index, $TextFileRef, $NewTextFileRef) = @_; 432 my($Line, $RowNum, $MaxRowNum, $RowCount, $InDelim, %RowNumMap, @LineWords); 433 434 $InDelim = $TextFilesInfo{InDelim}[$Index]; 435 436 # Setup a row nums map... 437 %RowNumMap = (); 438 $MaxRowNum = $TextFilesInfo{RowValues}[$Index][0]; 439 for $RowNum (@{$TextFilesInfo{RowValues}[$Index]}) { 440 if ($RowNum > $MaxRowNum) { 441 $MaxRowNum = $RowNum; 442 } 443 $RowNumMap{$RowNum} = $RowNum; 444 } 445 446 $RowCount = 1; 447 LINE: while ($Line = GetTextLine($TextFileRef)) { 448 $RowCount++; 449 if (exists $RowNumMap{$RowCount}) { 450 @LineWords = quotewords($InDelim, 0, $Line); 451 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 452 print $NewTextFileRef "$Line\n"; 453 } 454 elsif ($RowCount > $MaxRowNum) { 455 last LINE; 456 } 457 } 458 } 459 460 # Retrieve text file columns and rows information for specified options... 461 sub RetrieveColumnsAndRowsInfo { 462 ProcessColumnsInfo(); 463 ProcessRowsInfo(); 464 } 465 466 # Make sure the specified columns exists in text files... 467 sub ProcessColumnsInfo { 468 my($Index, $SpecifiedCategoryCol, $TextFile, @ColNumsToExtract); 469 470 @{$TextFilesInfo{CategoryColNum}} = (); 471 @{$TextFilesInfo{ColNumsToExtract}} = (); 472 473 $SpecifiedCategoryCol = $OptionsInfo{SpecifiedCategoryCol}; 474 475 FILELIST: for $Index (0 .. $#TextFilesList) { 476 $TextFile = $TextFilesList[$Index]; 477 478 $TextFilesInfo{CategoryColNum}[$Index] = 0; 479 @{$TextFilesInfo{ColNumsToExtract}[$Index]} = (); 480 481 if ($TextFilesInfo{FileOkay}[$Index]) { 482 if ($OptionsInfo{Mode} =~ /^categories$/i) { 483 my($CategoryColNum, $CategoryColValid); 484 485 $CategoryColNum = 0; 486 $CategoryColValid = 1; 487 if ($SpecifiedCategoryCol) { 488 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 489 if ($SpecifiedCategoryCol <= $TextFilesInfo{ColCount}[$Index]) { 490 $CategoryColNum = $SpecifiedCategoryCol - 1; 491 } 492 else { 493 $CategoryColValid = 0; 494 } 495 } 496 else { 497 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol})) { 498 $CategoryColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedCategoryCol}; 499 } 500 else { 501 $CategoryColValid = 0; 502 } 503 } 504 } 505 if ($CategoryColValid) { 506 $TextFilesInfo{CategoryColNum}[$Index] = $CategoryColNum; 507 } 508 else { 509 warn "Warning: Ignoring file $TextFile: Category column specified, $SpecifiedCategoryCol, using \"--categorycol\" option doesn't exist\n"; 510 $TextFilesInfo{FileOkay}[$Index] = 0; 511 } 512 } 513 elsif ($OptionsInfo{Mode} =~ /^columns$/i) { 514 my($SpecifiedColNum, $ColNum); 515 516 $ColNum = 0; 517 @ColNumsToExtract = (); 518 519 if (@{$OptionsInfo{SpecifiedColumns}}) { 520 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 521 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedColumns}}) { 522 if ($SpecifiedColNum >=1 && $SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { 523 $ColNum = $SpecifiedColNum - 1; 524 push @ColNumsToExtract, $ColNum; 525 } 526 } 527 } 528 else { 529 my($ColLabel); 530 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { 531 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 532 push @ColNumsToExtract, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 533 } 534 } 535 } 536 } 537 else { 538 push @ColNumsToExtract, $ColNum; 539 } 540 if (@ColNumsToExtract) { 541 push @{$TextFilesInfo{ColNumsToExtract}[$Index]}, @ColNumsToExtract; 542 } 543 else { 544 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist\n"; 545 $TextFilesInfo{FileOkay}[$Index] = 0; 546 } 547 } 548 } 549 } 550 } 551 552 # Process specified rows info... 553 sub ProcessRowsInfo { 554 my($Index, $TextFile, $ColID, $ColIDOkay, $Value, $Criterion, $ColNum, @RowValues); 555 556 @{$TextFilesInfo{RowValues}} = (); 557 558 FILELIST: for $Index (0 .. $#TextFilesList) { 559 $TextFile = $TextFilesList[$Index]; 560 @{$TextFilesInfo{RowValues}[$Index]} = (); 561 562 if ($OptionsInfo{Mode} !~ /^rows$/i) { 563 next FILELIST; 564 } 565 if (!$TextFilesInfo{FileOkay}[$Index]) { 566 next FILELIST; 567 } 568 569 @RowValues = (); 570 571 if ($OptionsInfo{RowsMode} =~ /^rowsbycolvalue$/i) { 572 my($ValueIndex); 573 for ($ValueIndex = 0; $ValueIndex < @{$OptionsInfo{SpecifiedRowValues}}; $ValueIndex = $ValueIndex + 3) { 574 $ColID = $OptionsInfo{SpecifiedRowValues}[$ValueIndex]; 575 $Value = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 1]; 576 $Criterion = $OptionsInfo{SpecifiedRowValues}[$ValueIndex + 2]; 577 578 $ColIDOkay = 0; 579 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { 580 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { 581 $ColIDOkay = 1; 582 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; 583 } 584 } 585 else { 586 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { 587 $ColNum = $ColID - 1; 588 $ColIDOkay = 1; 589 } 590 } 591 if ($ColIDOkay) { 592 push @RowValues, ($ColNum, $Value, $Criterion); 593 } 594 } 595 } 596 elsif ($OptionsInfo{RowsMode} =~ /^(rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue)$/i) { 597 # Process coulumn id... 598 $ColID = $OptionsInfo{SpecifiedRowValues}[0]; 599 $ColIDOkay = 0; 600 601 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { 602 if (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}) { 603 $ColIDOkay = 1; 604 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColID}; 605 } 606 } 607 else { 608 if ($ColID >=1 && $ColID <= $TextFilesInfo{ColCount}[$Index]) { 609 $ColIDOkay = 1; 610 $ColNum = $ColID - 1; 611 } 612 } 613 if ($ColIDOkay) { 614 push @RowValues, $ColNum; 615 # Get rest of the specified values... 616 if (@{$OptionsInfo{SpecifiedRowValues}} > 1) { 617 for $Index (1 .. $#{$OptionsInfo{SpecifiedRowValues}}) { 618 push @RowValues, $OptionsInfo{SpecifiedRowValues}[$Index]; 619 } 620 } 621 } 622 } 623 elsif ($OptionsInfo{RowsMode} =~ /^(rownums|rownumrange)$/i) { 624 push @RowValues, @{$OptionsInfo{SpecifiedRowValues}}; 625 } 626 627 if (@RowValues) { 628 push @{$TextFilesInfo{RowValues}[$Index]}, @RowValues; 629 } 630 else { 631 warn "Warning: Ignoring file $TextFile: Column specified, $ColID, using \"--rows\" option doesn't exist\n"; 632 $TextFilesInfo{FileOkay}[$Index] = 0; 633 } 634 } 635 } 636 637 # Retrieve information about input text files... 638 sub RetrieveTextFilesInfo { 639 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $CategoryOutFileRoot, $OutFile, $ColNum, $ColLabel); 640 641 %TextFilesInfo = (); 642 643 @{$TextFilesInfo{FileOkay}} = (); 644 @{$TextFilesInfo{ColCount}} = (); 645 @{$TextFilesInfo{ColLabels}} = (); 646 @{$TextFilesInfo{ColLabelToNumMap}} = (); 647 @{$TextFilesInfo{InDelim}} = (); 648 @{$TextFilesInfo{OutFile}} = (); 649 @{$TextFilesInfo{OutFileExt}} = (); 650 @{$TextFilesInfo{CategoryOutFileRoot}} = (); 651 652 FILELIST: for $Index (0 .. $#TextFilesList) { 653 $TextFile = $TextFilesList[$Index]; 654 655 $TextFilesInfo{FileOkay}[$Index] = 0; 656 $TextFilesInfo{ColCount}[$Index] = 0; 657 $TextFilesInfo{InDelim}[$Index] = ""; 658 $TextFilesInfo{OutFile}[$Index] = ""; 659 $TextFilesInfo{OutFileExt}[$Index] = ""; 660 $TextFilesInfo{CategoryOutFileRoot}[$Index] = ""; 661 662 @{$TextFilesInfo{ColLabels}[$Index]} = (); 663 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 664 665 if (!(-e $TextFile)) { 666 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 667 next FILELIST; 668 } 669 if (!CheckFileType($TextFile, "csv tsv")) { 670 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 671 next FILELIST; 672 } 673 674 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 675 if ($FileExt =~ /^tsv$/i) { 676 $InDelim = "\t"; 677 } 678 else { 679 $InDelim = "\,"; 680 if (!($OptionsInfo{InDelim} =~ /^(comma|semicolon)$/i)) { 681 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 682 next FILELIST; 683 } 684 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 685 $InDelim = "\;"; 686 } 687 } 688 689 if (!open TEXTFILE, "$TextFile") { 690 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 691 next FILELIST; 692 } 693 694 $Line = GetTextLine(\*TEXTFILE); 695 @ColLabels = quotewords($InDelim, 0, $Line); 696 close TEXTFILE; 697 698 $FileDir = ""; $FileName = ""; $FileExt = ""; 699 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 700 $FileExt = "csv"; 701 if ($Options{outdelim} =~ /^tab$/i) { 702 $FileExt = "tsv"; 703 } 704 705 if ($OptionsInfo{OutFileRoot} && (@TextFilesList == 1)) { 706 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 707 if ($RootFileName && $RootFileExt) { 708 $FileName = $RootFileName; 709 } 710 else { 711 $FileName = $OptionsInfo{OutFileRoot}; 712 } 713 $OutFileRoot .= $FileName; 714 } 715 else { 716 $OutFileRoot = $FileName; 717 $OutFileRoot .= ($OptionsInfo{Mode} =~ /^categories$/i) ? "CategoriesSummary" : (($OptionsInfo{Mode} =~ /^rows$/i) ? "ExtractedRows" : "ExtractedColumns"); 718 } 719 $CategoryOutFileRoot = "$FileName" . "Category"; 720 721 $OutFile = $OutFileRoot . ".$FileExt"; 722 if (lc($OutFile) eq lc($TextFile)) { 723 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 724 next FILELIST; 725 } 726 727 if (!$OptionsInfo{Overwrite}) { 728 if (-e $OutFile) { 729 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 730 next FILELIST; 731 } 732 } 733 734 $TextFilesInfo{FileOkay}[$Index] = 1; 735 $TextFilesInfo{InDelim}[$Index] = $InDelim; 736 $TextFilesInfo{CategoryOutFileRoot}[$Index] = $CategoryOutFileRoot; 737 $TextFilesInfo{OutFile}[$Index] = "$OutFile"; 738 $TextFilesInfo{OutFileExt}[$Index] = "$FileExt"; 739 740 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 741 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 742 743 for $ColNum (0 .. $#ColLabels) { 744 $ColLabel = $ColLabels[$ColNum]; 745 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 746 } 747 } 748 } 749 750 # Process option values... 751 sub ProcessOptions { 752 my(@SpecifiedColumns, @SpecifiedRowValues); 753 754 %OptionsInfo = (); 755 756 $OptionsInfo{Mode} = $Options{mode}; 757 758 $OptionsInfo{ColMode} = $Options{colmode}; 759 760 $OptionsInfo{CategoryCol} = defined $Options{categorycol} ? $Options{categorycol} : undef; 761 $OptionsInfo{SpecifiedCategoryCol} = ""; 762 763 if (defined $Options{categorycol}) { 764 my(@SpecifiedValues) = split ",", $Options{categorycol}; 765 if (@SpecifiedValues != 1) { 766 die "Error: Invalid number of values, ",scalar(@SpecifiedValues), " using \"--categorycol\" option: Only one value is allowed.\n"; 767 } 768 $OptionsInfo{SpecifiedCategoryCol} = $SpecifiedValues[0]; 769 if ($Options{colmode} =~ /^colnum$/i) { 770 if (!IsPositiveInteger($OptionsInfo{SpecifiedCategoryCol})) { 771 die "Error: Category column value, $OptionsInfo{SpecifiedCategoryCol}, specified using \"--categorycol\" is not valid. Allowed integer values: > 0.\n"; 772 } 773 } 774 } 775 776 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; 777 @{$OptionsInfo{SpecifiedColumns}} = (); 778 @SpecifiedColumns = (); 779 780 if (defined $Options{columns}) { 781 my(@SpecifiedValues) = split ",", $Options{columns}; 782 if ($Options{colmode} =~ /^colnum$/i) { 783 my($ColValue); 784 for $ColValue (@SpecifiedValues) { 785 if (!IsPositiveInteger($ColValue)) { 786 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 787 } 788 } 789 } 790 push @SpecifiedColumns, @SpecifiedValues; 791 } 792 @{$OptionsInfo{SpecifiedColumns}} = @SpecifiedColumns; 793 794 $OptionsInfo{InDelim} = $Options{indelim}; 795 796 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); 797 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 798 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 799 800 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 801 802 # Process any specified rows values... 803 @SpecifiedRowValues = (); 804 @{$OptionsInfo{SpecifiedRowValues}} = (); 805 806 $OptionsInfo{RowsMode} = $Options{rowsmode}; 807 $OptionsInfo{Rows} = defined $Options{rows} ? $Options{rows} : undef; 808 809 $OptionsInfo{SpecifiedRowsMode} = $Options{rowsmode}; 810 811 if (defined $Options{rows}) { 812 (@SpecifiedRowValues) = split ",", $Options{rows}; 813 } 814 else { 815 if ($Options{rowsmode} !~ /^rownums$/i) { 816 die "Error: Specify value for \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\n"; 817 } 818 push @SpecifiedRowValues, "1"; 819 } 820 @{$OptionsInfo{SpecifiedRowValues}} = @SpecifiedRowValues; 821 822 my($SpecifiedColID, $SpecifiedRowID); 823 # Make sure specified values are okay... 824 if ($Options{rowsmode} =~ /^rowsbycolvalue$/i) { 825 if (@SpecifiedRowValues % 3) { 826 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain triplets.\n"; 827 } 828 # Triplet format: colid,value,criteria. Criterion: le,ge,eq 829 my($Index, $ColID, $Criterion, $Value); 830 for ($Index = 0; $Index < @SpecifiedRowValues; $Index = $Index + 3) { 831 $ColID = $SpecifiedRowValues[$Index]; 832 $Value = $SpecifiedRowValues[$Index + 1]; 833 $Criterion = $SpecifiedRowValues[$Index + 2]; 834 if ($Options{colmode} =~ /^colnum$/i) { 835 if (!IsPositiveInteger($ColID)) { 836 die "Error: Invalid column id, $ColID, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 837 } 838 } 839 if ($Criterion !~ /^(eq|le|ge)$/i) { 840 die "Error: Invalid criterion value, $Criterion, specified in triplet, \"$ColID,$Criterion,$Value\", using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed values: le, ge, or eq.\n"; 841 } 842 } 843 } 844 elsif ($Options{rowsmode} =~ /^rowsbycolvaluelist$/i) { 845 ($SpecifiedColID) = $SpecifiedRowValues[0]; 846 if ($Options{colmode} =~ /^colnum$/i) { 847 if (!IsPositiveInteger($SpecifiedColID)) { 848 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 849 } 850 } 851 if (@SpecifiedRowValues == 1) { 852 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain more than one value\n"; 853 } 854 } 855 elsif ($Options{rowsmode} =~ /^rowsbycolvaluerange$/i) { 856 if (@SpecifiedRowValues != 3) { 857 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain three values\n"; 858 } 859 ($SpecifiedColID) = $SpecifiedRowValues[0]; 860 if ($Options{colmode} =~ /^colnum$/i) { 861 if (!IsPositiveInteger($SpecifiedColID)) { 862 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 863 } 864 } 865 if ($SpecifiedRowValues[1] >= $SpecifiedRowValues[2]) { 866 die "Error: Invalid value triplet - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: second value < third value\n"; 867 } 868 } 869 elsif ($Options{rowsmode} =~ /^(rowbymincolvalue|rowbymaxcolvalue)$/i) { 870 if (@SpecifiedRowValues != 1) { 871 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nOnly one value is allowed.\n"; 872 } 873 ($SpecifiedColID) = $SpecifiedRowValues[0]; 874 if ($Options{colmode} =~ /^colnum$/i) { 875 if (!IsPositiveInteger($SpecifiedColID)) { 876 die "Error: Rows value, $SpecifiedColID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 877 } 878 } 879 } 880 elsif ($Options{rowsmode} =~ /^rownums$/i) { 881 for $SpecifiedRowID (@SpecifiedRowValues) { 882 if (!IsPositiveInteger($SpecifiedRowID)) { 883 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 884 } 885 } 886 } 887 elsif ($Options{rowsmode} =~ /^rownumrange$/i) { 888 if (@SpecifiedRowValues != 2) { 889 die "Error: Invalid number of values, ", scalar(@SpecifiedRowValues) , ", specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nIt must contain only two values.\n"; 890 } 891 for $SpecifiedRowID (@SpecifiedRowValues) { 892 if (!IsPositiveInteger($SpecifiedRowID)) { 893 die "Error: Rows value, $SpecifiedRowID, specified using \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode} is not valid. Allowed integer values: > 0.\n"; 894 } 895 } 896 if ($SpecifiedRowValues[0] >= $SpecifiedRowValues[1]) { 897 die "Error: Invalid value pair - ", JoinWords(\@SpecifiedRowValues, ',', 0) , " - specified by \"--rows\" option with \"--rowsmode\" value of $Options{rowsmode}.\nAllowed values: First value < second value\n"; 898 } 899 } 900 } 901 902 # Setup script usage and retrieve command line arguments specified using various options... 903 sub SetupScriptUsage { 904 905 # Setup default and retrieve all the options... 906 %Options = (); 907 $Options{colmode} = "colnum"; 908 $Options{indelim} = "comma"; 909 $Options{mode} = "columns"; 910 $Options{outdelim} = "comma"; 911 $Options{quote} = "yes"; 912 $Options{rowsmode} = "rownums"; 913 914 if (!GetOptions(\%Options, "categorycol=s", "columns=s", "colmode|c=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "quote|q=s", "root|r=s", "rows=s", "rowsmode=s", "workingdir|w=s")) { 915 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 916 } 917 if ($Options{workingdir}) { 918 if (! -d $Options{workingdir}) { 919 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 920 } 921 chdir $Options{workingdir} || die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 922 } 923 if ($Options{mode} !~ /^(columns|rows|categories)$/i) { 924 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: columns, rows or categories \n"; 925 } 926 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 927 die "Error: The value specified, $Options{colmode}, for option \"--colmode\" is not valid. Allowed values: colnum or collabel \n"; 928 } 929 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 930 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 931 } 932 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 933 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 934 } 935 if ($Options{quote} !~ /^(yes|no)$/i) { 936 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 937 } 938 if ($Options{rowsmode} !~ /^(rowsbycolvalue|rowsbycolvaluelist|rowsbycolvaluerange|rowbymincolvalue|rowbymaxcolvalue|rownums|rownumrange)$/i) { 939 die "Error: The value specified, $Options{rowsmode}, for option \"--rowsmode\" is not valid. Allowed values: rowsbycolvalue, rowsbycolvaluelist, rowsbycolvaluerange, rowbymincolvalue, rowbymaxcolvalue, rownum, rownumrange\n"; 940 } 941 }