1 #!/usr/bin/perl -w 2 # 3 # File: AnalyzeTextFilesData.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 use StatisticsUtil; 35 36 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 37 38 # Autoflush STDOUT 39 $| = 1; 40 41 # Starting message... 42 $ScriptName = basename($0); 43 print "\n$ScriptName: Starting...\n\n"; 44 $StartTime = new Benchmark; 45 46 # Get the options and setup script... 47 SetupScriptUsage(); 48 if ($Options{help} || @ARGV < 1) { 49 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 50 } 51 52 my(@TextFilesList); 53 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 54 55 print "Processing options...\n"; 56 my(%OptionsInfo); 57 ProcessOptions(); 58 59 # Collect column information for all the text files... 60 print "Checking input text file(s)...\n"; 61 my(%TextFilesInfo); 62 RetrieveTextFilesInfo(); 63 ProcessColumnsInfo(); 64 65 # Generate output files... 66 my($FileIndex); 67 if (@TextFilesList > 1) { 68 print "\nProcessing text files...\n"; 69 } 70 for $FileIndex (0 .. $#TextFilesList) { 71 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 72 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 73 AnalyzeTextFile($FileIndex); 74 } 75 } 76 print "\n$ScriptName:Done...\n\n"; 77 78 $EndTime = new Benchmark; 79 $TotalTime = timediff ($EndTime, $StartTime); 80 print "Total time: ", timestr($TotalTime), "\n"; 81 82 ############################################################################### 83 84 # Analyze data... 85 sub AnalyzeTextFile { 86 my($Index) = @_; 87 my($TextFile, $Line, $InDelim, $ColNum, $Value, @LineWords, @ColNumsToAnalyze, %ColValuesToAnalyzeMap); 88 89 $TextFile = $TextFilesList[$Index]; 90 $InDelim = $TextFilesInfo{InDelim}[$Index]; 91 @ColNumsToAnalyze = @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}; 92 %ColValuesToAnalyzeMap = (); 93 for $ColNum (@ColNumsToAnalyze) { 94 @{$ColValuesToAnalyzeMap{$ColNum}} = (); 95 } 96 97 my($LineCount, $InvalidLineCount, @InvalidColLabels); 98 99 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 100 # Skip over column labels line in text file and collect appropriate column data 101 # for analysis... 102 $Line = GetTextLine(\*TEXTFILE); 103 $LineCount = 1; 104 $InvalidLineCount = 0; 105 while ($Line = GetTextLine(\*TEXTFILE)) { 106 $LineCount++; 107 @LineWords = quotewords($InDelim, 0, $Line); 108 @InvalidColLabels = (); 109 COLNUM: for $ColNum (@ColNumsToAnalyze) { 110 $Value = $LineWords[$ColNum]; 111 if ($OptionsInfo{CheckData}) { 112 if (!IsNumerical($Value)) { 113 push @InvalidColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 114 next COLNUM; 115 } 116 } 117 push @{$ColValuesToAnalyzeMap{$ColNum}}, $Value; 118 } 119 if (@InvalidColLabels) { 120 $InvalidLineCount++; 121 if ($OptionsInfo{DetailLevel} >=4 ) { 122 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed: $Line \n"; 123 } 124 elsif ($OptionsInfo{DetailLevel} >= 3) { 125 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for column(s) - ", JoinWords(\@InvalidColLabels, ", ", 0)," - to be analyzed...\n"; 126 } 127 elsif ($OptionsInfo{DetailLevel} >= 2) { 128 print "Line number $LineCount contains ", scalar(@InvalidColLabels)," non-numerical or empty value(s) for columns to be analyzed...\n"; 129 } 130 } 131 } 132 if ($InvalidLineCount && ($OptionsInfo{DetailLevel} >= 1)) { 133 print "Non-numerical or empty data present in $InvalidLineCount line(s)...\n"; 134 } 135 close TEXTFILE; 136 137 # Perform the analysis... 138 my(@SpecifiedFunctionNames, $SpecifiedFunction); 139 @SpecifiedFunctionNames = (); 140 141 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 142 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 143 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; 144 } 145 } 146 if (@SpecifiedFunctionNames) { 147 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%ColValuesToAnalyzeMap) 148 } 149 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 150 if ($OptionsInfo{AllColumnPairs}) { 151 PerformMatrixAnalysis($Index, \%ColValuesToAnalyzeMap); 152 } 153 else { 154 # Perform pairwise analysis for specified columns and write out calculated values - correlation 155 # rsquare, or covariance - in the same file. 156 PerformColumnPairAnalysis($Index, \%ColValuesToAnalyzeMap); 157 } 158 } 159 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { 160 PerformStandardScoresAnalysis($Index, \%ColValuesToAnalyzeMap); 161 } 162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 163 PerformFrequencyAnalysis($Index, \%ColValuesToAnalyzeMap); 164 } 165 } 166 167 # Calculate values for various statistical functions... 168 sub PerformAnalysis { 169 my($Index, $SpecifiedFunctionNamesRef, $ColValuesToAnalyzeMapRef) = @_; 170 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @ColNumsToAnalyze); 171 172 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $TextFilesInfo{OutFileExt}[$Index]; 173 174 print "Generating new text file $NewTextFile...\n"; 175 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 176 177 # Write out column labels... 178 @ColLabels = (); 179 push @ColLabels, "ColumnID"; 180 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 181 $Label = $SpecifiedFunction; 182 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 183 my($KthValue); 184 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; 185 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 186 $Label =~ s/K//g; 187 } 188 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 189 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; 190 } 191 push @ColLabels, $Label; 192 } 193 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 194 print NEWTEXTFILE "$Line\n"; 195 196 # Go over each column to be analyzed... 197 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 198 199 # Turn off "strict"; otherwise, invoking statistical functions using function name string 200 # is problematic. 201 no strict; 202 203 my($ColValuesRef, $ColNum, $Value, @RowValues, %CalculatedValues); 204 %CalculatedValues = (); 205 for $ColNum (@ColNumsToAnalyze) { 206 @RowValues = (); 207 # Setup column id... 208 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 209 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 210 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 211 $Value = ""; 212 if (!@{$ColValuesToAnalyzeMapRef->{$ColNum}}) { 213 # Invalid column values... 214 push @RowValues, $Value; 215 next FUNCTIONNAME; 216 } 217 if ($SpecifiedFunction =~ /^Count$/i) { 218 $Value = @{$ColValuesToAnalyzeMapRef->{$ColNum}}; 219 } 220 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 221 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KLargest}); 222 } 223 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 224 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{KSmallest}); 225 } 226 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 227 if (exists($CalculatedValues{$ColNum}{StandardDeviation})) { 228 $Value = $CalculatedValues{$ColNum}{StandardDeviation}; 229 } 230 else { 231 $Value = &$SpecifiedFunction($ColValuesRef); 232 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 233 } 234 } 235 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 236 if (!exists($CalculatedValues{$ColNum}{StandardDeviation})) { 237 $Value = StandardDeviation($ColValuesRef); 238 $CalculatedValues{$ColNum}{StandardDeviation} = $Value; 239 } 240 if (defined $CalculatedValues{$ColNum}{StandardDeviation}) { 241 $Value = &$SpecifiedFunction($CalculatedValues{$ColNum}{StandardDeviation}, @{$ColValuesToAnalyzeMapRef->{$ColNum}}); 242 } 243 } 244 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 245 $Value = &$SpecifiedFunction($ColValuesRef, $OptionsInfo{TrimFraction}); 246 } 247 else { 248 $Value = &$SpecifiedFunction($ColValuesRef); 249 } 250 # Format the output value. And add zero to get rid of tariling zeros... 251 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; 252 push @RowValues, $Value; 253 } 254 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 255 print NEWTEXTFILE "$Line\n"; 256 } 257 close NEWTEXTFILE; 258 } 259 260 # Calculate covariance, correlation, rsquare for specified column pairs.... 261 sub PerformColumnPairAnalysis { 262 my($Index, $ColValuesToAnalyzeMapRef) = @_; 263 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 264 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 265 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 266 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 267 268 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "ColumnPairsAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 269 print "Generating new text file $NewTextFile...\n"; 270 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 271 272 # Write out the column labels... 273 @ColLabels = (); 274 push @ColLabels, ("ColumnID1", "ColumnID2"); 275 if ($CalculateCorrelation || $CalculateRSquare) { 276 push @ColLabels, "Correlation"; 277 if ($CalculateRSquare) { 278 push @ColLabels, "RSquare"; 279 } 280 } 281 if ($CalculateCovariance) { 282 push @ColLabels, "Covariance"; 283 } 284 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 285 print NEWTEXTFILE "$Line\n"; 286 287 # Go over each column pair... 288 my($CorrelationValue, $RSquareValue, $CovarianceValue, $ColIndex, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColPairs1ToAnalyze, @ColPairs2ToAnalyze, @RowValues, $Value); 289 290 @ColPairs1ToAnalyze = @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}; 291 @ColPairs2ToAnalyze = @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}; 292 for $ColIndex (0 .. $#ColPairs1ToAnalyze) { 293 @RowValues = (); 294 $ColNum1 = $ColPairs1ToAnalyze[$ColIndex]; 295 $ColNum2 = $ColPairs2ToAnalyze[$ColIndex]; 296 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 297 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 298 299 # Setup column ids... 300 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 301 push @RowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum2]; 302 303 if (@$ColValuesRef1 != @$ColValuesRef2) { 304 # Print a warning... 305 warn "Warning: Skipping analysis for column pair $TextFilesInfo{ColLabels}[$Index][$ColNum1], $TextFilesInfo{ColLabels}[$Index][$ColNum2]: Number of valid data values must be same.\n"; 306 if ($CalculateCorrelation || $CalculateRSquare) { 307 push @RowValues, ""; 308 if ($CalculateRSquare) { 309 push @RowValues, ""; 310 } 311 } 312 if ($CalculateCovariance) { 313 push @RowValues, ""; 314 } 315 } 316 else { 317 # Calculate appropriate value... 318 if ($CalculateCorrelation || $CalculateRSquare) { 319 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 320 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 321 push @RowValues, $Value; 322 if ($CalculateRSquare) { 323 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 324 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 325 push @RowValues, $Value; 326 } 327 } 328 if ($CalculateCovariance) { 329 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 330 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 331 push @RowValues, $Value; 332 } 333 } 334 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 335 print NEWTEXTFILE "$Line\n"; 336 } 337 close NEWTEXTFILE; 338 } 339 340 # Generate histogram numbers... 341 sub PerformFrequencyAnalysis { 342 my($Index, $ColValuesToAnalyzeMapRef) = @_; 343 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $ColNum, @ColNumsToAnalyze, $ColValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 344 345 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 346 for $ColNum (@ColNumsToAnalyze) { 347 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 348 print "Generating new text file $NewTextFile...\n"; 349 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 350 351 # Write out the column labels... 352 @ColLabels = (); 353 push @ColLabels , ("Bins", "Frequency"); 354 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 355 print NEWTEXTFILE "$Line\n"; 356 357 #Calculate and write out frequency values... 358 %FrequencyMap = (); 359 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 360 if (@$ColValuesRef) { 361 if (@{$OptionsInfo{BinRange}}) { 362 %FrequencyMap = Frequency($ColValuesRef, \@{$OptionsInfo{BinRange}}); 363 } 364 else { 365 %FrequencyMap = Frequency($ColValuesRef, $OptionsInfo{NumOfBins}); 366 } 367 } 368 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 369 $FrequencyValue = $FrequencyMap{$BinValue}; 370 371 @RowValues = (); 372 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; 373 push @RowValues, $Value; 374 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; 375 push @RowValues, $Value; 376 377 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 378 print NEWTEXTFILE "$Line\n"; 379 } 380 close NEWTEXTFILE; 381 } 382 } 383 384 # Calculate covariance, correlation/rsquare matrices.... 385 sub PerformMatrixAnalysis { 386 my($Index, $ColValuesToAnalyzeMapRef) = @_; 387 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 388 389 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 390 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 391 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 392 393 $CorrelationTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CorrelationMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 394 $RSquareTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "RSquareMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 395 $CovarianceTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "CovarianceMatrix." . $TextFilesInfo{OutFileExt}[$Index]; 396 397 my($TextFilesList, $Delimiter); 398 $TextFilesList = ""; 399 if ($CalculateCorrelation || $CalculateRSquare) { 400 $TextFilesList = $CorrelationTextFile; 401 if ($CalculateRSquare) { 402 $TextFilesList .= ", $CorrelationTextFile"; 403 } 404 } 405 $Delimiter = length($TextFilesList) ? "," : ""; 406 if ($CalculateCovariance) { 407 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 408 } 409 if ($TextFilesList =~ /\,/) { 410 print "Generating new text files $TextFilesList...\n" 411 } 412 else { 413 print "Generating new text file $TextFilesList...\n" 414 } 415 if ($CalculateCorrelation || $CalculateRSquare) { 416 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 417 if ($CalculateRSquare) { 418 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 419 } 420 } 421 if ($CalculateCovariance) { 422 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 423 } 424 425 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $ColNum, $ColNum1, $ColNum2, $ColValuesRef1, $ColValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 426 427 # Write out the column labels... 428 @ColLabels = (); 429 push @ColLabels, ""; 430 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 431 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 432 } 433 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 434 if ($CalculateCorrelation || $CalculateRSquare) { 435 print CORRELATIONTEXTFILE "$Line\n"; 436 if ($CalculateRSquare) { 437 print RSQUARETEXTFILE "$Line\n"; 438 } 439 } 440 if ($CalculateCovariance) { 441 print COVARIANCETEXTFILE "$Line\n"; 442 } 443 444 # Due to symmetric nature of these matrices, only one half needs to be 445 # calculated. So, just calculate the lower half and copy it to upper half... 446 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap); 447 448 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 449 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 450 for $ColNum2 (0 .. $ColNum1) { 451 $ColValuesRef1 = \@{$ColValuesToAnalyzeMapRef->{$ColNum1}}; 452 $ColValuesRef2 = \@{$ColValuesToAnalyzeMapRef->{$ColNum2}}; 453 if ($CalculateCorrelation || $CalculateRSquare) { 454 $CorrelationValue = Correlation($ColValuesRef1, $ColValuesRef2); 455 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 456 $CorrelationMatrixMap{$ColNum1}{$ColNum2} = $CorrelationValue; 457 if ($ColNum1 != $ColNum2) { 458 $CorrelationMatrixMap{$ColNum2}{$ColNum1} = $CorrelationValue; 459 } 460 if ($CalculateRSquare) { 461 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 462 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 463 $RSquareMatrixMap{$ColNum1}{$ColNum2} = $RSquareValue; 464 if ($ColNum1 != $ColNum2) { 465 $RSquareMatrixMap{$ColNum2}{$ColNum1} = $RSquareValue; 466 } 467 } 468 } 469 if ($CalculateCovariance) { 470 $CovarianceValue = Covariance($ColValuesRef1, $ColValuesRef2); 471 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 472 $CovarianceMatrixMap{$ColNum1}{$ColNum2} = $CovarianceValue; 473 if ($ColNum1 != $ColNum2) { 474 $CovarianceMatrixMap{$ColNum2}{$ColNum1} = $CovarianceValue; 475 } 476 } 477 } 478 } 479 480 # Write out the matrices... 481 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 482 @CorrelationRowValues = (); 483 @RSquareRowValues = (); 484 @CovarianceRowValues = (); 485 if ($CalculateCorrelation || $CalculateRSquare) { 486 push @CorrelationRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 487 if ($CalculateRSquare) { 488 push @RSquareRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 489 } 490 } 491 if ($CalculateCovariance) { 492 push @CovarianceRowValues, $TextFilesInfo{ColLabels}[$Index][$ColNum1]; 493 } 494 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 495 if ($CalculateCorrelation || $CalculateRSquare) { 496 push @CorrelationRowValues, $CorrelationMatrixMap{$ColNum1}{$ColNum2}; 497 if ($CalculateRSquare) { 498 push @RSquareRowValues, $RSquareMatrixMap{$ColNum1}{$ColNum2}; 499 } 500 } 501 if ($CalculateCovariance) { 502 push @CovarianceRowValues, $CovarianceMatrixMap{$ColNum1}{$ColNum2}; 503 } 504 } 505 if ($CalculateCorrelation || $CalculateRSquare) { 506 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 507 print CORRELATIONTEXTFILE "$Line\n"; 508 if ($CalculateRSquare) { 509 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 510 print RSQUARETEXTFILE "$Line\n"; 511 } 512 } 513 if ($CalculateCovariance) { 514 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 515 print COVARIANCETEXTFILE "$Line\n"; 516 } 517 } 518 if ($CalculateCorrelation || $CalculateRSquare) { 519 close CORRELATIONTEXTFILE; 520 if ($CalculateRSquare) { 521 close RSQUARETEXTFILE; 522 } 523 } 524 if ($CalculateCovariance) { 525 close COVARIANCETEXTFILE; 526 } 527 } 528 529 # Calculate standard scores... 530 sub PerformStandardScoresAnalysis { 531 my($Index, $ColValuesToAnalyzeMapRef) = @_; 532 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 533 534 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; 535 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; 536 537 $NewTextFile = $TextFilesInfo{OutFileRoot}[$Index] . "StandardScores." . $TextFilesInfo{OutFileExt}[$Index]; 538 print "Generating new text file $NewTextFile...\n"; 539 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 540 541 my($ColValuesRef, $ColNum, @ColNumsToAnalyze); 542 # Write out column labels... 543 @ColLabels = (); 544 @ColNumsToAnalyze = @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}; 545 for $ColNum (@ColNumsToAnalyze) { 546 $Label = $TextFilesInfo{ColLabels}[$Index][$ColNum]; 547 if ($StandardScores) { 548 push @ColLabels, "${Label}\(StandardScores)"; 549 } 550 if ($StandardScoresN) { 551 push @ColLabels, "${Label}\(StandardScoresN)"; 552 } 553 } 554 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 555 print NEWTEXTFILE "$NewLine\n"; 556 557 # Go over each column to be analyzed and calculate standard deviation 558 # and mean values... 559 my(%StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 560 %StandardDeviationMap = (); 561 %StandardDeviationNMap = (); 562 %MeanMap = (); 563 for $ColNum (@ColNumsToAnalyze) { 564 $ColValuesRef = \@{$ColValuesToAnalyzeMapRef->{$ColNum}}; 565 if (!exists($MeanMap{$ColNum})) { 566 $MeanMap{$ColNum} = Mean($ColValuesRef); 567 } 568 if ($StandardScores) { 569 if (!exists($StandardDeviationMap{$ColNum})) { 570 $StandardDeviationMap{$ColNum} = StandardDeviation($ColValuesRef); 571 } 572 } 573 if ($StandardScoresN) { 574 if (!exists($StandardDeviationNMap{$ColNum})) { 575 $StandardDeviationNMap{$ColNum} = StandardDeviationN($ColValuesRef); 576 } 577 } 578 } 579 # 580 # Go over each row and calculate standard scores for each column 581 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 582 # for StandardScoresN; write out the calculated values as well... 583 584 my($TextFile, $InDelim, $Line, $Value, $ValueOkay, $ScoreValue, @RowValues, @LineWords); 585 $TextFile = $TextFilesList[$Index]; 586 $InDelim = $TextFilesInfo{InDelim}[$Index]; 587 588 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 589 $Line = GetTextLine(\*TEXTFILE); 590 while ($Line = GetTextLine(\*TEXTFILE)) { 591 @LineWords = quotewords($InDelim, 0, $Line); 592 @RowValues = (); 593 COLNUM: for $ColNum (@ColNumsToAnalyze) { 594 $Value = $LineWords[$ColNum]; 595 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; 596 if ($StandardScores) { 597 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationMap{$ColNum}) : ""; 598 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 599 push @RowValues, $ScoreValue; 600 } 601 if ($StandardScoresN) { 602 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$ColNum})/$StandardDeviationNMap{$ColNum}) : ""; 603 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 604 push @RowValues, $ScoreValue; 605 } 606 } 607 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 608 print NEWTEXTFILE "$NewLine\n"; 609 } 610 close TEXTFILE; 611 close NEWTEXTFILE; 612 } 613 614 # Make sure the specified columns exists in text files... 615 sub ProcessColumnsInfo { 616 my($Index, $TextFile, $ColNum, $NewColNum, $ColIndex, @ColNumsToAnalyze, %UniqueColNumsToAnalyzeMap); 617 618 @{$TextFilesInfo{ColNumsToAnalyze}} = (); 619 @{$TextFilesInfo{ColPairs1ToAnalyze}} = (); 620 @{$TextFilesInfo{ColPairs2ToAnalyze}} = (); 621 @{$TextFilesInfo{UniqueColNumsToAnalyze}} = (); 622 623 FILELIST: for $Index (0 .. $#TextFilesList) { 624 $TextFile = $TextFilesList[$Index]; 625 626 @{$TextFilesInfo{ColNumsToAnalyze}[$Index]} = (); 627 @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]} = (); 628 @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]} = (); 629 @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]} = (); 630 631 %UniqueColNumsToAnalyzeMap = (); 632 633 if ($TextFilesInfo{FileOkay}[$Index]) { 634 @ColNumsToAnalyze = (); 635 if (@{$OptionsInfo{SpecifiedColumns}}) { 636 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 637 for $ColNum (@{$OptionsInfo{SpecifiedColumns}}) { 638 if ($ColNum >=1 && $ColNum <= $TextFilesInfo{ColCount}[$Index]) { 639 $NewColNum = $ColNum -1; 640 push @ColNumsToAnalyze, $NewColNum; 641 } 642 } 643 } 644 else { 645 my($ColLabel); 646 for $ColLabel (@{$OptionsInfo{SpecifiedColumns}}) { 647 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 648 push @ColNumsToAnalyze, $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 649 } 650 } 651 } 652 } 653 elsif (defined $OptionsInfo{Columns} && $OptionsInfo{Columns} =~ /^All$/i) { 654 for $ColNum (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 655 push @ColNumsToAnalyze, $ColNum; 656 } 657 } 658 else { 659 push @ColNumsToAnalyze, 0; 660 } 661 if (@ColNumsToAnalyze) { 662 push @{$TextFilesInfo{ColNumsToAnalyze}[$Index]}, @ColNumsToAnalyze; 663 # Set up unique columns map as well... 664 for $ColNum (@ColNumsToAnalyze) { 665 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 666 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 667 } 668 } 669 } 670 else { 671 warn "Warning: Ignoring file $TextFile: None of the columns specified, @{$OptionsInfo{SpecifiedColumns}}, using \"--columns\" option exist.\n"; 672 $TextFilesInfo{FileOkay}[$Index] = 0; 673 next FILELIST; 674 } 675 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 676 # Make sure specific frequency files don't exist... 677 my($FrequencyFile); 678 for $ColNum (@ColNumsToAnalyze) { 679 $FrequencyFile = $TextFilesInfo{OutFileRoot}[$Index] . $TextFilesInfo{ColLabels}[$Index][$ColNum] . "FrequencyAnalysis." . $TextFilesInfo{OutFileExt}[$Index]; 680 if (-e $FrequencyFile) { 681 warn "Warning: Ignoring file $TextFile: The file $FrequencyFile already exists.\n"; 682 $TextFilesInfo{FileOkay}[$Index] = 0; 683 next FILELIST; 684 } 685 } 686 } 687 # Setup specified column pairs... 688 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { 689 my(@ColPairsToAnalyze, $ColNum1, $ColNum2); 690 if (@{$OptionsInfo{SpecifiedColumnPairs}}) { 691 # Make sure both columns exist... 692 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { 693 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { 694 $ColNum1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; 695 $ColNum2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; 696 if ($ColNum1 >=1 && $ColNum1 <= $TextFilesInfo{ColCount}[$Index] && $ColNum2 >=1 && $ColNum2 <= $TextFilesInfo{ColCount}[$Index]) { 697 $ColNum1 -= 1; 698 $ColNum2 -= 1; 699 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 700 } 701 } 702 } 703 else { 704 my($ColLabel1, $ColLabel2); 705 for ($ColIndex = 0; (($ColIndex + 1) < @{$OptionsInfo{SpecifiedColumnPairs}}); $ColIndex += 2 ) { 706 $ColLabel1 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex]; 707 $ColLabel2 = $OptionsInfo{SpecifiedColumnPairs}[$ColIndex + 1]; 708 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}) && exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2})) { 709 $ColNum1 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel1}; 710 $ColNum2 = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel2}; 711 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 712 } 713 } 714 } 715 } 716 elsif ($OptionsInfo{AllColumnPairs}) { 717 for $ColNum1 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 718 for $ColNum2 (0 .. ($TextFilesInfo{ColCount}[$Index] - 1)) { 719 push @ColPairsToAnalyze, ($ColNum1, $ColNum2); 720 } 721 } 722 } 723 else { 724 if ($TextFilesInfo{ColCount}[$Index] >= 2) { 725 push @ColPairsToAnalyze, (0,1); 726 } 727 } 728 if (@ColPairsToAnalyze) { 729 if (@ColPairsToAnalyze % 2) { 730 warn "Warning: Ignoring file $TextFile: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of valid values.\n"; 731 $TextFilesInfo{FileOkay}[$Index] = 0; 732 next FILELIST; 733 } 734 else { 735 for ($ColIndex = 0; $ColIndex < @ColPairsToAnalyze; $ColIndex += 2) { 736 push @{$TextFilesInfo{ColPairs1ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex]; 737 push @{$TextFilesInfo{ColPairs2ToAnalyze}[$Index]}, $ColPairsToAnalyze[$ColIndex + 1]; 738 } 739 # Set up unique columns map as well... 740 for $ColNum (@ColPairsToAnalyze) { 741 if (!exists $UniqueColNumsToAnalyzeMap{$ColNum}) { 742 $UniqueColNumsToAnalyzeMap{$ColNum} = $ColNum; 743 } 744 } 745 } 746 } 747 } 748 # Setup uniques columns array... 749 push @{$TextFilesInfo{UniqueColNumsToAnalyze}[$Index]}, (sort keys %UniqueColNumsToAnalyzeMap); 750 } 751 } 752 } 753 754 # Retrieve information about input text files... 755 sub RetrieveTextFilesInfo { 756 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $OutFileExt, $ColNum, $ColLabel); 757 758 %TextFilesInfo = (); 759 760 @{$TextFilesInfo{FileOkay}} = (); 761 @{$TextFilesInfo{ColCount}} = (); 762 @{$TextFilesInfo{ColLabels}} = (); 763 @{$TextFilesInfo{ColLabelToNumMap}} = (); 764 @{$TextFilesInfo{InDelim}} = (); 765 @{$TextFilesInfo{OutFileRoot}} = (); 766 @{$TextFilesInfo{OutFileExt}} = (); 767 768 FILELIST: for $Index (0 .. $#TextFilesList) { 769 $TextFile = $TextFilesList[$Index]; 770 771 $TextFilesInfo{FileOkay}[$Index] = 0; 772 $TextFilesInfo{ColCount}[$Index] = 0; 773 $TextFilesInfo{InDelim}[$Index] = ""; 774 $TextFilesInfo{OutFileRoot}[$Index] = ""; 775 $TextFilesInfo{OutFileExt}[$Index] = ""; 776 777 @{$TextFilesInfo{ColLabels}[$Index]} = (); 778 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 779 780 if (!(-e $TextFile)) { 781 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 782 next FILELIST; 783 } 784 if (!CheckFileType($TextFile, "csv tsv")) { 785 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 786 next FILELIST; 787 } 788 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 789 if ($FileExt =~ /^tsv$/i) { 790 $InDelim = "\t"; 791 } 792 else { 793 $InDelim = "\,"; 794 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 795 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 796 next FILELIST; 797 } 798 if ($Options{indelim} =~ /^semicolon$/i) { 799 $InDelim = "\;"; 800 } 801 } 802 803 if (!open TEXTFILE, "$TextFile") { 804 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 805 next FILELIST; 806 } 807 808 $Line = GetTextLine(\*TEXTFILE); 809 @ColLabels = quotewords($InDelim, 0, $Line); 810 close TEXTFILE; 811 812 $FileDir = ""; $FileName = ""; $FileExt = ""; 813 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 814 $FileExt = "csv"; 815 if ($Options{outdelim} =~ /^tab$/i) { 816 $FileExt = "tsv"; 817 } 818 $OutFileExt = $FileExt; 819 if ($Options{root} && (@TextFilesList == 1)) { 820 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 821 if ($RootFileName && $RootFileExt) { 822 $FileName = $RootFileName; 823 } 824 else { 825 $FileName = $Options{root}; 826 } 827 $OutFileRoot = $FileName; 828 } 829 else { 830 $OutFileRoot = $FileName; 831 } 832 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; 833 834 if (lc($OutFile) eq lc($TextFile)) { 835 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; 836 next FILELIST; 837 } 838 if (!$Options{overwrite}) { 839 if (-e $OutFile) { 840 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 841 next FILELIST; 842 } 843 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 844 if ($OptionsInfo{AllColumnPairs}) { 845 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { 846 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; 847 next FILELIST; 848 } 849 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { 850 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; 851 next FILELIST; 852 } 853 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { 854 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; 855 next FILELIST; 856 } 857 } 858 else { 859 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { 860 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; 861 next FILELIST; 862 } 863 } 864 } 865 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { 866 warn "Warning: Ignoring file $TextFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; 867 next FILELIST; 868 } 869 } 870 871 $TextFilesInfo{FileOkay}[$Index] = 1; 872 $TextFilesInfo{InDelim}[$Index] = $InDelim; 873 $TextFilesInfo{OutFileRoot}[$Index] = "$OutFileRoot"; 874 $TextFilesInfo{OutFileExt}[$Index] = "$OutFileExt"; 875 876 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 877 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 878 for $ColNum (0 .. $#ColLabels) { 879 $ColLabel = $ColLabels[$ColNum]; 880 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 881 } 882 } 883 } 884 885 # Process option values... 886 sub ProcessOptions { 887 %OptionsInfo = (); 888 889 $OptionsInfo{Mode} = $Options{mode}; 890 891 $OptionsInfo{DetailLevel} = $Options{detail}; 892 893 # Setup supported statistical functions... 894 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 895 %SupportedStatisticaFunctionsMap = (); 896 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 897 898 for $SupportedFunction (@SupportedStatisticaFunctions) { 899 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 900 } 901 902 # Setup a list of functions to use for analysis... 903 my($SpecifiedFunction); 904 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); 905 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); 906 # Check mode values... 907 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 908 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; 909 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 910 } 911 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 912 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; 913 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 914 } 915 elsif ($Options{mode} =~ /^All$/i ) { 916 $OptionsInfo{FileNameMode} = "AllStatistics"; 917 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; 918 } 919 else { 920 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; 921 # Comma delimited list of functions... 922 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 923 $Mode = $Options{mode}; 924 $Mode =~ s/ //g; 925 @SpecifiedFunctions = split ",", $Mode; 926 @UnsupportedSpecifiedFunctions = (); 927 for $SpecifiedFunction (@SpecifiedFunctions) { 928 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 929 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; 930 } 931 else { 932 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 933 } 934 } 935 if (@UnsupportedSpecifiedFunctions) { 936 if (@UnsupportedSpecifiedFunctions > 1) { 937 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 938 } 939 else { 940 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 941 } 942 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 943 } 944 } 945 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 946 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { 947 next FUNCTION; 948 } 949 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 950 } 951 952 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 953 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; 954 955 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 956 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; 957 958 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; 959 $OptionsInfo{Precision} = $Options{precision}; 960 961 $OptionsInfo{KLargest} = $Options{klargest}; 962 $OptionsInfo{KSmallest} = $Options{ksmallest}; 963 964 $OptionsInfo{TrimFraction} = $Options{trimfraction}; 965 966 # Setup frequency bin values... 967 $OptionsInfo{NumOfBins} = 10; 968 @{$OptionsInfo{BinRange}} = (); 969 if ($Options{frequencybins} =~ /\,/) { 970 my($BinValue, @SpecifiedBinRange); 971 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 972 if (@SpecifiedBinRange < 2) { 973 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 974 } 975 for $BinValue (@SpecifiedBinRange) { 976 if (!IsNumerical($BinValue)) { 977 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 978 } 979 } 980 my($Index1, $Index2); 981 for $Index1 (0 .. $#SpecifiedBinRange) { 982 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 983 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 984 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 985 } 986 } 987 } 988 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 989 } 990 else { 991 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 992 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 993 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 994 } 995 } 996 997 # Setup specified columns... 998 $OptionsInfo{ColMode} = $Options{colmode}; 999 $OptionsInfo{Columns} = defined $Options{columns} ? $Options{columns} : undef; 1000 1001 @{$OptionsInfo{SpecifiedColumns}} = (); 1002 if (defined $Options{columns} && $Options{columns} !~ /^All$/i) { 1003 my(@SpecifiedValues) = split ",", $Options{columns}; 1004 if ($Options{colmode} =~ /^colnum$/i) { 1005 my($ColValue); 1006 for $ColValue (@SpecifiedValues) { 1007 if (!IsPositiveInteger($ColValue)) { 1008 die "Error: Column value, $ColValue, specified using \"--columns\" is not valid: Allowed integer values: > 0.\n"; 1009 } 1010 } 1011 } 1012 push @{$OptionsInfo{SpecifiedColumns}}, @SpecifiedValues; 1013 } 1014 @{$OptionsInfo{SpecifiedColumnPairs}} = (); 1015 $OptionsInfo{AllColumnPairs} = (defined($Options{columnpairs}) && $Options{columnpairs} =~ /^AllPairs$/i) ? 1 : 0; 1016 if (defined($Options{columnpairs}) && !$OptionsInfo{AllColumnPairs}) { 1017 my(@SpecifiedValues) = split ",", $Options{columnpairs}; 1018 if (@SpecifiedValues % 2) { 1019 die "Error: Invalid number of values specified using \"--columnpairs\" option: It must contain even number of values.\n"; 1020 } 1021 if ($Options{colmode} =~ /^colnum$/i) { 1022 my($ColValue); 1023 for $ColValue (@SpecifiedValues) { 1024 if (!IsPositiveInteger($ColValue)) { 1025 die "Error: Column value, $ColValue, specified using \"--columnpairs\" is not valid: Allowed integer values: > 0.\n"; 1026 } 1027 } 1028 } 1029 push @{$OptionsInfo{SpecifiedColumnPairs}}, @SpecifiedValues; 1030 } 1031 1032 } 1033 1034 # Setup script usage and retrieve command line arguments specified using various options... 1035 sub SetupScriptUsage { 1036 1037 # Retrieve all the options... 1038 %Options = (); 1039 $Options{colmode} = "colnum"; 1040 $Options{detail} = 1; 1041 $Options{indelim} = "comma"; 1042 $Options{frequencybins} = 10; 1043 $Options{klargest} = 2; 1044 $Options{ksmallest} = 2; 1045 $Options{mode} = "DescriptiveStatisticsBasic"; 1046 $Options{outdelim} = "comma"; 1047 $Options{precision} = 2; 1048 $Options{quote} = "yes"; 1049 $Options{trimfraction} = 0.1; 1050 1051 if (!GetOptions(\%Options, "colmode|c=s", "columns=s", "columnpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "indelim=s", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { 1052 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1053 } 1054 if ($Options{workingdir}) { 1055 if (! -d $Options{workingdir}) { 1056 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1057 } 1058 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1059 } 1060 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { 1061 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n"; 1062 } 1063 if (!IsPositiveInteger($Options{detail})) { 1064 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 1065 } 1066 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 1067 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 1068 } 1069 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1070 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1071 } 1072 if ($Options{quote} !~ /^(yes|no)$/i) { 1073 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1074 } 1075 if (!IsPositiveInteger($Options{precision})) { 1076 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 1077 } 1078 if (!IsPositiveInteger($Options{klargest})) { 1079 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; 1080 } 1081 if (!IsPositiveInteger($Options{ksmallest})) { 1082 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; 1083 } 1084 if (IsFloat($Options{trimfraction})) { 1085 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { 1086 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1087 } 1088 } 1089 else { 1090 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1091 } 1092 } 1093