1 #!/usr/bin/perl -w 2 # 3 # File: AnalyzeSDFilesData.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2025 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use SDFileUtil; 34 use TextUtil; 35 use StatisticsUtil; 36 37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 38 39 # Autoflush STDOUT 40 $| = 1; 41 42 # Starting message... 43 $ScriptName = basename($0); 44 print "\n$ScriptName: Starting...\n\n"; 45 $StartTime = new Benchmark; 46 47 # Get the options and setup script... 48 SetupScriptUsage(); 49 if ($Options{help} || @ARGV < 1) { 50 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 51 } 52 53 my(@SDFilesList); 54 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf"); 55 56 print "Processing options...\n"; 57 my(%OptionsInfo); 58 ProcessOptions(); 59 60 # Collect information about SD files... 61 print "Checking input SD file(s)...\n"; 62 my(%SDFilesInfo); 63 RetrieveSDFilesInfo(); 64 ProcessSDFilesDataLabelsInfo(); 65 66 # Generate output files... 67 my($FileIndex); 68 if (@SDFilesList > 1) { 69 print "\nProcessing SD files...\n"; 70 } 71 for $FileIndex (0 .. $#SDFilesList) { 72 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 74 AnalyzeSDFile($FileIndex); 75 } 76 } 77 print "\n$ScriptName:Done...\n\n"; 78 79 $EndTime = new Benchmark; 80 $TotalTime = timediff ($EndTime, $StartTime); 81 print "Total time: ", timestr($TotalTime), "\n"; 82 83 ############################################################################### 84 85 # Analyze data... 86 sub AnalyzeSDFile { 87 my($Index) = @_; 88 my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap); 89 90 $SDFile = $SDFilesList[$Index]; 91 @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}; 92 %DataFieldValuesToAnalyzeMap = (); 93 for $DataLabel (@DataLabelsToAnalyze) { 94 @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = (); 95 } 96 97 # Collect appropriate data field label values for analysis... 98 my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels); 99 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 100 $CmpdCount = 0; 101 $InvalidCmpdCount = 0; 102 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 103 $CmpdCount++; 104 @CmpdLines = split "\n", $CmpdString; 105 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 106 @InvalidCmpdDataLabels = (); 107 DATALABEL: for $DataLabel (@DataLabelsToAnalyze) { 108 if (exists $DataFieldValues{$DataLabel}) { 109 $DataValue = $DataFieldValues{$DataLabel}; 110 if ($OptionsInfo{CheckData}) { 111 if (!IsNumerical($DataValue)) { 112 push @InvalidCmpdDataLabels, $DataLabel; 113 next DATALABEL; 114 } 115 } 116 push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue; 117 } 118 } 119 if (@InvalidCmpdDataLabels) { 120 $InvalidCmpdCount++; 121 if ($OptionsInfo{DetailLevel} >=4 ) { 122 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n"; 123 } 124 elsif ($OptionsInfo{DetailLevel} >= 3) { 125 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n"; 126 } 127 elsif ($OptionsInfo{DetailLevel} >= 2) { 128 print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n"; 129 } 130 } 131 } 132 if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) { 133 print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n"; 134 } 135 close SDFILE; 136 137 # Perform the analysis... 138 my(@SpecifiedFunctionNames, $SpecifiedFunction); 139 @SpecifiedFunctionNames = (); 140 141 for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 142 if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) { 143 push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)}; 144 } 145 } 146 if (@SpecifiedFunctionNames) { 147 PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap) 148 } 149 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 150 if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) { 151 PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 152 } 153 else { 154 # Perform pairwise analysis for specified columns and write out calculated values - correlation 155 # rsquare, or covariance - in the same file. 156 PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 157 } 158 } 159 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) { 160 PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 161 } 162 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 163 PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap); 164 } 165 166 } 167 168 # Calculate values for various statistical functions... 169 sub PerformAnalysis { 170 my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_; 171 my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze); 172 173 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index]; 174 175 print "Generating new text file $NewTextFile...\n"; 176 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 177 178 # Write out column labels... 179 @ColLabels = (); 180 push @ColLabels, "DataLabel"; 181 for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 182 $Label = $SpecifiedFunction; 183 if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) { 184 my($KthValue); 185 $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest}; 186 $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction"; 187 $Label =~ s/K//g; 188 } 189 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 190 $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})"; 191 } 192 push @ColLabels, $Label; 193 } 194 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 195 print NEWTEXTFILE "$Line\n"; 196 197 # Go over each column to be analyzed... 198 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 199 200 # Turn off "strict"; otherwise, invoking statistical functions using function name string 201 # is problematic. 202 no strict; 203 204 my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues); 205 %CalculatedValues = (); 206 for $DataLabel (@DataLabelsToAnalyze) { 207 @RowValues = (); 208 # Setup column id... 209 push @RowValues, $DataLabel; 210 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 211 FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) { 212 $Value = ""; 213 if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) { 214 # Invalid column values... 215 push @RowValues, $Value; 216 next FUNCTIONNAME; 217 } 218 if ($SpecifiedFunction =~ /^Count$/i) { 219 $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 220 } 221 elsif ($SpecifiedFunction =~ /^KLargest$/i) { 222 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest}); 223 } 224 elsif ($SpecifiedFunction =~ /^KSmallest$/i) { 225 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest}); 226 } 227 elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) { 228 if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 229 $Value = $CalculatedValues{$DataLabel}{StandardDeviation}; 230 } 231 else { 232 $Value = &$SpecifiedFunction($DataValuesRef); 233 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 234 } 235 } 236 elsif ($SpecifiedFunction =~ /^StandardError$/i) { 237 if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) { 238 $Value = StandardDeviation($DataValuesRef); 239 $CalculatedValues{$DataLabel}{StandardDeviation} = $Value; 240 } 241 if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) { 242 $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}}); 243 } 244 } 245 elsif ($SpecifiedFunction =~ /^TrimMean$/i) { 246 $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction}); 247 } 248 else { 249 $Value = &$SpecifiedFunction($DataValuesRef); 250 } 251 # Format the output value. And add zero to get rid of tariling zeros... 252 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ""; 253 push @RowValues, $Value; 254 } 255 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 256 print NEWTEXTFILE "$Line\n"; 257 } 258 close NEWTEXTFILE; 259 } 260 261 # Calculate covariance, correlation, rsquare for specified data field label pairs.... 262 sub PerformDataLabelPairAnalysis { 263 my($Index, $DataValuesToAnalyzeMapRef) = @_; 264 my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 265 266 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 267 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 268 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 269 270 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 271 print "Generating new text file $NewTextFile...\n"; 272 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 273 274 # Write out the column labels... 275 @ColLabels = (); 276 push @ColLabels, ("DataLabel1", "DataLabel2"); 277 if ($CalculateCorrelation || $CalculateRSquare) { 278 push @ColLabels, "Correlation"; 279 if ($CalculateRSquare) { 280 push @ColLabels, "RSquare"; 281 } 282 } 283 if ($CalculateCovariance) { 284 push @ColLabels, "Covariance"; 285 } 286 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 287 print NEWTEXTFILE "$Line\n"; 288 289 # Go over each data field pair... 290 my($CorrelationValue, $RSquareValue, $CovarianceValue, $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value); 291 292 @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}; 293 @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}; 294 for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) { 295 @RowValues = (); 296 $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex]; 297 $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex]; 298 $DataValues1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 299 $DataValues2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 300 301 # Setup column ids... 302 push @RowValues, $DataLabel1; 303 push @RowValues, $DataLabel2; 304 305 if (@$DataValues1 != @$DataValues2) { 306 # Print a warning... 307 warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n"; 308 if ($CalculateCorrelation || $CalculateRSquare) { 309 push @RowValues, ""; 310 if ($CalculateRSquare) { 311 push @RowValues, ""; 312 } 313 } 314 if ($CalculateCovariance) { 315 push @RowValues, ""; 316 } 317 } 318 else { 319 # Calculate appropriate value... 320 if ($CalculateCorrelation || $CalculateRSquare) { 321 $CorrelationValue = Correlation($DataValues1, $DataValues2); 322 $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 323 push @RowValues, $Value; 324 if ($CalculateRSquare) { 325 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 326 $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 327 push @RowValues, $Value; 328 } 329 } 330 if ($CalculateCovariance) { 331 $CovarianceValue = Covariance($DataValues1, $DataValues2); 332 $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 333 push @RowValues, $Value; 334 } 335 } 336 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 337 print NEWTEXTFILE "$Line\n"; 338 } 339 close NEWTEXTFILE; 340 } 341 342 # Generate histogram numbers... 343 sub PerformFrequencyAnalysis { 344 my($Index, $DataValuesToAnalyzeMapRef) = @_; 345 my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap); 346 347 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 348 for $DataLabel (@DataLabelsToAnalyze) { 349 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 350 print "Generating new text file $NewTextFile...\n"; 351 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 352 353 # Write out the column labels... 354 @ColLabels = (); 355 push @ColLabels , ("Bins", "Frequency"); 356 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 357 print NEWTEXTFILE "$Line\n"; 358 359 #Calculate and write out frequency values... 360 %FrequencyMap = (); 361 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 362 if (@$DataValuesRef) { 363 if (@{$OptionsInfo{BinRange}}) { 364 %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}}); 365 } 366 else { 367 %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins}); 368 } 369 } 370 for $BinValue (sort { $a <=> $b } keys %FrequencyMap) { 371 $FrequencyValue = $FrequencyMap{$BinValue}; 372 373 @RowValues = (); 374 $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : ""; 375 push @RowValues, $Value; 376 $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : ""; 377 push @RowValues, $Value; 378 379 $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 380 print NEWTEXTFILE "$Line\n"; 381 } 382 close NEWTEXTFILE; 383 } 384 } 385 386 # Calculate covariance, correlation/rsquare matrices.... 387 sub PerformMatrixAnalysis { 388 my($Index, $DataValuesToAnalyzeMapRef) = @_; 389 my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance); 390 391 $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0; 392 $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0; 393 $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0; 394 395 $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 396 $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 397 $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." . $SDFilesInfo{NewTextFileExt}[$Index]; 398 399 my($TextFilesList, $Delimiter); 400 $TextFilesList = ""; 401 if ($CalculateCorrelation || $CalculateRSquare) { 402 $TextFilesList = $CorrelationTextFile; 403 if ($CalculateRSquare) { 404 $TextFilesList .= ", $CorrelationTextFile"; 405 } 406 } 407 $Delimiter = length($TextFilesList) ? "," : ""; 408 if ($CalculateCovariance) { 409 $TextFilesList .= "${Delimiter} ${CorrelationTextFile}"; 410 } 411 if ($TextFilesList =~ /\,/) { 412 print "Generating new text files $TextFilesList...\n" 413 } 414 else { 415 print "Generating new text file $TextFilesList...\n" 416 } 417 if ($CalculateCorrelation || $CalculateRSquare) { 418 open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n"; 419 if ($CalculateRSquare) { 420 open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n"; 421 } 422 } 423 if ($CalculateCovariance) { 424 open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n"; 425 } 426 427 my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues); 428 429 # Write out the column labels... 430 @ColLabels = (); 431 push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]}; 432 $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 433 if ($CalculateCorrelation || $CalculateRSquare) { 434 print CORRELATIONTEXTFILE "$Line\n"; 435 if ($CalculateRSquare) { 436 print RSQUARETEXTFILE "$Line\n"; 437 } 438 } 439 if ($CalculateCovariance) { 440 print COVARIANCETEXTFILE "$Line\n"; 441 } 442 443 # Due to symmetric nature of these matrices, only one half needs to be 444 # calculated. So, just calculate the lower half and copy it to upper half... 445 my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze); 446 447 %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = (); 448 @DataLabelsToAnalyze = (); 449 @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]}; 450 451 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 452 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 453 for $LabelIndex2 (0 .. $LabelIndex1) { 454 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 455 $DataValuesRef1 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}}; 456 $DataValuesRef2 = \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}}; 457 if ($CalculateCorrelation || $CalculateRSquare) { 458 $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2); 459 $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : ""; 460 $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue; 461 if ($DataLabel1 ne $DataLabel2) { 462 $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue; 463 } 464 if ($CalculateRSquare) { 465 $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : ""; 466 $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : ""; 467 $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue; 468 if ($DataLabel1 ne $DataLabel2) { 469 $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue; 470 } 471 } 472 } 473 if ($CalculateCovariance) { 474 $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2); 475 $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : ""; 476 $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue; 477 if ($DataLabel1 ne $DataLabel2) { 478 $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue; 479 } 480 } 481 } 482 } 483 484 # Write out the matrices... 485 for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) { 486 $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1]; 487 @CorrelationRowValues = (); 488 @RSquareRowValues = (); 489 @CovarianceRowValues = (); 490 if ($CalculateCorrelation || $CalculateRSquare) { 491 push @CorrelationRowValues, $DataLabel1; 492 if ($CalculateRSquare) { 493 push @RSquareRowValues, $DataLabel1; 494 } 495 } 496 if ($CalculateCovariance) { 497 push @CovarianceRowValues, $DataLabel; 498 } 499 for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) { 500 $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2]; 501 if ($CalculateCorrelation || $CalculateRSquare) { 502 push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2}; 503 if ($CalculateRSquare) { 504 push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2}; 505 } 506 } 507 if ($CalculateCovariance) { 508 push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2}; 509 } 510 } 511 if ($CalculateCorrelation || $CalculateRSquare) { 512 $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 513 print CORRELATIONTEXTFILE "$Line\n"; 514 if ($CalculateRSquare) { 515 $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 516 print RSQUARETEXTFILE "$Line\n"; 517 } 518 } 519 if ($CalculateCovariance) { 520 $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 521 print COVARIANCETEXTFILE "$Line\n"; 522 } 523 } 524 if ($CalculateCorrelation || $CalculateRSquare) { 525 close CORRELATIONTEXTFILE; 526 if ($CalculateRSquare) { 527 close RSQUARETEXTFILE; 528 } 529 } 530 if ($CalculateCovariance) { 531 close COVARIANCETEXTFILE; 532 } 533 } 534 535 # Calculate standard scores... 536 sub PerformStandardScoresAnalysis { 537 my($Index, $DataValuesToAnalyzeMapRef) = @_; 538 my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine); 539 540 $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0; 541 $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0; 542 543 $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." . $SDFilesInfo{NewTextFileExt}[$Index]; 544 print "Generating new text file $NewTextFile...\n"; 545 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; 546 547 my($DataLabel, @DataLabelsToAnalyze); 548 # Write out column labels... 549 @ColLabels = (); 550 @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}; 551 for $DataLabel (@DataLabelsToAnalyze) { 552 if ($StandardScores) { 553 push @ColLabels, "${DataLabel}\(StandardScores)"; 554 } 555 if ($StandardScoresN) { 556 push @ColLabels, "${DataLabel}\(StandardScoresN)"; 557 } 558 } 559 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 560 print NEWTEXTFILE "$NewLine\n"; 561 562 # Go over each column to be analyzed and calculate standard deviation 563 # and mean values... 564 my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap); 565 %StandardDeviationMap = (); 566 %StandardDeviationNMap = (); 567 %MeanMap = (); 568 for $DataLabel (@DataLabelsToAnalyze) { 569 $DataValuesRef = \@{$DataValuesToAnalyzeMapRef->{$DataLabel}}; 570 if (!exists($MeanMap{$DataLabel})) { 571 $MeanMap{$DataLabel} = Mean($DataValuesRef); 572 } 573 if ($StandardScores) { 574 if (!exists($StandardDeviationMap{$DataLabel})) { 575 $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef); 576 } 577 } 578 if ($StandardScoresN) { 579 if (!exists($StandardDeviationNMap{$DataLabel})) { 580 $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef); 581 } 582 } 583 } 584 # 585 # Go over each data field and calculate standard scores for each column 586 # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n 587 # for StandardScoresN; write out the calculated values as well... 588 589 my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues); 590 $SDFile = $SDFilesList[$Index]; 591 592 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 593 while ($CmpdString = ReadCmpdString(\*SDFILE)) { 594 @CmpdLines = split "\n", $CmpdString; 595 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 596 @RowValues = (); 597 for $DataLabel (@DataLabelsToAnalyze) { 598 $Value = ""; 599 if (exists $DataFieldValues{$DataLabel}) { 600 $Value = $DataFieldValues{$DataLabel}; 601 } 602 $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1; 603 if ($StandardScores) { 604 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : ""; 605 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 606 push @RowValues, $ScoreValue; 607 } 608 if ($StandardScoresN) { 609 $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : ""; 610 $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : ""; 611 push @RowValues, $ScoreValue; 612 } 613 } 614 $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 615 print NEWTEXTFILE "$NewLine\n"; 616 } 617 close SDFILE; 618 close NEWTEXTFILE; 619 620 } 621 622 # Make sure the specified data field labels exists in SD files... 623 sub ProcessSDFilesDataLabelsInfo { 624 my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap); 625 626 @{$SDFilesInfo{DataLabelsToAnalyze}} = (); 627 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = (); 628 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = (); 629 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = (); 630 631 FILELIST: for $Index (0 .. $#SDFilesList) { 632 $SDFile = $SDFilesList[$Index]; 633 634 @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = (); 635 @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = (); 636 @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = (); 637 @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = (); 638 639 %UniqueDataLabelsToAnalyzeMap = (); 640 641 if ($SDFilesInfo{FileOkay}[$Index]) { 642 @DataLabelsToAnalyze = (); 643 if (@{$OptionsInfo{SpecifiedDataLabels}}) { 644 for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) { 645 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) { 646 push @DataLabelsToAnalyze, $DataLabel; 647 } 648 } 649 } 650 elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) { 651 push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]}; 652 } 653 else { 654 push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]}; 655 } 656 if (@DataLabelsToAnalyze) { 657 push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze; 658 # Set up unique data field label map as well... 659 for $DataLabel (@DataLabelsToAnalyze) { 660 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 661 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 662 } 663 } 664 } 665 else { 666 warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n"; 667 $SDFilesInfo{FileOkay}[$Index] = 0; 668 next FILELIST; 669 } 670 if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) { 671 # Make sure specific frequency files don't exist... 672 my($FrequencyFile); 673 for $DataLabel (@DataLabelsToAnalyze) { 674 $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." . $SDFilesInfo{NewTextFileExt}[$Index]; 675 if (-e $FrequencyFile) { 676 warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n"; 677 $SDFilesInfo{FileOkay}[$Index] = 0; 678 next FILELIST; 679 } 680 } 681 } 682 # Setup specified data field label pairs... 683 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) { 684 my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2); 685 if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) { 686 # Make sure both data field labels exist... 687 my($DataFieldIndex); 688 for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) { 689 $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex]; 690 $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1]; 691 if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) { 692 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 693 } 694 } 695 } 696 elsif ($OptionsInfo{AllDataLabelPairs}) { 697 for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { 698 for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) { 699 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 700 } 701 } 702 } 703 else { 704 for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { 705 for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) { 706 push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2); 707 } 708 } 709 } 710 if (@DataLabelPairsToAnalyze) { 711 if (@DataLabelPairsToAnalyze % 2) { 712 warn "Warning: Ignoring file $SDFile: Invalid number values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n"; 713 $SDFilesInfo{FileOkay}[$Index] = 0; 714 next FILELIST; 715 } 716 else { 717 for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) { 718 push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex]; 719 push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1]; 720 } 721 # Set up unique data field labe map as well... 722 for $DataLabel (@DataLabelPairsToAnalyze) { 723 if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) { 724 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel; 725 } 726 } 727 } 728 } 729 } 730 # Setup unique data field label array... 731 push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap); 732 } 733 } 734 } 735 736 # Retrieve information about input SD files... 737 sub RetrieveSDFilesInfo { 738 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount); 739 740 %SDFilesInfo = (); 741 742 @{$SDFilesInfo{FileOkay}} = (); 743 @{$SDFilesInfo{CmpdCount}} = (); 744 @{$SDFilesInfo{NewTextFileRoot}} = (); 745 @{$SDFilesInfo{NewTextFileExt}} = (); 746 747 @{$SDFilesInfo{AllDataFieldLabels}} = (); 748 @{$SDFilesInfo{AllDataFieldLabelsMap}} = (); 749 @{$SDFilesInfo{CommonDataLabels}} = (); 750 751 FILELIST: for $Index (0 .. $#SDFilesList) { 752 $SDFile = $SDFilesList[$Index]; 753 754 $SDFilesInfo{FileOkay}[$Index] = 0; 755 756 $SDFilesInfo{CmpdCount}[$Index] = 0; 757 $SDFilesInfo{NewTextFileRoot}[$Index] = ""; 758 $SDFilesInfo{NewTextFileExt}[$Index] = ""; 759 760 @{$SDFilesInfo{AllDataLabels}[$Index]} = (); 761 %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = (); 762 @{$SDFilesInfo{CommonDataLabels}[$Index]} = (); 763 764 if (!(-e $SDFile)) { 765 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 766 next FILELIST; 767 } 768 if (!CheckFileType($SDFile, "sd sdf")) { 769 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 770 next FILELIST; 771 } 772 773 # Generate appropriate name for the new text files... 774 $FileDir = ""; $FileName = ""; $FileExt = ""; 775 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 776 $OutFileExt = "csv"; 777 if ($Options{outdelim} =~ /^tab$/i) { 778 $OutFileExt = "tsv"; 779 } 780 if ($Options{root} && (@SDFilesList == 1)) { 781 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 782 if ($RootFileName && $RootFileExt) { 783 $FileName = $RootFileName; 784 } 785 else { 786 $FileName = $Options{root}; 787 } 788 $OutFileRoot = $FileName; 789 } 790 else { 791 $OutFileRoot = $FileName; 792 } 793 $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt"; 794 795 if (!$OptionsInfo{Overwrite}) { 796 if (-e $OutFile) { 797 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 798 next FILELIST; 799 } 800 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) { 801 if ($OptionsInfo{AllDataLabelPairs}) { 802 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) { 803 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n"; 804 next FILELIST; 805 } 806 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) { 807 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n"; 808 next FILELIST; 809 } 810 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) { 811 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n"; 812 next FILELIST; 813 } 814 } 815 else { 816 if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") { 817 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n"; 818 next FILELIST; 819 } 820 } 821 } 822 if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) { 823 warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n"; 824 next FILELIST; 825 } 826 } 827 828 if (!open SDFILE, "$SDFile") { 829 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 830 next FILELIST; 831 } 832 833 my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels); 834 $CmpdCount = 0; 835 @DataFieldLabels = (); 836 @CommonDataFieldLabels = (); 837 ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); 838 push @DataFieldLabels, @{$DataFieldLabelsRef}; 839 push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef}; 840 close SDFILE; 841 842 $SDFilesInfo{FileOkay}[$Index] = 1; 843 $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot"; 844 $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt"; 845 846 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; 847 push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels; 848 push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels; 849 for $Label (@DataFieldLabels) { 850 $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label; 851 } 852 } 853 } 854 855 # Process option values... 856 sub ProcessOptions { 857 %OptionsInfo = (); 858 859 $OptionsInfo{Mode} = $Options{mode}; 860 861 $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef; 862 863 $OptionsInfo{DetailLevel} = $Options{detail}; 864 865 # Setup supported statistical functions... 866 my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap); 867 868 %SupportedStatisticaFunctionsMap = (); 869 @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN); 870 871 for $SupportedFunction (@SupportedStatisticaFunctions) { 872 $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction; 873 } 874 875 # Setup a list of functions to use for analysis... 876 my($SpecifiedFunction); 877 878 %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = (); 879 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = (); 880 881 # Check mode values... 882 if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) { 883 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic"; 884 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum); 885 } 886 elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) { 887 $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll"; 888 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance RSquare Frequency KLargest KSmallest Sum); 889 } 890 elsif ($Options{mode} =~ /^All$/i ) { 891 $OptionsInfo{FileNameMode} = "AllStatistics"; 892 @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions; 893 } 894 else { 895 $OptionsInfo{FileNameMode} = "SpecifiedStatistics"; 896 897 # Comma delimited list of functions... 898 my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions); 899 900 $Mode = $Options{mode}; 901 $Mode =~ s/ //g; 902 @SpecifiedFunctions = split ",", $Mode; 903 @UnsupportedSpecifiedFunctions = (); 904 for $SpecifiedFunction (@SpecifiedFunctions) { 905 if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) { 906 push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction; 907 } 908 else { 909 push @UnsupportedSpecifiedFunctions, $SpecifiedFunction; 910 } 911 } 912 if (@UnsupportedSpecifiedFunctions) { 913 if (@UnsupportedSpecifiedFunctions > 1) { 914 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n"; 915 } 916 else { 917 warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n"; 918 } 919 die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n"; 920 } 921 } 922 923 FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) { 924 if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) { 925 next FUNCTION; 926 } 927 $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)}; 928 } 929 930 # Setup delimiter and quotes... 931 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); 932 $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0; 933 934 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 935 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; 936 937 # Setup miscellaneous options... 938 $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1; 939 $OptionsInfo{Precision} = $Options{precision}; 940 941 $OptionsInfo{KLargest} = $Options{klargest}; 942 $OptionsInfo{KSmallest} = $Options{ksmallest}; 943 944 $OptionsInfo{TrimFraction} = $Options{trimfraction}; 945 946 # Setup frequency bin values... 947 $OptionsInfo{NumOfBins} = 10; 948 @{$OptionsInfo{BinRange}} = (); 949 if ($Options{frequencybins} =~ /\,/) { 950 my($BinValue, @SpecifiedBinRange); 951 @SpecifiedBinRange = split /\,/, $Options{frequencybins}; 952 if (@SpecifiedBinRange < 2) { 953 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n"; 954 } 955 for $BinValue (@SpecifiedBinRange) { 956 if (!IsNumerical($BinValue)) { 957 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n"; 958 } 959 } 960 my($Index1, $Index2); 961 for $Index1 (0 .. $#SpecifiedBinRange) { 962 for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) { 963 if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) { 964 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n"; 965 } 966 } 967 } 968 push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange; 969 } 970 else { 971 $OptionsInfo{NumOfBins} = $Options{frequencybins}; 972 if (!IsPositiveInteger($OptionsInfo{NumOfBins})) { 973 die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n"; 974 } 975 } 976 977 # Setup specified data field labels... 978 @{$OptionsInfo{SpecifiedDataLabels}} = (); 979 if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) { 980 my(@SpecifiedValues) = split ",", $Options{datafields}; 981 push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues; 982 } 983 @{$OptionsInfo{SpecifiedDataLabelPairs}} = (); 984 $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0; 985 $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0; 986 if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) { 987 my(@SpecifiedValues) = split ",", $Options{datafieldpairs}; 988 if (@SpecifiedValues % 2) { 989 die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n"; 990 } 991 push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues; 992 } 993 994 } 995 996 # Setup script usage and retrieve command line arguments specified using various options... 997 sub SetupScriptUsage { 998 999 # Retrieve all the options... 1000 %Options = (); 1001 $Options{detail} = 0; 1002 $Options{datafields} = "Common"; 1003 $Options{datafieldpairs} = "CommonPairs"; 1004 $Options{frequencybins} = 10; 1005 $Options{klargest} = 2; 1006 $Options{ksmallest} = 2; 1007 $Options{mode} = "DescriptiveStatisticsBasic"; 1008 $Options{outdelim} = "comma"; 1009 $Options{precision} = 2; 1010 $Options{quote} = "yes"; 1011 $Options{trimfraction} = 0.1; 1012 1013 if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) { 1014 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1015 } 1016 if ($Options{workingdir}) { 1017 if (! -d $Options{workingdir}) { 1018 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1019 } 1020 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1021 } 1022 if (!IsInteger($Options{detail})) { 1023 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n"; 1024 } 1025 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1026 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1027 } 1028 if ($Options{quote} !~ /^(yes|no)$/i) { 1029 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1030 } 1031 if (!IsPositiveInteger($Options{precision})) { 1032 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 1033 } 1034 if (!IsPositiveInteger($Options{klargest})) { 1035 die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n"; 1036 } 1037 if (!IsPositiveInteger($Options{ksmallest})) { 1038 die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n"; 1039 } 1040 if (IsFloat($Options{trimfraction})) { 1041 if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) { 1042 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1043 } 1044 } 1045 else { 1046 die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n"; 1047 } 1048 } 1049