MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: AnalyzeSDFilesData.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use SDFileUtil;
  34 use TextUtil;
  35 use StatisticsUtil;
  36 
  37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  38 
  39 # Autoflush STDOUT
  40 $| = 1;
  41 
  42 # Starting message...
  43 $ScriptName = basename($0);
  44 print "\n$ScriptName: Starting...\n\n";
  45 $StartTime = new Benchmark;
  46 
  47 # Get the options and setup script...
  48 SetupScriptUsage();
  49 if ($Options{help} || @ARGV < 1) {
  50   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  51 }
  52 
  53 my(@SDFilesList);
  54 @SDFilesList = ExpandFileNames(\@ARGV, "sd sdf");
  55 
  56 print "Processing options...\n";
  57 my(%OptionsInfo);
  58 ProcessOptions();
  59 
  60 # Collect information about SD files...
  61 print "Checking input SD file(s)...\n";
  62 my(%SDFilesInfo);
  63 RetrieveSDFilesInfo();
  64 ProcessSDFilesDataLabelsInfo();
  65 
  66 # Generate output files...
  67 my($FileIndex);
  68 if (@SDFilesList > 1) {
  69   print "\nProcessing SD files...\n";
  70 }
  71 for $FileIndex (0 .. $#SDFilesList) {
  72   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  74     AnalyzeSDFile($FileIndex);
  75   }
  76 }
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # Analyze data...
  86 sub AnalyzeSDFile {
  87   my($Index) = @_;
  88   my($SDFile, $DataLabel, $DataValue, @DataLabelsToAnalyze, %DataFieldValuesToAnalyzeMap);
  89 
  90   $SDFile = $SDFilesList[$Index];
  91   @DataLabelsToAnalyze = @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]};
  92   %DataFieldValuesToAnalyzeMap = ();
  93   for $DataLabel (@DataLabelsToAnalyze) {
  94     @{$DataFieldValuesToAnalyzeMap{$DataLabel}} = ();
  95   }
  96 
  97   # Collect appropriate data field label values for analysis...
  98   my($CmpdString, @CmpdLines, %DataFieldValues, $CmpdCount, $InvalidCmpdCount, @InvalidCmpdDataLabels);
  99   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 100   $CmpdCount = 0;
 101   $InvalidCmpdCount = 0;
 102   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 103     $CmpdCount++;
 104     @CmpdLines = split "\n", $CmpdString;
 105     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 106     @InvalidCmpdDataLabels = ();
 107     DATALABEL: for $DataLabel (@DataLabelsToAnalyze) {
 108       if (exists $DataFieldValues{$DataLabel}) {
 109         $DataValue = $DataFieldValues{$DataLabel};
 110         if ($OptionsInfo{CheckData}) {
 111           if (!IsNumerical($DataValue)) {
 112             push @InvalidCmpdDataLabels, $DataLabel;
 113             next DATALABEL;
 114           }
 115         }
 116         push @{$DataFieldValuesToAnalyzeMap{$DataLabel}}, $DataValue;
 117       }
 118     }
 119     if (@InvalidCmpdDataLabels) {
 120       $InvalidCmpdCount++;
 121       if ($OptionsInfo{DetailLevel} >=4 ) {
 122         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed:\n$CmpdString \n";
 123       }
 124       elsif ($OptionsInfo{DetailLevel} >= 3) {
 125         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field(s) - ", JoinWords(\@InvalidCmpdDataLabels, ", ", 0)," - to be analyzed...\n";
 126       }
 127       elsif ($OptionsInfo{DetailLevel} >= 2) {
 128         print "Compound record $CmpdCount contains ", scalar(@InvalidCmpdDataLabels)," non-numerical or empty value(s) for data field to be analyzed...\n";
 129       }
 130     }
 131   }
 132   if ($InvalidCmpdCount && ($OptionsInfo{DetailLevel} >= 1)) {
 133     print "Non-numerical or empty data present in $InvalidCmpdCount compound record(s)...\n";
 134   }
 135   close SDFILE;
 136 
 137   # Perform the analysis...
 138   my(@SpecifiedFunctionNames, $SpecifiedFunction);
 139   @SpecifiedFunctionNames = ();
 140 
 141   for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
 142     if ($SpecifiedFunction !~ /^(Covariance|Correlation|Frequency|Rsquare|StandardScores|StandardScoresN)$/i) {
 143       push @SpecifiedFunctionNames, $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)};
 144     }
 145   }
 146   if (@SpecifiedFunctionNames) {
 147     PerformAnalysis($Index, \@SpecifiedFunctionNames, \%DataFieldValuesToAnalyzeMap)
 148   }
 149   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
 150     if ($OptionsInfo{AllDataLabelPairs} || $OptionsInfo{CommonDataLabelPairs}) {
 151       PerformMatrixAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 152     }
 153     else {
 154       # Perform pairwise analysis for specified columns and write out calculated values - correlation
 155       # rsquare, or covariance - in the same file.
 156       PerformDataLabelPairAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 157     }
 158   }
 159   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ) {
 160     PerformStandardScoresAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 161   }
 162   if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
 163     PerformFrequencyAnalysis($Index, \%DataFieldValuesToAnalyzeMap);
 164   }
 165 
 166 }
 167 
 168 # Calculate values for various statistical functions...
 169 sub PerformAnalysis {
 170   my($Index, $SpecifiedFunctionNamesRef, $DataValuesToAnalyzeMapRef) = @_;
 171   my($NewTextFile, $Line, $SpecifiedFunction, $Label, @ColLabels, @DataLabelsToAnalyze);
 172 
 173   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $OptionsInfo{FileNameMode} . "." . $SDFilesInfo{NewTextFileExt}[$Index];
 174 
 175   print "Generating new text file $NewTextFile...\n";
 176   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 177 
 178   # Write out column labels...
 179   @ColLabels = ();
 180   push @ColLabels, "DataLabel";
 181   for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 182     $Label = $SpecifiedFunction;
 183     if ($SpecifiedFunction =~ /^(KLargest|KSmallest)$/i) {
 184       my($KthValue);
 185       $KthValue = ($SpecifiedFunction =~ /^KLargest$/i) ? $OptionsInfo{KLargest} : $OptionsInfo{KSmallest};
 186       $Label = AddNumberSuffix($KthValue) . "$SpecifiedFunction";
 187       $Label =~ s/K//g;
 188     }
 189     elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 190       $Label = "${SpecifiedFunction}($OptionsInfo{TrimFraction})";
 191     }
 192     push @ColLabels, $Label;
 193   }
 194   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 195   print NEWTEXTFILE "$Line\n";
 196 
 197   # Go over each column to be analyzed...
 198   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 199 
 200   # Turn off "strict"; otherwise, invoking statistical functions using function name string
 201   # is problematic.
 202   no strict;
 203 
 204   my($DataValuesRef, $DataLabel, $Value, @RowValues, %CalculatedValues);
 205   %CalculatedValues = ();
 206   for $DataLabel (@DataLabelsToAnalyze) {
 207     @RowValues = ();
 208     # Setup column id...
 209     push @RowValues, $DataLabel;
 210     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 211     FUNCTIONNAME: for $SpecifiedFunction (@{$SpecifiedFunctionNamesRef}) {
 212       $Value = "";
 213       if (!@{$DataValuesToAnalyzeMapRef->{$DataLabel}}) {
 214         # Invalid column values...
 215         push @RowValues, $Value;
 216         next FUNCTIONNAME;
 217       }
 218       if ($SpecifiedFunction =~ /^Count$/i) {
 219         $Value = @{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 220       }
 221       elsif ($SpecifiedFunction =~ /^KLargest$/i) {
 222         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KLargest});
 223       }
 224       elsif ($SpecifiedFunction =~ /^KSmallest$/i) {
 225         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{KSmallest});
 226       }
 227       elsif ($SpecifiedFunction =~ /^StandardDeviation$/i) {
 228         if (exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 229           $Value = $CalculatedValues{$DataLabel}{StandardDeviation};
 230         }
 231         else {
 232           $Value = &$SpecifiedFunction($DataValuesRef);
 233           $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 234         }
 235       }
 236       elsif ($SpecifiedFunction =~ /^StandardError$/i) {
 237         if (!exists($CalculatedValues{$DataLabel}{StandardDeviation})) {
 238           $Value = StandardDeviation($DataValuesRef);
 239           $CalculatedValues{$DataLabel}{StandardDeviation} = $Value;
 240         }
 241         if (defined $CalculatedValues{$DataLabel}{StandardDeviation}) {
 242           $Value = &$SpecifiedFunction($CalculatedValues{$DataLabel}{StandardDeviation}, @{$DataValuesToAnalyzeMapRef->{$DataLabel}});
 243         }
 244       }
 245       elsif ($SpecifiedFunction =~ /^TrimMean$/i) {
 246         $Value = &$SpecifiedFunction($DataValuesRef, $OptionsInfo{TrimFraction});
 247       }
 248       else {
 249         $Value = &$SpecifiedFunction($DataValuesRef);
 250       }
 251       # Format the output value. And add zero to get rid of tariling zeros...
 252       $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : "";
 253       push @RowValues, $Value;
 254     }
 255     $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 256     print NEWTEXTFILE "$Line\n";
 257   }
 258   close NEWTEXTFILE;
 259 }
 260 
 261 # Calculate covariance, correlation, rsquare for specified data field label pairs....
 262 sub PerformDataLabelPairAnalysis {
 263   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 264   my($NewTextFile, @ColLabels, $Line, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 265 
 266   $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
 267   $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
 268   $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
 269 
 270   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "DataFieldPairsAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 271   print "Generating new text file $NewTextFile...\n";
 272   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 273 
 274   # Write out the column labels...
 275   @ColLabels = ();
 276   push @ColLabels, ("DataLabel1", "DataLabel2");
 277   if ($CalculateCorrelation || $CalculateRSquare) {
 278     push @ColLabels, "Correlation";
 279     if ($CalculateRSquare) {
 280       push @ColLabels, "RSquare";
 281     }
 282   }
 283   if ($CalculateCovariance) {
 284     push @ColLabels, "Covariance";
 285   }
 286   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 287   print NEWTEXTFILE "$Line\n";
 288 
 289   # Go over each data field pair...
 290   my($CorrelationValue, $RSquareValue, $CovarianceValue,  $LabelIndex, $DataLabel1, $DataLabel2, $DataValues1, $DataValues2, @DataLabelPairs1ToAnalyze, @DataLabelPairs2ToAnalyze, @RowValues, $Value);
 291 
 292   @DataLabelPairs1ToAnalyze = @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]};
 293   @DataLabelPairs2ToAnalyze = @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]};
 294   for $LabelIndex (0 .. $#DataLabelPairs1ToAnalyze) {
 295     @RowValues = ();
 296     $DataLabel1 = $DataLabelPairs1ToAnalyze[$LabelIndex];
 297     $DataLabel2 = $DataLabelPairs2ToAnalyze[$LabelIndex];
 298     $DataValues1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 299     $DataValues2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 300 
 301     # Setup column ids...
 302     push @RowValues, $DataLabel1;
 303     push @RowValues, $DataLabel2;
 304 
 305     if (@$DataValues1 != @$DataValues2) {
 306       # Print a warning...
 307       warn "Warning: Skipping analysis for data field pair $DataLabel1, $DataLabel2: Number of valid data values must be same.\n";
 308       if ($CalculateCorrelation || $CalculateRSquare) {
 309         push @RowValues, "";
 310         if ($CalculateRSquare) {
 311           push @RowValues, "";
 312         }
 313       }
 314       if ($CalculateCovariance) {
 315         push @RowValues, "";
 316       }
 317     }
 318     else {
 319       # Calculate appropriate value...
 320       if ($CalculateCorrelation || $CalculateRSquare) {
 321         $CorrelationValue = Correlation($DataValues1, $DataValues2);
 322         $Value = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
 323         push @RowValues, $Value;
 324         if ($CalculateRSquare) {
 325           $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 326           $Value = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
 327           push @RowValues, $Value;
 328         }
 329       }
 330       if ($CalculateCovariance) {
 331         $CovarianceValue = Covariance($DataValues1, $DataValues2);
 332         $Value = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
 333         push @RowValues, $Value;
 334       }
 335     }
 336     $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 337     print NEWTEXTFILE "$Line\n";
 338   }
 339   close NEWTEXTFILE;
 340 }
 341 
 342 # Generate histogram numbers...
 343 sub PerformFrequencyAnalysis {
 344   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 345   my($NewTextFile, $ColLabel, @ColLabels, @RowValues, $Line, $DataLabel, @DataLabelsToAnalyze, $DataValuesRef, $BinValue, $FrequencyValue, $Value, %FrequencyMap);
 346 
 347   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 348   for $DataLabel (@DataLabelsToAnalyze) {
 349     $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $DataLabel . "FrequencyAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 350     print "Generating new text file $NewTextFile...\n";
 351     open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 352 
 353     # Write out the column labels...
 354     @ColLabels = ();
 355     push @ColLabels , ("Bins", "Frequency");
 356     $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 357     print NEWTEXTFILE "$Line\n";
 358 
 359     #Calculate and write out frequency values...
 360     %FrequencyMap = ();
 361     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 362     if (@$DataValuesRef) {
 363       if (@{$OptionsInfo{BinRange}}) {
 364         %FrequencyMap = Frequency($DataValuesRef, \@{$OptionsInfo{BinRange}});
 365       }
 366       else {
 367         %FrequencyMap = Frequency($DataValuesRef, $OptionsInfo{NumOfBins});
 368       }
 369     }
 370     for $BinValue (sort { $a <=> $b }  keys %FrequencyMap) {
 371       $FrequencyValue = $FrequencyMap{$BinValue};
 372 
 373       @RowValues = ();
 374       $Value = (length($BinValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $BinValue) + 0) : "";
 375       push @RowValues, $Value;
 376       $Value = (length($FrequencyValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $FrequencyValue) + 0) : "";
 377       push @RowValues, $Value;
 378 
 379       $Line = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 380       print NEWTEXTFILE "$Line\n";
 381     }
 382     close NEWTEXTFILE;
 383   }
 384 }
 385 
 386 # Calculate covariance, correlation/rsquare matrices....
 387 sub PerformMatrixAnalysis {
 388   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 389   my($CorrelationTextFile, $CovarianceTextFile, $RSquareTextFile, $CalculateCorrelation, $CalculateRSquare, $CalculateCovariance);
 390 
 391   $CalculateCorrelation = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) ? 1 : 0;
 392   $CalculateRSquare = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) ? 1 : 0;
 393   $CalculateCovariance = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) ? 1 : 0;
 394 
 395   $CorrelationTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CorrelationMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 396   $RSquareTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "RSquareMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 397   $CovarianceTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "CovarianceMatrix." .  $SDFilesInfo{NewTextFileExt}[$Index];
 398 
 399   my($TextFilesList, $Delimiter);
 400   $TextFilesList =  "";
 401   if ($CalculateCorrelation || $CalculateRSquare) {
 402     $TextFilesList = $CorrelationTextFile;
 403     if ($CalculateRSquare) {
 404       $TextFilesList .= ", $CorrelationTextFile";
 405     }
 406   }
 407   $Delimiter = length($TextFilesList) ? "," : "";
 408   if ($CalculateCovariance) {
 409     $TextFilesList .= "${Delimiter} ${CorrelationTextFile}";
 410   }
 411   if ($TextFilesList =~ /\,/) {
 412     print "Generating new text files $TextFilesList...\n"
 413   }
 414   else {
 415     print "Generating new text file $TextFilesList...\n"
 416   }
 417   if ($CalculateCorrelation || $CalculateRSquare) {
 418     open CORRELATIONTEXTFILE, ">$CorrelationTextFile" or die "Error: Can't open $CorrelationTextFile: $! \n";
 419     if ($CalculateRSquare) {
 420       open RSQUARETEXTFILE, ">$RSquareTextFile" or die "Error: Can't open $RSquareTextFile: $! \n";
 421     }
 422   }
 423   if ($CalculateCovariance) {
 424     open COVARIANCETEXTFILE, ">$CovarianceTextFile" or die "Error: Can't open $CovarianceTextFile: $! \n";
 425   }
 426 
 427   my($Line, $Value, $CorrelationValue, $RSquareValue, $CovarianceValue, $DataLabel, $DataLabel1, $DataLabel2, $DataValuesRef1, $DataValuesRef2, @ColLabels, @CovarianceRowValues, @CorrelationRowValues, @RSquareRowValues);
 428 
 429   # Write out the column labels...
 430   @ColLabels = ();
 431   push @ColLabels, @{$SDFilesInfo{AllDataLabels}[$Index]};
 432   $Line = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 433   if ($CalculateCorrelation || $CalculateRSquare) {
 434     print CORRELATIONTEXTFILE "$Line\n";
 435     if ($CalculateRSquare) {
 436       print RSQUARETEXTFILE "$Line\n";
 437     }
 438   }
 439   if ($CalculateCovariance) {
 440     print COVARIANCETEXTFILE "$Line\n";
 441   }
 442 
 443   # Due to symmetric nature of these matrices, only one half needs to be
 444   # calculated. So, just calculate the lower half and copy it to upper half...
 445   my(%CorrelationMatrixMap, %RSquareMatrixMap, %CovarianceMatrixMap, $LabelIndex1, $LabelIndex2, @DataLabelsToAnalyze);
 446 
 447   %CorrelationMatrixMap = (); %RSquareMatrixMap = (); %CovarianceMatrixMap = ();
 448   @DataLabelsToAnalyze = ();
 449   @DataLabelsToAnalyze = $OptionsInfo{AllDataLabelPairs} ? @{$SDFilesInfo{AllDataLabels}[$Index]} : @{$SDFilesInfo{CommonDataLabels}[$Index]};
 450 
 451   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 452     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 453     for $LabelIndex2 (0 .. $LabelIndex1) {
 454       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 455       $DataValuesRef1 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel1}};
 456       $DataValuesRef2 =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel2}};
 457       if ($CalculateCorrelation || $CalculateRSquare) {
 458         $CorrelationValue = Correlation($DataValuesRef1, $DataValuesRef2);
 459         $CorrelationValue = (defined($CorrelationValue) && length($CorrelationValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CorrelationValue) + 0) : "";
 460         $CorrelationMatrixMap{$DataLabel1}{$DataLabel2} = $CorrelationValue;
 461         if ($DataLabel1 ne $DataLabel2) {
 462           $CorrelationMatrixMap{$DataLabel2}{$DataLabel1} = $CorrelationValue;
 463         }
 464         if ($CalculateRSquare) {
 465           $RSquareValue = (defined($CorrelationValue) && length($CorrelationValue)) ? ($CorrelationValue ** 2) : "";
 466           $RSquareValue = (length($RSquareValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $RSquareValue) + 0) : "";
 467           $RSquareMatrixMap{$DataLabel1}{$DataLabel2} = $RSquareValue;
 468           if ($DataLabel1 ne $DataLabel2) {
 469             $RSquareMatrixMap{$DataLabel2}{$DataLabel1} = $RSquareValue;
 470           }
 471         }
 472       }
 473       if ($CalculateCovariance) {
 474         $CovarianceValue = Covariance($DataValuesRef1, $DataValuesRef2);
 475         $CovarianceValue = (defined($CovarianceValue) && length($CovarianceValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CovarianceValue) + 0) : "";
 476         $CovarianceMatrixMap{$DataLabel1}{$DataLabel2} = $CovarianceValue;
 477         if ($DataLabel1 ne $DataLabel2) {
 478           $CovarianceMatrixMap{$DataLabel2}{$DataLabel1} = $CovarianceValue;
 479         }
 480       }
 481     }
 482   }
 483 
 484   # Write out the matrices...
 485   for $LabelIndex1 (0 .. (@DataLabelsToAnalyze - 1)) {
 486     $DataLabel1 = $DataLabelsToAnalyze[$LabelIndex1];
 487     @CorrelationRowValues = ();
 488     @RSquareRowValues = ();
 489     @CovarianceRowValues = ();
 490     if ($CalculateCorrelation || $CalculateRSquare) {
 491       push @CorrelationRowValues, $DataLabel1;
 492       if ($CalculateRSquare) {
 493         push @RSquareRowValues, $DataLabel1;
 494       }
 495     }
 496     if ($CalculateCovariance) {
 497       push @CovarianceRowValues, $DataLabel;
 498     }
 499     for $LabelIndex2 (0 .. (@DataLabelsToAnalyze - 1)) {
 500       $DataLabel2 = $DataLabelsToAnalyze[$LabelIndex2];
 501       if ($CalculateCorrelation || $CalculateRSquare) {
 502         push @CorrelationRowValues, $CorrelationMatrixMap{$DataLabel1}{$DataLabel2};
 503         if ($CalculateRSquare) {
 504           push @RSquareRowValues, $RSquareMatrixMap{$DataLabel1}{$DataLabel2};
 505         }
 506       }
 507       if ($CalculateCovariance) {
 508         push @CovarianceRowValues, $CovarianceMatrixMap{$DataLabel1}{$DataLabel2};
 509       }
 510     }
 511     if ($CalculateCorrelation || $CalculateRSquare) {
 512       $Line = JoinWords(\@CorrelationRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 513       print CORRELATIONTEXTFILE "$Line\n";
 514       if ($CalculateRSquare) {
 515         $Line = JoinWords(\@RSquareRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 516         print RSQUARETEXTFILE "$Line\n";
 517       }
 518     }
 519     if ($CalculateCovariance) {
 520       $Line = JoinWords(\@CovarianceRowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 521       print COVARIANCETEXTFILE "$Line\n";
 522     }
 523   }
 524   if ($CalculateCorrelation || $CalculateRSquare) {
 525     close CORRELATIONTEXTFILE;
 526     if ($CalculateRSquare) {
 527       close RSQUARETEXTFILE;
 528     }
 529   }
 530   if ($CalculateCovariance) {
 531     close COVARIANCETEXTFILE;
 532   }
 533 }
 534 
 535 # Calculate standard scores...
 536 sub PerformStandardScoresAnalysis {
 537   my($Index, $DataValuesToAnalyzeMapRef) = @_;
 538   my($StandardScores, $StandardScoresN, $NewTextFile, @ColLabels, $Label, $NewLine);
 539 
 540   $StandardScores = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) ? 1 : 0;
 541   $StandardScoresN = exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscoresn}) ? 1 : 0;
 542 
 543   $NewTextFile = $SDFilesInfo{NewTextFileRoot}[$Index] . "StandardScores." .  $SDFilesInfo{NewTextFileExt}[$Index];
 544   print "Generating new text file $NewTextFile...\n";
 545   open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
 546 
 547   my($DataLabel, @DataLabelsToAnalyze);
 548   # Write out column labels...
 549   @ColLabels = ();
 550   @DataLabelsToAnalyze = @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]};
 551   for $DataLabel (@DataLabelsToAnalyze) {
 552     if ($StandardScores) {
 553       push @ColLabels, "${DataLabel}\(StandardScores)";
 554     }
 555     if ($StandardScoresN) {
 556       push @ColLabels, "${DataLabel}\(StandardScoresN)";
 557     }
 558   }
 559   $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 560   print NEWTEXTFILE "$NewLine\n";
 561 
 562   # Go over each column to be analyzed and calculate standard deviation
 563   # and mean values...
 564   my($DataValuesRef, %StandardDeviationMap, %StandardDeviationNMap, %MeanMap);
 565   %StandardDeviationMap = ();
 566   %StandardDeviationNMap = ();
 567   %MeanMap = ();
 568   for $DataLabel (@DataLabelsToAnalyze) {
 569     $DataValuesRef =  \@{$DataValuesToAnalyzeMapRef->{$DataLabel}};
 570     if (!exists($MeanMap{$DataLabel})) {
 571       $MeanMap{$DataLabel} = Mean($DataValuesRef);
 572     }
 573     if ($StandardScores) {
 574       if (!exists($StandardDeviationMap{$DataLabel})) {
 575         $StandardDeviationMap{$DataLabel} = StandardDeviation($DataValuesRef);
 576       }
 577     }
 578     if ($StandardScoresN) {
 579       if (!exists($StandardDeviationNMap{$DataLabel})) {
 580         $StandardDeviationNMap{$DataLabel} = StandardDeviationN($DataValuesRef);
 581       }
 582     }
 583   }
 584   #
 585   # Go over each data field and calculate standard scores for each column
 586   # using (x[i] - mean) / (n - 1) for StandardScores and (x[i] - mean) / n
 587   # for StandardScoresN; write out the calculated values as well...
 588 
 589   my($SDFile, $Value, $ValueOkay, $ScoreValue, @RowValues, $CmpdString, @CmpdLines, %DataFieldValues);
 590   $SDFile = $SDFilesList[$Index];
 591 
 592   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 593   while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 594     @CmpdLines = split "\n", $CmpdString;
 595     %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 596     @RowValues = ();
 597     for $DataLabel (@DataLabelsToAnalyze) {
 598       $Value = "";
 599       if (exists $DataFieldValues{$DataLabel}) {
 600         $Value = $DataFieldValues{$DataLabel};
 601       }
 602       $ValueOkay = ($OptionsInfo{CheckData} && !IsNumerical($Value)) ? 0 : 1;
 603       if ($StandardScores) {
 604         $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationMap{$DataLabel}) : "";
 605         $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
 606         push @RowValues, $ScoreValue;
 607       }
 608       if ($StandardScoresN) {
 609         $ScoreValue = $ValueOkay ? (($Value - $MeanMap{$DataLabel})/$StandardDeviationNMap{$DataLabel}) : "";
 610         $ScoreValue = (defined($ScoreValue) && length($ScoreValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $ScoreValue) + 0) : "";
 611         push @RowValues, $ScoreValue;
 612       }
 613     }
 614     $NewLine = JoinWords(\@RowValues, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
 615     print NEWTEXTFILE "$NewLine\n";
 616   }
 617   close SDFILE;
 618   close NEWTEXTFILE;
 619 
 620 }
 621 
 622 # Make sure the specified data field labels exists in SD files...
 623 sub ProcessSDFilesDataLabelsInfo {
 624   my($Index, $DataFieldIndex, $SDFile, $DataLabel, @DataLabelsToAnalyze, %UniqueDataLabelsToAnalyzeMap);
 625 
 626   @{$SDFilesInfo{DataLabelsToAnalyze}} = ();
 627   @{$SDFilesInfo{DataLabelPairs1ToAnalyze}} = ();
 628   @{$SDFilesInfo{DataLabelPairs2ToAnalyze}} = ();
 629   @{$SDFilesInfo{UniqueDataLabelsToAnalyze}} = ();
 630 
 631   FILELIST: for $Index (0 .. $#SDFilesList) {
 632     $SDFile = $SDFilesList[$Index];
 633 
 634     @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]} = ();
 635     @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]} = ();
 636     @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]} = ();
 637     @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]} = ();
 638 
 639     %UniqueDataLabelsToAnalyzeMap = ();
 640 
 641     if ($SDFilesInfo{FileOkay}[$Index]) {
 642       @DataLabelsToAnalyze = ();
 643       if (@{$OptionsInfo{SpecifiedDataLabels}}) {
 644         for $DataLabel (@{$OptionsInfo{SpecifiedDataLabels}}) {
 645           if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel})) {
 646             push @DataLabelsToAnalyze, $DataLabel;
 647           }
 648         }
 649       }
 650       elsif (defined($OptionsInfo{DataFields}) && $OptionsInfo{DataFields} =~ /^All$/i) {
 651         push @DataLabelsToAnalyze, @{$SDFilesInfo{AllDataLabels}[$Index]};
 652       }
 653       else {
 654         push @DataLabelsToAnalyze, @{$SDFilesInfo{CommonDataLabels}[$Index]};
 655       }
 656       if (@DataLabelsToAnalyze) {
 657         push @{$SDFilesInfo{DataLabelsToAnalyze}[$Index]}, @DataLabelsToAnalyze;
 658         # Set up unique data field label map as well...
 659         for $DataLabel (@DataLabelsToAnalyze) {
 660           if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 661             $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 662           }
 663         }
 664       }
 665       else {
 666         warn "Warning: Ignoring file $SDFile: None of the data field labels specified, @{$OptionsInfo{SpecifiedDataLabels}}, using \"--datafields\" option exist.\n";
 667         $SDFilesInfo{FileOkay}[$Index] = 0;
 668         next FILELIST;
 669       }
 670       if (!$OptionsInfo{Overwrite} && exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{frequency})) {
 671         # Make sure specific frequency files don't exist...
 672         my($FrequencyFile);
 673         for $DataLabel (@DataLabelsToAnalyze) {
 674           $FrequencyFile = $SDFilesInfo{NewTextFileRoot}[$Index] . $SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel} . "FrequencyAnalysis." .  $SDFilesInfo{NewTextFileExt}[$Index];
 675           if (-e $FrequencyFile) {
 676             warn "Warning: Ignoring file $SDFile: The file $FrequencyFile already exists.\n";
 677             $SDFilesInfo{FileOkay}[$Index] = 0;
 678             next FILELIST;
 679           }
 680         }
 681       }
 682       # Setup specified data field label pairs...
 683       if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance} || exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) {
 684         my(@DataLabelPairsToAnalyze, $DataLabel1, $DataLabel2);
 685         if (@{$OptionsInfo{SpecifiedDataLabelPairs}}) {
 686           # Make sure both data field labels exist...
 687           my($DataFieldIndex);
 688           for ($DataFieldIndex = 0; (($DataFieldIndex + 1) < @{$OptionsInfo{SpecifiedDataLabelPairs}}); $DataFieldIndex += 2 ) {
 689             $DataLabel1 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex];
 690             $DataLabel2 = $OptionsInfo{SpecifiedDataLabelPairs}[$DataFieldIndex + 1];
 691             if (exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel1}) && exists($SDFilesInfo{AllDataLabelsMap}[$Index]{$DataLabel2})) {
 692               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 693             }
 694           }
 695         }
 696         elsif ($OptionsInfo{AllDataLabelPairs}) {
 697           for $DataLabel1 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
 698             for $DataLabel2 (@{$SDFilesInfo{AllDataLabels}[$Index]}) {
 699               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 700             }
 701           }
 702         }
 703         else {
 704           for $DataLabel1 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
 705             for $DataLabel2 (@{$SDFilesInfo{CommonDataLabels}[$Index]}) {
 706               push @DataLabelPairsToAnalyze, ($DataLabel1, $DataLabel2);
 707             }
 708           }
 709         }
 710         if (@DataLabelPairsToAnalyze) {
 711           if (@DataLabelPairsToAnalyze % 2) {
 712             warn "Warning: Ignoring file $SDFile: Invalid number  values specified using \"--datafieldpairs\" option: It must contain even number of valid values.\n";
 713             $SDFilesInfo{FileOkay}[$Index] = 0;
 714             next FILELIST;
 715           }
 716           else {
 717             for ($DataFieldIndex = 0; $DataFieldIndex < @DataLabelPairsToAnalyze; $DataFieldIndex += 2) {
 718               push @{$SDFilesInfo{DataLabelPairs1ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex];
 719               push @{$SDFilesInfo{DataLabelPairs2ToAnalyze}[$Index]}, $DataLabelPairsToAnalyze[$DataFieldIndex + 1];
 720             }
 721             # Set up unique data field labe map as well...
 722             for $DataLabel (@DataLabelPairsToAnalyze) {
 723               if (!exists $UniqueDataLabelsToAnalyzeMap{$DataLabel}) {
 724                 $UniqueDataLabelsToAnalyzeMap{$DataLabel} = $DataLabel;
 725               }
 726             }
 727           }
 728         }
 729       }
 730       # Setup unique data field label array...
 731       push @{$SDFilesInfo{UniqueDataLabelsToAnalyze}[$Index]}, (sort keys %UniqueDataLabelsToAnalyzeMap);
 732     }
 733   }
 734 }
 735 
 736 # Retrieve information about input SD files...
 737 sub RetrieveSDFilesInfo {
 738   my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFile, $OutFileRoot, $OutFileExt, $CmpdCount);
 739 
 740   %SDFilesInfo = ();
 741 
 742   @{$SDFilesInfo{FileOkay}} = ();
 743   @{$SDFilesInfo{CmpdCount}} = ();
 744   @{$SDFilesInfo{NewTextFileRoot}} = ();
 745   @{$SDFilesInfo{NewTextFileExt}} = ();
 746 
 747   @{$SDFilesInfo{AllDataFieldLabels}} = ();
 748   @{$SDFilesInfo{AllDataFieldLabelsMap}} = ();
 749   @{$SDFilesInfo{CommonDataLabels}} = ();
 750 
 751   FILELIST: for $Index (0 .. $#SDFilesList) {
 752     $SDFile = $SDFilesList[$Index];
 753 
 754     $SDFilesInfo{FileOkay}[$Index] = 0;
 755 
 756     $SDFilesInfo{CmpdCount}[$Index] = 0;
 757     $SDFilesInfo{NewTextFileRoot}[$Index] = "";
 758     $SDFilesInfo{NewTextFileExt}[$Index] = "";
 759 
 760     @{$SDFilesInfo{AllDataLabels}[$Index]} = ();
 761     %{$SDFilesInfo{AllDataLabelsMap}[$Index]} = ();
 762     @{$SDFilesInfo{CommonDataLabels}[$Index]} = ();
 763 
 764     if (!(-e $SDFile)) {
 765       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 766       next FILELIST;
 767     }
 768     if (!CheckFileType($SDFile, "sd sdf")) {
 769       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 770       next FILELIST;
 771     }
 772 
 773     # Generate appropriate name for the new text files...
 774     $FileDir = ""; $FileName = ""; $FileExt = "";
 775     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 776     $OutFileExt = "csv";
 777     if ($Options{outdelim} =~ /^tab$/i) {
 778       $OutFileExt = "tsv";
 779     }
 780     if ($Options{root} && (@SDFilesList == 1)) {
 781       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 782       if ($RootFileName && $RootFileExt) {
 783         $FileName = $RootFileName;
 784       }
 785       else {
 786         $FileName = $Options{root};
 787       }
 788       $OutFileRoot = $FileName;
 789     }
 790     else {
 791       $OutFileRoot = $FileName;
 792     }
 793     $OutFile = $OutFileRoot . $OptionsInfo{FileNameMode} . ".$OutFileExt";
 794 
 795     if (!$OptionsInfo{Overwrite}) {
 796       if (-e $OutFile) {
 797         warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
 798         next FILELIST;
 799       }
 800       if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) || exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare})) {
 801         if ($OptionsInfo{AllDataLabelPairs}) {
 802           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{covariance}) && (-e "${OutFileRoot}CovarianceMatrix.${FileExt}")) {
 803             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}Covariance.${FileExt} already exists.\n";
 804             next FILELIST;
 805           }
 806           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{correlation}) && (-e "${OutFileRoot}CorrelationMatrix.${FileExt}")) {
 807             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}CorrelationMatrix.${FileExt} already exists.\n";
 808             next FILELIST;
 809           }
 810           if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{rsquare}) && (-e "${OutFileRoot}RSquareMatrix.${FileExt}")) {
 811             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}RSquareMatrix.${FileExt} already exists.\n";
 812             next FILELIST;
 813           }
 814         }
 815         else {
 816           if (-e "${OutFileRoot}ColumnPairsAnalysis.${FileExt}") {
 817             warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}ColumnPairsAnalysis.${FileExt} already exists.\n";
 818             next FILELIST;
 819           }
 820         }
 821       }
 822       if (exists($OptionsInfo{SpecifiedStatisticalFunctionsMap}{standardscores}) && (-e "${OutFileRoot}StandardScores.${FileExt}")) {
 823         warn "Warning: Ignoring file $SDFile: The file ${OutFileRoot}StandardScores.${FileExt} already exists.\n";
 824         next FILELIST;
 825       }
 826     }
 827 
 828     if (!open SDFILE, "$SDFile") {
 829       warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 830       next FILELIST;
 831     }
 832 
 833     my($CmpdCount, $Label, $DataFieldLabelsRef, $CommonDataFieldLabelsRef, @DataFieldLabels, @CommonDataFieldLabels);
 834     $CmpdCount = 0;
 835     @DataFieldLabels = ();
 836     @CommonDataFieldLabels = ();
 837     ($CmpdCount, $DataFieldLabelsRef, $CommonDataFieldLabelsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
 838     push @DataFieldLabels, @{$DataFieldLabelsRef};
 839     push @CommonDataFieldLabels, @{$CommonDataFieldLabelsRef};
 840     close SDFILE;
 841 
 842     $SDFilesInfo{FileOkay}[$Index] = 1;
 843     $SDFilesInfo{NewTextFileRoot}[$Index] = "$OutFileRoot";
 844     $SDFilesInfo{NewTextFileExt}[$Index] = "$OutFileExt";
 845 
 846     $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
 847     push @{$SDFilesInfo{AllDataLabels}[$Index]}, @DataFieldLabels;
 848     push @{$SDFilesInfo{CommonDataLabels}[$Index]}, @CommonDataFieldLabels;
 849     for $Label (@DataFieldLabels) {
 850       $SDFilesInfo{AllDataLabelsMap}[$Index]{$Label} = $Label;
 851     }
 852   }
 853 }
 854 
 855 # Process option values...
 856 sub ProcessOptions {
 857   %OptionsInfo = ();
 858 
 859   $OptionsInfo{Mode} = $Options{mode};
 860 
 861   $OptionsInfo{DataFields} = defined $Options{datafields} ? $Options{datafields} : undef;
 862 
 863   $OptionsInfo{DetailLevel} = $Options{detail};
 864 
 865   # Setup supported statistical functions...
 866   my($SupportedFunction, @SupportedStatisticaFunctions, %SupportedStatisticaFunctionsMap);
 867 
 868   %SupportedStatisticaFunctionsMap = ();
 869   @SupportedStatisticaFunctions = qw(Average AverageDeviation Correlation Count Covariance GeometricMean Frequency HarmonicMean KLargest KSmallest Kurtosis Maximum Minimum Mean Median Mode RSquare Skewness Sum SumOfSquares StandardDeviation StandardDeviationN StandardError StandardScores StandardScoresN TrimMean Variance VarianceN);
 870 
 871   for $SupportedFunction (@SupportedStatisticaFunctions) {
 872     $SupportedStatisticaFunctionsMap{lc($SupportedFunction)} = $SupportedFunction;
 873   }
 874 
 875   # Setup a list of functions to use for analysis...
 876   my($SpecifiedFunction);
 877 
 878   %{$OptionsInfo{SpecifiedStatisticalFunctionsMap}} = ();
 879   @{$OptionsInfo{SpecifiedStatisticalFunctions}} = ();
 880 
 881   # Check mode values...
 882   if ($Options{mode} =~ /^DescriptiveStatisticsBasic$/i ) {
 883     $OptionsInfo{FileNameMode} = "DescriptiveStatisticsBasic";
 884     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean Median StandardDeviation StandardError Variance Sum);
 885   }
 886   elsif ($Options{mode} =~ /^DescriptiveStatisticsAll$/i ) {
 887     $OptionsInfo{FileNameMode} = "DescriptiveStatisticsAll";
 888     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = qw(Count Maximum Minimum Mean GeometricMean HarmonicMean TrimMean Median Mode StandardDeviation Kurtosis Skewness StandardError Variance  RSquare Frequency  KLargest KSmallest Sum);
 889   }
 890   elsif ($Options{mode} =~ /^All$/i ) {
 891     $OptionsInfo{FileNameMode} = "AllStatistics";
 892     @{$OptionsInfo{SpecifiedStatisticalFunctions}} = @SupportedStatisticaFunctions;
 893   }
 894   else {
 895     $OptionsInfo{FileNameMode} = "SpecifiedStatistics";
 896 
 897     # Comma delimited list of functions...
 898     my($Mode, @SpecifiedFunctions, @UnsupportedSpecifiedFunctions);
 899 
 900     $Mode = $Options{mode};
 901     $Mode =~ s/ //g;
 902     @SpecifiedFunctions = split ",", $Mode;
 903     @UnsupportedSpecifiedFunctions = ();
 904     for $SpecifiedFunction (@SpecifiedFunctions) {
 905       if (exists($SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)})) {
 906         push @{$OptionsInfo{SpecifiedStatisticalFunctions}}, $SpecifiedFunction;
 907       }
 908       else {
 909         push @UnsupportedSpecifiedFunctions, $SpecifiedFunction;
 910       }
 911     }
 912     if (@UnsupportedSpecifiedFunctions) {
 913       if (@UnsupportedSpecifiedFunctions > 1) {
 914         warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedFunctions, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
 915       }
 916       else {
 917         warn "Error: The value specified, @UnsupportedSpecifiedFunctions , for option \"-m --mode\" is not valid.\n";
 918       }
 919       die "Allowed values:", JoinWords(\@SupportedStatisticaFunctions, ", ", 0), "\n";
 920     }
 921   }
 922 
 923   FUNCTION: for $SpecifiedFunction (@{$OptionsInfo{SpecifiedStatisticalFunctions}}) {
 924     if (exists $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} ) {
 925       next FUNCTION;
 926     }
 927     $OptionsInfo{SpecifiedStatisticalFunctionsMap}{lc($SpecifiedFunction)} = $SupportedStatisticaFunctionsMap{lc($SpecifiedFunction)};
 928   }
 929 
 930   # Setup delimiter and quotes...
 931   $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
 932   $OptionsInfo{OutQuote} = ($Options{quote} =~ /yes/i ) ? 1 : 0;
 933 
 934   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 935   $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
 936 
 937   # Setup miscellaneous options...
 938   $OptionsInfo{CheckData} = $Options{fast} ? 0 : 1;
 939   $OptionsInfo{Precision} = $Options{precision};
 940 
 941   $OptionsInfo{KLargest} = $Options{klargest};
 942   $OptionsInfo{KSmallest} = $Options{ksmallest};
 943 
 944   $OptionsInfo{TrimFraction} = $Options{trimfraction};
 945 
 946   # Setup frequency bin values...
 947   $OptionsInfo{NumOfBins} = 10;
 948   @{$OptionsInfo{BinRange}} = ();
 949   if ($Options{frequencybins} =~ /\,/) {
 950     my($BinValue, @SpecifiedBinRange);
 951     @SpecifiedBinRange = split /\,/,  $Options{frequencybins};
 952     if (@SpecifiedBinRange < 2) {
 953       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain at least two values. \n";
 954     }
 955     for $BinValue (@SpecifiedBinRange) {
 956       if (!IsNumerical($BinValue)) {
 957         die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Contains non numeric values. \n";
 958       }
 959     }
 960     my($Index1, $Index2);
 961     for $Index1 (0 .. $#SpecifiedBinRange) {
 962       for $Index2 (($Index1 + 1) .. $#SpecifiedBinRange) {
 963         if ($SpecifiedBinRange[$Index1] >= $SpecifiedBinRange[$Index2]) {
 964           die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid: Must contain values in ascending order. \n";
 965         }
 966       }
 967     }
 968     push @{$OptionsInfo{BinRange}}, @SpecifiedBinRange;
 969   }
 970   else {
 971     $OptionsInfo{NumOfBins} = $Options{frequencybins};
 972     if (!IsPositiveInteger($OptionsInfo{NumOfBins})) {
 973       die "Error: The value specified, $Options{frequencybins}, for option \"--frequencybins\" is not valid. Allowed values: positive integer or \"number,number,[number]...\". \n";
 974     }
 975   }
 976 
 977   # Setup specified data field labels...
 978   @{$OptionsInfo{SpecifiedDataLabels}} = ();
 979   if (defined $Options{datafields} && $Options{datafields} !~ /^(All|Common)$/i ) {
 980     my(@SpecifiedValues) = split ",", $Options{datafields};
 981     push @{$OptionsInfo{SpecifiedDataLabels}}, @SpecifiedValues;
 982   }
 983   @{$OptionsInfo{SpecifiedDataLabelPairs}} = ();
 984   $OptionsInfo{AllDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^AllPairs$/i) ? 1 : 0;
 985   $OptionsInfo{CommonDataLabelPairs} = (defined($Options{datafieldpairs}) && $Options{datafieldpairs} =~ /^CommonPairs$/i) ? 1 : 0;
 986   if (defined($Options{datafieldpairs}) && !$OptionsInfo{AllDataLabelPairs} && !$OptionsInfo{CommonDataLabelPairs}) {
 987     my(@SpecifiedValues) = split ",", $Options{datafieldpairs};
 988     if (@SpecifiedValues % 2) {
 989       die "Error: Invalid number of values specified using \"--datafieldpairs\" option: It must contain even number of values.\n";
 990     }
 991     push @{$OptionsInfo{SpecifiedDataLabelPairs}}, @SpecifiedValues;
 992   }
 993 
 994 }
 995 
 996 # Setup script usage  and retrieve command line arguments specified using various options...
 997 sub SetupScriptUsage {
 998 
 999   # Retrieve all the options...
1000   %Options = ();
1001   $Options{detail} = 0;
1002   $Options{datafields} = "Common";
1003   $Options{datafieldpairs} = "CommonPairs";
1004   $Options{frequencybins} = 10;
1005   $Options{klargest} = 2;
1006   $Options{ksmallest} = 2;
1007   $Options{mode} = "DescriptiveStatisticsBasic";
1008   $Options{outdelim} = "comma";
1009   $Options{precision} = 2;
1010   $Options{quote} = "yes";
1011   $Options{trimfraction} = 0.1;
1012 
1013   if (!GetOptions(\%Options, "datafields=s", "datafieldpairs=s", "detail|d=i", "frequencybins=s", "fast|f", "help|h", "klargest=i", "ksmallest=i", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "trimfraction=f", "workingdir|w=s")) {
1014     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1015   }
1016   if ($Options{workingdir}) {
1017     if (! -d $Options{workingdir}) {
1018       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1019     }
1020     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1021   }
1022   if (!IsInteger($Options{detail})) {
1023     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: >= 0\n";
1024   }
1025   if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1026     die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1027   }
1028   if ($Options{quote} !~ /^(yes|no)$/i) {
1029     die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1030   }
1031   if (!IsPositiveInteger($Options{precision})) {
1032     die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n";
1033   }
1034   if (!IsPositiveInteger($Options{klargest})) {
1035     die "Error: The value specified, $Options{klargest}, for option \"--klargest\" is not valid. Allowed values: > 0 \n";
1036   }
1037   if (!IsPositiveInteger($Options{ksmallest})) {
1038     die "Error: The value specified, $Options{ksmallest}, for option \"--ksmallest\" is not valid. Allowed values: > 0 \n";
1039   }
1040   if (IsFloat($Options{trimfraction})) {
1041     if ($Options{trimfraction} <= 0 || $Options{trimfraction} >= 1.0) {
1042       die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1043     }
1044   }
1045   else {
1046     die "Error: The value specified, $Options{trimfraction}, for option \"--trimfraction\" is not valid. Allowed values: > 0 and < 1.0\n";
1047   }
1048 }
1049