MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # File: InfoTextFiles.pl
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use FindBin; use lib "$FindBin::Bin/../lib";
  28 use Getopt::Long;
  29 use File::Basename;
  30 use Text::ParseWords;
  31 use Benchmark;
  32 use FileUtil;
  33 use TextUtil;
  34 
  35 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  36 
  37 # Autoflush STDOUT
  38 $| = 1;
  39 
  40 # Starting message...
  41 $ScriptName = basename($0);
  42 print "\n$ScriptName: Starting...\n\n";
  43 $StartTime = new Benchmark;
  44 
  45 # Get the options and setup script...
  46 SetupScriptUsage();
  47 if ($Options{help} || @ARGV < 1) {
  48   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  49 }
  50 
  51 my(@TextFilesList);
  52 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  53 
  54 # Process options...
  55 print "Processing options...\n";
  56 my(%OptionsInfo);
  57 ProcessOptions();
  58 
  59 print "Checking input text file(s)...\n";
  60 my(%TextFilesInfo);
  61 RetrieveTextFilesInfo();
  62 ProcessColumnsInfo();
  63 
  64 # Generate output files...
  65 my($FileIndex);
  66 if (@TextFilesList > 1) {
  67   print "\nProcessing text files...\n";
  68 }
  69 for $FileIndex (0 .. $#TextFilesList) {
  70   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  71     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  72     ListTextFileInfo($FileIndex);
  73   }
  74 }
  75 ListTotalSizeOfFiles();
  76 
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # List appropriate information...
  86 sub ListTextFileInfo {
  87   my($Index) = @_;
  88   my($TextFile,  $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,);
  89 
  90   $TextFile = $TextFilesList[$Index];
  91   $InDelim = $TextFilesInfo{InDelim}[$Index];
  92   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
  93 
  94   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
  95 
  96   $LineCount = 0;
  97   $EmptyLinesCount = 0;
  98   $EmptyColDataLinesCount = 0;
  99   $GreaterThanMaxColLinesCount = 0;
 100 
 101   %EmptyColValuesCountMap = ();
 102   %NonEmptyColValuesCountMap = ();
 103   %SpecifiedNonNumericalColValuesCountMap = ();
 104   %NonNumericalColValuesCountMap = ();
 105   %NumericalColValuesCountMap = ();
 106 
 107   if ($OptionsInfo{ParseLines}) {
 108     # Skip over column labels from old file...
 109     if (<TEXTFILE>) {
 110       $LineCount++;
 111       LINE: while ($Line = <TEXTFILE>) {
 112         $LineCount++;
 113         $PrintTextLine = 0;
 114         $Line =~ s/(\r\n)|(\r)|\n//g;
 115         @LineWords = quotewords($InDelim, 0, $Line);
 116         if ($OptionsInfo{CountEmpty}) {
 117           # Count lines with no data...
 118           if (!@LineWords) {
 119             $EmptyLinesCount++;
 120             if ($OptionsInfo{DetailLevel} >= 2) {
 121               print "Line number $LineCount is empty...\n";
 122             }
 123             next LINE;
 124           }
 125           # Count lines with empty data for some columns...
 126           $EmptyColValueFound = 0;
 127           VALUE: for $Value (@LineWords) {
 128               if (!IsNotEmpty($Value)) {
 129                 $EmptyColValueFound = 1;
 130                 next VALUE;
 131               }
 132           }
 133           if ($EmptyColValueFound) {
 134             $EmptyColDataLinesCount++;
 135             if ($OptionsInfo{DetailLevel} >= 2) {
 136               print "Line number $LineCount contains empty column value(s)...\n";
 137             }
 138             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 139           }
 140           # Count lines with columns greater than the column label line...
 141           if (@LineWords > @ColLabels) {
 142             $GreaterThanMaxColLinesCount++;
 143             if ($OptionsInfo{DetailLevel} >= 2) {
 144               print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n";
 145             }
 146             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 147           }
 148           # Count empty values for each coulmn...
 149           for $ColNum (0 .. $#LineWords) {
 150             if ($ColNum < @ColLabels) {
 151               $Label = $ColLabels[$ColNum];
 152               if (IsNotEmpty($LineWords[$ColNum])) {
 153                 if (exists($NonEmptyColValuesCountMap{$Label})) {
 154                   $NonEmptyColValuesCountMap{$Label} += 1;
 155                 }
 156                 else {
 157                   $NonEmptyColValuesCountMap{$Label} = 1;
 158                 }
 159               }
 160               else {
 161                 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 162                 if (exists($EmptyColValuesCountMap{$Label})) {
 163                   $EmptyColValuesCountMap{$Label} += 1;
 164                 }
 165                 else {
 166                   $EmptyColValuesCountMap{$Label} = 1;
 167                 }
 168               }
 169             }
 170           }
 171         }
 172         if ($OptionsInfo{CheckData}) {
 173           for $ColNum (0 .. $#LineWords) {
 174             if ($ColNum < @ColLabels) {
 175               if (IsNumerical($LineWords[$ColNum])) {
 176                 $Label = $ColLabels[$ColNum];
 177                 if (exists($NumericalColValuesCountMap{$Label})) {
 178                   $NumericalColValuesCountMap{$Label} += 1;
 179                 }
 180                 else {
 181                   $NumericalColValuesCountMap{$Label} = 1;
 182                 }
 183               }
 184               else {
 185                 $Label = $ColLabels[$ColNum];
 186                 if (IsNotEmpty($LineWords[$ColNum])) {
 187                   if (exists($NonNumericalColValuesCountMap{$Label})) {
 188                     $NonNumericalColValuesCountMap{$Label} += 1;
 189                   }
 190                   else {
 191                     $NonNumericalColValuesCountMap{$Label} = 1;
 192                   }
 193                 }
 194               }
 195             }
 196           }
 197         }
 198         if ($OptionsInfo{CheckNumericalData}) {
 199           $NonNumericalDataFound = 0;
 200           for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) {
 201             if ($ColNum < @LineWords) {
 202               if (!IsNumerical($LineWords[$ColNum])) {
 203                 $NonNumericalDataFound = 1;
 204                 $Label = $ColLabels[$ColNum];
 205                 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) {
 206                   $SpecifiedNonNumericalColValuesCountMap{$Label} += 1;
 207                 }
 208                 else {
 209                   $SpecifiedNonNumericalColValuesCountMap{$Label} = 1;
 210                 }
 211               }
 212             }
 213           }
 214           if ($NonNumericalDataFound) {
 215             $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0;
 216             if ($OptionsInfo{DetailLevel} >=2 ) {
 217               print "Line number $LineCount contains non-numerical data for some specified column(s)...\n";
 218             }
 219           }
 220         }
 221         if ($PrintTextLine) {
 222           print "Line $LineCount: $Line\n\n";
 223         }
 224       }
 225     }
 226   }
 227   else {
 228     while (<TEXTFILE>) {
 229       $LineCount++;
 230     }
 231   }
 232   close TEXTFILE;
 233 
 234   print "\nNumber of lines: $LineCount\n";
 235   print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n";
 236   print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n";
 237 
 238   if ($OptionsInfo{CountEmpty}) {
 239     print "\nNumber of lines with no data: $EmptyLinesCount\n";
 240     print "Number of lines with some missing column data: $EmptyColDataLinesCount\n";
 241     print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n";
 242     PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap);
 243     PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap);
 244   }
 245 
 246   if ($OptionsInfo{CheckData}) {
 247     print "\n";
 248     PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap);
 249     PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap);
 250     print "\n";
 251   }
 252 
 253   if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) {
 254     PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap);
 255   }
 256 
 257   # File size and modification information...
 258   print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n";
 259   print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n";
 260 }
 261 
 262 # Total size of all the fiels...
 263 sub ListTotalSizeOfFiles {
 264   my($FileOkayCount, $TotalSize, $Index);
 265 
 266   $FileOkayCount = 0;
 267   $TotalSize = 0;
 268 
 269   for $Index (0 .. $#TextFilesList) {
 270     if ($TextFilesInfo{FileOkay}[$Index]) {
 271       $FileOkayCount++;
 272       $TotalSize += $TextFilesInfo{FileSize}[$Index];
 273     }
 274   }
 275   if ($FileOkayCount > 1) {
 276     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 277   }
 278 }
 279 
 280 # List data information...
 281 sub PrintDataInformation {
 282   my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_;
 283   my($Line, $Label);
 284 
 285   $Line = "";
 286   for $Label (@{$DataLabelRef}) {
 287     $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ",";
 288   }
 289   $Line =~ s/\,$//g;
 290   print "$InfoLabel: $Line\n";
 291 }
 292 
 293 # Retrieve information about input text files...
 294 sub RetrieveTextFilesInfo {
 295   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels,  $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString);
 296 
 297   %TextFilesInfo = ();
 298   @{$TextFilesInfo{FileOkay}} = ();
 299   @{$TextFilesInfo{ColCount}} = ();
 300   @{$TextFilesInfo{ColLabels}} = ();
 301   @{$TextFilesInfo{ColLabelToNumMap}} = ();
 302   @{$TextFilesInfo{InDelim}} = ();
 303   @{$TextFilesInfo{FileSize}} = ();
 304   @{$TextFilesInfo{FileLastModified}} = ();
 305 
 306   FILELIST: for $Index (0 .. $#TextFilesList) {
 307     $TextFile = $TextFilesList[$Index];
 308 
 309     $TextFilesInfo{FileOkay}[$Index] = 0;
 310     $TextFilesInfo{ColCount}[$Index] = 0;
 311     $TextFilesInfo{InDelim}[$Index] = "";
 312     $TextFilesInfo{FileSize}[$Index] = 0;
 313     $TextFilesInfo{FileLastModified}[$Index] = '';
 314     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 315     %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = ();
 316 
 317     if (!(-e $TextFile)) {
 318       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 319       next FILELIST;
 320     }
 321     if (!CheckFileType($TextFile, "csv tsv")) {
 322       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 323       next FILELIST;
 324     }
 325     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 326     if ($FileExt =~ /^tsv$/i) {
 327       $InDelim = "\t";
 328     }
 329     else {
 330       $InDelim = "\,";
 331       if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) {
 332         warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n";
 333         next FILELIST;
 334       }
 335       if ($OptionsInfo{InDelim} =~ /^semicolon$/i) {
 336         $InDelim = "\;";
 337       }
 338     }
 339 
 340     if (!open TEXTFILE, "$TextFile") {
 341       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 342       next FILELIST;
 343     }
 344 
 345     $Line = GetTextLine(\*TEXTFILE);
 346     @ColLabels = quotewords($InDelim, 0, $Line);
 347     close TEXTFILE;
 348 
 349     $TextFilesInfo{FileOkay}[$Index] = 1;
 350     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 351 
 352     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 353     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 354     for $ColNum (0 .. $#ColLabels) {
 355       $ColLabel = $ColLabels[$ColNum];
 356       $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum;
 357     }
 358     $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile);
 359     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile);
 360     $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 361   }
 362 
 363 }
 364 
 365 # Make sure specified numerical data columns are okay...
 366 sub ProcessColumnsInfo {
 367   my($Index, $TextFile);
 368 
 369   @{$TextFilesInfo{NumericalDataColNums}} = ();
 370   @{$TextFilesInfo{NumericalDataColLabels}} = ();
 371 
 372   FILELIST: for $Index (0 .. $#TextFilesList) {
 373     $TextFile = $TextFilesList[$Index];
 374     @{$TextFilesInfo{NumericalDataColNums}[$Index]} = ();
 375     @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = ();
 376 
 377     if ($TextFilesInfo{FileOkay}[$Index]) {
 378       my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels);
 379       @SpecifiedColNums = ();
 380       if ($OptionsInfo{Mode} =~ /^colnum$/i) {
 381         for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 382           if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) {
 383             $ColNum = $SpecifiedColNum - 1;
 384             push @SpecifiedColNums, $ColNum;
 385             push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum];
 386           }
 387         }
 388       }
 389       else {
 390         for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 391           if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) {
 392             $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel};
 393             push @SpecifiedColNums, $ColNum;
 394             push @SpecifiedColLabels, $ColLabel;
 395           }
 396         }
 397       }
 398       if (@SpecifiedColNums) {
 399         push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums;
 400         push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels;
 401       }
 402     }
 403   }
 404 }
 405 
 406 # Process option values...
 407 sub ProcessOptions {
 408   %OptionsInfo = ();
 409 
 410   $OptionsInfo{Mode} = $Options{mode};
 411 
 412   $OptionsInfo{All} = $Options{all} ? $Options{all} : 0;
 413   $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0;
 414 
 415   $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1;
 416 
 417   $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0;
 418 
 419   $OptionsInfo{InDelim} = $Options{indelim};
 420   $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0;
 421 
 422   $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0;
 423   $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0;
 424   $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0;
 425   $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0;
 426 
 427   @{$OptionsInfo{SpecifiedNumericalDataCols}} = ();
 428   if ($Options{numericaldatacols}) {
 429     @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols};
 430     if ($Options{mode} =~ /^colnum$/i) {
 431       my($ColNum);
 432       for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) {
 433         if (!IsPositiveInteger($ColNum)) {
 434           die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n";
 435         }
 436       }
 437     }
 438   }
 439 
 440 }
 441 
 442 # Setup script usage  and retrieve command line arguments specified using various options...
 443 sub SetupScriptUsage {
 444 
 445   # Retrieve all the options...
 446   %Options = ();
 447   $Options{detail} = 1;
 448   $Options{mode} = "colnum";
 449   $Options{indelim} = "comma";
 450   if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) {
 451     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 452   }
 453   if ($Options{workingdir}) {
 454     if (! -d $Options{workingdir}) {
 455       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 456     }
 457     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 458   }
 459   if ($Options{mode} !~ /^(colnum|collabel)$/i) {
 460     die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n";
 461   }
 462   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 463     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 464   }
 465   if (!IsPositiveInteger($Options{detail})) {
 466     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 467   }
 468 }
 469