1 #!/usr/bin/perl -w 2 # 3 # File: InfoTextFiles.pl 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use FindBin; use lib "$FindBin::Bin/../lib"; 28 use Getopt::Long; 29 use File::Basename; 30 use Text::ParseWords; 31 use Benchmark; 32 use FileUtil; 33 use TextUtil; 34 35 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 36 37 # Autoflush STDOUT 38 $| = 1; 39 40 # Starting message... 41 $ScriptName = basename($0); 42 print "\n$ScriptName: Starting...\n\n"; 43 $StartTime = new Benchmark; 44 45 # Get the options and setup script... 46 SetupScriptUsage(); 47 if ($Options{help} || @ARGV < 1) { 48 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 49 } 50 51 my(@TextFilesList); 52 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 53 54 # Process options... 55 print "Processing options...\n"; 56 my(%OptionsInfo); 57 ProcessOptions(); 58 59 print "Checking input text file(s)...\n"; 60 my(%TextFilesInfo); 61 RetrieveTextFilesInfo(); 62 ProcessColumnsInfo(); 63 64 # Generate output files... 65 my($FileIndex); 66 if (@TextFilesList > 1) { 67 print "\nProcessing text files...\n"; 68 } 69 for $FileIndex (0 .. $#TextFilesList) { 70 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 71 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 72 ListTextFileInfo($FileIndex); 73 } 74 } 75 ListTotalSizeOfFiles(); 76 77 print "\n$ScriptName:Done...\n\n"; 78 79 $EndTime = new Benchmark; 80 $TotalTime = timediff ($EndTime, $StartTime); 81 print "Total time: ", timestr($TotalTime), "\n"; 82 83 ############################################################################### 84 85 # List appropriate information... 86 sub ListTextFileInfo { 87 my($Index) = @_; 88 my($TextFile, $Line, $InDelim, $LineCount, $EmptyLinesCount, $EmptyColDataLinesCount, $GreaterThanMaxColLinesCount, $Label, $Value, $ColNum, $EmptyColValueFound, $PrintTextLine, $NonNumericalDataFound, @ColLabels, @LineWords, %EmptyColValuesCountMap, %NonEmptyColValuesCountMap, %SpecifiedNonNumericalColValuesCountMap, %NonNumericalColValuesCountMap, %NumericalColValuesCountMap,); 89 90 $TextFile = $TextFilesList[$Index]; 91 $InDelim = $TextFilesInfo{InDelim}[$Index]; 92 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; 93 94 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 95 96 $LineCount = 0; 97 $EmptyLinesCount = 0; 98 $EmptyColDataLinesCount = 0; 99 $GreaterThanMaxColLinesCount = 0; 100 101 %EmptyColValuesCountMap = (); 102 %NonEmptyColValuesCountMap = (); 103 %SpecifiedNonNumericalColValuesCountMap = (); 104 %NonNumericalColValuesCountMap = (); 105 %NumericalColValuesCountMap = (); 106 107 if ($OptionsInfo{ParseLines}) { 108 # Skip over column labels from old file... 109 if (<TEXTFILE>) { 110 $LineCount++; 111 LINE: while ($Line = <TEXTFILE>) { 112 $LineCount++; 113 $PrintTextLine = 0; 114 $Line =~ s/(\r\n)|(\r)|\n//g; 115 @LineWords = quotewords($InDelim, 0, $Line); 116 if ($OptionsInfo{CountEmpty}) { 117 # Count lines with no data... 118 if (!@LineWords) { 119 $EmptyLinesCount++; 120 if ($OptionsInfo{DetailLevel} >= 2) { 121 print "Line number $LineCount is empty...\n"; 122 } 123 next LINE; 124 } 125 # Count lines with empty data for some columns... 126 $EmptyColValueFound = 0; 127 VALUE: for $Value (@LineWords) { 128 if (!IsNotEmpty($Value)) { 129 $EmptyColValueFound = 1; 130 next VALUE; 131 } 132 } 133 if ($EmptyColValueFound) { 134 $EmptyColDataLinesCount++; 135 if ($OptionsInfo{DetailLevel} >= 2) { 136 print "Line number $LineCount contains empty column value(s)...\n"; 137 } 138 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 139 } 140 # Count lines with columns greater than the column label line... 141 if (@LineWords > @ColLabels) { 142 $GreaterThanMaxColLinesCount++; 143 if ($OptionsInfo{DetailLevel} >= 2) { 144 print "Line number $LineCount contains more than ", scalar(@ColLabels), " columns...\n"; 145 } 146 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 147 } 148 # Count empty values for each coulmn... 149 for $ColNum (0 .. $#LineWords) { 150 if ($ColNum < @ColLabels) { 151 $Label = $ColLabels[$ColNum]; 152 if (IsNotEmpty($LineWords[$ColNum])) { 153 if (exists($NonEmptyColValuesCountMap{$Label})) { 154 $NonEmptyColValuesCountMap{$Label} += 1; 155 } 156 else { 157 $NonEmptyColValuesCountMap{$Label} = 1; 158 } 159 } 160 else { 161 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 162 if (exists($EmptyColValuesCountMap{$Label})) { 163 $EmptyColValuesCountMap{$Label} += 1; 164 } 165 else { 166 $EmptyColValuesCountMap{$Label} = 1; 167 } 168 } 169 } 170 } 171 } 172 if ($OptionsInfo{CheckData}) { 173 for $ColNum (0 .. $#LineWords) { 174 if ($ColNum < @ColLabels) { 175 if (IsNumerical($LineWords[$ColNum])) { 176 $Label = $ColLabels[$ColNum]; 177 if (exists($NumericalColValuesCountMap{$Label})) { 178 $NumericalColValuesCountMap{$Label} += 1; 179 } 180 else { 181 $NumericalColValuesCountMap{$Label} = 1; 182 } 183 } 184 else { 185 $Label = $ColLabels[$ColNum]; 186 if (IsNotEmpty($LineWords[$ColNum])) { 187 if (exists($NonNumericalColValuesCountMap{$Label})) { 188 $NonNumericalColValuesCountMap{$Label} += 1; 189 } 190 else { 191 $NonNumericalColValuesCountMap{$Label} = 1; 192 } 193 } 194 } 195 } 196 } 197 } 198 if ($OptionsInfo{CheckNumericalData}) { 199 $NonNumericalDataFound = 0; 200 for $ColNum (@{$TextFilesInfo{NumericalDataColNums}[$Index]}) { 201 if ($ColNum < @LineWords) { 202 if (!IsNumerical($LineWords[$ColNum])) { 203 $NonNumericalDataFound = 1; 204 $Label = $ColLabels[$ColNum]; 205 if (exists($SpecifiedNonNumericalColValuesCountMap{$Label})) { 206 $SpecifiedNonNumericalColValuesCountMap{$Label} += 1; 207 } 208 else { 209 $SpecifiedNonNumericalColValuesCountMap{$Label} = 1; 210 } 211 } 212 } 213 } 214 if ($NonNumericalDataFound) { 215 $PrintTextLine = ($OptionsInfo{DetailLevel} >= 3) ? 1 : 0; 216 if ($OptionsInfo{DetailLevel} >=2 ) { 217 print "Line number $LineCount contains non-numerical data for some specified column(s)...\n"; 218 } 219 } 220 } 221 if ($PrintTextLine) { 222 print "Line $LineCount: $Line\n\n"; 223 } 224 } 225 } 226 } 227 else { 228 while (<TEXTFILE>) { 229 $LineCount++; 230 } 231 } 232 close TEXTFILE; 233 234 print "\nNumber of lines: $LineCount\n"; 235 print "Number of columns: $TextFilesInfo{ColCount}[$Index]\n"; 236 print "Column labels: ", JoinWords(\@ColLabels, ", ", 1), "\n"; 237 238 if ($OptionsInfo{CountEmpty}) { 239 print "\nNumber of lines with no data: $EmptyLinesCount\n"; 240 print "Number of lines with some missing column data: $EmptyColDataLinesCount\n"; 241 print "Number of lines containing greater than ", scalar(@ColLabels), " columns: $GreaterThanMaxColLinesCount\n"; 242 PrintDataInformation("Number of non-empty values for each column(s)", \@ColLabels, \%NonEmptyColValuesCountMap); 243 PrintDataInformation("Number of empty values for each column(s)", \@ColLabels, \%EmptyColValuesCountMap); 244 } 245 246 if ($OptionsInfo{CheckData}) { 247 print "\n"; 248 PrintDataInformation("Number of non-numerical data values for each column(s)", \@ColLabels, \%NonNumericalColValuesCountMap); 249 PrintDataInformation("Number of numerical data values for each column(s)", \@ColLabels, \%NumericalColValuesCountMap); 250 print "\n"; 251 } 252 253 if ($OptionsInfo{CheckNumericalData} && @{$TextFilesInfo{NumericalDataColLabels}[$Index]}) { 254 PrintDataInformation("Number of non-numerical data values for each column(s)", \@{$TextFilesInfo{NumericalDataColLabels}[$Index]}, \%SpecifiedNonNumericalColValuesCountMap); 255 } 256 257 # File size and modification information... 258 print "\nFile size: ", FormatFileSize($TextFilesInfo{FileSize}[$Index]), " \n"; 259 print "Last modified: ", $TextFilesInfo{FileLastModified}[$Index], " \n"; 260 } 261 262 # Total size of all the fiels... 263 sub ListTotalSizeOfFiles { 264 my($FileOkayCount, $TotalSize, $Index); 265 266 $FileOkayCount = 0; 267 $TotalSize = 0; 268 269 for $Index (0 .. $#TextFilesList) { 270 if ($TextFilesInfo{FileOkay}[$Index]) { 271 $FileOkayCount++; 272 $TotalSize += $TextFilesInfo{FileSize}[$Index]; 273 } 274 } 275 if ($FileOkayCount > 1) { 276 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 277 } 278 } 279 280 # List data information... 281 sub PrintDataInformation { 282 my($InfoLabel, $DataLabelRef, $DataLabelToValueMapRef) = @_; 283 my($Line, $Label); 284 285 $Line = ""; 286 for $Label (@{$DataLabelRef}) { 287 $Line .= " \"$Label\" - " . (exists($DataLabelToValueMapRef->{$Label}) ? $DataLabelToValueMapRef->{$Label} : 0) . ","; 288 } 289 $Line =~ s/\,$//g; 290 print "$InfoLabel: $Line\n"; 291 } 292 293 # Retrieve information about input text files... 294 sub RetrieveTextFilesInfo { 295 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $ColNum, $ColLabel, $ModifiedTimeString, $ModifiedDateString); 296 297 %TextFilesInfo = (); 298 @{$TextFilesInfo{FileOkay}} = (); 299 @{$TextFilesInfo{ColCount}} = (); 300 @{$TextFilesInfo{ColLabels}} = (); 301 @{$TextFilesInfo{ColLabelToNumMap}} = (); 302 @{$TextFilesInfo{InDelim}} = (); 303 @{$TextFilesInfo{FileSize}} = (); 304 @{$TextFilesInfo{FileLastModified}} = (); 305 306 FILELIST: for $Index (0 .. $#TextFilesList) { 307 $TextFile = $TextFilesList[$Index]; 308 309 $TextFilesInfo{FileOkay}[$Index] = 0; 310 $TextFilesInfo{ColCount}[$Index] = 0; 311 $TextFilesInfo{InDelim}[$Index] = ""; 312 $TextFilesInfo{FileSize}[$Index] = 0; 313 $TextFilesInfo{FileLastModified}[$Index] = ''; 314 @{$TextFilesInfo{ColLabels}[$Index]} = (); 315 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); 316 317 if (!(-e $TextFile)) { 318 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 319 next FILELIST; 320 } 321 if (!CheckFileType($TextFile, "csv tsv")) { 322 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 323 next FILELIST; 324 } 325 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 326 if ($FileExt =~ /^tsv$/i) { 327 $InDelim = "\t"; 328 } 329 else { 330 $InDelim = "\,"; 331 if ($OptionsInfo{InDelim} !~ /^(comma|semicolon)$/i) { 332 warn "Warning: Ignoring file $TextFile: The value specified, $OptionsInfo{InDelim}, for option \"--indelim\" is not valid for csv files\n"; 333 next FILELIST; 334 } 335 if ($OptionsInfo{InDelim} =~ /^semicolon$/i) { 336 $InDelim = "\;"; 337 } 338 } 339 340 if (!open TEXTFILE, "$TextFile") { 341 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 342 next FILELIST; 343 } 344 345 $Line = GetTextLine(\*TEXTFILE); 346 @ColLabels = quotewords($InDelim, 0, $Line); 347 close TEXTFILE; 348 349 $TextFilesInfo{FileOkay}[$Index] = 1; 350 $TextFilesInfo{InDelim}[$Index] = $InDelim; 351 352 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 353 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 354 for $ColNum (0 .. $#ColLabels) { 355 $ColLabel = $ColLabels[$ColNum]; 356 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; 357 } 358 $TextFilesInfo{FileSize}[$Index] = FileSize($TextFile); 359 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($TextFile); 360 $TextFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 361 } 362 363 } 364 365 # Make sure specified numerical data columns are okay... 366 sub ProcessColumnsInfo { 367 my($Index, $TextFile); 368 369 @{$TextFilesInfo{NumericalDataColNums}} = (); 370 @{$TextFilesInfo{NumericalDataColLabels}} = (); 371 372 FILELIST: for $Index (0 .. $#TextFilesList) { 373 $TextFile = $TextFilesList[$Index]; 374 @{$TextFilesInfo{NumericalDataColNums}[$Index]} = (); 375 @{$TextFilesInfo{NumericalDataColLabels}[$Index]} = (); 376 377 if ($TextFilesInfo{FileOkay}[$Index]) { 378 my($SpecifiedColNum, $ColNum, $ColLabel, @SpecifiedColNums, @SpecifiedColLabels); 379 @SpecifiedColNums = (); 380 if ($OptionsInfo{Mode} =~ /^colnum$/i) { 381 for $SpecifiedColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 382 if ($SpecifiedColNum <= $TextFilesInfo{ColCount}[$Index]) { 383 $ColNum = $SpecifiedColNum - 1; 384 push @SpecifiedColNums, $ColNum; 385 push @SpecifiedColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; 386 } 387 } 388 } 389 else { 390 for $ColLabel (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 391 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel})) { 392 $ColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; 393 push @SpecifiedColNums, $ColNum; 394 push @SpecifiedColLabels, $ColLabel; 395 } 396 } 397 } 398 if (@SpecifiedColNums) { 399 push @{$TextFilesInfo{NumericalDataColNums}[$Index]}, @SpecifiedColNums; 400 push @{$TextFilesInfo{NumericalDataColLabels}[$Index]}, @SpecifiedColLabels; 401 } 402 } 403 } 404 } 405 406 # Process option values... 407 sub ProcessOptions { 408 %OptionsInfo = (); 409 410 $OptionsInfo{Mode} = $Options{mode}; 411 412 $OptionsInfo{All} = $Options{all} ? $Options{all} : 0; 413 $OptionsInfo{Count} = $Options{count} ? $Options{count} : 0; 414 415 $OptionsInfo{DetailLevel} = $Options{detail} ? $Options{detail} : 1; 416 417 $OptionsInfo{Empty} = $Options{empty} ? $Options{empty} : 0; 418 419 $OptionsInfo{InDelim} = $Options{indelim}; 420 $OptionsInfo{NumericalDataCols} = $Options{numericaldatacols} ? $Options{numericaldatacols} : 0; 421 422 $OptionsInfo{ParseLines} = ($Options{all} || $Options{empty} || $Options{numericaldatacols}) ? 1 : 0; 423 $OptionsInfo{CountEmpty} = ($Options{all} || $Options{empty}) ? 1 : 0; 424 $OptionsInfo{CheckData} = ($Options{all} || $Options{datacheck}) ? 1 : 0; 425 $OptionsInfo{CheckNumericalData} = ($Options{all} || $Options{numericaldatacols}) ? 1 : 0; 426 427 @{$OptionsInfo{SpecifiedNumericalDataCols}} = (); 428 if ($Options{numericaldatacols}) { 429 @{$OptionsInfo{SpecifiedNumericalDataCols}} = split ",", $Options{numericaldatacols}; 430 if ($Options{mode} =~ /^colnum$/i) { 431 my($ColNum); 432 for $ColNum (@{$OptionsInfo{SpecifiedNumericalDataCols}}) { 433 if (!IsPositiveInteger($ColNum)) { 434 die "Error: Invalid value $ColNum specified using \"--numericaldatacols\" option: Allowed values: > 0\n"; 435 } 436 } 437 } 438 } 439 440 } 441 442 # Setup script usage and retrieve command line arguments specified using various options... 443 sub SetupScriptUsage { 444 445 # Retrieve all the options... 446 %Options = (); 447 $Options{detail} = 1; 448 $Options{mode} = "colnum"; 449 $Options{indelim} = "comma"; 450 if (!GetOptions(\%Options, "all|a", "count|c", "datacheck", "detail|d=i", "empty|e", "help|h", "indelim=s", "mode|m=s", "numericaldatacols|n=s", "workingdir|w=s")) { 451 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 452 } 453 if ($Options{workingdir}) { 454 if (! -d $Options{workingdir}) { 455 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 456 } 457 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 458 } 459 if ($Options{mode} !~ /^(colnum|collabel)$/i) { 460 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: colnum or collabel\n"; 461 } 462 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 463 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 464 } 465 if (!IsPositiveInteger($Options{detail})) { 466 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 467 } 468 } 469