MayaChemTools:Code:TextUtil.pm

   1 package TextUtil;
   2 #
   3 # File: TextUtil.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2025 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Exporter;
  28 use Text::ParseWords;
  29 
  30 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  31 
  32 @ISA = qw(Exporter);
  33 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords  QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText);
  34 @EXPORT_OK = qw();
  35 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  36 
  37 # Add number suffix...
  38 sub AddNumberSuffix {
  39   my($Value) = @_;
  40   my($ValueWithSuffix, $Suffix);
  41 
  42   $ValueWithSuffix = $Value;
  43   if (!IsPositiveInteger($Value)) {
  44     return $ValueWithSuffix;
  45   }
  46   $Suffix = "th";
  47   if ($Value < 10 || $Value > 20) {
  48     my $Remainder = $Value % 10;
  49     $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th"));
  50   }
  51   $ValueWithSuffix = "${ValueWithSuffix}${Suffix}";
  52   return $ValueWithSuffix;
  53 }
  54 
  55 # Check out the string: Doen it contain any white space characters?
  56 sub ContainsWhiteSpaces {
  57   my($TheString) = @_;
  58   my($Status) = 0;
  59 
  60   if (defined($TheString) && length($TheString)) {
  61     $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0;
  62   }
  63   return $Status;
  64 }
  65 
  66 # Read the line, change to UNIX new line char, and chop off new line char as well...
  67 sub GetTextLine {
  68   my($TextFileRef) = @_;
  69   my($Line) = '';
  70 
  71   # Get the next non empty line...
  72   LINE: while (defined($_ = <$TextFileRef>)) {
  73     # Change Windows and Mac new line char to UNIX...
  74     s/(\r\n)|(\r)/\n/g;
  75 
  76     # Take out any new line char at the end by explicitly removing it instead of using
  77     # chomp, which might not always work correctly on files generated on a system
  78     # with a value of input line separator different from the current system...
  79     s/\n$//g;
  80 
  81     # Doesn't hurt to chomp...
  82     chomp;
  83 
  84     $Line = $_;
  85     if (length $Line) {
  86       last LINE;
  87     }
  88   }
  89   return $Line;
  90 }
  91 
  92 # Load data from a CSV file into the specified hash reference using a specific
  93 # column for unique data key values.
  94 #
  95 # The lines starting with # are treated as comments and ignored. First line
  96 # not starting with # must contain column labels and the number of columns in
  97 # all other data rows must match the number of column labels.
  98 #
  99 # The first column is assumed to contain data key value by default; all other columns
 100 # contain data as indicated in their column labels.
 101 #
 102 # In order to avoid dependence of data access on the specified column labels, the
 103 # column data is loaded into hash with Column<Num> hash keys, where column number
 104 # start from 1. The data key column is not available as Colnum<Num> hash key;
 105 #
 106 # The format of the data structure loaded into a specified hash reference is:
 107 #
 108 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
 109 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
 110 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
 111 # $TextDataMapRef->{NumOfCols} - Number of columns
 112 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
 113 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey>
 114 #
 115 # Caveats:
 116 #   . The column number start from 1.
 117 #   . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
 118 #
 119 sub GetTextFileDataByUniqueKey {
 120   my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 121 
 122   return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
 123 }
 124 
 125 # Load data from a CSV file into the specified hash reference using a specific
 126 # column for non-unique data key values.
 127 #
 128 # The lines starting with # are treated as comments and ignored. First line
 129 # not starting with # must contain column labels and the number of columns in
 130 # all other data rows must match the number of column labels.
 131 #
 132 # The first column is assumed to contain data key value by default; all other columns
 133 # contain data as indicated in their column labels.
 134 #
 135 # In order to avoid dependence of data access on the specified column labels, the
 136 # column data is loaded into hash with Column<Num> hash keys, where column number
 137 # start from 1. The data key column is not available as Colnum<Num> hash key;
 138 #
 139 # The format of the data structure loaded into a specified hash reference is:
 140 #
 141 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys
 142 # @{$TextDataMapRef->{ColLabels}} - Array of column labels
 143 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs
 144 # $TextDataMapRef->{NumOfCols} - Number of columns
 145 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey>
 146 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey>
 147 #
 148 # Caveats:
 149 #   . The column number start from 1.
 150 #   . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs.
 151 #
 152 sub GetTextFileDataByNonUniqueKey {
 153   my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 154 
 155   return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim);
 156 }
 157 
 158 # Loadtext file data using unique or non-uniqye data column key...
 159 #
 160 sub _GetTextFileData {
 161   my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_;
 162   my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums);
 163 
 164   print "\nProcessing text data file $TextDataFile...\n";
 165 
 166   $UniqueDataKeyMode = 0;
 167   if ($DataKeyMode =~ /^UniqueKey$/i) {
 168     $UniqueDataKeyMode = 1;
 169   }
 170 
 171   # Setup default values...
 172   $DataKeyColNum = defined  $DataKeyColNum ? $DataKeyColNum : 1;
 173 
 174   if (defined $InDelim) {
 175     if ($InDelim =~ /^tab$/i) {
 176       $InDelim = "\t";
 177     }
 178     elsif ($InDelim =~ /^semicolon$/i) {
 179       $InDelim = "\;";
 180     }
 181     elsif ($InDelim =~ /^comma$/i) {
 182       $InDelim = "\,";
 183     }
 184     else {
 185       warn "Warning: Ignoring specified input delimiter: $InDelim. Supported values: comma, semicolon or tab. Using default comma delimiter...";
 186       $InDelim = "\,";
 187     }
 188   }
 189   else {
 190     if ($TextDataFile =~ /\.tsv$/i) {
 191       $InDelim = "\t";
 192     }
 193     elsif ($TextDataFile =~ /\.csv$/i) {
 194       $InDelim = "\,";
 195     }
 196     else {
 197       warn "Warning: Unknown file extension. Using default comma delimiter...";
 198       $InDelim = "\,";
 199     }
 200   }
 201 
 202   ($LineCount, $IgnoredLineCount) = (0) x 2;
 203 
 204   open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ...";
 205 
 206   # Skip lines up to column labels...
 207   LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
 208     $LineCount++;
 209     if ($Line =~ /^#/) {
 210       $IgnoredLineCount++;
 211     }
 212     else {
 213       last LINE;
 214     }
 215   }
 216 
 217   # Initialize data map...
 218   %{$TextDataMapRef} = ();
 219   @{$TextDataMapRef->{DataKeys}} = ();
 220   @{$TextDataMapRef->{ColLabels}} = ();
 221   @{$TextDataMapRef->{DataColIDs}} = ();
 222   $TextDataMapRef->{NumOfCols} = undef;
 223 
 224   # Process column labels...
 225   @ColLabels= quotewords($InDelim, 0, $Line);
 226   $NumOfCols = @ColLabels;
 227 
 228   if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) {
 229     warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ...";
 230     return;
 231   }
 232   $DataKeyColIndex = $DataKeyColNum - 1;
 233 
 234   $TextDataMapRef->{NumOfCols} = $NumOfCols;
 235   push @{$TextDataMapRef->{ColLabels}}, @ColLabels;
 236 
 237   # Set up column data IDs for tracking the data...
 238   @DataColNums = ();
 239   @DataColIDs = ();
 240   COLNUM: for $ColNum (1 .. $NumOfCols) {
 241     if ($ColNum == $DataKeyColNum) {
 242       next COLNUM;
 243     }
 244     push @DataColNums, $ColNum;
 245     $ColID = "DataCol${ColNum}";
 246     push @DataColIDs, $ColID;
 247   }
 248   push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs;
 249 
 250   # Initialize column data hash...
 251   %{$TextDataMapRef->{DataKey}} = ();
 252   for $ColIndex (0 .. $#DataColNums) {
 253     $ColNum = $DataColNums[$ColIndex];
 254     $ColID = $DataColIDs[$ColIndex];
 255     %{$TextDataMapRef->{$ColID}} = ();
 256   }
 257 
 258   LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) {
 259     $LineCount++;
 260     if ($Line =~ /^#/) {
 261       $IgnoredLineCount++;
 262       next LINE;
 263     }
 264 
 265     @LineWords = quotewords($InDelim, 0, $Line);
 266     if (@LineWords != $NumOfCols) {
 267       $IgnoredLineCount++;
 268       warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n";
 269       next LINE;
 270     }
 271     $DataKey = $LineWords[$DataKeyColIndex];
 272 
 273     if ($UniqueDataKeyMode) {
 274       if (exists $TextDataMapRef->{DataKey}{$DataKey}) {
 275         $IgnoredLineCount++;
 276         warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n";
 277         next LINE;
 278       }
 279       push @{$TextDataMapRef->{DataKeys}}, $DataKey;
 280       $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
 281     }
 282     else {
 283       if (!exists $TextDataMapRef->{DataKey}{$DataKey}) {
 284         push @{$TextDataMapRef->{DataKeys}}, $DataKey;
 285         $TextDataMapRef->{DataKey}{$DataKey} = $DataKey;
 286 
 287         for $ColIndex (0 .. $#DataColNums) {
 288           $ColNum = $DataColNums[$ColIndex];
 289           $ColID = $DataColIDs[$ColIndex];
 290           @{$TextDataMapRef->{$ColID}{$DataKey}} = ();
 291         }
 292       }
 293     }
 294 
 295     # Track column data values...
 296     for $ColIndex (0 .. $#DataColNums) {
 297       $ColID = $DataColIDs[$ColIndex];
 298 
 299       $ColNum = $DataColNums[$ColIndex];
 300       $ColValue = $LineWords[$ColNum - 1];
 301 
 302       if ($UniqueDataKeyMode) {
 303         $TextDataMapRef->{$ColID}{$DataKey} = $ColValue;
 304       }
 305       else {
 306         push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue;
 307       }
 308     }
 309 
 310   }
 311 
 312   print "\nTotal number of lines in file $TextDataFile: $LineCount\n";
 313   print "Total number of lines ignored: $IgnoredLineCount\n";
 314 
 315   close TEXTDATAFILE;
 316 }
 317 
 318 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in
 319 # Perl for internal hash keys in hv.h include file.
 320 #
 321 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and
 322 # the value of use64bitint flag used during its compilation.
 323 #
 324 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear
 325 # to work:
 326 #
 327 #    o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions
 328 #    o Stay away from "use integer" to avoid signed integer arithmetic for bit operators
 329 #
 330 #
 331 #   MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits
 332 #
 333 my($MaxHashCodeMask);
 334 $MaxHashCodeMask = 2**31 - 1;
 335 
 336 sub HashCode {
 337   my($String) = @_;
 338   my($HashCode, $Value, $ShiftedHashCode);
 339 
 340   $HashCode = 0;
 341   for $Value (unpack('C*', $String)) {
 342     $HashCode += $Value;
 343 
 344     $ShiftedHashCode = $HashCode << 10;
 345     if ($ShiftedHashCode > $MaxHashCodeMask) {
 346       $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 347     }
 348 
 349     $HashCode += $ShiftedHashCode;
 350     if ($HashCode > $MaxHashCodeMask) {
 351       $HashCode = $HashCode & $MaxHashCodeMask;
 352     }
 353 
 354     $HashCode ^= ($HashCode >> 6);
 355   }
 356 
 357   $ShiftedHashCode = $HashCode << 3;
 358   if ($ShiftedHashCode > $MaxHashCodeMask) {
 359     $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 360   }
 361 
 362   $HashCode += $ShiftedHashCode;
 363   if ($HashCode > $MaxHashCodeMask) {
 364     $HashCode = $HashCode & $MaxHashCodeMask;
 365   }
 366   $HashCode ^= ($HashCode >> 11);
 367 
 368   $ShiftedHashCode = $HashCode << 15;
 369   if ($ShiftedHashCode > $MaxHashCodeMask) {
 370     $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask;
 371   }
 372 
 373   $HashCode += $ShiftedHashCode;
 374   if ($HashCode > $MaxHashCodeMask) {
 375     $HashCode = $HashCode & $MaxHashCodeMask;
 376   }
 377   return $HashCode;
 378 }
 379 
 380 # Check out the string: Is it defined and has a non zero length?
 381 sub IsEmpty {
 382   my($TheString) = @_;
 383   my($Status) = 1;
 384 
 385   $Status = (defined($TheString) && length($TheString)) ? 0 : 1;
 386 
 387   return $Status;
 388 }
 389 
 390 # Is first specified number power of second specified number...
 391 sub IsNumberPowerOfNumber {
 392   my($FirstNum, $SecondNum) = @_;
 393   my($PowerValue);
 394 
 395   $PowerValue = log($FirstNum)/log($SecondNum);
 396 
 397   return IsInteger($PowerValue) ? 1 : 0;
 398 }
 399 
 400 # Check out the string: Is it an integer?
 401 sub IsInteger {
 402   my($TheString) = @_;
 403   my($Status) = 0;
 404 
 405   if (defined($TheString) && length($TheString)) {
 406     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 407     $TheString =~ s/^[+-]//;
 408     $Status = ($TheString =~ /[^0-9]/) ? 0 : 1;
 409   }
 410   return $Status;
 411 }
 412 
 413 # Check out the string: Is it an integer with value > 0?
 414 sub IsPositiveInteger {
 415   my($TheString) = @_;
 416   my($Status) = 0;
 417 
 418   $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0;
 419 
 420   return $Status;
 421 }
 422 
 423 
 424 # Check out the string: Is it a float?
 425 sub IsFloat {
 426   my($TheString) = @_;
 427   my($Status) = 0;
 428 
 429   if (defined($TheString) && length($TheString)) {
 430     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 431     $TheString =~ s/^[+-]//;
 432     $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
 433   }
 434   return $Status;
 435 }
 436 
 437 # Check out the string: Is it defined and has a non zero length?
 438 sub IsNotEmpty {
 439   my($TheString) = @_;
 440   my($Status);
 441 
 442   $Status = IsEmpty($TheString) ? 0 : 1;
 443 
 444   return $Status;
 445 }
 446 
 447 # Check out the string: Does it only contain numerical data?
 448 sub IsNumerical {
 449   my($TheString) = @_;
 450   my($Status) = 0;
 451 
 452   if (defined($TheString) && length($TheString)) {
 453     $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString);
 454     $TheString =~ s/^[+-]//;
 455     $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1);
 456   }
 457   return $Status;
 458 }
 459 
 460 # Join different words using delimiter and quote parameters. And return as
 461 # a string value.
 462 sub JoinWords {
 463   my($Words, $Delim, $Quote) = @_;
 464 
 465   if (!@$Words) {
 466     return "";
 467   }
 468 
 469   $Quote = $Quote ? "\"" : "";
 470   my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words;
 471 
 472   return join $Delim, @NewWords;
 473 }
 474 
 475 # Split string value containing quoted or unquoted words in to an array containing
 476 # unquoted words.
 477 #
 478 # This function is used to split strings generated by JoinWords.
 479 #
 480 sub SplitWords {
 481   my($Line, $Delim) = @_;
 482 
 483   if (!$Line) {
 484     return ();
 485   }
 486 
 487   # Is it a quoted string?
 488   if ($Line =~ /^\"/) {
 489     # Take out first and last quote...
 490     $Line =~ s/^\"//; $Line =~ s/\"$//;
 491 
 492     $Delim = "\"$Delim\"";
 493   }
 494   return split /$Delim/, $Line;
 495 }
 496 
 497 # Based on quote parameter, figure out what to do
 498 sub QuoteAWord {
 499   my($Word, $Quote) = @_;
 500   my($QuotedWord);
 501 
 502   $QuotedWord = "";
 503   if ($Word) {
 504     $QuotedWord = $Word;
 505   }
 506   if ($Quote) {
 507     $QuotedWord = "\"$QuotedWord\"";
 508   }
 509   return ($QuotedWord);
 510 }
 511 
 512 # Remove leading white space characters from the string...
 513 sub RemoveLeadingWhiteSpaces {
 514   my($InString) = @_;
 515   my($OutString, $TrailingString, $LeadingWhiteSpace);
 516 
 517   $OutString = $InString;
 518   if (length($InString) && ContainsWhiteSpaces($InString)) {
 519     $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/;
 520   }
 521   return $OutString;
 522 }
 523 
 524 # Remove Trailing white space characters from the string...
 525 sub RemoveTrailingWhiteSpaces {
 526   my($InString) = @_;
 527   my($OutString, $LeadingString, $TrailingWhiteSpace);
 528 
 529   $OutString = $InString;
 530   if (length($InString) && ContainsWhiteSpaces($InString)) {
 531     $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/;
 532   }
 533   return $OutString;
 534 }
 535 
 536 # Remove both leading and trailing white space characters from the string...
 537 sub RemoveLeadingAndTrailingWhiteSpaces {
 538   my($InString) = @_;
 539   my($OutString);
 540 
 541   $OutString = $InString;
 542   if (length($InString) && ContainsWhiteSpaces($InString)) {
 543     $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/;
 544   }
 545   return $OutString;
 546 }
 547 
 548 # Wrap text string...
 549 sub WrapText {
 550   my($InString, $WrapLength, $WrapDelimiter);
 551   my($OutString);
 552 
 553   $WrapLength = 40;
 554   $WrapDelimiter = "\n";
 555   if (@_ == 3) {
 556     ($InString, $WrapLength, $WrapDelimiter) = @_;
 557   }
 558   elsif (@_ == 2) {
 559     ($InString, $WrapLength) = @_;
 560   }
 561   else {
 562     ($InString, $WrapLength) = @_;
 563   }
 564   $OutString = $InString;
 565   if ($InString && (length($InString) > $WrapLength)) {
 566     $OutString = "";
 567     my($Index, $Length, $FirstPiece, $StringPiece);
 568     $Index = 0; $Length = length($InString);
 569     $FirstPiece = 1;
 570     for ($Index = 0; $Index < $Length; $Index += $WrapLength) {
 571       if (($Index + $WrapLength) < $Length) {
 572         $StringPiece = substr($InString, $Index, $WrapLength);
 573       }
 574       else {
 575         # Last piece of the string...
 576         $StringPiece = substr($InString, $Index, $WrapLength);
 577       }
 578       if ($FirstPiece) {
 579         $FirstPiece = 0;
 580         $OutString = $StringPiece;
 581       }
 582       else {
 583         $OutString .= "${WrapDelimiter}${StringPiece}";
 584       }
 585     }
 586   }
 587   return $OutString;
 588 }
 589