1 package TextUtil; 2 # 3 # File: TextUtil.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Exporter; 28 use Text::ParseWords; 29 30 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 31 32 @ISA = qw(Exporter); 33 @EXPORT = qw(AddNumberSuffix ContainsWhiteSpaces GetTextLine GetTextFileDataByUniqueKey GetTextFileDataByNonUniqueKey HashCode IsEmpty IsNumberPowerOfNumber IsInteger IsPositiveInteger IsFloat IsNotEmpty IsNumerical JoinWords SplitWords QuoteAWord RemoveLeadingWhiteSpaces RemoveTrailingWhiteSpaces RemoveLeadingAndTrailingWhiteSpaces WrapText); 34 @EXPORT_OK = qw(); 35 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 36 37 # Add number suffix... 38 sub AddNumberSuffix { 39 my($Value) = @_; 40 my($ValueWithSuffix, $Suffix); 41 42 $ValueWithSuffix = $Value; 43 if (!IsPositiveInteger($Value)) { 44 return $ValueWithSuffix; 45 } 46 $Suffix = "th"; 47 if ($Value < 10 || $Value > 20) { 48 my $Remainder = $Value % 10; 49 $Suffix = ($Remainder == 1) ? "st" : (($Remainder == 2) ? "nd" : (($Remainder == 3) ? "rd" : "th")); 50 } 51 $ValueWithSuffix = "${ValueWithSuffix}${Suffix}"; 52 return $ValueWithSuffix; 53 } 54 55 # Check out the string: Doen it contain any white space characters? 56 sub ContainsWhiteSpaces { 57 my($TheString) = @_; 58 my($Status) = 0; 59 60 if (defined($TheString) && length($TheString)) { 61 $Status = ($TheString =~ /[ \t\r\n\f]/ ) ? 1 : 0; 62 } 63 return $Status; 64 } 65 66 # Read the line, change to UNIX new line char, and chop off new line char as well... 67 sub GetTextLine { 68 my($TextFileRef) = @_; 69 my($Line) = ''; 70 71 # Get the next non empty line... 72 LINE: while (defined($_ = <$TextFileRef>)) { 73 # Change Windows and Mac new line char to UNIX... 74 s/(\r\n)|(\r)/\n/g; 75 76 # Take out any new line char at the end by explicitly removing it instead of using 77 # chomp, which might not always work correctly on files generated on a system 78 # with a value of input line separator different from the current system... 79 s/\n$//g; 80 81 # Doesn't hurt to chomp... 82 chomp; 83 84 $Line = $_; 85 if (length $Line) { 86 last LINE; 87 } 88 } 89 return $Line; 90 } 91 92 # Load data from a CSV file into the specified hash reference using a specific 93 # column for unique data key values. 94 # 95 # The lines starting with # are treated as comments and ignored. First line 96 # not starting with # must contain column labels and the number of columns in 97 # all other data rows must match the number of column labels. 98 # 99 # The first column is assumed to contain data key value by default; all other columns 100 # contain data as indicated in their column labels. 101 # 102 # In order to avoid dependence of data access on the specified column labels, the 103 # column data is loaded into hash with Column<Num> hash keys, where column number 104 # start from 1. The data key column is not available as Colnum<Num> hash key; 105 # 106 # The format of the data structure loaded into a specified hash reference is: 107 # 108 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys 109 # @{$TextDataMapRef->{ColLabels}} - Array of column labels 110 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs 111 # $TextDataMapRef->{NumOfCols} - Number of columns 112 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> 113 # %{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair: <DataCol<Num>, DataKey> 114 # 115 # Caveats: 116 # . The column number start from 1. 117 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. 118 # 119 sub GetTextFileDataByUniqueKey { 120 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 121 122 return _GetTextFileData("UniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); 123 } 124 125 # Load data from a CSV file into the specified hash reference using a specific 126 # column for non-unique data key values. 127 # 128 # The lines starting with # are treated as comments and ignored. First line 129 # not starting with # must contain column labels and the number of columns in 130 # all other data rows must match the number of column labels. 131 # 132 # The first column is assumed to contain data key value by default; all other columns 133 # contain data as indicated in their column labels. 134 # 135 # In order to avoid dependence of data access on the specified column labels, the 136 # column data is loaded into hash with Column<Num> hash keys, where column number 137 # start from 1. The data key column is not available as Colnum<Num> hash key; 138 # 139 # The format of the data structure loaded into a specified hash reference is: 140 # 141 # @{$TextDataMapRef->{DataKeys}} - Array of unique data keys 142 # @{$TextDataMapRef->{ColLabels}} - Array of column labels 143 # @{$TextDataMapRef->{DataColIDs}} - Array of data column IDs 144 # $TextDataMapRef->{NumOfCols} - Number of columns 145 # %{$TextDataMapRef->{DataKey}} - Hash keys pair: <DataKey, DataKey> 146 # @{$TextDataMapRef->{DataCol<Num>}} - Hash keys pair with data as an array: <DataCol<Num>, DataKey> 147 # 148 # Caveats: 149 # . The column number start from 1. 150 # . Column data for data key column column is not loaded into <Column<Num>, DataKey> hash keys pairs. 151 # 152 sub GetTextFileDataByNonUniqueKey { 153 my($TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 154 155 return _GetTextFileData("NonUniqueKey", $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim); 156 } 157 158 # Loadtext file data using unique or non-uniqye data column key... 159 # 160 sub _GetTextFileData { 161 my($DataKeyMode, $TextDataFile, $TextDataMapRef, $DataKeyColNum, $InDelim) = @_; 162 my($DataKeyColIndex, $LineCount, $IgnoredLineCount, $UniqueDataKeyMode, $DataKey, $Line, $NumOfCols, $ColIndex, $ColNum, $ColID, $ColValue, @LineWords, @ColLabels, @DataColIDs, @DataColNums); 163 164 print "\nProcessing text data file $TextDataFile...\n"; 165 166 $UniqueDataKeyMode = 0; 167 if ($DataKeyMode =~ /^UniqueKey$/i) { 168 $UniqueDataKeyMode = 1; 169 } 170 171 # Setup default values... 172 $DataKeyColNum = defined $DataKeyColNum ? $DataKeyColNum : 1; 173 174 if (defined $InDelim) { 175 if ($InDelim =~ /^tab$/i) { 176 $InDelim = "\t"; 177 } 178 elsif ($InDelim =~ /^semicolon$/i) { 179 $InDelim = "\;"; 180 } 181 elsif ($InDelim =~ /^comma$/i) { 182 $InDelim = "\,"; 183 } 184 else { 185 warn "Warning: Ignoring specified input delimiter: $InDelim. Supported values: comma, semicolon or tab. Using default comma delimiter..."; 186 $InDelim = "\,"; 187 } 188 } 189 else { 190 if ($TextDataFile =~ /\.tsv$/i) { 191 $InDelim = "\t"; 192 } 193 elsif ($TextDataFile =~ /\.csv$/i) { 194 $InDelim = "\,"; 195 } 196 else { 197 warn "Warning: Unknown file extension. Using default comma delimiter..."; 198 $InDelim = "\,"; 199 } 200 } 201 202 ($LineCount, $IgnoredLineCount) = (0) x 2; 203 204 open TEXTDATAFILE, "$TextDataFile" or die "Couldn't open $TextDataFile: $! ..."; 205 206 # Skip lines up to column labels... 207 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { 208 $LineCount++; 209 if ($Line =~ /^#/) { 210 $IgnoredLineCount++; 211 } 212 else { 213 last LINE; 214 } 215 } 216 217 # Initialize data map... 218 %{$TextDataMapRef} = (); 219 @{$TextDataMapRef->{DataKeys}} = (); 220 @{$TextDataMapRef->{ColLabels}} = (); 221 @{$TextDataMapRef->{DataColIDs}} = (); 222 $TextDataMapRef->{NumOfCols} = undef; 223 224 # Process column labels... 225 @ColLabels= quotewords($InDelim, 0, $Line); 226 $NumOfCols = @ColLabels; 227 228 if ($DataKeyColNum < 1 || $DataKeyColNum > $NumOfCols) { 229 warn "Warning: Ignoring text data file $TextDataFile: Invalid data key column number, $DataKeyColNum, specified. It must be > 0 or <= $NumOfCols, number of columns in the text file ..."; 230 return; 231 } 232 $DataKeyColIndex = $DataKeyColNum - 1; 233 234 $TextDataMapRef->{NumOfCols} = $NumOfCols; 235 push @{$TextDataMapRef->{ColLabels}}, @ColLabels; 236 237 # Set up column data IDs for tracking the data... 238 @DataColNums = (); 239 @DataColIDs = (); 240 COLNUM: for $ColNum (1 .. $NumOfCols) { 241 if ($ColNum == $DataKeyColNum) { 242 next COLNUM; 243 } 244 push @DataColNums, $ColNum; 245 $ColID = "DataCol${ColNum}"; 246 push @DataColIDs, $ColID; 247 } 248 push @{$TextDataMapRef->{DataColIDs}}, @DataColIDs; 249 250 # Initialize column data hash... 251 %{$TextDataMapRef->{DataKey}} = (); 252 for $ColIndex (0 .. $#DataColNums) { 253 $ColNum = $DataColNums[$ColIndex]; 254 $ColID = $DataColIDs[$ColIndex]; 255 %{$TextDataMapRef->{$ColID}} = (); 256 } 257 258 LINE: while ($Line = TextUtil::GetTextLine(\*TEXTDATAFILE)) { 259 $LineCount++; 260 if ($Line =~ /^#/) { 261 $IgnoredLineCount++; 262 next LINE; 263 } 264 265 @LineWords = quotewords($InDelim, 0, $Line); 266 if (@LineWords != $NumOfCols) { 267 $IgnoredLineCount++; 268 warn "Warning: The number of data fields, @LineWords, in $TextDataFile must be $NumOfCols.\nIgnoring line number $LineCount: $Line...\n"; 269 next LINE; 270 } 271 $DataKey = $LineWords[$DataKeyColIndex]; 272 273 if ($UniqueDataKeyMode) { 274 if (exists $TextDataMapRef->{DataKey}{$DataKey}) { 275 $IgnoredLineCount++; 276 warn "Warning: The data key, $DataKey, in data column key number, $DataKeyColNum, is already present.\nIgnoring line number $LineCount: $Line...\n"; 277 next LINE; 278 } 279 push @{$TextDataMapRef->{DataKeys}}, $DataKey; 280 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; 281 } 282 else { 283 if (!exists $TextDataMapRef->{DataKey}{$DataKey}) { 284 push @{$TextDataMapRef->{DataKeys}}, $DataKey; 285 $TextDataMapRef->{DataKey}{$DataKey} = $DataKey; 286 287 for $ColIndex (0 .. $#DataColNums) { 288 $ColNum = $DataColNums[$ColIndex]; 289 $ColID = $DataColIDs[$ColIndex]; 290 @{$TextDataMapRef->{$ColID}{$DataKey}} = (); 291 } 292 } 293 } 294 295 # Track column data values... 296 for $ColIndex (0 .. $#DataColNums) { 297 $ColID = $DataColIDs[$ColIndex]; 298 299 $ColNum = $DataColNums[$ColIndex]; 300 $ColValue = $LineWords[$ColNum - 1]; 301 302 if ($UniqueDataKeyMode) { 303 $TextDataMapRef->{$ColID}{$DataKey} = $ColValue; 304 } 305 else { 306 push @{$TextDataMapRef->{$ColID}{$DataKey}}, $ColValue; 307 } 308 } 309 310 } 311 312 print "\nTotal number of lines in file $TextDataFile: $LineCount\n"; 313 print "Total number of lines ignored: $IgnoredLineCount\n"; 314 315 close TEXTDATAFILE; 316 } 317 318 # Returns a 32 bit integer hash code using One-at-a-time algorithm By Bob Jenkins [Ref 38]. It's also implemented in 319 # Perl for internal hash keys in hv.h include file. 320 # 321 # It's not clear how to force Perl perform unsigned integer arithmetic irrespective of the OS/Platform and 322 # the value of use64bitint flag used during its compilation. 323 # 324 # In order to generate a consistent 32 bit has code across OS/platforms, the following methodology appear 325 # to work: 326 # 327 # o Use MaxHashCodeMask to retrieve appropriate bits after left shifting by bit operators and additions 328 # o Stay away from "use integer" to avoid signed integer arithmetic for bit operators 329 # 330 # 331 # MaxHashCodeMask (2147483647) corresponds to the maximum value which can be stored in 31 bits 332 # 333 my($MaxHashCodeMask); 334 $MaxHashCodeMask = 2**31 - 1; 335 336 sub HashCode { 337 my($String) = @_; 338 my($HashCode, $Value, $ShiftedHashCode); 339 340 $HashCode = 0; 341 for $Value (unpack('C*', $String)) { 342 $HashCode += $Value; 343 344 $ShiftedHashCode = $HashCode << 10; 345 if ($ShiftedHashCode > $MaxHashCodeMask) { 346 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 347 } 348 349 $HashCode += $ShiftedHashCode; 350 if ($HashCode > $MaxHashCodeMask) { 351 $HashCode = $HashCode & $MaxHashCodeMask; 352 } 353 354 $HashCode ^= ($HashCode >> 6); 355 } 356 357 $ShiftedHashCode = $HashCode << 3; 358 if ($ShiftedHashCode > $MaxHashCodeMask) { 359 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 360 } 361 362 $HashCode += $ShiftedHashCode; 363 if ($HashCode > $MaxHashCodeMask) { 364 $HashCode = $HashCode & $MaxHashCodeMask; 365 } 366 $HashCode ^= ($HashCode >> 11); 367 368 $ShiftedHashCode = $HashCode << 15; 369 if ($ShiftedHashCode > $MaxHashCodeMask) { 370 $ShiftedHashCode = $ShiftedHashCode & $MaxHashCodeMask; 371 } 372 373 $HashCode += $ShiftedHashCode; 374 if ($HashCode > $MaxHashCodeMask) { 375 $HashCode = $HashCode & $MaxHashCodeMask; 376 } 377 return $HashCode; 378 } 379 380 # Check out the string: Is it defined and has a non zero length? 381 sub IsEmpty { 382 my($TheString) = @_; 383 my($Status) = 1; 384 385 $Status = (defined($TheString) && length($TheString)) ? 0 : 1; 386 387 return $Status; 388 } 389 390 # Is first specified number power of second specified number... 391 sub IsNumberPowerOfNumber { 392 my($FirstNum, $SecondNum) = @_; 393 my($PowerValue); 394 395 $PowerValue = log($FirstNum)/log($SecondNum); 396 397 return IsInteger($PowerValue) ? 1 : 0; 398 } 399 400 # Check out the string: Is it an integer? 401 sub IsInteger { 402 my($TheString) = @_; 403 my($Status) = 0; 404 405 if (defined($TheString) && length($TheString)) { 406 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 407 $TheString =~ s/^[+-]//; 408 $Status = ($TheString =~ /[^0-9]/) ? 0 : 1; 409 } 410 return $Status; 411 } 412 413 # Check out the string: Is it an integer with value > 0? 414 sub IsPositiveInteger { 415 my($TheString) = @_; 416 my($Status) = 0; 417 418 $Status = IsInteger($TheString) ? ($TheString > 0 ? 1 : 0) : 0; 419 420 return $Status; 421 } 422 423 424 # Check out the string: Is it a float? 425 sub IsFloat { 426 my($TheString) = @_; 427 my($Status) = 0; 428 429 if (defined($TheString) && length($TheString)) { 430 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 431 $TheString =~ s/^[+-]//; 432 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); 433 } 434 return $Status; 435 } 436 437 # Check out the string: Is it defined and has a non zero length? 438 sub IsNotEmpty { 439 my($TheString) = @_; 440 my($Status); 441 442 $Status = IsEmpty($TheString) ? 0 : 1; 443 444 return $Status; 445 } 446 447 # Check out the string: Does it only contain numerical data? 448 sub IsNumerical { 449 my($TheString) = @_; 450 my($Status) = 0; 451 452 if (defined($TheString) && length($TheString)) { 453 $TheString = RemoveLeadingAndTrailingWhiteSpaces($TheString); 454 $TheString =~ s/^[+-]//; 455 $Status = ($TheString =~ /[^0-9.eE]/) ? 0 : (((length($TheString) == 1) && ($TheString =~ /[.eE]/)) ? 0 : 1); 456 } 457 return $Status; 458 } 459 460 # Join different words using delimiter and quote parameters. And return as 461 # a string value. 462 sub JoinWords { 463 my($Words, $Delim, $Quote) = @_; 464 465 if (!@$Words) { 466 return ""; 467 } 468 469 $Quote = $Quote ? "\"" : ""; 470 my(@NewWords) = map { (defined($_) && length($_)) ? "${Quote}$_${Quote}" : "${Quote}${Quote}" } @$Words; 471 472 return join $Delim, @NewWords; 473 } 474 475 # Split string value containing quoted or unquoted words in to an array containing 476 # unquoted words. 477 # 478 # This function is used to split strings generated by JoinWords. 479 # 480 sub SplitWords { 481 my($Line, $Delim) = @_; 482 483 if (!$Line) { 484 return (); 485 } 486 487 # Is it a quoted string? 488 if ($Line =~ /^\"/) { 489 # Take out first and last quote... 490 $Line =~ s/^\"//; $Line =~ s/\"$//; 491 492 $Delim = "\"$Delim\""; 493 } 494 return split /$Delim/, $Line; 495 } 496 497 # Based on quote parameter, figure out what to do 498 sub QuoteAWord { 499 my($Word, $Quote) = @_; 500 my($QuotedWord); 501 502 $QuotedWord = ""; 503 if ($Word) { 504 $QuotedWord = $Word; 505 } 506 if ($Quote) { 507 $QuotedWord = "\"$QuotedWord\""; 508 } 509 return ($QuotedWord); 510 } 511 512 # Remove leading white space characters from the string... 513 sub RemoveLeadingWhiteSpaces { 514 my($InString) = @_; 515 my($OutString, $TrailingString, $LeadingWhiteSpace); 516 517 $OutString = $InString; 518 if (length($InString) && ContainsWhiteSpaces($InString)) { 519 $OutString =~ s/^([ \t\r\n\f]*)(.*?)$/$2/; 520 } 521 return $OutString; 522 } 523 524 # Remove Trailing white space characters from the string... 525 sub RemoveTrailingWhiteSpaces { 526 my($InString) = @_; 527 my($OutString, $LeadingString, $TrailingWhiteSpace); 528 529 $OutString = $InString; 530 if (length($InString) && ContainsWhiteSpaces($InString)) { 531 $OutString =~ s/^(.*?)([ \t\r\n\f]*)$/$1/; 532 } 533 return $OutString; 534 } 535 536 # Remove both leading and trailing white space characters from the string... 537 sub RemoveLeadingAndTrailingWhiteSpaces { 538 my($InString) = @_; 539 my($OutString); 540 541 $OutString = $InString; 542 if (length($InString) && ContainsWhiteSpaces($InString)) { 543 $OutString =~ s/^([ \t\r\n\f]*)(.*?)([ \t\r\n\f]*)$/$2/; 544 } 545 return $OutString; 546 } 547 548 # Wrap text string... 549 sub WrapText { 550 my($InString, $WrapLength, $WrapDelimiter); 551 my($OutString); 552 553 $WrapLength = 40; 554 $WrapDelimiter = "\n"; 555 if (@_ == 3) { 556 ($InString, $WrapLength, $WrapDelimiter) = @_; 557 } 558 elsif (@_ == 2) { 559 ($InString, $WrapLength) = @_; 560 } 561 else { 562 ($InString, $WrapLength) = @_; 563 } 564 $OutString = $InString; 565 if ($InString && (length($InString) > $WrapLength)) { 566 $OutString = ""; 567 my($Index, $Length, $FirstPiece, $StringPiece); 568 $Index = 0; $Length = length($InString); 569 $FirstPiece = 1; 570 for ($Index = 0; $Index < $Length; $Index += $WrapLength) { 571 if (($Index + $WrapLength) < $Length) { 572 $StringPiece = substr($InString, $Index, $WrapLength); 573 } 574 else { 575 # Last piece of the string... 576 $StringPiece = substr($InString, $Index, $WrapLength); 577 } 578 if ($FirstPiece) { 579 $FirstPiece = 0; 580 $OutString = $StringPiece; 581 } 582 else { 583 $OutString .= "${WrapDelimiter}${StringPiece}"; 584 } 585 } 586 } 587 return $OutString; 588 } 589