MayaChemTools

   1 package Parsers::Lexer;
   2 #
   3 # File: Lexer.pm
   4 # Author: Manish Sud <msud@san.rr.com>
   5 #
   6 # Copyright (C) 2024 Manish Sud. All rights reserved.
   7 #
   8 # This file is part of MayaChemTools.
   9 #
  10 # MayaChemTools is free software; you can redistribute it and/or modify it under
  11 # the terms of the GNU Lesser General Public License as published by the Free
  12 # Software Foundation; either version 3 of the License, or (at your option) any
  13 # later version.
  14 #
  15 # MayaChemTools is distributed in the hope that it will be useful, but without
  16 # any warranty; without even the implied warranty of merchantability of fitness
  17 # for a particular purpose.  See the GNU Lesser General Public License for more
  18 # details.
  19 #
  20 # You should have received a copy of the GNU Lesser General Public License
  21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  23 # Boston, MA, 02111-1307, USA.
  24 #
  25 
  26 use strict;
  27 use Carp;
  28 use Exporter;
  29 use Scalar::Util ();
  30 
  31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  32 
  33 @ISA = qw(Exporter);
  34 @EXPORT = qw();
  35 @EXPORT_OK = qw();
  36 
  37 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  38 
  39 # Setup class variables...
  40 my($ClassName);
  41 _InitializeClass();
  42 
  43 # Overload Perl functions...
  44 use overload '""' => 'StringifyLexer';
  45 
  46 # Class constructor...
  47 sub new {
  48   my($Class, $Input, @TokensSpec) = @_;
  49 
  50   # Initialize object...
  51   my $This = {};
  52   bless $This, ref($Class) || $Class;
  53   $This->_InitializeLexer();
  54 
  55   $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec);
  56 
  57   return $This;
  58 }
  59 
  60 
  61 # Initialize class ...
  62 sub _InitializeClass {
  63   #Class name...
  64   $ClassName = __PACKAGE__;
  65 }
  66 
  67 # Initialize object data...
  68 #
  69 sub _InitializeLexer {
  70   my($This) = @_;
  71 
  72   # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types:
  73   #   . Reference to input iterator function
  74   #   . Reference to an open file handle
  75   #   . Text string
  76   #
  77   $This->{Input} = undef;
  78 
  79   # Type of input paramater determined using Perl ref function:
  80   #   . InputIterator - ref returns CODE
  81   #   . FileStream - ref return GLOB and fileno is valid
  82   #   . String - ref return an empty string
  83   #
  84   $This->{InputType} = '';
  85 
  86   # Tokens specifications supplied by the caller. It's an array containing references
  87   # to arrays with each containing TokenLabel and TokenMatchRegex pair along with
  88   # an option reference to code to be executed after a matched.
  89   #
  90   # For example:
  91   #
  92   # @LexerTokensSpec = (
  93   #                        [ 'LETTER', qr/[a-zA-Z]/ ],
  94   #                        [ 'NUMBER', qr/\d+/ ],
  95   #                        [ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ],
  96   #                        [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_;  return "\n"; } ],
  97   #                        [ 'CHAR', qr/[\.]/ ],
  98   #                       );
  99   #
 100   @{$This->{TokensSpec}} = ();
 101 
 102   # Refernce to chained lexer...
 103   $This->{ChainedLexer} = undef;
 104 
 105   return $This;
 106 }
 107 
 108 # Validate input parameters and generate a chained lexer...
 109 #
 110 sub _ValidateParametersAndGenerateLexer {
 111   my($This, $Input, @TokensSpec) = @_;
 112 
 113   #
 114   # Validate input to be lexed...
 115   if (!defined $Input) {
 116     croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string...";
 117   }
 118   $This->{Input} = $Input;
 119 
 120   # Check input parameter type...
 121   my($InputType);
 122 
 123   $InputType = ref $Input;
 124   if ($InputType =~ /CODE/i) {
 125     # Input iterator...
 126     $This->{InputType} = "InputIterator";
 127   }
 128   elsif ($InputType =~ /GLOB/i && defined fileno $Input) {
 129     # Input stream...
 130     $This->{InputType} = "FileStream";
 131   }
 132   elsif ($InputType) {
 133     # Perl ref function returns nonempty string for all other references...
 134     croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
 135   }
 136   else {
 137     # Input string...
 138     $This->{InputType} = "String";
 139   }
 140 
 141   # Check tokens specifications...
 142   if (!@TokensSpec) {
 143     croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values...";
 144   }
 145   @{$This->{TokensSpec}} = @TokensSpec;
 146 
 147   $This->_GenerateLexer($Input, @TokensSpec);
 148 
 149   return $This;
 150 }
 151 
 152 # Generate a lexer using reference to an input iterator function, an open file
 153 # handle or an input string passed as first parameter by the caller along
 154 # with token specifications as second paramater...
 155 #
 156 sub _GenerateLexer {
 157   my($This, $Input, @TokensSpec) = @_;
 158 
 159   if ($This->{InputType} =~ /^InputIterator$/i) {
 160     $This->_GenerateInputIteratorLexer($Input, @TokensSpec);
 161   }
 162   elsif ($This->{InputType} =~ /^FileStream$/i) {
 163     $This->_GenerateInputFileStreamLexer($Input, @TokensSpec);
 164   }
 165   elsif ($This->{InputType} =~ /^String$/i) {
 166     $This->_GenerateInputStringLexer($Input, @TokensSpec);
 167   }
 168   else {
 169     croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string...";
 170   }
 171 
 172   return $This;
 173 }
 174 
 175 # Generate a lexer using specifed input iterator...
 176 #
 177 sub _GenerateInputIteratorLexer {
 178   my($This, $InputIteratorRef, @TokensSpec) = @_;
 179 
 180   $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec);
 181 
 182   return $This;
 183 }
 184 
 185 # Generate a lexer using specifed input file stream reference...
 186 #
 187 sub _GenerateInputFileStreamLexer {
 188   my($This, $FileHandleRef, @TokensSpec) = @_;
 189 
 190   # Iterator is a annoymous function reference and Perl keeps $FileHandleRef
 191   # in scope during its execution.
 192 
 193   $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec);
 194 
 195   return $This;
 196 }
 197 
 198 # Generate a lexer using specifed input string...
 199 #
 200 sub _GenerateInputStringLexer {
 201   my($This, $Text, @TokensSpec) = @_;
 202   my(@InputText) = ($Text);
 203 
 204   # Iterator is a annoymous function reference and Perl keeps @InputText
 205   # in scope during its execution.
 206 
 207   $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec);
 208 
 209   return $This;
 210 }
 211 
 212 # Get next available token label and value pair as an array reference or unrecognized
 213 # text from input stream by either removing it from the input or simply peeking ahead...
 214 #
 215 # Supported mode values: Peek, Next. Default: Next
 216 #
 217 sub Lex {
 218   my($This, $Mode) = @_;
 219 
 220   return $This->{ChainedLexer}->($Mode)
 221 }
 222 
 223 # Get next available token label and value pair as an array reference or unrecognized
 224 # text from input stream by either removing it from the input stream...
 225 #
 226 sub Next {
 227   my($This) = @_;
 228 
 229   return $This->Lex();
 230 }
 231 
 232 # Get next available token label and value pair as an array reference or unrecognized
 233 # text from input stream by simply peeking ahead and without removing it from the input
 234 # stream..
 235 #
 236 sub Peek {
 237   my($This) = @_;
 238 
 239   return $This->Lex('Peek')
 240 }
 241 
 242 # Get a reference to lexer method to be used by the caller...
 243 #
 244 sub GetLex {
 245   my($This) = @_;
 246 
 247   return sub { $This->Lex(); };
 248 }
 249 
 250 # The chained lexer generation is implemented based on examples in Higher-order Perl
 251 # [ Ref 126 ] book.
 252 #
 253 # Generate a lexer using specified input iterator and chaining it with other lexers generated
 254 # for all token specifications. The lexer generated for first token specification uses input
 255 # iterator to retrieve any available input text; the subsequent chained lexeres for rest
 256 # of the tokens use lexers generated for previous token specifications to get next input
 257 # which might be unmatched input text or a reference to an array containing token and
 258 # matched text pair.
 259 #
 260 sub _GenerateChainedLexer {
 261   my($This, $InputIteratorRef, @TokensSpec) = @_;
 262   my($TokenSpecRef, $ChainedLexer);
 263 
 264   $ChainedLexer = undef;
 265   for $TokenSpecRef (@TokensSpec) {
 266     $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef});
 267   }
 268 
 269   $This->{ChainedLexer} = $ChainedLexer;
 270 
 271   return $This;
 272 }
 273 
 274 
 275 # Generate a lexer using specifed token specification using specified input or
 276 # input retrieved using another token lexer. The lexer retrieving input from the
 277 # specified input stream is at the bottom of the chain.
 278 #
 279 sub _GenerateLexerForToken {
 280   my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_;
 281   my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens);
 282 
 283   # Input buffer for a specific lexer in chained lexers containing unprocessed
 284   # text for token specifications retrieved from a downstrean lexer or intial
 285   # input...
 286   #
 287   $InputBuffer = "";
 288 
 289   # @ProcessedTokens contains either references to an array containing token label
 290   # and matched text or any unmatched input text string...
 291   #
 292   @ProcessedTokens = ();
 293 
 294   # Setup a default annoymous function reference to generate an array reference
 295   # containing $Token and text matched to $RegexPattern.
 296   #
 297   $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText]  };
 298 
 299   # Setup an annoymous function to match and split input text using $RegexPattern for
 300   # a specific token during its lexer invocation in chained lexers.
 301   #
 302   # The usage of parenthesis around $RegexPattern during split allows capturing of matched
 303   # text, which is subsequently processed to retrieve matched $Token values. The split function
 304   # inserts a "" separator in the returned array as first entry whenever $InputText starts with
 305   # $RegexPattern. $InputText is returned as the only element for no match.
 306   #
 307   $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; };
 308 
 309   # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller
 310   # which in turns chains the lexers for all $Tokens before returning a reference to a lexer
 311   # at top of the lexer chain.
 312   #
 313   # Perl maintains scope of all variables defined with in the scope of the current function
 314   # during invocation of annoymous function even after the return call.
 315   #
 316   return sub {
 317     my($Mode) = @_;
 318 
 319     # Currenly supported value for mode: Peek, Next
 320     #
 321     $Mode = defined $Mode ? $Mode : 'Next';
 322 
 323     while (@ProcessedTokens == 0 && defined $InputBuffer ) {
 324       # Get any new input....
 325       my $NewInput = $InputIteratorOrLexer->();
 326 
 327       if (ref $NewInput) {
 328         # Input is an array reference containing matched token and text returned by
 329         # a chained lexer downstream lexer...
 330         #
 331         # Match $RegexPattern in available buffer text to retieve any matched text
 332         # for current $Token. $Separator might be "": $RegexPattern is at start of
 333         # of $InputBuffer
 334         #
 335         # Process input buffer containing text to be matched for the current lexer
 336         # which didn't get processed earlier during @NewTokens > 2  while loop:
 337         # no match for current lexer or more input available. It maintains order
 338         # of token matching in input stream.
 339         #
 340         my($Separator, $MatchedTokenRefOrText);
 341 
 342         ($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer);
 343         if (defined $MatchedTokenRefOrText) {
 344           $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText);
 345         }
 346 
 347         # Collect valid token references or text...
 348         push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput);
 349 
 350         # Empty put buffer...
 351         $InputBuffer = "";
 352 
 353         # Get out of the loop as processed token refererences and/or text  are available...
 354         last;
 355       }
 356 
 357       # Process input retrieved from downstream lexer or input iterator which hasn't
 358       # been processed into tokens..
 359       if (defined $NewInput) {
 360         $InputBuffer .= $NewInput;
 361       }
 362 
 363       # Retrieve any matched tokens from available input for the current lexer...
 364       #
 365       my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer);
 366 
 367       while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) {
 368         # Scenario 1: Complete match
 369         #   @NewTokens > 2 : Availability of separator, matched token text, separator.
 370         #   The separator might correspond to token for a token for upstream lexer followed
 371         #   by matched token from current lexer. It ends up getting passed to upsrteam
 372         #   lexer for processing.
 373         #
 374         # Scenario 2: No more input available from iterator or downstream lexer
 375         #   @NewTokens <= 2 and no more input implies any left over text in buffer. And
 376         #   it ends up getting passed to upsrteam for processing.
 377         #
 378 
 379         # Take off any unprocessed input text that doesn't match off the buffer: It'll be
 380         # passed to upstream chained lexer for processing...
 381         #
 382         push @ProcessedTokens, shift @NewTokens;
 383 
 384         if (@NewTokens) {
 385           my $MatchedTokenText = shift @NewTokens;
 386           push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText);
 387         }
 388       }
 389 
 390       # Retrieve any leftover text from NewTokens and put it back into InputBuffer for
 391       # processing by current lexer. All token references have been taken out....
 392       #
 393       $InputBuffer = "";
 394       if (@NewTokens) {
 395         $InputBuffer = join "", @NewTokens;
 396       }
 397 
 398       if (!defined $NewInput) {
 399         # No more input from the downstream lexer...
 400         $InputBuffer = undef;
 401       }
 402 
 403       # Clean up any empty strings from ProcessedTokens containing token
 404       # references or text...
 405       @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens;
 406 
 407     }
 408 
 409     # Return reference to an array containing token and matched text or just unmatched input text...
 410     my $TokenRefOrText = undef;
 411 
 412     if (@ProcessedTokens) {
 413       # Get first available reference either by just peeking or removing it from the list
 414       # of available tokens...
 415       $TokenRefOrText = ($Mode =~ /^Peek$/i) ?  $ProcessedTokens[0] : shift @ProcessedTokens;
 416     }
 417 
 418     return $TokenRefOrText;
 419   };
 420 }
 421 
 422 # Is it a lexer object?
 423 sub _IsLexer {
 424   my($Object) = @_;
 425 
 426   return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0;
 427 }
 428 
 429 # Return a string containing information about lexer...
 430 sub StringifyLexer {
 431   my($This) = @_;
 432   my($LexerString);
 433 
 434   $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString();
 435 
 436   return $LexerString;
 437 }
 438 
 439 # Return a string containing information about lexer...
 440 sub _GetLexerInfoString {
 441   my($This) = @_;
 442   my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction);
 443 
 444   $LexerInfoString = "InputType: $This->{InputType}";
 445 
 446   if ($This->{InputType} =~ /^String$/i) {
 447     $LexerInfoString .= "; InputString: $This->{Input}";
 448   }
 449 
 450   $TokensSpec = "TokensSpecifications: <None>";
 451   if (@{$This->{TokensSpec}}) {
 452     $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:";
 453     for $TokenSpec (@{$This->{TokensSpec}}) {
 454       ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec};
 455       $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined";
 456       $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]";
 457     }
 458     $TokensSpec .= " >";
 459   }
 460 
 461   $LexerInfoString .= "; $TokensSpec";
 462 
 463   return $LexerInfoString;
 464 }
 465