1 package Parsers::Lexer; 2 # 3 # File: Lexer.pm 4 # Author: Manish Sud <msud@san.rr.com> 5 # 6 # Copyright (C) 2024 Manish Sud. All rights reserved. 7 # 8 # This file is part of MayaChemTools. 9 # 10 # MayaChemTools is free software; you can redistribute it and/or modify it under 11 # the terms of the GNU Lesser General Public License as published by the Free 12 # Software Foundation; either version 3 of the License, or (at your option) any 13 # later version. 14 # 15 # MayaChemTools is distributed in the hope that it will be useful, but without 16 # any warranty; without even the implied warranty of merchantability of fitness 17 # for a particular purpose. See the GNU Lesser General Public License for more 18 # details. 19 # 20 # You should have received a copy of the GNU Lesser General Public License 21 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 22 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 23 # Boston, MA, 02111-1307, USA. 24 # 25 26 use strict; 27 use Carp; 28 use Exporter; 29 use Scalar::Util (); 30 31 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 32 33 @ISA = qw(Exporter); 34 @EXPORT = qw(); 35 @EXPORT_OK = qw(); 36 37 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 38 39 # Setup class variables... 40 my($ClassName); 41 _InitializeClass(); 42 43 # Overload Perl functions... 44 use overload '""' => 'StringifyLexer'; 45 46 # Class constructor... 47 sub new { 48 my($Class, $Input, @TokensSpec) = @_; 49 50 # Initialize object... 51 my $This = {}; 52 bless $This, ref($Class) || $Class; 53 $This->_InitializeLexer(); 54 55 $This->_ValidateParametersAndGenerateLexer($Input, @TokensSpec); 56 57 return $This; 58 } 59 60 61 # Initialize class ... 62 sub _InitializeClass { 63 #Class name... 64 $ClassName = __PACKAGE__; 65 } 66 67 # Initialize object data... 68 # 69 sub _InitializeLexer { 70 my($This) = @_; 71 72 # Input parameter used by lexer to retrieve text to be lexed. Supported parameter types: 73 # . Reference to input iterator function 74 # . Reference to an open file handle 75 # . Text string 76 # 77 $This->{Input} = undef; 78 79 # Type of input paramater determined using Perl ref function: 80 # . InputIterator - ref returns CODE 81 # . FileStream - ref return GLOB and fileno is valid 82 # . String - ref return an empty string 83 # 84 $This->{InputType} = ''; 85 86 # Tokens specifications supplied by the caller. It's an array containing references 87 # to arrays with each containing TokenLabel and TokenMatchRegex pair along with 88 # an option reference to code to be executed after a matched. 89 # 90 # For example: 91 # 92 # @LexerTokensSpec = ( 93 # [ 'LETTER', qr/[a-zA-Z]/ ], 94 # [ 'NUMBER', qr/\d+/ ], 95 # [ 'SPACE', qr/[ ]*/, sub { my($This, $TokenLabel, $MatchedText) = @_; return ''; } ], 96 # [ 'NEWLINE', qr/(?:\r\n|\r|\n)/, sub { my($This, $TokenLabel, $MatchedText) = @_; return "\n"; } ], 97 # [ 'CHAR', qr/[\.]/ ], 98 # ); 99 # 100 @{$This->{TokensSpec}} = (); 101 102 # Refernce to chained lexer... 103 $This->{ChainedLexer} = undef; 104 105 return $This; 106 } 107 108 # Validate input parameters and generate a chained lexer... 109 # 110 sub _ValidateParametersAndGenerateLexer { 111 my($This, $Input, @TokensSpec) = @_; 112 113 # 114 # Validate input to be lexed... 115 if (!defined $Input) { 116 croak "Error: ${ClassName}->new: Object can't be instantiated: Input is not defined. Supported values: a reference to input iterator function, a reference to an open file handle or a text string..."; 117 } 118 $This->{Input} = $Input; 119 120 # Check input parameter type... 121 my($InputType); 122 123 $InputType = ref $Input; 124 if ($InputType =~ /CODE/i) { 125 # Input iterator... 126 $This->{InputType} = "InputIterator"; 127 } 128 elsif ($InputType =~ /GLOB/i && defined fileno $Input) { 129 # Input stream... 130 $This->{InputType} = "FileStream"; 131 } 132 elsif ($InputType) { 133 # Perl ref function returns nonempty string for all other references... 134 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string..."; 135 } 136 else { 137 # Input string... 138 $This->{InputType} = "String"; 139 } 140 141 # Check tokens specifications... 142 if (!@TokensSpec) { 143 croak "Error: ${ClassName}->new: TokensSpec is not defined or the array doesn't contain any values. Supported values: a reference to an array containg token label, regular expression to match and an option reference to function to modify matched values..."; 144 } 145 @{$This->{TokensSpec}} = @TokensSpec; 146 147 $This->_GenerateLexer($Input, @TokensSpec); 148 149 return $This; 150 } 151 152 # Generate a lexer using reference to an input iterator function, an open file 153 # handle or an input string passed as first parameter by the caller along 154 # with token specifications as second paramater... 155 # 156 sub _GenerateLexer { 157 my($This, $Input, @TokensSpec) = @_; 158 159 if ($This->{InputType} =~ /^InputIterator$/i) { 160 $This->_GenerateInputIteratorLexer($Input, @TokensSpec); 161 } 162 elsif ($This->{InputType} =~ /^FileStream$/i) { 163 $This->_GenerateInputFileStreamLexer($Input, @TokensSpec); 164 } 165 elsif ($This->{InputType} =~ /^String$/i) { 166 $This->_GenerateInputStringLexer($Input, @TokensSpec); 167 } 168 else { 169 croak "Error: ${ClassName}->new: Object can't be instantiated: Invalid input parameter type specified. Supported parameter types: a reference to input iterator function, a reference to an open file handle or a text string..."; 170 } 171 172 return $This; 173 } 174 175 # Generate a lexer using specifed input iterator... 176 # 177 sub _GenerateInputIteratorLexer { 178 my($This, $InputIteratorRef, @TokensSpec) = @_; 179 180 $This->_GenerateChainedLexer($InputIteratorRef, @TokensSpec); 181 182 return $This; 183 } 184 185 # Generate a lexer using specifed input file stream reference... 186 # 187 sub _GenerateInputFileStreamLexer { 188 my($This, $FileHandleRef, @TokensSpec) = @_; 189 190 # Iterator is a annoymous function reference and Perl keeps $FileHandleRef 191 # in scope during its execution. 192 193 $This->_GenerateChainedLexer( sub { return <$FileHandleRef>; }, @TokensSpec); 194 195 return $This; 196 } 197 198 # Generate a lexer using specifed input string... 199 # 200 sub _GenerateInputStringLexer { 201 my($This, $Text, @TokensSpec) = @_; 202 my(@InputText) = ($Text); 203 204 # Iterator is a annoymous function reference and Perl keeps @InputText 205 # in scope during its execution. 206 207 $This->_GenerateChainedLexer( sub { return shift @InputText; }, @TokensSpec); 208 209 return $This; 210 } 211 212 # Get next available token label and value pair as an array reference or unrecognized 213 # text from input stream by either removing it from the input or simply peeking ahead... 214 # 215 # Supported mode values: Peek, Next. Default: Next 216 # 217 sub Lex { 218 my($This, $Mode) = @_; 219 220 return $This->{ChainedLexer}->($Mode) 221 } 222 223 # Get next available token label and value pair as an array reference or unrecognized 224 # text from input stream by either removing it from the input stream... 225 # 226 sub Next { 227 my($This) = @_; 228 229 return $This->Lex(); 230 } 231 232 # Get next available token label and value pair as an array reference or unrecognized 233 # text from input stream by simply peeking ahead and without removing it from the input 234 # stream.. 235 # 236 sub Peek { 237 my($This) = @_; 238 239 return $This->Lex('Peek') 240 } 241 242 # Get a reference to lexer method to be used by the caller... 243 # 244 sub GetLex { 245 my($This) = @_; 246 247 return sub { $This->Lex(); }; 248 } 249 250 # The chained lexer generation is implemented based on examples in Higher-order Perl 251 # [ Ref 126 ] book. 252 # 253 # Generate a lexer using specified input iterator and chaining it with other lexers generated 254 # for all token specifications. The lexer generated for first token specification uses input 255 # iterator to retrieve any available input text; the subsequent chained lexeres for rest 256 # of the tokens use lexers generated for previous token specifications to get next input 257 # which might be unmatched input text or a reference to an array containing token and 258 # matched text pair. 259 # 260 sub _GenerateChainedLexer { 261 my($This, $InputIteratorRef, @TokensSpec) = @_; 262 my($TokenSpecRef, $ChainedLexer); 263 264 $ChainedLexer = undef; 265 for $TokenSpecRef (@TokensSpec) { 266 $ChainedLexer = defined $ChainedLexer ? $This->_GenerateLexerForToken($ChainedLexer, @{$TokenSpecRef}) : $This->_GenerateLexerForToken($InputIteratorRef, @{$TokenSpecRef}); 267 } 268 269 $This->{ChainedLexer} = $ChainedLexer; 270 271 return $This; 272 } 273 274 275 # Generate a lexer using specifed token specification using specified input or 276 # input retrieved using another token lexer. The lexer retrieving input from the 277 # specified input stream is at the bottom of the chain. 278 # 279 sub _GenerateLexerForToken { 280 my($This, $InputIteratorOrLexer, $TokenLabel, $RegexPattern, $TokenMatchActionRef) = @_; 281 my($TokenMatchAndSplitRef, $InputBuffer, @ProcessedTokens); 282 283 # Input buffer for a specific lexer in chained lexers containing unprocessed 284 # text for token specifications retrieved from a downstrean lexer or intial 285 # input... 286 # 287 $InputBuffer = ""; 288 289 # @ProcessedTokens contains either references to an array containing token label 290 # and matched text or any unmatched input text string... 291 # 292 @ProcessedTokens = (); 293 294 # Setup a default annoymous function reference to generate an array reference 295 # containing $Token and text matched to $RegexPattern. 296 # 297 $TokenMatchActionRef = defined $TokenMatchActionRef ? $TokenMatchActionRef : sub { my($This, $Label, $MatchedText) = @_; return [$Label, $MatchedText] }; 298 299 # Setup an annoymous function to match and split input text using $RegexPattern for 300 # a specific token during its lexer invocation in chained lexers. 301 # 302 # The usage of parenthesis around $RegexPattern during split allows capturing of matched 303 # text, which is subsequently processed to retrieve matched $Token values. The split function 304 # inserts a "" separator in the returned array as first entry whenever $InputText starts with 305 # $RegexPattern. $InputText is returned as the only element for no match. 306 # 307 $TokenMatchAndSplitRef = sub { my($InputText) = @_; return split /($RegexPattern)/, $InputText; }; 308 309 # Setup a lexer for $TokenLabel as an annoymous function and return its reference to caller 310 # which in turns chains the lexers for all $Tokens before returning a reference to a lexer 311 # at top of the lexer chain. 312 # 313 # Perl maintains scope of all variables defined with in the scope of the current function 314 # during invocation of annoymous function even after the return call. 315 # 316 return sub { 317 my($Mode) = @_; 318 319 # Currenly supported value for mode: Peek, Next 320 # 321 $Mode = defined $Mode ? $Mode : 'Next'; 322 323 while (@ProcessedTokens == 0 && defined $InputBuffer ) { 324 # Get any new input.... 325 my $NewInput = $InputIteratorOrLexer->(); 326 327 if (ref $NewInput) { 328 # Input is an array reference containing matched token and text returned by 329 # a chained lexer downstream lexer... 330 # 331 # Match $RegexPattern in available buffer text to retieve any matched text 332 # for current $Token. $Separator might be "": $RegexPattern is at start of 333 # of $InputBuffer 334 # 335 # Process input buffer containing text to be matched for the current lexer 336 # which didn't get processed earlier during @NewTokens > 2 while loop: 337 # no match for current lexer or more input available. It maintains order 338 # of token matching in input stream. 339 # 340 my($Separator, $MatchedTokenRefOrText); 341 342 ($Separator, $MatchedTokenRefOrText) = $TokenMatchAndSplitRef->($InputBuffer); 343 if (defined $MatchedTokenRefOrText) { 344 $MatchedTokenRefOrText = $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenRefOrText); 345 } 346 347 # Collect valid token references or text... 348 push @ProcessedTokens, grep { defined $_ && $_ ne "" } ($Separator, $MatchedTokenRefOrText, $NewInput); 349 350 # Empty put buffer... 351 $InputBuffer = ""; 352 353 # Get out of the loop as processed token refererences and/or text are available... 354 last; 355 } 356 357 # Process input retrieved from downstream lexer or input iterator which hasn't 358 # been processed into tokens.. 359 if (defined $NewInput) { 360 $InputBuffer .= $NewInput; 361 } 362 363 # Retrieve any matched tokens from available input for the current lexer... 364 # 365 my(@NewTokens) = $TokenMatchAndSplitRef->($InputBuffer); 366 367 while ( @NewTokens > 2 || @NewTokens && !defined $NewInput) { 368 # Scenario 1: Complete match 369 # @NewTokens > 2 : Availability of separator, matched token text, separator. 370 # The separator might correspond to token for a token for upstream lexer followed 371 # by matched token from current lexer. It ends up getting passed to upsrteam 372 # lexer for processing. 373 # 374 # Scenario 2: No more input available from iterator or downstream lexer 375 # @NewTokens <= 2 and no more input implies any left over text in buffer. And 376 # it ends up getting passed to upsrteam for processing. 377 # 378 379 # Take off any unprocessed input text that doesn't match off the buffer: It'll be 380 # passed to upstream chained lexer for processing... 381 # 382 push @ProcessedTokens, shift @NewTokens; 383 384 if (@NewTokens) { 385 my $MatchedTokenText = shift @NewTokens; 386 push @ProcessedTokens, $TokenMatchActionRef->($This, $TokenLabel, $MatchedTokenText); 387 } 388 } 389 390 # Retrieve any leftover text from NewTokens and put it back into InputBuffer for 391 # processing by current lexer. All token references have been taken out.... 392 # 393 $InputBuffer = ""; 394 if (@NewTokens) { 395 $InputBuffer = join "", @NewTokens; 396 } 397 398 if (!defined $NewInput) { 399 # No more input from the downstream lexer... 400 $InputBuffer = undef; 401 } 402 403 # Clean up any empty strings from ProcessedTokens containing token 404 # references or text... 405 @ProcessedTokens = grep { $_ ne "" } @ProcessedTokens; 406 407 } 408 409 # Return reference to an array containing token and matched text or just unmatched input text... 410 my $TokenRefOrText = undef; 411 412 if (@ProcessedTokens) { 413 # Get first available reference either by just peeking or removing it from the list 414 # of available tokens... 415 $TokenRefOrText = ($Mode =~ /^Peek$/i) ? $ProcessedTokens[0] : shift @ProcessedTokens; 416 } 417 418 return $TokenRefOrText; 419 }; 420 } 421 422 # Is it a lexer object? 423 sub _IsLexer { 424 my($Object) = @_; 425 426 return (Scalar::Util::blessed($Object) && $Object->isa($ClassName)) ? 1 : 0; 427 } 428 429 # Return a string containing information about lexer... 430 sub StringifyLexer { 431 my($This) = @_; 432 my($LexerString); 433 434 $LexerString = "Lexer: PackageName: $ClassName; " . $This->_GetLexerInfoString(); 435 436 return $LexerString; 437 } 438 439 # Return a string containing information about lexer... 440 sub _GetLexerInfoString { 441 my($This) = @_; 442 my($LexerInfoString, $TokensSpec, $TokenSpec, $TokenLabel, $TokenMatchRegex, $TokenMatchAction); 443 444 $LexerInfoString = "InputType: $This->{InputType}"; 445 446 if ($This->{InputType} =~ /^String$/i) { 447 $LexerInfoString .= "; InputString: $This->{Input}"; 448 } 449 450 $TokensSpec = "TokensSpecifications: <None>"; 451 if (@{$This->{TokensSpec}}) { 452 $TokensSpec = "TokensSpecifications: < [Label, MatchRegex, MatchAction]:"; 453 for $TokenSpec (@{$This->{TokensSpec}}) { 454 ($TokenLabel, $TokenMatchRegex) = @{$TokenSpec}; 455 $TokenMatchAction = (@{$TokenSpec} == 3) ? "$TokenSpec->[2]" : "undefined"; 456 $TokensSpec .= " [$TokenLabel, $TokenMatchRegex, $TokenMatchAction]"; 457 } 458 $TokensSpec .= " >"; 459 } 460 461 $LexerInfoString .= "; $TokensSpec"; 462 463 return $LexerInfoString; 464 } 465