-
Notifications
You must be signed in to change notification settings - Fork 0
/
unilex.icn
2344 lines (2240 loc) · 77.7 KB
/
unilex.icn
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# verified to revision 5969 - use svn blame unilex.icn &>unilex.blame to get new revisions
# have added the regexp changes not included in the main repository
#
#
# A hand-written lex(1)-compatible Unicon lexer.
#
link escape
#$define debuglex 1
$ifndef NoPatternIntegration
$define PatternIntegration 1
$endif
global yytext, #GV: this variable holds the current
#: text that has been collected during
#: the token processing in the current
#: lexer being used.
yyin, #GV: this variable holds either the current
#: file being parsed or a list of the
#: previously read files for which there
#: $include statements found.
yytoken, #GV: holds the current token that has been
#: found by the current lexer in use. The
#: value will aslways be a token record.
debuglex #GV: if the lexer is to be debugged, set this
#: to any non-null value.
global OctalCharacters, #GV: cset of octal digits
DecimalCharacters, #GV: cset of decimal digits
LetterCharacters, #GV: cset of letters, includes "_",
#: valid first character of identifiers
HexadecimalCharacters, #GV: cset of hexadecimal digits
RadixNumberCharacters, #GV: cset of digits and letters, used
#: for radix defined numbers
FS, #GV: Not used
IS, #GV: Not used
WhitespaceCharacters, #GV: cset of whitespace characters
idchars #GV: cset of valid characters used in
#: identifiers from 2nd character on
global errors #GV: holds a count of the current number of
#: errors that have been found by both the
#: lexer and parser.
#
# global variable to reference relvant lexer procedure
#
global yylex2 #GV: This variable holds the current lexer
#: in use. Allows for specialised lexers
#: to be incorporated as required. In
#: the current case allows for a
#: specialised lexer to be used for regex.
# E [Ee][+-]?{D}+
#
#
# tokens are records mainly for historical reasons at this point.
# unilex.icn can probably still be compiled with icont, simplifying
# bootstrapping, but if not for that, token should become a class.
#
#RD:
#: token() - contains the collected information for a token. It will include a
#: taken type, the string value of the token, the line number/column number and
#: the filename in which the token has been found. Within the code, certain tokens
#: are generated which are not actually found in the source code (list comprehensions
#: are one such code example) and the line number/column number and source are a
#: set of values that indicate this. The source filename in these cases is "__faux.icn"
#:
record token(
tok, #RF: the token type for this token,
#: see the token type codes as found
#: in the source file ytab_h.icn.
s, #RF: string representation of the token
line, #RF: line number in the source code
#: where this token has been found
column, #RF: column that is the start of the token
filename #RF: name of the source file in which
#: token has occurred
)
#PD:
#: init_csets() - a procedure to initialise a number of global variables
#: to hold the various csets that will be used in scanning for the next
#: token in the input source. The original names used in the code were
#: cryptic single character names. They have been renamed to reflect what
#: they actually represent. An example of this is O being renamed to
#: OctalCharacters. A number of these were not actually used in the lexer
#: or eleswhere in the compiler.
#:
#: The value associated with RadixNumberCharacters and its need within the
#: lexer has changed. A different process has been used to determine if a
#: radix defined number is valid or not. The original code left this up to
#: the icont compiler to determine if it was of the correct format. The
#: unicon compiler now determines this, in part for the preparation of the
#: unicon compiler to take on more functionality.
#:
procedure init_csets()
OctalCharacters := '01234567' #GI: We set the global variable to hold
#: those characters that are valid for
#: octal digits (01234567).
#:
DecimalCharacters := &digits #GI: We set the global variable to hold
#: those characters that are valid for
#: decimal digits (0123456789)
#:
LetterCharacters := &letters ++ '_'
#GI: We set the global variable to hold
#: those characters that are valid
#: letter characters. This will include
#: both lower and upper case latin alphabet
#: characters and the underscore character (_)
#:
HexadecimalCharacters := &digits ++ 'abcdefABCDEF'
#GI: We set the global variable to hold
#: those characters which are valid
#: hexadecimal characters. This includes
#: the decimal characters and the lower and upper
#: case letters (abcdefABCDEF)
#:
RadixNumberCharacters := &digits ++ &letters
#GI: In relation to the radix defined numbers,
#: the base for these numbers ranges from 2 to
#: 36. As such, we use the letters to represent
#: the relevant additional characters needed for
#: all bases above 10. Hexadecimla will be
#: included as well as being previously defined.
#: We set the global variable to handle the entire
#: possible range of characters required (determined
#: by base 36) and this will include the decimal digits
#: and the lower and upper case latin letters.
#:
#FS := 'fFlL' # Not used anywhere
#IS := 'uUlL' # Not Used anywhere
WhitespaceCharacters := ' \t\v' #GI: We set the global variable to the valid whitespace
#: characters, which are the space, tab and vertical tab
#: characters
#:
idchars := LetterCharacters ++ DecimalCharacters
#GI: We set the global variable to the valid identifier
#: characters (position 2 onwards) which include the
#: upper and lower case latin characters, the decimal
#: digits and the underscore character
#:
end
$include "ytab_h.icn"
global yylineno, #GV: this will hold the currentline number in the
#: current file being processed.
#:
yycolno, #GV: this will hold the current column number in
#: the current line being processed.
#:
yyfilename #GV: this is the name of the current file being
#: processed
#:
global tokflags #GV: the tokflags will tell you whether
#: the token can start an expression
#: or end an expression, as well as
#: whether a newline was seen since
#: the last token
#:
#
# since an identifier token (as either a reserved word or an actual identifier)
# can be the start of some parse expression or it can end a parse
# expression or be neither, instead of just using "magic" numbers to indicate
# this, we will give decsriptive names to these values. We use the following defines
# to specify the relevant values. Since these are flag values, we make sure that
# each defined value is a power of 2 to specify the bit needed.
#
$define Neither 0 #GD: the token is not found at the beginning
#: or the ending of a parse expression
$define Beginner 1 #GD: the token can begin a parse expression
$define Ender 2 #GD: the token can end a parse expression
$define Newline 4 #GD: the newline character has been found
global lastid #GV: This global does not appear to be used in the source code.
global buffer #GV: holds the current text being parsed.
global lastchar #GV: This global is initialised but is not otherwised used.
#PD:
#: reswords() - this procedure is only ever called once and in the calling of it,
#: the global variable which initially holds the procedure value as defined in
#: the code below is overwritten by the results returned by this procedure.
#: Thereafter, reswords holds a table value and for all intents and purposes
#: the procedure value is no longer accessible.
#:
procedure reswords()
static t
initial {
t := table([Beginner+Ender, IDENT])
t["abstract"] := [Neither, ABSTRACT ]
t["break"] := [Beginner+Ender, BREAK ]
t["by"] := [Neither, BY ]
t["case"] := [Beginner, CASE ]
t["class"] := [Neither, CLASS ]
t["create"] := [Beginner, CREATE ]
t["critical"] := [Beginner, CRITICAL ]
t["default"] := [Beginner, DEFAULT ]
t["do"] := [Neither, DO ]
t["else"] := [Neither, ELSE ]
t["end"] := [Beginner, END ]
t["every"] := [Beginner, EVERY ]
t["fail"] := [Beginner+Ender, FAIL ]
t["global"] := [Neither, GLOBAL ]
t["if"] := [Beginner, IF ]
t["import"] := [Neither, IMPORT ]
t["initial"] := [Beginner, iconINITIAL ]
t["initially"] := [Ender, INITIALLY ]
t["invocable"] := [Neither, INVOCABLE ]
t["link"] := [Neither, LINK ]
t["local"] := [Beginner, LOCAL ]
t["method"] := [Neither, METHOD ]
t["next"] := [Beginner+Ender, NEXT ]
t["not"] := [Beginner, NOT ]
t["of"] := [Neither, OF ]
t["package"] := [Neither, PACKAGE ]
t["procedure"] := [Neither, PROCEDURE ]
t["record"] := [Neither, RECORD ]
t["repeat"] := [Beginner, REPEAT ]
t["return"] := [Beginner+Ender, RETURN ]
t["static"] := [Beginner, STATIC ]
t["suspend"] := [Beginner+Ender, SUSPEND ]
t["then"] := [Neither, THEN ]
t["thread"] := [Beginner, THREAD ]
t["to"] := [Neither, TO ]
t["until"] := [Beginner, UNTIL ]
t["while"] := [Beginner, WHILE ]
}
return t
end
#PD:
#: lex_error() - print a message when a token is not recognised. This is
#: not used in the compiler. It does however, appear in the Robert Parlett parser
#: application. The replacement for this is the uni_error procedure. This code should
#: be cleaned up in all systems that use the lexer processes.
#:
procedure lex_error()
yyerror("token not recognized")
end
#PD:
#: uni_error(s) - print and count the error messages being printed. If
#: called with no parameter, a standard error message is displayed.
#:
#:@param s string message to be displayed
#:@fails always fails, returns no results.
#:
procedure uni_error(s)
#
# if errors is not initialised, set the value to 0
#
/errors := 0
#
# if no message is supplied, set the value to a standard message
#
/s := "token not recognized"
write("uni_error calls yyerror ", image(s))
yyerror(s)
errors +:= 1
end
#PD:
#: dyslexia() - print a message in those cases which have caused problems
#: in coding previously. These are in the event that a misunderstanding
#: has occured or a typo has occurred, but the order is still legal unicon
#: source.
#:
#: An example is the occurrence of :+ (as in [i:+j]) which is legal but
#: what was wanted was +: instead (as in [i+:j]).
#:
#: This will print out a warning message to &errout but will not stop the
#: compilation from occurring.
#:
procedure dyslexia()
static messages #SV: this will hold a table of
#: possible character sequences
#: and the associated warnings
local testing_value, #LV: temporary to hold the current
#: key sequence from messages
current_pos #LV: the current position in the
#: source being examined. This is
#: used to reset the scanning
#: position for the actual token
#: scanning.
#
# initialise the different kinds of possible errors though legal unicon
#
initial {
messages := table()
messages[":+"] := "token may be malformed, did you mean +: ?"
messages[":-"] := "token may be malformed, did you mean -: ?"
messages["&&"] := "pattern operator && is non-standard, did you mean & ?"
messages["keys"] := "identifier may be malformed, did you mean \"key\" ?"
messages["procs"] := "identifier may be malformed, did you mean \"proc\" ?"
}
#
# we test each of the test strings against the current position which
# is 1 character back from the &pos and if found print a warning and
# exit loop
#
current_pos := &pos - 1
every testing_value := key(messages) do {
#
# this is to ensure that the testing position is always the same
# position each time.
#
&pos := current_pos
if =testing_value then {
warning(messages[testing_value], yylineno, yyfilename, testing_value)
break
}
}
#
# always restore the position for scanning back to the original value
# that &pos had before entry to this procedure
#
&pos := current_pos + 1
end
#PD:
#: yylex2Normal() - The original lexer procedure has been renamed from
#: yylex2 to yylex2Normal to allow for multiple lexers to be used within
#: the unicon compiler. This was brought about because of the use of a
#: sub-langauge for the regex facilities being introduced. The alternate
#: lexers can then be initiated by a specific token within the grammar.
#:
#: This is the standard unicon lexer.
#:
procedure yylex2Normal()
local new_filename #LV: a temporary used in determining
#: if a new filename has been
#: specified in the source code
#: via the use of a #line directive
static punc_table #SV: holds a table that points to
#: the appropriate procedure for
#: handling the next token to be
#: found in the input based on
#: the current character that has
#: been seen in the input source
initial {
#
# set up a number of csets that will be used in the string scanning
# processes for determining the next token in the input source.
#
init_csets()
#
# this uses the specific functionality of the unicon/icon language that
# all procedure definitions automatically create a global variable whose value
# can then be overwritten (hence, losing access to the original function). We
# do this here to have the global variable hold a special table which
# will return a 2 element list about the kind of identifier token found. This
# is applicable to both reserved words and all other identifiers that
# are valid within the unicon language. The first element of the list
# gives information about whether this identifier can start or finish
# an expression after/before a new line is encountered. The second
# element of the list is the applicable identifier/reserved word lexical
# type that will be used by the parser.
#
reswords := reswords()
#
# as we encounter a character in the input, we can use this to select
# the specific lexical procedure to handle the various associated lexical entities.
# if the character found is not a valid identifier or other symbol,
# a specific error procedure will be called to handle this situation
#
punc_table := table(uni_error)
punc_table["'"] := do_literal
punc_table["\""] := do_literal
punc_table["!"] := do_bang
punc_table["%"] := do_mod
punc_table["&"] := do_and
punc_table["*"] := do_star
punc_table["+"] := do_plus
punc_table["-"] := do_minus
punc_table["."] := do_dot
punc_table["/"] := do_slash
punc_table[":"] := do_colon
punc_table["<"] := do_less
punc_table["="] := do_equal
punc_table[">"] := do_greater
punc_table["?"] := do_qmark
punc_table["@"] := do_at
punc_table["\\"] := do_backslash
punc_table["^"] := do_caret
punc_table["|"] := do_or
punc_table["~"] := do_tilde
punc_table["("] := do_lparen
punc_table[")"] := do_rparen
punc_table["["] := do_lbrack
punc_table["]"] := do_rbrack
punc_table["{"] := do_lbrace
punc_table["}"] := do_rbrace
punc_table[","] := do_comma
punc_table[";"] := do_semi
punc_table["$"] := do_dollar
punc_table["`"] := do_backquote
every punc_table[!DecimalCharacters] := do_digits
every punc_table[!LetterCharacters] := do_letters
}
yycolno +:= *yytext
repeat {
if pos(0) then {
fail
} else if ="#" then {
if ="line " then {
if yylineno := integer(tab(many(&digits))) then {
=" \""
new_filename := tab(find("\"")|0)
if *new_filename > 0 then {
yyfilename := new_filename
}
}
}
tab(find("\n") | 0)
} else if ="\n" then {
yylineno +:= 1
yycolno := 1
if tokflags < Newline then {
tokflags +:= Newline
}
} else if tab(any(' ')) then {
yycolno +:= 1;
} else if tab(any('\v\^l')) then {
# skip these
} else if tab(any('\t')) then {
yycolno +:= 1
while (yycolno-1) % 8 ~= 0 do {
yycolno +:= 1
}
} else {
yytext := move(1)
#
# could we put a test in here for the various kinds of oopsies that
# Clinton was talking about in his email.
#
dyslexia()
#
# the actual scanning for the next token
#
if rv := punc_table[yytext]() then {
return rv
}
}
}
end
###############################################################################
###############################################################################
#
# New regex lexer - switched on by the grammar reaching the specific production
# that starts the regex grammar rules
#
# This is the example using < and >
#
# The normal lexer is switched backed when the REGEXEND token is encountered
# and returned to the parser.
#
###############################################################################
#
# The global variables below are used by the regex lexer to handle various
# conditions within a regex
#
#
global regexskipchars, #GV: holds the cset of special characters that
#: represent various regex controls, included
#: in this cset is the character that starts
#: the REGEXEND delimiter,
#:
debugwrite, #GV: used to hold the debugging fuction when
#: testing the lexer after making changes,
#: the value is either write or 1. In normal
#: running, this value is 1
#:
regexintlit, #GV: this is a flag to indicate that the next
#: set of characters are to an integer literal
#:
regexnoskip #GV: this is a flag to indicate whether or not
#: to collect following characters into a single
#: string value
#:
#PD:
#: yylex2Regex() - This is the replacement lexer that is used when the regex
#: expressions are being processed. The global variable yylex2 is assigned this
#: procedure value on entry into the regex during parsing.
#:
procedure yylex2Regex()
static punc_table #SV:
initial {
#
# each of the keys is one of the special characters recognised by the
# regex lexer. All characters not as leys use the standard lexer routine
#
punc_table := table(do_regexp_common)
punc_table["\\"] := do_regexp_backslash
punc_table["["] := do_regexp_lbrack
punc_table["]"] := do_regexp_rbrack
punc_table["("] := do_regexp_lparen
punc_table[")"] := do_regexp_rparen
punc_table["{"] := do_regexp_lbrace
punc_table["}"] := do_regexp_rbrace
punc_table["*"] := do_regexp_star
punc_table["+"] := do_regexp_plus
punc_table["|"] := do_regexp_bar
punc_table["?"] := do_regexp_qmark
punc_table["^"] := do_regexp_caret
punc_table["."] := do_regexp_dot
punc_table["-"] := do_regexp_hyphen
punc_table["\""] := do_regexp_quote
#
# These are the relevant rountines that are used to determine when the
# regex is terminated. During testing the symbols (: and :) were used as
# the designators. Normal usage uses < and >.
#
#punc_table[":"] := do_regexp_colon
punc_table[">"] := do_regexp_nmgt
}
#
# When debugging changes to the lexer, uncomment the following assignment
# to get debugging output from the lexer.
#
#debugwrite := write
#
# In normal operation the following assignment ensures that no debugging
# output occurs
#
debugwrite := 1
#
# The basic design of the regex lexer is that each character is returned to
# the parser for handling by the parser. However, this leads to a greatly
# increase number of pattern matching calls that will be run at runtime.
# To reduce the number of pattern matching calls, a sequence of non-special
# characters can be returned as a single string. This assignment is used to
# determine when a group of characters can be grouped together. Non-special
# characters include all the non-printable control characters, including newline.
#
#regexskipchars := '\\[]{}()*+?:.^|-"' # use if ending symbol starts with :
regexskipchars := '\\[]{}()*+?>.^|-"' # use if ending symbol is >
yycolno +:= *yytext
debugwrite("yylex2Regex: yytext:\"", yytext, "\"")
if pos(0) then {
fail
}
yytext := move(1)
debugwrite("yylex2Regex: yytext:\"", yytext, "\"")
if yytext == "\n" then {
yylineno +:= 1
yycolno := 1
}
if rv := punc_table[yytext]() then {
debugwrite("yylex2Regex: rv:", rv, " yytext:\"", yytext, "\"")
return rv
}
end
#PD:
#: do_regexp_common() - This procedure handles all characters that are not
#: special regex characters
#:
procedure do_regexp_common()
local i, #LV: used to locate the end of string
#: to be returned
j #LV: used to locate the start of the
#: collected string, if more than one
#: character is selected.
#
# This handles the creation of an INTLIT token that is found in the regex
# expression form {n}. At present, this code only allows a single integer
# value to be used in the {n} form
#
if \regexintlit then {
yytext ||:= tab(many(&digits))
regexintlit := &null
debugwrite("do_regexp_common: digits:", yytext)
return INTLIT
}
debugwrite("do_regexp_common: regexskip:", \regexskip | "&null")
#
# The following assignment is used to determine if multiple characters have
# been collected. If there are, the position will be after the current &pos
#
j := &pos
debugwrite("do_regexp_common: skip j:", j)
#
# We can collect as many characters as possible to return as a string value
#
if /regexnoskip then {
debugwrite("do_regexp_common: skip:\"", yytext,"\"")
#
# are there a number of characters to collect?
#
i := upto(regexskipchars)
#
# if so we can collect them together
#
if i > j then {
debugwrite("do_regexp_common: skip:", i)
#
# But make sure that the character follwing these are not one of
# *, + or ?, as these will require the last character to removed
# from the string
#
if &subject[i] == ("*" | "+" | "?") then {
i -:= 1
debugwrite("do_regexp_common: skip:", i)
}
#
# now we collect those characters, if any of those characters are
# non-printable, they will be converted into a printable escaped
# form by the use of the internal function image().
#
yytext ||:= tab(i)
#
#
yytext := image(yytext)[2:-1]
debugwrite("do_regexp_common: skip:\"", yytext, "\"")
#
# set the flag so that we don't try to collect any more until the
# special characters found have been processed.
#
regexnoskip := 1
}
} else {
debugwrite("do_regexp_common: skip:\"", yytext, "\"")
#
# if the flag has been set, reset it
#
regexnoskip := &null
}
return REGEXCHAR
end
#PD:
#: do_regexp_colon() - this procedure was used when the regex ender was set to :).
#: The code is left in as an example of how to change the regex ender string. If
#: the following character is not a ) then just return the value as a normal REGEXCHAR.
#:
procedure do_regexp_colon()
if yytext ||:= =")" then {
yylex2 := yylex2Normal
tokflags +:= Ender
regexnoskip := &null
return REGEXEND
}
return REGEXCHAR
end
#PD:
#: do_regexp_nmgt() - This is the normal REGEXEND processing code.
#:
procedure do_regexp_nmgt()
#
# we need to reset the lexer being used to the normal unicon/icon lexer.
# we do it hhere as this is where we have found the terminating character for
# the regex processing. the procedure above [do_regexp_colon] gives an example
# of how to deal with a multi-character termination string. This procedure
# is the example to use if an alternative regex ending charcater is used. It
# does need to be co-ordinated with the appropriate changes in the grammar. See
# the non-terminal definiton for [expr11] in the appropriate [.y] file.
#
yylex2 := yylex2Normal
tokflags +:= Ender
regexnoskip := &null
return REGEXEND
end
#PD:
#: convert_backslash() - This procedure handles the processing of escapes of characters.
#: The handling of \xhh and \^c and \ddd is now done. Each form is converted to
#: the actual character and then back to the escaped form as designated by the
#: internal function image(). If the character that has been
#: escaped is not one of the designated escape characters then just return the
#: the character as itself.
#:
#: We treat " and ' as special cases because of their use in strings, csets and regexes.
#:
procedure convert_backslash()
static conversion_set #SV: holds the default escaped conversion
#: strings
local ch, #LV: temporary to hold a character being tested
ch2, #LV: temporary to hold a following character being tested
rval #LV: hold the value to be returned
initial {
#
# The backslash is escaped so as to allow the correct processing by the
# icont compiler.
#
conversion_set := table()
conversion_set["\\"] := "\\\\"
conversion_set["b"] := "\\b"
conversion_set["d"] := "\\d"
conversion_set["e"] := "\\e"
conversion_set["f"] := "\\f"
conversion_set["l"] := "\\l"
conversion_set["n"] := "\\n"
conversion_set["r"] := "\\r"
conversion_set["t"] := "\\t"
conversion_set["v"] := "\\v"
conversion_set["\""] := "\\\""
conversion_set["\'"] := "\\\'"
}
# need to handle \x and \^ as both of these are possible character definitions
#
# find out what character we have escaped
#
ch := move(1)
if ch == "x" then {
if member(HexadecimalCharacters, &subject[&pos:&pos + 1]) then {
ch := move(1)
}
if member(HexadecimalCharacters, &subject[&pos:&pos + 1]) then {
ch ||:= move(1)
}
#
# convert the hex characters to the equivalent integer value and then
# to the specific character. image() will convert the character to a
# standard format
#
rval := image(char(integer("16r" || ch)))[2:-1]
} else if ch == "^" then {
ch := move(1)
#
# convert the control characters to the equivalent integer value using
# the lower 5 bits and then to the specific character. image() will
# convert the character to a standard format
#
rval := image(char(iand(ord(ch), 31)))[2:-1]
} else if member(OctalCharacters, ch) then {
if member(OctalCharacters, &subject[&pos:&pos + 1]) then {
ch ||:= move(1)
}
if member(OctalCharacters, &subject[&pos:&pos + 1]) then {
ch ||:= move(1)
}
#
# convert the octal characters to the equivalent integer value and then
# to the specific character. image() will convert the character to a
# standard format
#
rval := image(char(integer("8r" || ch)))[2:-1]
} else {
#
# set the text to either the correct encoding or just the character itself
#
rval := (\conversion_set[ch] | ch )
#
}
return rval
end
#PD:
#: do_regexp_backslash() - this procedure handles the backslash processing for
#: the regex lexer
#:
procedure do_regexp_backslash()
yytext := convert_backslash()
# we need to reset the no skip condition and return a REGEXCHAR
#
regexnoskip := &null
return REGEXCHAR
end
#PD:
#: do_regexp_quote() - this procedure converts a " character into the encoded
#: format for the pattern matching routines.
#:
procedure do_regexp_quote()
yytext := "\\\""
regexnoskip := &null
return REGEXCHAR
end
#
# each of the following procedures returns the specific code for the special
# character found and resets the no skip condition.
#PD:
#: do_regexp_star() - processes the regex * postfix operator
#:
#:
procedure do_regexp_star()
regexnoskip := &null
return REGEXSTAR
end
#PD:
#: do_regexp_plus() - processes the regex + postfix operator
#:
procedure do_regexp_plus()
regexnoskip := &null
return REGEXPLUS
end
#PD:
#: do_regexp_bar() - processes the regex | infix operator
#:
procedure do_regexp_bar()
regexnoskip := &null
return REGEXBAR
end
#PD:
#: do_regexp_qmark() - processes the ? postfix operator
#:
procedure do_regexp_qmark()
regexnoskip := &null
return REGEXQMARK
end
#PD:
#: do_regexp_caret() - processes the ^ operator
#:
procedure do_regexp_caret()
regexnoskip := &null
return REGEXCARET
end
#PD:
#: do_regexp_dot() - processes the . operator
#:
procedure do_regexp_dot()
regexnoskip := &null
return REGEXDOT
end
#PD:
#: do_regexp_hyphen() - processes the - infix operator
#:
procedure do_regexp_hyphen()
regexnoskip := &null
return REGEXHYPHEN
end
#PD:
#: do_regexp_lbrack() - processes the [ prefix operator
#:
procedure do_regexp_lbrack()
regexnoskip := &null
return REGEXLBRACK
end
#PD:
#: do_regexp_rbrack() - processes the ] postfix operator
#:
procedure do_regexp_rbrack()
regexnoskip := &null
return REGEXRBRACK
end
#PD:
#: do_regexp_lparen() - processes the ( prefix operator
#:
procedure do_regexp_lparen()
regexnoskip := &null
return REGEXLPAREN
end
#PD:
#: do_regexp_rparen() - processes the ) postfix operator
#:
procedure do_regexp_rparen()
regexnoskip := &null
return REGEXRPAREN
end
#PD:
#: do_regexp_lbrace() - processes the { prefix operator
#:
procedure do_regexp_lbrace()
regexnoskip := &null
return REGEXLBRACE
end
#PD:
#: do_regexp_rbrace() - processes the } postfix operator
#:
procedure do_regexp_rbrace()
regexnoskip := &null
return REGEXRBRACE
end
###############################################################################
#
# End of new lexer code
#
###############################################################################
#PD:
#: do_letters() - this procedure will collect as many characters as possible that
#: are part of the valid set of id characters. The global table reswords holds
#: all reserved words and returns the relevant list associated with each of those
#: reserved words. The table has been created to return a standard value for any
#: entry that is not found in this table. This is the applicable value for all
#: other identifiers
#:
procedure do_letters()
local x #LV: temporary to hold list returned
#: from reswords table lookup
#
# yytext already contains the first character of the identifier, we need to
# append to yytext all the following valid identifier characters.
#
yytext ||:= tab(many(idchars))
#
# using the the fact that the global variable has had the procedure value
# originally assigned by the runtime overwritten by the results of the original
# call. It now holds a table that automatically returns the required information
# needed for parsing identifiers and reserved words.
#
x := reswords[yytext]
#
# each of the reserved words has a specific set of token flags that are
# specified in the first list element of the result and for all other
# identifiers, the generic token flags applicable are found in the first list
# element of the not found entry.
#
tokflags +:= x[1]
#
# the second list element specifies the type of identifier found, for each
# reserved word, a specific to that reserved word value is returned, for all
# other identifiers, it is the value IDENT
#
return x[2]
end
#PD:
#: radixcset(radix) - this procedure returns the required radix cset for the valid
#: characters applicable for any radix from 2 to 36, the radix is specified as a
#: string starting with digits and terminated by either r or R
#: example "24r" or "15R
#:
#: this ensures that errors in the radix specification are found at the earliest
#: and is not left to the icont or iconc translation process. This allows for a
#: possible future where the unicon compiler generates the icode files instead
#: of icont.
#:
procedure radixcset(radix)
static lcase, #SV: the lower case cset converted to a string
ucase, #SV: the upper case cset converted to a string
digits #SV: the digits cset converted to a string
local i, #LV: used as index into strings for the valid
#: charaters applicable to the specified radix
cset1 #LV: the resultant cset of valid radix characters
initial {
lcase := "" || &lcase
ucase := "" || &ucase
digits := "" || &digits
}
if i := (10 >= integer(radix[1:-1])) + 1 then {
cset1 := digits[1:i]
} else {
cset1 := &digits
#
# calculate the number of letters (ucase/lcase) required
i := (10 < integer(radix[1:-1])) - 9
cset1 ++:= cset(lcase[1:i]) ++ cset(ucase[1:i])
}
return cset1
end
#PD:
#: do_digits() - this procedure handles all the various formats that numbers are
#: able to take and includes integers, reals, radix and decimal multiplier formats.
#; If additional number formats become an option, this will be the place in which
#: they will be decoded. Such formats could include rational and complex number
#: formats.
#:
procedure do_digits()
local radix, #LV: the radix for radix defined integers
c, #LV: checks for any following alpha character
#: after a KMGTP
expstr, #LV: temporary used to determine if real
#: exponent is greater than 308
dsz, #LV: this is a temporary for the purposes