-
Notifications
You must be signed in to change notification settings - Fork 0
/
minicomp.py
188 lines (135 loc) · 4.9 KB
/
minicomp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Author:
Created:
Purpose:
Course:
Notes:
History:
"""
# ------------------------------------------------ STEP 1: PRELIMINARIES / ENVIRONMENT SETUP
# import the libraries we'll need
import ply.lex as lex # lexical analysis / tokenization
import ply.yacc as yacc # parser
from ASTNODE import ASTNODE # simple class for creating nodes for an Abstract Syntax Tree (AST)
from Common import Common # a useful class and method for getting the type of an object
from ReadFile import ReadFile # a simple but a useful read file class
# ------------------------------------------------ STEP 2: SET UP LEXER
"""
See Section 4 of https://www.dabeaz.com/ply/ply.html. Sections 4 through 4.4.
To handle reserved words, you should write a single rule to match an identifier and do a special name lookup
in a function like this:
reserved = {
'if' : 'IF',
'then' : 'THEN',
'else' : 'ELSE',
'while' : 'WHILE',
...
}
Note:
* t_ definitions with regular expressions do not need to be created for reserved words
* reserved words represent concrete syntax in the source language of your compiler
"""
reserved = {}
"""
Tokens are elements of your grammar that are not reserved words, for example an ID for an identifier,
or DQ_STRING for a string wrapped with double quotes. A list of tokens is required.
Example: tokens = ['ID', 'NUMBER', ... ]
"""
# noinspection SpellCheckingInspection
tokens = [
'NUMBER'
]
tokens += list(reserved.values())
"""
See Section 4.8 of https://www.dabeaz.com/ply/ply.html.
Literals are single characters that you will use in your language's concrete syntax but do not need
t_ definitions created for them.
Example: literals = ['(', ')', "+", "-", "%", "*", "/", "=", ";"]
"""
literals = []
# helper function create a Python int or float as needed
def string_to_number(s):
try:
ans = (int(s), "integer")
except ValueError:
ans = (float(s), "float")
return ans
# noinspection PyPep8Naming
# noinspection PySingleQuotedDocstring
def t_NUMBER(t):
r'[0-9]*\.?[0-9]+([eE][-+]?[0-9]+)?' # 2024-02-14, DMW, we'll save floating point for later
# https://www.regular-expressions.info/floatingpoint.html
# r'\d+' # original regular expression -- allow integers only
t.value = string_to_number(t.value) # int(t.value)
return t
# See section 4.5 - https://www.dabeaz.com/ply/ply.html - Lexer rules for ignoring text for tokenization
# noinspection PyPep8Naming
def t_COMMENT(t):
r'\#.*'
pass
# No return value. Token discarded
# See sections 4.6, 4.7 - https://www.dabeaz.com/ply/ply.html
# characters to ignore as whitespace, space, tab, vertical tab, form feed
t_ignore = " \t\v\f"
# noinspection PySingleQuotedDocstring
def t_newline(t):
r'\n+'
t.lexer.lineno += t.value.count("\n")
# Compute column.
# input is the input text string
# token is a token instance
def find_column(input_text, token):
line_start = input_text.rfind('\n', 0, token.lexpos) + 1
return (token.lexpos - line_start) + 1
# See sections 4.9 - Error Handling - https://www.dabeaz.com/ply/ply.html
def t_error(t):
print("Illegal character '%s'" % t.value[0])
t.lexer.skip(1)
# See sections 4.10 - EOF Handling- https://www.dabeaz.com/ply/ply.html
# Note: This method is not required and may not be necessary
# EOF handling rule
# def t_eof(t):
# # Get more input (Example)
# more = raw_input('... ')
# if more:
# self.lexer.input(more)
# return self.lexer.token()
# return None
# See sections 4.11 - Building and using the lexer - https://www.dabeaz.com/ply/ply.html
# Build the lexer
lexer = lex.lex()
# ------------------------------------------------ STEP 3: SET UP THE PARSER
# Note: example precedence shown, setting precedence helps with binary operators
# noinspection SpellCheckingInspection
# precedence = (
# ('left', 'LT', 'LE', 'EQ', 'NEQ', 'GE', 'GT'),
# ('left', '+', '-'),
# ('left', '*', '/', '%'),
# ('right', 'UMINUS'),
# )
program = None
start = "program" # set the start production, even though the first production is the start by default
# noinspection PyPep8Naming
# noinspection PySingleQuotedDocstring
def p_PROGRAM(p):
"program : number"
global program
program = ASTNODE("program", children=[p[1]])
# noinspection PyPep8Naming
def p_NUMBER(p):
"number : NUMBER"
p[0] = ASTNODE("number", value=p[1])
# print(p[1]) # debugging
# a p_error(p) rule is required
def p_error(p):
if p:
print("Syntax error at '%s'" % p.value)
else:
print("Syntax error at EOF")
parser = yacc.yacc()
# ------------------------------------------------ STEP 4: USE THE PARSER
if __name__ == "__main__":
source_program = "17"
ast = parser.parse(source_program)
print(program)
ASTNODE.render_tree(program)