-
Notifications
You must be signed in to change notification settings - Fork 0
/
frontend_pythonre.py
370 lines (324 loc) · 11.7 KB
/
frontend_pythonre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
import ply.yacc as yacc
import ast_refined
import re2lexer
tokens = re2lexer.tokens
debug = False
NUM_CHARS = 256
MAX_SEQUENTIAL_TEST = 64
#grammar structure obtained from https://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap09.html
def p_regex(p):
'regex : alternative '
p[0] = ast_refined.whole_regexp(p[1])
def p_alternative(p):
'''alternative : alternative OR concatenation
| alternative OR
| concatenation'''
if(len(p) >= 4):
if (isinstance( p[1] , ast_refined.alternative) and
isinstance(p[3], ast_refined.alternative)):
for c in p[3].children:
p[1].append(c)
elif isinstance( p[1] , ast_refined.alternative):
p[1].append(p[3])
p[0] = p[1]
elif isinstance( p[3] , ast_refined.alternative):
p[3].append(p[1])
p[0] = p[3]
else:
p[0] = ast_refined.alternative(p[1],p[3])
elif(len(p) == 3):
if isinstance( p[1] , ast_refined.alternative):
p[1].append(ast_refined.epsilon_move())
p[0] = p[1]
else:
p[0] = ast_refined.alternative(p[1],ast_refined.epsilon_move())
else:
p[0] = p[1]
def p_concatenation(p):
'''concatenation : concatenation repetition
| repetition'''
if(len(p) >= 3):
if (isinstance( p[1] , ast_refined.concatenation) and
isinstance( p[2] , ast_refined.concatenation) ):
for c in p[2].children:
p[1].append(c)
p[0] = p[1]
elif isinstance( p[1] , ast_refined.concatenation) :
p[1].append(p[2])
p[0] = p[1]
elif isinstance( p[2] , ast_refined.concatenation) :
p[1] = ast_refined.concatenation(p[1])
for c in p[2].children:
p[1].append(c)
p[0] = p[1]
else:
p[0] = ast_refined.concatenation(p[1],p[2])
else:
p[0] = p[1]
def p_repetition(p):
'''repetition : subexpr TIMES
| subexpr PLUS
| subexpr OPT
| subexpr LCPAR NUM RCPAR
| subexpr LCPAR COMMA NUM RCPAR
| subexpr LCPAR NUM COMMA RCPAR
| subexpr LCPAR NUM COMMA NUM RCPAR
| subexpr '''
#pay attention that even though some constructs are parsed by the grammar
#they may be not suitable to run in the accel.
if(len(p) == 3):
if( p[2] == '*'):
p[0] = ast_refined.any_repetition(p[1])
elif(p[2] == '+'):
p[0] = ast_refined.more_than_one_repetition(p[1])
elif(p[2] == '?'):
p[0] = ast_refined.optional_repetition(p[1])
elif len(p) >= 5 and p.slice[2].type == 'LCPAR':
if len(p) == 5:
#case: alternative LCPAR NUM RCPAR regex{n}
n_rep = int(p[3])
p[0] = ast_refined.bounded_num_repetition(p[1], min_num=n_rep,max_num=n_rep)
elif len(p) == 6 and p.slice[3].type == 'COMMA':
#case: alternative LCPAR COMMA NUM RCPAR regex{,m}
n_min_rep = 0
n_max_rep = int(p[4])
p[0] = ast_refined.bounded_num_repetition(p[1], min_num=n_min_rep,max_num=n_max_rep)
elif len(p) == 6 and p.slice[4].type == 'COMMA':
#case: alternative LCPAR NUM COMMA RCPAR regex{n,}
n_min_rep = int(p[3])
p[0] = ast_refined.min_bounded_num_repetition(p[1], min_num=n_min_rep)
else:
#case: alternative LCPAR NUM COMMA NUM RCPAR regex{n,m}
n_min_rep = int(p[3])
n_max_rep = int(p[5])
p[0] = ast_refined.bounded_num_repetition(p[1], min_num=n_min_rep,max_num=n_max_rep)
else:
#case subexpression
p[0] = p[1]
def p_subexpr(p):
'''subexpr : LPAR alternative RPAR
| LSPAR HAT group RSPAR
| LSPAR group RSPAR
| terminal_sequence
| metachar
| ANYCHAR
| DOLLAR
| MINUS
'''
if debug: #access lexer object through p.slice[i] example:
print('try',p.slice[1], p[1])
if p.slice[1].type == 'LPAR' :
# matched subexpr -> LPAR alternative RPAR so just pass up a reference to
# the sub ast
p[0] = p[2]
elif p.slice[1].type == 'LSPAR' and p.slice[2].type == 'HAT' :
# negative group
p[0] = match_negative_group(p[3])
elif p.slice[1].type == 'LSPAR':
#positive group
p[0] = match_positive_group(p[2])
elif p.slice[1].type == 'metachar':
# metachar can be treated as a positive group of chars to be matched
p[0] = match_positive_group(p[1])
elif p.slice[1].type == 'ANYCHAR':
#match a '.'
p[0] = ast_refined.any_character()
elif p.slice[1].type == 'DOLLAR':
#match a '$'
p[0] = ast_refined.end_of_string()
else:
#a terminal_sequence
list_of_matches = [ ast_refined.match_character(c) for c in p[1]]
p[0] = ast_refined.concatenation(*list_of_matches)
def match_negative_group(char_set):
#this method decide whether to create a seq test or a parallel test
if len(char_set) < MAX_SEQUENTIAL_TEST:
return turn_into_a_negative_match_seq(char_set)
else:
return turn_into_a_positive_match_seq(complement_char_set(char_set))
def match_positive_group(char_set):
#this method decide whether to create a seq test or a parallel test
if (NUM_CHARS- len(char_set)) < MAX_SEQUENTIAL_TEST:
return turn_into_a_negative_match_seq(complement_char_set(char_set))
else:
return turn_into_a_positive_match_seq(char_set)
def turn_into_a_negative_match_seq(char_set):
list_of_matches = [ ast_refined.match_negative_character(c) for c in char_set]
return ast_refined.none_of(*list_of_matches)
def turn_into_a_positive_match_seq(char_set):
list_of_matches = [ ast_refined.match_character(c) for c in char_set]
return ast_refined.alternative(*list_of_matches)
def complement_char_set(char_set):
char_set = list(map( lambda x: x.encode('utf-8')[0] if isinstance(x,str) else x, char_set))
return [ c for c in range(NUM_CHARS) if c not in char_set]
def p_group(p):
''' group : terminal_sequence MINUS terminal_sequence group
| terminal_sequence MINUS terminal_sequence
| terminal_sequence group
| terminal_sequence
| metachar group
| metachar '''
#group is a special subproduction that does not return an ast
#but a list of int representing chars in utf-8 is responsability of
#other production to produce a valid ast.
if debug:
print('try group',p.slice[1], p[1])
if len(p) >= 4:
#the production includes a MINUS
#in terminal sequence it may be packed multiple chars e.g. in case of a number
#take the ones facing with MINUS and substitute from the others
extra = [c for c in p[1][:-1]]+[c for c in p[3][1:]]
p[1] = p[1][-1]
p[3] = p[3][0]
#produce a list of chars ranging from low to high (included)
# NOTE: char is described by means of an int
p[0] = set(chars_between(p[1],p[3]))
p[0].update(extra)
if len(p) ==5:
# i.e. if we are in the case terminal_sequence MINUS terminal_sequence group
# concatenate the list produced by the subgroup
p[0].update(p[4])
else :
#in the last cases all nonterminals carry a list of symbols
#just pass them up
p[0] = set(p[1])
if len(p) > 2:
# if we are in the case (CHAR|NUM) group
# concatenate the list produced by the subrule group
p[0].update(p[2])
def p_terminal_sequence(p):
''' terminal_sequence : CHAR
| NUM
| HEXA
| COMMA
'''
if p.slice[1].type == 'NUM':
p[0] = [c for c in p[1]]
else:
# 'CHAR', 'COMMA','MINUS'
p[0] = [p[1]]
def p_metachar(p):
''' metachar : WHITESPACE
| WHITESPACE_COMPLEMENTED
| DIGIT
| DIGIT_COMPLEMENTED
| ALPHANUMERIC
| ALPHANUMERIC_COMPLEMENTED
'''
if p.slice[1].type == 'WHITESPACE': #\s
#from https://docs.python.org/3.8/library/re.html
#For Unicode (str) patterns:
#Matches Unicode whitespace characters (which includes [ \t\n\r\f\v], and also many other characters,
# for example the non-breaking spaces mandated by typography rules in many languages).
#If the ASCII flag is used, only [ \t\n\r\f\v] is matched.
#
#or 8-bit (bytes) patterns:
#Matches characters considered whitespace in the ASCII character set; this is equivalent to [ \t\n\r\f\v].
# and https://docs.microsoft.com/en-us/cpp/c-language/escape-sequences?view=vs-2019
space_set = [' ','\t','\n','\r','\f','\v']
p[0] = space_set
elif p.slice[1].type == 'WHITESPACE_COMPLEMENTED': #\S
space_set = [' ','\t','\n','\r','\f','\v']
p[0] = complement_char_set(space_set)
elif p.slice[1].type == 'DIGIT': #\d
digit_set = chars_between('0','9')
p[0] = digit_set
elif p.slice[1].type == 'DIGIT_COMPLEMENTED': #\D
digit_set = chars_between('0','9')
p[0] = complement_char_set(digit_set)
elif p.slice[1].type == 'ALPHANUMERIC': #\w
alphanum_set = chars_between('0','9')+chars_between('a','z')+chars_between('A','Z')+['_']
p[0] = alphanum_set
elif p.slice[1].type == 'ALPHANUMERIC_COMPLEMENTED': #\W
alphanum_set = chars_between('0','9')+chars_between('a','z')+chars_between('A','Z')+['_']
p[0] = complement_char_set(alphanum_set)
else:
raise Exception(f'problem with {p}')
def chars_between(a,b):
a = a.encode('utf-8')[0] if isinstance(a,str) else a
b = b.encode('utf-8')[0] if isinstance(b,str) else b
low = min(a, b)
high = max(a, b)
#produce a list of chars ranging from low to high (included)
# NOTE: char is described by means of an int
return [c for c in range(low,high+1)]
def from_num_to_concat(n_string):
to_concat = []
for c in n_string:
to_concat.append(ast_refined.match_character(c))
return ast_refined.concatenation(*to_concat)
# Error rule for syntax errors
def p_error(p):
if p:
print(f"Syntax error at line {p.lineno} col {p.lexpos} token { p.type}")
line = p.lexer.lexdata.splitlines()[p.lineno-1]
print(line)
print(" "*(p.lexpos)+'^')
# Just discard the token and tell the parser it's okay.
#raise Exception(p)
else:
print("Syntax error at EOF")
raise Exception("Syntax error")
# Build the parser
parser = yacc.yacc()
def to_ir(data=None, no_postfix=False,no_prefix=False,dotast=None):
#https://docs.python.org/3/library/re.html#regular-expression-syntax
#^ (Caret.) Matches the start of the string, and in MULTILINE mode also
# matches immediately after each newline.
if data[0] == '^':
no_prefix = True
data = data[1:]
#$
#Matches the end of the string or just before the newline at the end of the
# string, and in MULTILINE mode also matches before a newline. foo matches both
# ‘foo’ and ‘foobar’, while the regular expression foo$ matches only ‘foo’.
if data[-1] == '$':
no_postfix = True
data = data[:-1]
from helper import normalize_regex_input
tmp = normalize_regex_input(data)
ast = parser.parse(tmp)
#by default regex_code accepts a string even if the part matching the regex does not end at the end of the string
ast.set_accept_partial(not no_postfix)
#by default the start of the match in the string does not have to correspond with start of the string
ast.set_ignore_prefix(not no_prefix)
if(dotast is not None):
dot_file_content = ast.dotty_str()
with open(dotast, 'w', encoding="utf-8") as f:
f.write(dot_file_content)
return ast.to_ir()
if __name__ == "__main__":
data = '[^a-zA-Z]\\-'
ast = parser.parse(data)
dot_file_content = ast.dotty_str()
with open('test.dot', 'w', encoding="utf-8") as f:
f.write(dot_file_content)
data = 'r12ax\\-'
ast = parser.parse(data)
dot_file_content = ast.dotty_str()
with open('test.dot', 'w', encoding="utf-8") as f:
f.write(dot_file_content)
try:
data = 'r12ax\\1'
ast = to_ir(data)
assert False
except Exception as e:
pass
try:
data = 'r(?P=ciao)'
ast = to_ir(data)
assert False
except Exception as e:
pass
regex_string = '(([RKX]{2}?)(.)([STX]))'
regex_string = normalize_regex_input(regex_string)
assert regex_string == '(([RKX]{2})(.)([STX]))'
regex_string = '(([RKX]{2,}?)(.)([STX]))'
regex_string = normalize_regex_input(regex_string)
assert regex_string == '(([RKX]{2,})(.)([STX]))'
regex_string = '(([RKX]{,2}?)(.)([STX]))'
regex_string = normalize_regex_input(regex_string)
assert regex_string == '(([RKX]{,2})(.)([STX]))'
regex_string = '(([RKX]{,2})/(.)([STX]))'
regex_string = normalize_regex_input(regex_string)
assert regex_string == '(([RKX]{,2})/(.)([STX]))'