-
Notifications
You must be signed in to change notification settings - Fork 0
/
codestream_token.go
404 lines (374 loc) · 12.2 KB
/
codestream_token.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
package joeson
import (
"errors"
"fmt"
goscanner "go/scanner"
"go/token"
"regexp"
"strconv"
"strings"
"github.com/grepsuzette/joeson/helpers"
)
// TokenStream helps parsing tokenized texts.
// Suppose you want to parse a grammar from some pre-tokenized text.
// Tokenization will simplify your grammar, but we would now need
// two systems of source coordinates, e.g. to report usable errors
// to the user.
//
// Suppose a sequence of `=` represents tokens:
//
// pos=0 pos=12
// / /
// ======= ===N=====
// ==== ======= ===
// =========== =====
//
// When parsing fails at offset N, TokenStream can find which offset
// of which token, and translate back to byte offset in original
// text.
//
// Grammars are intended to parse against the tokenized text (`work`)
// and errors are meant to show the `original` text.
type (
TokenStream struct {
tokens []Token // tokens with their position
original string // original text (not tokenized)
work string // work text (tokenized + joined upon " ")
workOffset int // the current working position relative to `work`
lineStarts []int // line starts in the `original` text
}
Token struct {
Repr string
OriginalOffset int // relative to original
WorkOffset int // relative to work
// meta interface{} // useless for now
}
coord struct {
token Token // token found at requested position
nToken int // number of that token, ∈ [0; len(tokens)[
offsetInToken int // offset of requested position in token.Repr, ∈ [0; len(tokens[nToken].Repr)[
workOffset int // offset relative to `work` text. ∈ [0; len(work)[ . Use toWorkOffset()
originalOffset int // offset relative to `original` text
}
)
func (t Token) String() string {
return fmt.Sprintf("OriginalOffset: %d\tWorkOffset: %d\t%q",
t.OriginalOffset,
t.WorkOffset,
t.Repr,
)
}
// Create a new token stream.
// `text` is the original, untokenized text.
// `tokens` must have been generated from `text`.
// For go code, you could use for instance "go/scanner".
func NewTokenStream(text string, tokens []Token) *TokenStream {
lineStarts := []int{0}
for pos, rune := range text {
if rune == '\n' {
lineStarts = append(lineStarts, pos+1)
}
}
var b strings.Builder
for _, token := range tokens {
b.WriteString(token.Repr)
}
return &TokenStream{tokens, text, b.String(), 0, lineStarts}
}
// `Pos()` returns the byte offset relative to workOffset (the tokenized string)
func (ts *TokenStream) Pos() int {
return ts.workOffset
}
// `SetPos()` sets the the byte offset relative to workOffset (the tokenized string)
// A value of `n` equals to the length of `work` represents the end of the
// stream (nothing to parse anymore).
func (ts *TokenStream) SetPos(n int) {
if n < 0 || n > len(ts.work) {
panic(fmt.Sprintf("%d is out of bound", n))
}
ts.workOffset = n
}
// Get current line (in the original text), first line is 1.
func (code *TokenStream) Line() int { return code.PosToLine(code.workOffset) }
// Get current column (in the original text), first column is 1.
func (code *TokenStream) Col() int { return code.PosToCol(code.workOffset) }
// Convert a certain position (byte offset relative to workOffset) to Line.
// Line refers to the original text, and starts at 1.
func (code *TokenStream) PosToLine(workOffset int) int {
return helpers.BisectRight(
code.lineStarts,
code.coords(workOffset).originalOffset,
) - 1
}
// Convert a certain position (byte offset relative to workOffset) to Column.
// Column refers to the original text, and start at 1.
func (code *TokenStream) PosToCol(workOffset int) int {
return code.coords(workOffset).originalOffset -
code.lineStarts[code.PosToLine(workOffset)]
}
func (code *TokenStream) Code() string { return code.original }
// Get the length in bytes of the original text
func (code *TokenStream) Length() int { return len(code.original) }
// Get the length in bytes of the tokenized text
func (code *TokenStream) workLength() int { return len(code.work) }
// Get the string from current position until the start of string `needle` is found.
// Update current position accordingly (**after** `needle` if found).
func (code *TokenStream) GetUntil(needle string) string {
offset := strings.Index(code.work[code.workOffset:], needle)
if offset == -1 {
offset = len(code.work)
} else {
offset += code.workOffset // because we searched from this pos
offset += len(needle) // what we're after is the length in bytes
}
oldWorkOffset := code.workOffset
code.workOffset = offset
return code.work[oldWorkOffset:offset]
}
// Take a look `n` runes backwards or forwards, depending on the sign of n,
// return the string contained in the interval made with the current position.
// don't update position
func (code *TokenStream) PeekRunes(n int) string {
if n <= 0 {
return helpers.LastNRunes(code.work[:code.workOffset], -n)
} else {
var b strings.Builder
i := 0
for _, rune := range code.work[code.workOffset:] {
b.WriteRune(rune)
i++
if i >= n {
break
}
}
return b.String()
}
}
// Extract the string contained at lines [least(n...)+currentLine; greatest(n...)+currentLine], backwards or forwards,
// When only 1 value is given, a second value of 0 is implied to create a range.
// For TokenStream, PeekLines() is mostly meant for printing purposes;
// it responds with the original text, not the tokenized one.
func (code *TokenStream) PeekLines(n ...int) string {
if len(n) == 0 {
n = []int{0}
} else if len(n) == 1 {
n = []int{n[0], 0} // implied 0
}
min := n[0]
max := n[0]
for _, n := range n {
if n < min {
min = n
}
if n > max {
max = n
}
}
start := code.lineStarts[helpers.Max(0, code.Line()+min)]
var end int
endLine := helpers.Min(len(code.lineStarts)-1, code.Line()+max)
if endLine < len(code.lineStarts)-1 {
end = code.lineStarts[endLine+1] - 1
} else {
end = len(code.original)
}
return helpers.SliceString(code.original, start, end)
}
// Match func(rune) bool against rune at current position.
// didMatch indicates whether is succeeded. If so the rune is m and position is
// updated. When at EOF it never match.
func (code *TokenStream) MatchRune(f func(rune) bool) (didMatch bool, m rune) {
if code.workOffset >= code.workLength() {
return false, '\x00' // never match at EOF
}
var ret rune
newPos := code.workOffset
iter := 0
for offset, rune := range code.work[code.workOffset:] {
if iter == 1 {
newPos += offset // before leaving add offset of the next character
break
}
if !f(rune) {
return false, ' '
} else {
ret = rune
iter++ // another round to take offset of the next rune and immediately break
}
}
if newPos == code.workOffset {
// when not updated, it means rune matched was the last in text
code.SetPos(len(code.work))
} else {
code.SetPos(newPos)
}
return true, ret
}
// Match string `s` against current position.
// didMatch indicates whether is succeeded
// in which case the match is in `m`
func (code *TokenStream) MatchString(s string) (didMatch bool, m string) {
if s != code.work[code.workOffset:helpers.Min(code.workOffset+len(s), len(code.work))] {
return false, ""
} else {
}
code.workOffset += len(s)
return true, s
}
// Match regex `re` against current position.
// didMatch indicates whether is succeeded.
// If so the full text for the match (usually called match[0]) is in m.
func (code *TokenStream) MatchRegexp(re regexp.Regexp) (didMatch bool, m string) {
if firstMatchLoc := re.FindStringIndex(code.work[code.workOffset:]); firstMatchLoc == nil {
return false, ""
} else {
if firstMatchLoc[0] != 0 {
return false, ""
} else {
from := code.workOffset + firstMatchLoc[0]
to := helpers.Min(code.workOffset+firstMatchLoc[1], len(code.work))
s := code.work[from:to]
code.workOffset += firstMatchLoc[1]
return true, s
}
}
}
// Single line information to be included in parse errors
func (code *TokenStream) Print() string {
var o strings.Builder
originalOffset := code.coords(code.Pos()).originalOffset
o.WriteString("Code at offset ")
o.WriteString(BoldYellow(strconv.Itoa(originalOffset)))
o.WriteString("/")
o.WriteString(BoldYellow(strconv.Itoa(len(code.original))))
o.WriteString(": '")
o.WriteString(Cyan(helpers.SliceString(code.original, helpers.Max(0, originalOffset-20), originalOffset)))
o.WriteString(BoldCyan("|"))
o.WriteString(BoldWhite(helpers.SliceString(code.original, originalOffset, originalOffset+40)) + "'")
return o.String()
}
// multiline print, for debugging purposes
func (code *TokenStream) PrintDebug() string {
pos := code.workOffset
s := "Code at offset " + BoldYellow(strconv.Itoa(pos)) + "/" + BoldYellow(strconv.Itoa(len(code.original))) + ": '"
s += Cyan(helpers.SliceString(code.original, helpers.Max(0, pos-20), pos))
s += BoldCyan("|")
s += BoldWhite(helpers.SliceString(code.original, pos, pos+40)) + "'"
s += "\n"
s += "lineStarts:\n"
s += fmt.Sprintf("%v\n", code.lineStarts)
s += "Original text:\n"
s += strings.NewReplacer("\n", "<CR>\n", "\t", "<TAB>", " ", "_").Replace(code.original) + "\n\n"
s += code.PrintWorkText()
s += "\n"
s += "Tokens:\n"
for _, token := range code.tokens {
s += BoldBlack("[o=" + strconv.Itoa(token.OriginalOffset) + ", w=" + strconv.Itoa(token.WorkOffset) + "]")
s += token.Repr
}
return s
}
func (code *TokenStream) PrintWorkText() string {
return "Work text (tokenized):\n" + code.work + "\n"
}
// Get all possible coordinates (originalOffset, line, col)
// from provided workOffset (current byte offset relative to the tokenized
// string).
func (code *TokenStream) coords(workOffset int) coord {
if len(code.tokens) == 0 {
return coord{}
}
// find most advanced token number, such that the following token would begin
// after workOffset.
nToken := 0
var token Token
for {
if nToken >= len(code.tokens)-1 {
break // don't panic, it can make sense when a token was inserted
}
token = code.tokens[nToken]
if workOffset < token.WorkOffset+len(token.Repr) {
break
}
nToken++
}
if nToken >= len(code.tokens) {
panic(fmt.Sprintf("nToken=%d goes beyond len(code.tokens)=%d\n"+
"%s", nToken, len(code.tokens), code.Print()))
}
offsetInToken := workOffset - token.WorkOffset
originalOffset := code.tokens[nToken].OriginalOffset + offsetInToken
return coord{
token: token,
nToken: nToken,
offsetInToken: offsetInToken,
workOffset: workOffset,
originalOffset: originalOffset,
}
}
// use by error handler below
var scanErrors []error
func handleErrors(pos token.Position, msg string) {
scanErrors = append(scanErrors, scannerError{pos, msg})
}
// TokenStreamFromGoCode is a special function transforming
// some go code into a TokenStream. You can then call
// `yourGrammar.ParseTokens(ts TokenStream)` directly.
func TokenStreamFromGoCode(source string) (*TokenStream, error) {
var scan goscanner.Scanner
fset := token.NewFileSet()
file := fset.AddFile("", fset.Base(), len(source))
scan.Init(file, []byte(source), handleErrors, 0 /*goscanner.ScanComments*/)
if scan.ErrorCount > 0 {
if scan.ErrorCount != len(scanErrors) {
panic("assert") // errors must have been collected
}
return nil, errors.Join(scanErrors...)
}
tokens := []Token{}
workOffset := 0
prev := ""
// Go lexer adds an automatic semicolon when the line's last token is:
// * an identifier
// * an integer, floating-point, imaginary, rune, or string literal
// * one of the keywords break, continue, fallthrough, or return
// * one of the operators and delimiters ++, --, ), ], or }
var b strings.Builder
mustInsertSpaceAfter := regexp.MustCompile("[a-zA-Z0-9_=]$")
for {
pos, tok, lit := scan.Scan()
if tok == token.EOF {
break
}
s := ""
tokStr := tok.String()
if tokStr == ";" && lit == "\n" {
s = ";\n"
} else if lit != "" {
if mustInsertSpaceAfter.MatchString(prev) {
s = " " + lit
} else {
s = lit
}
} else {
if mustInsertSpaceAfter.MatchString(prev) &&
(tok.IsOperator() && tok == token.COMMA) {
s = " " + tokStr
} else {
s = tokStr
}
}
workOffset += len(prev)
prev = s
tokens = append(tokens, Token{s, int(pos) - 1, workOffset})
b.WriteString(s)
}
return NewTokenStream(source, tokens), nil
}
type scannerError struct {
pos token.Position
msg string
}
func (se scannerError) Error() string {
return fmt.Sprintf("there was an error at %s: %s", se.pos.String(), se.msg)
}