Skip to content

Commit

Permalink
refactor tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
jaspervdj-snyk committed Jun 21, 2023
1 parent 7ff9c68 commit 5a6db3d
Showing 1 changed file with 68 additions and 58 deletions.
126 changes: 68 additions & 58 deletions pkg/input/arm/tokenizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@
package arm

import (
"strings"
"unicode"
)

func tokenize(input string) ([]token, error) {
var tokens []token
t := &tokenizer{remaining: input}
t := &tokenizer{remaining: []rune(input)}
for {
tkn := t.next()
if tkn == nil {
Expand All @@ -32,75 +31,94 @@ func tokenize(input string) ([]token, error) {
}

type tokenizer struct {
remaining string
remaining []rune
}

func (t *tokenizer) next() token {
t.chopLeadingWhitespace()
func (t *tokenizer) peek() (rune, bool) {
if len(t.remaining) == 0 {
return 0, false
}
return t.remaining[0], true
}

func (t *tokenizer) pop() (rune, bool) {
c, ok := t.peek()
if !ok {
return c, false
}
t.remaining = t.remaining[1:]
return c, ok
}

func (t *tokenizer) next() token {
// Discard any whitespace
c1, ok := t.pop()
for ok && unicode.IsSpace(c1) {
c1, ok = t.pop()
}

// End of text
if !ok {
return nil
}

switch []rune(t.remaining)[0] {
switch c1 {
case '(':
t.remaining = t.remaining[1:]
return openParen{}
case ')':
t.remaining = t.remaining[1:]
return closeParen{}
case ',':
t.remaining = t.remaining[1:]
return comma{}
case '.':
t.remaining = t.remaining[1:]
return dot{}
case '\'':
t.remaining = t.remaining[1:]
tkn := stringLiteral{content: t.findEndOfStringLiteral()}
return tkn
}

// if we reach here, the token is an identifier
endOfIdentifierIdx := strCharIndex(t.remaining, func(char rune) bool {
switch char {
case '(', ')', ',', '.':
return true
default:
return unicode.IsSpace(char)
// We are inside a string, parse it completely
str := []rune{}
for {
c, ok := t.pop()
if !ok {
panic("expected ' to end string")
}
if c == '\'' {
// Could be end of string or an escaped '\', take a peek.
c2, ok := t.peek()
if ok && c2 == '\'' {
t.pop()
str = append(str, '\'')
} else {
return stringLiteral{content: string(str)}
}
} else {
// A normal character inside a string, just add it.
str = append(str, c)
}
}
})

raw := t.remaining[:endOfIdentifierIdx]
t.remaining = t.remaining[endOfIdentifierIdx:]
return identifier{name: raw}
}

// https://learn.microsoft.com/en-us/azure/azure-resource-manager/templates/template-expressions#escape-characters
// 2 single quotes are an escaped single quote
func (t *tokenizer) findEndOfStringLiteral() string {
ret := ""
for {
nextSingleQuoteIndex := strCharIndex(t.remaining, func(char rune) bool {
return char == '\''
})
ret += t.remaining[:nextSingleQuoteIndex]
t.remaining = t.remaining[nextSingleQuoteIndex:]
if len(t.remaining) < 2 || t.remaining[1] != '\'' {
// no escape sequence detected, pop the closing quote and return the
// literal
t.remaining = t.remaining[1:]
return ret
default:
// if we reach here, the token is an identifier
if validIdentifierStart(c1) {
id := []rune{c1}
for {
// Keep adding valid characters to id while we can
c, ok := t.peek()
if ok && validIdentifierChar(c) {
t.pop()
id = append(id, c)
} else {
return identifier{name: string(id)}
}
}
}

// escape sequence detected, push a single quote onto the string, pop the 2
// quotes, and continue
ret += "'"
t.remaining = t.remaining[2:]
panic("unexpected character")
}
}

func (t *tokenizer) chopLeadingWhitespace() {
t.remaining = strings.TrimLeftFunc(t.remaining, unicode.IsSpace)
func validIdentifierStart(c rune) bool {
return unicode.IsLetter(c)
}

func validIdentifierChar(c rune) bool {
return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
}

type token interface {
Expand All @@ -118,11 +136,3 @@ type identifier struct {
type stringLiteral struct {
content string
}

func strCharIndex(s string, f func(rune) bool) int {
idx := strings.IndexFunc(s, f)
if idx == -1 {
return len(s)
}
return idx
}

0 comments on commit 5a6db3d

Please sign in to comment.