refactor tokenizer

snyk · Jun 21, 2023 · 5a6db3d · 5a6db3d
1 parent 7ff9c68
commit 5a6db3d
Showing 1 changed file with 68 additions and 58 deletions.
diff --git a/pkg/input/arm/tokenizer.go b/pkg/input/arm/tokenizer.go
@@ -15,13 +15,12 @@
 package arm
 
 import (
-	"strings"
 	"unicode"
 )
 
 func tokenize(input string) ([]token, error) {
 	var tokens []token
-	t := &tokenizer{remaining: input}
+	t := &tokenizer{remaining: []rune(input)}
 	for {
 		tkn := t.next()
 		if tkn == nil {
@@ -32,75 +31,94 @@ func tokenize(input string) ([]token, error) {
 }
 
 type tokenizer struct {
-	remaining string
+	remaining []rune
 }
 
-func (t *tokenizer) next() token {
-	t.chopLeadingWhitespace()
+func (t *tokenizer) peek() (rune, bool) {
 	if len(t.remaining) == 0 {
+		return 0, false
+	}
+	return t.remaining[0], true
+}
+
+func (t *tokenizer) pop() (rune, bool) {
+	c, ok := t.peek()
+	if !ok {
+		return c, false
+	}
+	t.remaining = t.remaining[1:]
+	return c, ok
+}
+
+func (t *tokenizer) next() token {
+	// Discard any whitespace
+	c1, ok := t.pop()
+	for ok && unicode.IsSpace(c1) {
+		c1, ok = t.pop()
+	}
+
+	// End of text
+	if !ok {
 		return nil
 	}
 
-	switch []rune(t.remaining)[0] {
+	switch c1 {
 	case '(':
-		t.remaining = t.remaining[1:]
 		return openParen{}
 	case ')':
-		t.remaining = t.remaining[1:]
 		return closeParen{}
 	case ',':
-		t.remaining = t.remaining[1:]
 		return comma{}
 	case '.':
-		t.remaining = t.remaining[1:]
 		return dot{}
 	case '\'':
-		t.remaining = t.remaining[1:]
-		tkn := stringLiteral{content: t.findEndOfStringLiteral()}
-		return tkn
-	}
-
-	// if we reach here, the token is an identifier
-	endOfIdentifierIdx := strCharIndex(t.remaining, func(char rune) bool {
-		switch char {
-		case '(', ')', ',', '.':
-			return true
-		default:
-			return unicode.IsSpace(char)
+		// We are inside a string, parse it completely
+		str := []rune{}
+		for {
+			c, ok := t.pop()
+			if !ok {
+				panic("expected ' to end string")
+			}
+			if c == '\'' {
+				// Could be end of string or an escaped '\', take a peek.
+				c2, ok := t.peek()
+				if ok && c2 == '\'' {
+					t.pop()
+					str = append(str, '\'')
+				} else {
+					return stringLiteral{content: string(str)}
+				}
+			} else {
+				// A normal character inside a string, just add it.
+				str = append(str, c)
+			}
 		}
-	})
-
-	raw := t.remaining[:endOfIdentifierIdx]
-	t.remaining = t.remaining[endOfIdentifierIdx:]
-	return identifier{name: raw}
-}
-
-// https://learn.microsoft.com/en-us/azure/azure-resource-manager/templates/template-expressions#escape-characters
-// 2 single quotes are an escaped single quote
-func (t *tokenizer) findEndOfStringLiteral() string {
-	ret := ""
-	for {
-		nextSingleQuoteIndex := strCharIndex(t.remaining, func(char rune) bool {
-			return char == '\''
-		})
-		ret += t.remaining[:nextSingleQuoteIndex]
-		t.remaining = t.remaining[nextSingleQuoteIndex:]
-		if len(t.remaining) < 2 || t.remaining[1] != '\'' {
-			// no escape sequence detected, pop the closing quote and return the
-			// literal
-			t.remaining = t.remaining[1:]
-			return ret
+	default:
+		// if we reach here, the token is an identifier
+		if validIdentifierStart(c1) {
+			id := []rune{c1}
+			for {
+				// Keep adding valid characters to id while we can
+				c, ok := t.peek()
+				if ok && validIdentifierChar(c) {
+					t.pop()
+					id = append(id, c)
+				} else {
+					return identifier{name: string(id)}
+				}
+			}
 		}
 
-		// escape sequence detected, push a single quote onto the string, pop the 2
-		// quotes, and continue
-		ret += "'"
-		t.remaining = t.remaining[2:]
+		panic("unexpected character")
 	}
 }
 
-func (t *tokenizer) chopLeadingWhitespace() {
-	t.remaining = strings.TrimLeftFunc(t.remaining, unicode.IsSpace)
+func validIdentifierStart(c rune) bool {
+	return unicode.IsLetter(c)
+}
+
+func validIdentifierChar(c rune) bool {
+	return unicode.IsLetter(c) || unicode.IsDigit(c) || c == '_'
 }
 
 type token interface {
@@ -118,11 +136,3 @@ type identifier struct {
 type stringLiteral struct {
 	content string
 }
-
-func strCharIndex(s string, f func(rune) bool) int {
-	idx := strings.IndexFunc(s, f)
-	if idx == -1 {
-		return len(s)
-	}
-	return idx
-}