From dc7b24ec0f3e60215faebd6a9a339103262cd96a Mon Sep 17 00:00:00 2001 From: mavolin Date: Mon, 16 Oct 2023 20:20:19 +0200 Subject: [PATCH] Fix backslash escapes not being properly handled Closes #134 --- Makefile | 3 + grammar/pigeon.peg | 4 +- pigeon.go | 102 +-- test/issue_134/issue_134.go | 1376 ++++++++++++++++++++++++++++++++++ test/issue_134/issue_134.peg | 9 + 5 files changed, 1447 insertions(+), 47 deletions(-) create mode 100644 test/issue_134/issue_134.go create mode 100644 test/issue_134/issue_134.peg diff --git a/Makefile b/Makefile index 9891ada..d83e885 100644 --- a/Makefile +++ b/Makefile @@ -179,6 +179,9 @@ $(TEST_DIR)/issue_80/issue_80.go: $(TEST_DIR)/issue_80/issue_80.peg $(BINDIR)/pi $(TEST_DIR)/issue_115/issue_115.go: $(TEST_DIR)/issue_115/issue_115.peg $(BINDIR)/pigeon $(BINDIR)/pigeon -nolint $< > $@ +$(TEST_DIR)/issue_134/issue_134.go: $(TEST_DIR)/issue_134/issue_134.peg $(BINDIR)/pigeon + $(BINDIR)/pigeon -nolint $< > $@ + $(TEST_DIR)/left_recursion/left_recursion.go: \ $(TEST_DIR)/left_recursion/standart/leftrecursion/left_recursion.go \ $(TEST_DIR)/left_recursion/optimized/leftrecursion/left_recursion.go \ diff --git a/grammar/pigeon.peg b/grammar/pigeon.peg index 30fc71b..800d4a2 100644 --- a/grammar/pigeon.peg +++ b/grammar/pigeon.peg @@ -324,9 +324,9 @@ CodeBlock ← '{' Code '}' { Code ← ( ( Comment / CodeStringLiteral / ![{}] SourceChar )+ / '{' Code '}' )* -CodeStringLiteral ← '"' (`\"` / [^"\r\n])* '"' / +CodeStringLiteral ← '"' (`\"` / `\\` / [^"\r\n])* '"' / '`' [^`]* '`' / - '\'' (`\'` / [^']+) '\'' + '\'' (`\'` / `\\` / [^']+) '\'' __ ← ( Whitespace / EOL / Comment )* _ ← ( Whitespace / MultiLineCommentNoLineTerminator )* diff --git a/pigeon.go b/pigeon.go index 1887468..f4e6adc 100644 --- a/pigeon.go +++ b/pigeon.go @@ -2543,8 +2543,14 @@ var g = &grammar{ ignoreCase: false, want: "\"\\\\\\\"\"", }, - &charClassMatcher{ + &litMatcher{ pos: position{line: 327, col: 33, offset: 10007}, + val: "\\\\", + ignoreCase: false, + want: "\"\\\\\\\\\"", + }, + &charClassMatcher{ + pos: position{line: 327, col: 40, offset: 10014}, val: "[^\"\\r\\n]", chars: []rune{'"', '\r', '\n'}, ignoreCase: false, @@ -2554,7 +2560,7 @@ var g = &grammar{ }, }, &litMatcher{ - pos: position{line: 327, col: 44, offset: 10018}, + pos: position{line: 327, col: 51, offset: 10025}, val: "\"", ignoreCase: false, want: "\"\\\"\"", @@ -2562,18 +2568,18 @@ var g = &grammar{ }, }, &seqExpr{ - pos: position{line: 328, col: 21, offset: 10044}, + pos: position{line: 328, col: 21, offset: 10051}, exprs: []any{ &litMatcher{ - pos: position{line: 328, col: 21, offset: 10044}, + pos: position{line: 328, col: 21, offset: 10051}, val: "`", ignoreCase: false, want: "\"`\"", }, &zeroOrMoreExpr{ - pos: position{line: 328, col: 25, offset: 10048}, + pos: position{line: 328, col: 25, offset: 10055}, expr: &charClassMatcher{ - pos: position{line: 328, col: 25, offset: 10048}, + pos: position{line: 328, col: 25, offset: 10055}, val: "[^`]", chars: []rune{'`'}, ignoreCase: false, @@ -2581,7 +2587,7 @@ var g = &grammar{ }, }, &litMatcher{ - pos: position{line: 328, col: 31, offset: 10054}, + pos: position{line: 328, col: 31, offset: 10061}, val: "`", ignoreCase: false, want: "\"`\"", @@ -2589,27 +2595,33 @@ var g = &grammar{ }, }, &seqExpr{ - pos: position{line: 329, col: 21, offset: 10080}, + pos: position{line: 329, col: 21, offset: 10087}, exprs: []any{ &litMatcher{ - pos: position{line: 329, col: 21, offset: 10080}, + pos: position{line: 329, col: 21, offset: 10087}, val: "'", ignoreCase: false, want: "\"'\"", }, &choiceExpr{ - pos: position{line: 329, col: 27, offset: 10086}, + pos: position{line: 329, col: 27, offset: 10093}, alternatives: []any{ &litMatcher{ - pos: position{line: 329, col: 27, offset: 10086}, + pos: position{line: 329, col: 27, offset: 10093}, val: "\\'", ignoreCase: false, want: "\"\\\\'\"", }, + &litMatcher{ + pos: position{line: 329, col: 34, offset: 10100}, + val: "\\\\", + ignoreCase: false, + want: "\"\\\\\\\\\"", + }, &oneOrMoreExpr{ - pos: position{line: 329, col: 34, offset: 10093}, + pos: position{line: 329, col: 41, offset: 10107}, expr: &charClassMatcher{ - pos: position{line: 329, col: 34, offset: 10093}, + pos: position{line: 329, col: 41, offset: 10107}, val: "[^']", chars: []rune{'\''}, ignoreCase: false, @@ -2619,7 +2631,7 @@ var g = &grammar{ }, }, &litMatcher{ - pos: position{line: 329, col: 41, offset: 10100}, + pos: position{line: 329, col: 48, offset: 10114}, val: "'", ignoreCase: false, want: "\"'\"", @@ -2631,22 +2643,22 @@ var g = &grammar{ }, { name: "__", - pos: position{line: 331, col: 1, offset: 10106}, + pos: position{line: 331, col: 1, offset: 10120}, expr: &zeroOrMoreExpr{ - pos: position{line: 331, col: 6, offset: 10113}, + pos: position{line: 331, col: 6, offset: 10127}, expr: &choiceExpr{ - pos: position{line: 331, col: 8, offset: 10115}, + pos: position{line: 331, col: 8, offset: 10129}, alternatives: []any{ &ruleRefExpr{ - pos: position{line: 331, col: 8, offset: 10115}, + pos: position{line: 331, col: 8, offset: 10129}, name: "Whitespace", }, &ruleRefExpr{ - pos: position{line: 331, col: 21, offset: 10128}, + pos: position{line: 331, col: 21, offset: 10142}, name: "EOL", }, &ruleRefExpr{ - pos: position{line: 331, col: 27, offset: 10134}, + pos: position{line: 331, col: 27, offset: 10148}, name: "Comment", }, }, @@ -2655,18 +2667,18 @@ var g = &grammar{ }, { name: "_", - pos: position{line: 332, col: 1, offset: 10145}, + pos: position{line: 332, col: 1, offset: 10159}, expr: &zeroOrMoreExpr{ - pos: position{line: 332, col: 5, offset: 10151}, + pos: position{line: 332, col: 5, offset: 10165}, expr: &choiceExpr{ - pos: position{line: 332, col: 7, offset: 10153}, + pos: position{line: 332, col: 7, offset: 10167}, alternatives: []any{ &ruleRefExpr{ - pos: position{line: 332, col: 7, offset: 10153}, + pos: position{line: 332, col: 7, offset: 10167}, name: "Whitespace", }, &ruleRefExpr{ - pos: position{line: 332, col: 20, offset: 10166}, + pos: position{line: 332, col: 20, offset: 10180}, name: "MultiLineCommentNoLineTerminator", }, }, @@ -2675,9 +2687,9 @@ var g = &grammar{ }, { name: "Whitespace", - pos: position{line: 334, col: 1, offset: 10203}, + pos: position{line: 334, col: 1, offset: 10217}, expr: &charClassMatcher{ - pos: position{line: 334, col: 14, offset: 10218}, + pos: position{line: 334, col: 14, offset: 10232}, val: "[ \\t\\r]", chars: []rune{' ', '\t', '\r'}, ignoreCase: false, @@ -2686,9 +2698,9 @@ var g = &grammar{ }, { name: "EOL", - pos: position{line: 335, col: 1, offset: 10226}, + pos: position{line: 335, col: 1, offset: 10240}, expr: &litMatcher{ - pos: position{line: 335, col: 7, offset: 10234}, + pos: position{line: 335, col: 7, offset: 10248}, val: "\n", ignoreCase: false, want: "\"\\n\"", @@ -2696,19 +2708,19 @@ var g = &grammar{ }, { name: "EOS", - pos: position{line: 336, col: 1, offset: 10239}, + pos: position{line: 336, col: 1, offset: 10253}, expr: &choiceExpr{ - pos: position{line: 336, col: 7, offset: 10247}, + pos: position{line: 336, col: 7, offset: 10261}, alternatives: []any{ &seqExpr{ - pos: position{line: 336, col: 7, offset: 10247}, + pos: position{line: 336, col: 7, offset: 10261}, exprs: []any{ &ruleRefExpr{ - pos: position{line: 336, col: 7, offset: 10247}, + pos: position{line: 336, col: 7, offset: 10261}, name: "__", }, &litMatcher{ - pos: position{line: 336, col: 10, offset: 10250}, + pos: position{line: 336, col: 10, offset: 10264}, val: ";", ignoreCase: false, want: "\";\"", @@ -2716,34 +2728,34 @@ var g = &grammar{ }, }, &seqExpr{ - pos: position{line: 336, col: 16, offset: 10256}, + pos: position{line: 336, col: 16, offset: 10270}, exprs: []any{ &ruleRefExpr{ - pos: position{line: 336, col: 16, offset: 10256}, + pos: position{line: 336, col: 16, offset: 10270}, name: "_", }, &zeroOrOneExpr{ - pos: position{line: 336, col: 18, offset: 10258}, + pos: position{line: 336, col: 18, offset: 10272}, expr: &ruleRefExpr{ - pos: position{line: 336, col: 18, offset: 10258}, + pos: position{line: 336, col: 18, offset: 10272}, name: "SingleLineComment", }, }, &ruleRefExpr{ - pos: position{line: 336, col: 37, offset: 10277}, + pos: position{line: 336, col: 37, offset: 10291}, name: "EOL", }, }, }, &seqExpr{ - pos: position{line: 336, col: 43, offset: 10283}, + pos: position{line: 336, col: 43, offset: 10297}, exprs: []any{ &ruleRefExpr{ - pos: position{line: 336, col: 43, offset: 10283}, + pos: position{line: 336, col: 43, offset: 10297}, name: "__", }, &ruleRefExpr{ - pos: position{line: 336, col: 46, offset: 10286}, + pos: position{line: 336, col: 46, offset: 10300}, name: "EOF", }, }, @@ -2753,11 +2765,11 @@ var g = &grammar{ }, { name: "EOF", - pos: position{line: 338, col: 1, offset: 10291}, + pos: position{line: 338, col: 1, offset: 10305}, expr: ¬Expr{ - pos: position{line: 338, col: 7, offset: 10299}, + pos: position{line: 338, col: 7, offset: 10313}, expr: &anyMatcher{ - line: 338, col: 8, offset: 10300, + line: 338, col: 8, offset: 10314, }, }, }, diff --git a/test/issue_134/issue_134.go b/test/issue_134/issue_134.go new file mode 100644 index 0000000..eb35ce4 --- /dev/null +++ b/test/issue_134/issue_134.go @@ -0,0 +1,1376 @@ +// Code generated by pigeon; DO NOT EDIT. + +package issue134 + +import ( + "bytes" + "errors" + "fmt" + "io" + "math" + "os" + "sort" + "strconv" + "strings" + "sync" + "unicode" + "unicode/utf8" +) + +var g = &grammar{ + rules: []*rule{ + { + name: "Test", + pos: position{line: 6, col: 1, offset: 70}, + expr: &actionExpr{ + pos: position{line: 6, col: 9, offset: 78}, + run: (*parser).callonTest1, + expr: &anyMatcher{ + line: 6, col: 9, offset: 78, + }, + }, + }, + }, +} + +func (c *current) onTest1() (any, error) { + return "\\" + "{", nil +} + +func (p *parser) callonTest1() (any, error) { + stack := p.vstack[len(p.vstack)-1] + _ = stack + return p.cur.onTest1() +} + +var ( + // errNoRule is returned when the grammar to parse has no rule. + errNoRule = errors.New("grammar has no rule") + + // errInvalidEntrypoint is returned when the specified entrypoint rule + // does not exit. + errInvalidEntrypoint = errors.New("invalid entrypoint") + + // errInvalidEncoding is returned when the source is not properly + // utf8-encoded. + errInvalidEncoding = errors.New("invalid encoding") + + // errMaxExprCnt is used to signal that the maximum number of + // expressions have been parsed. + errMaxExprCnt = errors.New("max number of expresssions parsed") +) + +// Option is a function that can set an option on the parser. It returns +// the previous setting as an Option. +type Option func(*parser) Option + +// MaxExpressions creates an Option to stop parsing after the provided +// number of expressions have been parsed, if the value is 0 then the parser will +// parse for as many steps as needed (possibly an infinite number). +// +// The default for maxExprCnt is 0. +func MaxExpressions(maxExprCnt uint64) Option { + return func(p *parser) Option { + oldMaxExprCnt := p.maxExprCnt + p.maxExprCnt = maxExprCnt + return MaxExpressions(oldMaxExprCnt) + } +} + +// Entrypoint creates an Option to set the rule name to use as entrypoint. +// The rule name must have been specified in the -alternate-entrypoints +// if generating the parser with the -optimize-grammar flag, otherwise +// it may have been optimized out. Passing an empty string sets the +// entrypoint to the first rule in the grammar. +// +// The default is to start parsing at the first rule in the grammar. +func Entrypoint(ruleName string) Option { + return func(p *parser) Option { + oldEntrypoint := p.entrypoint + p.entrypoint = ruleName + if ruleName == "" { + p.entrypoint = g.rules[0].name + } + return Entrypoint(oldEntrypoint) + } +} + +// Statistics adds a user provided Stats struct to the parser to allow +// the user to process the results after the parsing has finished. +// Also the key for the "no match" counter is set. +// +// Example usage: +// +// input := "input" +// stats := Stats{} +// _, err := Parse("input-file", []byte(input), Statistics(&stats, "no match")) +// if err != nil { +// log.Panicln(err) +// } +// b, err := json.MarshalIndent(stats.ChoiceAltCnt, "", " ") +// if err != nil { +// log.Panicln(err) +// } +// fmt.Println(string(b)) +func Statistics(stats *Stats, choiceNoMatch string) Option { + return func(p *parser) Option { + oldStats := p.Stats + p.Stats = stats + oldChoiceNoMatch := p.choiceNoMatch + p.choiceNoMatch = choiceNoMatch + if p.Stats.ChoiceAltCnt == nil { + p.Stats.ChoiceAltCnt = make(map[string]map[string]int) + } + return Statistics(oldStats, oldChoiceNoMatch) + } +} + +// Debug creates an Option to set the debug flag to b. When set to true, +// debugging information is printed to stdout while parsing. +// +// The default is false. +func Debug(b bool) Option { + return func(p *parser) Option { + old := p.debug + p.debug = b + return Debug(old) + } +} + +// Memoize creates an Option to set the memoize flag to b. When set to true, +// the parser will cache all results so each expression is evaluated only +// once. This guarantees linear parsing time even for pathological cases, +// at the expense of more memory and slower times for typical cases. +// +// The default is false. +func Memoize(b bool) Option { + return func(p *parser) Option { + old := p.memoize + p.memoize = b + return Memoize(old) + } +} + +// AllowInvalidUTF8 creates an Option to allow invalid UTF-8 bytes. +// Every invalid UTF-8 byte is treated as a utf8.RuneError (U+FFFD) +// by character class matchers and is matched by the any matcher. +// The returned matched value, c.text and c.offset are NOT affected. +// +// The default is false. +func AllowInvalidUTF8(b bool) Option { + return func(p *parser) Option { + old := p.allowInvalidUTF8 + p.allowInvalidUTF8 = b + return AllowInvalidUTF8(old) + } +} + +// Recover creates an Option to set the recover flag to b. When set to +// true, this causes the parser to recover from panics and convert it +// to an error. Setting it to false can be useful while debugging to +// access the full stack trace. +// +// The default is true. +func Recover(b bool) Option { + return func(p *parser) Option { + old := p.recover + p.recover = b + return Recover(old) + } +} + +// GlobalStore creates an Option to set a key to a certain value in +// the globalStore. +func GlobalStore(key string, value any) Option { + return func(p *parser) Option { + old := p.cur.globalStore[key] + p.cur.globalStore[key] = value + return GlobalStore(key, old) + } +} + +// InitState creates an Option to set a key to a certain value in +// the global "state" store. +func InitState(key string, value any) Option { + return func(p *parser) Option { + old := p.cur.state[key] + p.cur.state[key] = value + return InitState(key, old) + } +} + +// ParseFile parses the file identified by filename. +func ParseFile(filename string, opts ...Option) (i any, err error) { // nolint: deadcode + f, err := os.Open(filename) + if err != nil { + return nil, err + } + defer func() { + if closeErr := f.Close(); closeErr != nil { + err = closeErr + } + }() + return ParseReader(filename, f, opts...) +} + +// ParseReader parses the data from r using filename as information in the +// error messages. +func ParseReader(filename string, r io.Reader, opts ...Option) (any, error) { // nolint: deadcode + b, err := io.ReadAll(r) + if err != nil { + return nil, err + } + + return Parse(filename, b, opts...) +} + +// Parse parses the data from b using filename as information in the +// error messages. +func Parse(filename string, b []byte, opts ...Option) (any, error) { + return newParser(filename, b, opts...).parse(g) +} + +// position records a position in the text. +type position struct { + line, col, offset int +} + +func (p position) String() string { + return strconv.Itoa(p.line) + ":" + strconv.Itoa(p.col) + " [" + strconv.Itoa(p.offset) + "]" +} + +// savepoint stores all state required to go back to this point in the +// parser. +type savepoint struct { + position + rn rune + w int +} + +type current struct { + pos position // start position of the match + text []byte // raw text of the match + + // state is a store for arbitrary key,value pairs that the user wants to be + // tied to the backtracking of the parser. + // This is always rolled back if a parsing rule fails. + state storeDict + + // globalStore is a general store for the user to store arbitrary key-value + // pairs that they need to manage and that they do not want tied to the + // backtracking of the parser. This is only modified by the user and never + // rolled back by the parser. It is always up to the user to keep this in a + // consistent state. + globalStore storeDict +} + +type storeDict map[string]any + +// the AST types... + +// nolint: structcheck +type grammar struct { + pos position + rules []*rule +} + +// nolint: structcheck +type rule struct { + pos position + name string + displayName string + expr any +} + +// nolint: structcheck +type choiceExpr struct { + pos position + alternatives []any +} + +// nolint: structcheck +type actionExpr struct { + pos position + expr any + run func(*parser) (any, error) +} + +// nolint: structcheck +type recoveryExpr struct { + pos position + expr any + recoverExpr any + failureLabel []string +} + +// nolint: structcheck +type seqExpr struct { + pos position + exprs []any +} + +// nolint: structcheck +type throwExpr struct { + pos position + label string +} + +// nolint: structcheck +type labeledExpr struct { + pos position + label string + expr any +} + +// nolint: structcheck +type expr struct { + pos position + expr any +} + +type ( + andExpr expr // nolint: structcheck + notExpr expr // nolint: structcheck + zeroOrOneExpr expr // nolint: structcheck + zeroOrMoreExpr expr // nolint: structcheck + oneOrMoreExpr expr // nolint: structcheck +) + +// nolint: structcheck +type ruleRefExpr struct { + pos position + name string +} + +// nolint: structcheck +type stateCodeExpr struct { + pos position + run func(*parser) error +} + +// nolint: structcheck +type andCodeExpr struct { + pos position + run func(*parser) (bool, error) +} + +// nolint: structcheck +type notCodeExpr struct { + pos position + run func(*parser) (bool, error) +} + +// nolint: structcheck +type litMatcher struct { + pos position + val string + ignoreCase bool + want string +} + +// nolint: structcheck +type charClassMatcher struct { + pos position + val string + basicLatinChars [128]bool + chars []rune + ranges []rune + classes []*unicode.RangeTable + ignoreCase bool + inverted bool +} + +type anyMatcher position // nolint: structcheck + +// errList cumulates the errors found by the parser. +type errList []error + +func (e *errList) add(err error) { + *e = append(*e, err) +} + +func (e errList) err() error { + if len(e) == 0 { + return nil + } + e.dedupe() + return e +} + +func (e *errList) dedupe() { + var cleaned []error + set := make(map[string]bool) + for _, err := range *e { + if msg := err.Error(); !set[msg] { + set[msg] = true + cleaned = append(cleaned, err) + } + } + *e = cleaned +} + +func (e errList) Error() string { + switch len(e) { + case 0: + return "" + case 1: + return e[0].Error() + default: + var buf bytes.Buffer + + for i, err := range e { + if i > 0 { + buf.WriteRune('\n') + } + buf.WriteString(err.Error()) + } + return buf.String() + } +} + +// parserError wraps an error with a prefix indicating the rule in which +// the error occurred. The original error is stored in the Inner field. +type parserError struct { + Inner error + pos position + prefix string + expected []string +} + +// Error returns the error message. +func (p *parserError) Error() string { + return p.prefix + ": " + p.Inner.Error() +} + +// newParser creates a parser with the specified input source and options. +func newParser(filename string, b []byte, opts ...Option) *parser { + stats := Stats{ + ChoiceAltCnt: make(map[string]map[string]int), + } + + p := &parser{ + filename: filename, + errs: new(errList), + data: b, + pt: savepoint{position: position{line: 1}}, + recover: true, + cur: current{ + state: make(storeDict), + globalStore: make(storeDict), + }, + maxFailPos: position{col: 1, line: 1}, + maxFailExpected: make([]string, 0, 20), + Stats: &stats, + // start rule is rule [0] unless an alternate entrypoint is specified + entrypoint: g.rules[0].name, + } + p.setOptions(opts) + + if p.maxExprCnt == 0 { + p.maxExprCnt = math.MaxUint64 + } + + return p +} + +// setOptions applies the options to the parser. +func (p *parser) setOptions(opts []Option) { + for _, opt := range opts { + opt(p) + } +} + +// nolint: structcheck,deadcode +type resultTuple struct { + v any + b bool + end savepoint +} + +// nolint: varcheck +const choiceNoMatch = -1 + +// Stats stores some statistics, gathered during parsing +type Stats struct { + // ExprCnt counts the number of expressions processed during parsing + // This value is compared to the maximum number of expressions allowed + // (set by the MaxExpressions option). + ExprCnt uint64 + + // ChoiceAltCnt is used to count for each ordered choice expression, + // which alternative is used how may times. + // These numbers allow to optimize the order of the ordered choice expression + // to increase the performance of the parser + // + // The outer key of ChoiceAltCnt is composed of the name of the rule as well + // as the line and the column of the ordered choice. + // The inner key of ChoiceAltCnt is the number (one-based) of the matching alternative. + // For each alternative the number of matches are counted. If an ordered choice does not + // match, a special counter is incremented. The name of this counter is set with + // the parser option Statistics. + // For an alternative to be included in ChoiceAltCnt, it has to match at least once. + ChoiceAltCnt map[string]map[string]int +} + +// nolint: structcheck,maligned +type parser struct { + filename string + pt savepoint + cur current + + data []byte + errs *errList + + depth int + recover bool + debug bool + + memoize bool + // memoization table for the packrat algorithm: + // map[offset in source] map[expression or rule] {value, match} + memo map[int]map[any]resultTuple + + // rules table, maps the rule identifier to the rule node + rules map[string]*rule + // variables stack, map of label to value + vstack []map[string]any + // rule stack, allows identification of the current rule in errors + rstack []*rule + + // parse fail + maxFailPos position + maxFailExpected []string + maxFailInvertExpected bool + + // max number of expressions to be parsed + maxExprCnt uint64 + // entrypoint for the parser + entrypoint string + + allowInvalidUTF8 bool + + *Stats + + choiceNoMatch string + // recovery expression stack, keeps track of the currently available recovery expression, these are traversed in reverse + recoveryStack []map[string]any +} + +// push a variable set on the vstack. +func (p *parser) pushV() { + if cap(p.vstack) == len(p.vstack) { + // create new empty slot in the stack + p.vstack = append(p.vstack, nil) + } else { + // slice to 1 more + p.vstack = p.vstack[:len(p.vstack)+1] + } + + // get the last args set + m := p.vstack[len(p.vstack)-1] + if m != nil && len(m) == 0 { + // empty map, all good + return + } + + m = make(map[string]any) + p.vstack[len(p.vstack)-1] = m +} + +// pop a variable set from the vstack. +func (p *parser) popV() { + // if the map is not empty, clear it + m := p.vstack[len(p.vstack)-1] + if len(m) > 0 { + // GC that map + p.vstack[len(p.vstack)-1] = nil + } + p.vstack = p.vstack[:len(p.vstack)-1] +} + +// push a recovery expression with its labels to the recoveryStack +func (p *parser) pushRecovery(labels []string, expr any) { + if cap(p.recoveryStack) == len(p.recoveryStack) { + // create new empty slot in the stack + p.recoveryStack = append(p.recoveryStack, nil) + } else { + // slice to 1 more + p.recoveryStack = p.recoveryStack[:len(p.recoveryStack)+1] + } + + m := make(map[string]any, len(labels)) + for _, fl := range labels { + m[fl] = expr + } + p.recoveryStack[len(p.recoveryStack)-1] = m +} + +// pop a recovery expression from the recoveryStack +func (p *parser) popRecovery() { + // GC that map + p.recoveryStack[len(p.recoveryStack)-1] = nil + + p.recoveryStack = p.recoveryStack[:len(p.recoveryStack)-1] +} + +func (p *parser) print(prefix, s string) string { + if !p.debug { + return s + } + + fmt.Printf("%s %d:%d:%d: %s [%#U]\n", + prefix, p.pt.line, p.pt.col, p.pt.offset, s, p.pt.rn) + return s +} + +func (p *parser) printIndent(mark string, s string) string { + return p.print(strings.Repeat(" ", p.depth)+mark, s) +} + +func (p *parser) in(s string) string { + res := p.printIndent(">", s) + p.depth++ + return res +} + +func (p *parser) out(s string) string { + p.depth-- + return p.printIndent("<", s) +} + +func (p *parser) addErr(err error) { + p.addErrAt(err, p.pt.position, []string{}) +} + +func (p *parser) addErrAt(err error, pos position, expected []string) { + var buf bytes.Buffer + if p.filename != "" { + buf.WriteString(p.filename) + } + if buf.Len() > 0 { + buf.WriteString(":") + } + buf.WriteString(fmt.Sprintf("%d:%d (%d)", pos.line, pos.col, pos.offset)) + if len(p.rstack) > 0 { + if buf.Len() > 0 { + buf.WriteString(": ") + } + rule := p.rstack[len(p.rstack)-1] + if rule.displayName != "" { + buf.WriteString("rule " + rule.displayName) + } else { + buf.WriteString("rule " + rule.name) + } + } + pe := &parserError{Inner: err, pos: pos, prefix: buf.String(), expected: expected} + p.errs.add(pe) +} + +func (p *parser) failAt(fail bool, pos position, want string) { + // process fail if parsing fails and not inverted or parsing succeeds and invert is set + if fail == p.maxFailInvertExpected { + if pos.offset < p.maxFailPos.offset { + return + } + + if pos.offset > p.maxFailPos.offset { + p.maxFailPos = pos + p.maxFailExpected = p.maxFailExpected[:0] + } + + if p.maxFailInvertExpected { + want = "!" + want + } + p.maxFailExpected = append(p.maxFailExpected, want) + } +} + +// read advances the parser to the next rune. +func (p *parser) read() { + p.pt.offset += p.pt.w + rn, n := utf8.DecodeRune(p.data[p.pt.offset:]) + p.pt.rn = rn + p.pt.w = n + p.pt.col++ + if rn == '\n' { + p.pt.line++ + p.pt.col = 0 + } + + if rn == utf8.RuneError && n == 1 { // see utf8.DecodeRune + if !p.allowInvalidUTF8 { + p.addErr(errInvalidEncoding) + } + } +} + +// restore parser position to the savepoint pt. +func (p *parser) restore(pt savepoint) { + if p.debug { + defer p.out(p.in("restore")) + } + if pt.offset == p.pt.offset { + return + } + p.pt = pt +} + +// Cloner is implemented by any value that has a Clone method, which returns a +// copy of the value. This is mainly used for types which are not passed by +// value (e.g map, slice, chan) or structs that contain such types. +// +// This is used in conjunction with the global state feature to create proper +// copies of the state to allow the parser to properly restore the state in +// the case of backtracking. +type Cloner interface { + Clone() any +} + +var statePool = &sync.Pool{ + New: func() any { return make(storeDict) }, +} + +func (sd storeDict) Discard() { + for k := range sd { + delete(sd, k) + } + statePool.Put(sd) +} + +// clone and return parser current state. +func (p *parser) cloneState() storeDict { + if p.debug { + defer p.out(p.in("cloneState")) + } + + state := statePool.Get().(storeDict) + for k, v := range p.cur.state { + if c, ok := v.(Cloner); ok { + state[k] = c.Clone() + } else { + state[k] = v + } + } + return state +} + +// restore parser current state to the state storeDict. +// every restoreState should applied only one time for every cloned state +func (p *parser) restoreState(state storeDict) { + if p.debug { + defer p.out(p.in("restoreState")) + } + p.cur.state.Discard() + p.cur.state = state +} + +// get the slice of bytes from the savepoint start to the current position. +func (p *parser) sliceFrom(start savepoint) []byte { + return p.data[start.position.offset:p.pt.position.offset] +} + +func (p *parser) getMemoized(node any) (resultTuple, bool) { + if len(p.memo) == 0 { + return resultTuple{}, false + } + m := p.memo[p.pt.offset] + if len(m) == 0 { + return resultTuple{}, false + } + res, ok := m[node] + return res, ok +} + +func (p *parser) setMemoized(pt savepoint, node any, tuple resultTuple) { + if p.memo == nil { + p.memo = make(map[int]map[any]resultTuple) + } + m := p.memo[pt.offset] + if m == nil { + m = make(map[any]resultTuple) + p.memo[pt.offset] = m + } + m[node] = tuple +} + +func (p *parser) buildRulesTable(g *grammar) { + p.rules = make(map[string]*rule, len(g.rules)) + for _, r := range g.rules { + p.rules[r.name] = r + } +} + +// nolint: gocyclo +func (p *parser) parse(g *grammar) (val any, err error) { + if len(g.rules) == 0 { + p.addErr(errNoRule) + return nil, p.errs.err() + } + + // TODO : not super critical but this could be generated + p.buildRulesTable(g) + + if p.recover { + // panic can be used in action code to stop parsing immediately + // and return the panic as an error. + defer func() { + if e := recover(); e != nil { + if p.debug { + defer p.out(p.in("panic handler")) + } + val = nil + switch e := e.(type) { + case error: + p.addErr(e) + default: + p.addErr(fmt.Errorf("%v", e)) + } + err = p.errs.err() + } + }() + } + + startRule, ok := p.rules[p.entrypoint] + if !ok { + p.addErr(errInvalidEntrypoint) + return nil, p.errs.err() + } + + p.read() // advance to first rune + val, ok = p.parseRuleWrap(startRule) + if !ok { + if len(*p.errs) == 0 { + // If parsing fails, but no errors have been recorded, the expected values + // for the farthest parser position are returned as error. + maxFailExpectedMap := make(map[string]struct{}, len(p.maxFailExpected)) + for _, v := range p.maxFailExpected { + maxFailExpectedMap[v] = struct{}{} + } + expected := make([]string, 0, len(maxFailExpectedMap)) + eof := false + if _, ok := maxFailExpectedMap["!."]; ok { + delete(maxFailExpectedMap, "!.") + eof = true + } + for k := range maxFailExpectedMap { + expected = append(expected, k) + } + sort.Strings(expected) + if eof { + expected = append(expected, "EOF") + } + p.addErrAt(errors.New("no match found, expected: "+listJoin(expected, ", ", "or")), p.maxFailPos, expected) + } + + return nil, p.errs.err() + } + return val, p.errs.err() +} + +func listJoin(list []string, sep string, lastSep string) string { + switch len(list) { + case 0: + return "" + case 1: + return list[0] + default: + return strings.Join(list[:len(list)-1], sep) + " " + lastSep + " " + list[len(list)-1] + } +} + +func (p *parser) parseRuleMemoize(rule *rule) (any, bool) { + res, ok := p.getMemoized(rule) + if ok { + p.restore(res.end) + return res.v, res.b + } + + startMark := p.pt + val, ok := p.parseRule(rule) + p.setMemoized(startMark, rule, resultTuple{val, ok, p.pt}) + + return val, ok +} + +func (p *parser) parseRuleWrap(rule *rule) (any, bool) { + if p.debug { + defer p.out(p.in("parseRule " + rule.name)) + } + var ( + val any + ok bool + startMark = p.pt + ) + + if p.memoize { + val, ok = p.parseRuleMemoize(rule) + } else { + val, ok = p.parseRule(rule) + } + + if ok && p.debug { + p.printIndent("MATCH", string(p.sliceFrom(startMark))) + } + return val, ok +} + +func (p *parser) parseRule(rule *rule) (any, bool) { + p.rstack = append(p.rstack, rule) + p.pushV() + val, ok := p.parseExprWrap(rule.expr) + p.popV() + p.rstack = p.rstack[:len(p.rstack)-1] + return val, ok +} + +func (p *parser) parseExprWrap(expr any) (any, bool) { + var pt savepoint + + if p.memoize { + res, ok := p.getMemoized(expr) + if ok { + p.restore(res.end) + return res.v, res.b + } + pt = p.pt + } + + val, ok := p.parseExpr(expr) + + if p.memoize { + p.setMemoized(pt, expr, resultTuple{val, ok, p.pt}) + } + return val, ok +} + +// nolint: gocyclo +func (p *parser) parseExpr(expr any) (any, bool) { + p.ExprCnt++ + if p.ExprCnt > p.maxExprCnt { + panic(errMaxExprCnt) + } + + var val any + var ok bool + switch expr := expr.(type) { + case *actionExpr: + val, ok = p.parseActionExpr(expr) + case *andCodeExpr: + val, ok = p.parseAndCodeExpr(expr) + case *andExpr: + val, ok = p.parseAndExpr(expr) + case *anyMatcher: + val, ok = p.parseAnyMatcher(expr) + case *charClassMatcher: + val, ok = p.parseCharClassMatcher(expr) + case *choiceExpr: + val, ok = p.parseChoiceExpr(expr) + case *labeledExpr: + val, ok = p.parseLabeledExpr(expr) + case *litMatcher: + val, ok = p.parseLitMatcher(expr) + case *notCodeExpr: + val, ok = p.parseNotCodeExpr(expr) + case *notExpr: + val, ok = p.parseNotExpr(expr) + case *oneOrMoreExpr: + val, ok = p.parseOneOrMoreExpr(expr) + case *recoveryExpr: + val, ok = p.parseRecoveryExpr(expr) + case *ruleRefExpr: + val, ok = p.parseRuleRefExpr(expr) + case *seqExpr: + val, ok = p.parseSeqExpr(expr) + case *stateCodeExpr: + val, ok = p.parseStateCodeExpr(expr) + case *throwExpr: + val, ok = p.parseThrowExpr(expr) + case *zeroOrMoreExpr: + val, ok = p.parseZeroOrMoreExpr(expr) + case *zeroOrOneExpr: + val, ok = p.parseZeroOrOneExpr(expr) + default: + panic(fmt.Sprintf("unknown expression type %T", expr)) + } + return val, ok +} + +func (p *parser) parseActionExpr(act *actionExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseActionExpr")) + } + + start := p.pt + val, ok := p.parseExprWrap(act.expr) + if ok { + p.cur.pos = start.position + p.cur.text = p.sliceFrom(start) + state := p.cloneState() + actVal, err := act.run(p) + if err != nil { + p.addErrAt(err, start.position, []string{}) + } + p.restoreState(state) + + val = actVal + } + if ok && p.debug { + p.printIndent("MATCH", string(p.sliceFrom(start))) + } + return val, ok +} + +func (p *parser) parseAndCodeExpr(and *andCodeExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseAndCodeExpr")) + } + + state := p.cloneState() + + ok, err := and.run(p) + if err != nil { + p.addErr(err) + } + p.restoreState(state) + + return nil, ok +} + +func (p *parser) parseAndExpr(and *andExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseAndExpr")) + } + + pt := p.pt + state := p.cloneState() + p.pushV() + _, ok := p.parseExprWrap(and.expr) + p.popV() + p.restoreState(state) + p.restore(pt) + + return nil, ok +} + +func (p *parser) parseAnyMatcher(any *anyMatcher) (any, bool) { + if p.debug { + defer p.out(p.in("parseAnyMatcher")) + } + + if p.pt.rn == utf8.RuneError && p.pt.w == 0 { + // EOF - see utf8.DecodeRune + p.failAt(false, p.pt.position, ".") + return nil, false + } + start := p.pt + p.read() + p.failAt(true, start.position, ".") + return p.sliceFrom(start), true +} + +// nolint: gocyclo +func (p *parser) parseCharClassMatcher(chr *charClassMatcher) (any, bool) { + if p.debug { + defer p.out(p.in("parseCharClassMatcher")) + } + + cur := p.pt.rn + start := p.pt + + // can't match EOF + if cur == utf8.RuneError && p.pt.w == 0 { // see utf8.DecodeRune + p.failAt(false, start.position, chr.val) + return nil, false + } + + if chr.ignoreCase { + cur = unicode.ToLower(cur) + } + + // try to match in the list of available chars + for _, rn := range chr.chars { + if rn == cur { + if chr.inverted { + p.failAt(false, start.position, chr.val) + return nil, false + } + p.read() + p.failAt(true, start.position, chr.val) + return p.sliceFrom(start), true + } + } + + // try to match in the list of ranges + for i := 0; i < len(chr.ranges); i += 2 { + if cur >= chr.ranges[i] && cur <= chr.ranges[i+1] { + if chr.inverted { + p.failAt(false, start.position, chr.val) + return nil, false + } + p.read() + p.failAt(true, start.position, chr.val) + return p.sliceFrom(start), true + } + } + + // try to match in the list of Unicode classes + for _, cl := range chr.classes { + if unicode.Is(cl, cur) { + if chr.inverted { + p.failAt(false, start.position, chr.val) + return nil, false + } + p.read() + p.failAt(true, start.position, chr.val) + return p.sliceFrom(start), true + } + } + + if chr.inverted { + p.read() + p.failAt(true, start.position, chr.val) + return p.sliceFrom(start), true + } + p.failAt(false, start.position, chr.val) + return nil, false +} + +func (p *parser) incChoiceAltCnt(ch *choiceExpr, altI int) { + choiceIdent := fmt.Sprintf("%s %d:%d", p.rstack[len(p.rstack)-1].name, ch.pos.line, ch.pos.col) + m := p.ChoiceAltCnt[choiceIdent] + if m == nil { + m = make(map[string]int) + p.ChoiceAltCnt[choiceIdent] = m + } + // We increment altI by 1, so the keys do not start at 0 + alt := strconv.Itoa(altI + 1) + if altI == choiceNoMatch { + alt = p.choiceNoMatch + } + m[alt]++ +} + +func (p *parser) parseChoiceExpr(ch *choiceExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseChoiceExpr")) + } + + for altI, alt := range ch.alternatives { + // dummy assignment to prevent compile error if optimized + _ = altI + + state := p.cloneState() + + p.pushV() + val, ok := p.parseExprWrap(alt) + p.popV() + if ok { + p.incChoiceAltCnt(ch, altI) + return val, ok + } + p.restoreState(state) + } + p.incChoiceAltCnt(ch, choiceNoMatch) + return nil, false +} + +func (p *parser) parseLabeledExpr(lab *labeledExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseLabeledExpr")) + } + + p.pushV() + val, ok := p.parseExprWrap(lab.expr) + p.popV() + if ok && lab.label != "" { + m := p.vstack[len(p.vstack)-1] + m[lab.label] = val + } + return val, ok +} + +func (p *parser) parseLitMatcher(lit *litMatcher) (any, bool) { + if p.debug { + defer p.out(p.in("parseLitMatcher")) + } + + start := p.pt + for _, want := range lit.val { + cur := p.pt.rn + if lit.ignoreCase { + cur = unicode.ToLower(cur) + } + if cur != want { + p.failAt(false, start.position, lit.want) + p.restore(start) + return nil, false + } + p.read() + } + p.failAt(true, start.position, lit.want) + return p.sliceFrom(start), true +} + +func (p *parser) parseNotCodeExpr(not *notCodeExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseNotCodeExpr")) + } + + state := p.cloneState() + + ok, err := not.run(p) + if err != nil { + p.addErr(err) + } + p.restoreState(state) + + return nil, !ok +} + +func (p *parser) parseNotExpr(not *notExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseNotExpr")) + } + + pt := p.pt + state := p.cloneState() + p.pushV() + p.maxFailInvertExpected = !p.maxFailInvertExpected + _, ok := p.parseExprWrap(not.expr) + p.maxFailInvertExpected = !p.maxFailInvertExpected + p.popV() + p.restoreState(state) + p.restore(pt) + + return nil, !ok +} + +func (p *parser) parseOneOrMoreExpr(expr *oneOrMoreExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseOneOrMoreExpr")) + } + + var vals []any + + for { + p.pushV() + val, ok := p.parseExprWrap(expr.expr) + p.popV() + if !ok { + if len(vals) == 0 { + // did not match once, no match + return nil, false + } + return vals, true + } + vals = append(vals, val) + } +} + +func (p *parser) parseRecoveryExpr(recover *recoveryExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseRecoveryExpr (" + strings.Join(recover.failureLabel, ",") + ")")) + } + + p.pushRecovery(recover.failureLabel, recover.recoverExpr) + val, ok := p.parseExprWrap(recover.expr) + p.popRecovery() + + return val, ok +} + +func (p *parser) parseRuleRefExpr(ref *ruleRefExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseRuleRefExpr " + ref.name)) + } + + if ref.name == "" { + panic(fmt.Sprintf("%s: invalid rule: missing name", ref.pos)) + } + + rule := p.rules[ref.name] + if rule == nil { + p.addErr(fmt.Errorf("undefined rule: %s", ref.name)) + return nil, false + } + return p.parseRuleWrap(rule) +} + +func (p *parser) parseSeqExpr(seq *seqExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseSeqExpr")) + } + + vals := make([]any, 0, len(seq.exprs)) + + pt := p.pt + state := p.cloneState() + for _, expr := range seq.exprs { + val, ok := p.parseExprWrap(expr) + if !ok { + p.restoreState(state) + p.restore(pt) + return nil, false + } + vals = append(vals, val) + } + return vals, true +} + +func (p *parser) parseStateCodeExpr(state *stateCodeExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseStateCodeExpr")) + } + + err := state.run(p) + if err != nil { + p.addErr(err) + } + return nil, true +} + +func (p *parser) parseThrowExpr(expr *throwExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseThrowExpr")) + } + + for i := len(p.recoveryStack) - 1; i >= 0; i-- { + if recoverExpr, ok := p.recoveryStack[i][expr.label]; ok { + if val, ok := p.parseExprWrap(recoverExpr); ok { + return val, ok + } + } + } + + return nil, false +} + +func (p *parser) parseZeroOrMoreExpr(expr *zeroOrMoreExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseZeroOrMoreExpr")) + } + + var vals []any + + for { + p.pushV() + val, ok := p.parseExprWrap(expr.expr) + p.popV() + if !ok { + return vals, true + } + vals = append(vals, val) + } +} + +func (p *parser) parseZeroOrOneExpr(expr *zeroOrOneExpr) (any, bool) { + if p.debug { + defer p.out(p.in("parseZeroOrOneExpr")) + } + + p.pushV() + val, _ := p.parseExprWrap(expr.expr) + p.popV() + // whether it matched or not, consider it a match + return val, true +} diff --git a/test/issue_134/issue_134.peg b/test/issue_134/issue_134.peg new file mode 100644 index 0000000..92d57d6 --- /dev/null +++ b/test/issue_134/issue_134.peg @@ -0,0 +1,9 @@ +{ + package issue134 +} + +// this shouldn't compile if #133 isn't fixed. +Test <- . { + return "\\"+"{", nil +} +