From 64fba2305b03943dbe255df6b8e64746675ce4ab Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Sun, 31 May 2015 12:46:20 -0700 Subject: [PATCH 1/6] No longer using a regex based lexer --- jmespath/lexer.py | 382 +++++++++++++++++++++++++++++++--------------- 1 file changed, 256 insertions(+), 126 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index 6368851e..d534a91e 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -1,148 +1,278 @@ -import re +import string import warnings from json import loads from jmespath.exceptions import LexerError, EmptyExpressionError -class Lexer(object): - TOKENS = ( - r'(?P-?\d+)|' - r'(?P([a-zA-Z_][a-zA-Z_0-9]*))|' - r'(?P("(?:\\\\|\\"|[^"])*"))|' - r'(?P(\'(?:\\\\|\\\'|[^\'])*\'))|' - r'(?P(`(?:\\\\|\\`|[^`])*`))|' - r'(?P\[\?)|' - r'(?P\|\|)|' - r'(?P\|)|' - r'(?P!=)|' - r'(?P\})|' - r'(?P==)|' - r'(?P\.)|' - r'(?P\*)|' - r'(?P>=)|' - r'(?P\()|' - r'(?P\{)|' - r'(?P<=)|' - r'(?P\[\])|' - r'(?P\])|' - r'(?P\[)|' - r'(?P\))|' - r'(?P,)|' - r'(?P:)|' - r'(?P<)|' - r'(?P&)|' - r'(?P>)|' - r'(?P@)|' - r'(?P[ \t]+)' - ) +VALID_NUMBER = set(string.digits) +VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') +STATE_IDENTIFIER = 0; +STATE_NUMBER = 1; +STATE_SINGLE_CHAR = 2; +STATE_WHITESPACE = 3; +STATE_STRING_LITERAL = 4; +STATE_QUOTED_STRING = 5; +STATE_JSON_LITERAL = 6; +STATE_LBRACKET = 7; +STATE_PIPE = 8; +STATE_LT = 9; +STATE_GT = 10; +STATE_EQ = 11; +STATE_NOT = 12; +TRANSITION_TABLE = { + '<': STATE_LT, + '>': STATE_GT, + '=': STATE_EQ, + '!': STATE_NOT, + '[': STATE_LBRACKET, + '|': STATE_PIPE, + '`': STATE_JSON_LITERAL, + '"': STATE_QUOTED_STRING, + "'": STATE_STRING_LITERAL, + '-': STATE_NUMBER, + '0': STATE_NUMBER, + '1': STATE_NUMBER, + '2': STATE_NUMBER, + '3': STATE_NUMBER, + '4': STATE_NUMBER, + '5': STATE_NUMBER, + '6': STATE_NUMBER, + '7': STATE_NUMBER, + '8': STATE_NUMBER, + '9': STATE_NUMBER, + '.': STATE_SINGLE_CHAR, + '*': STATE_SINGLE_CHAR, + ']': STATE_SINGLE_CHAR, + ',': STATE_SINGLE_CHAR, + ':': STATE_SINGLE_CHAR, + '@': STATE_SINGLE_CHAR, + '&': STATE_SINGLE_CHAR, + '(': STATE_SINGLE_CHAR, + ')': STATE_SINGLE_CHAR, + '{': STATE_SINGLE_CHAR, + '}': STATE_SINGLE_CHAR, + '_': STATE_IDENTIFIER, + 'A': STATE_IDENTIFIER, + 'B': STATE_IDENTIFIER, + 'C': STATE_IDENTIFIER, + 'D': STATE_IDENTIFIER, + 'E': STATE_IDENTIFIER, + 'F': STATE_IDENTIFIER, + 'G': STATE_IDENTIFIER, + 'H': STATE_IDENTIFIER, + 'I': STATE_IDENTIFIER, + 'J': STATE_IDENTIFIER, + 'K': STATE_IDENTIFIER, + 'L': STATE_IDENTIFIER, + 'M': STATE_IDENTIFIER, + 'N': STATE_IDENTIFIER, + 'O': STATE_IDENTIFIER, + 'P': STATE_IDENTIFIER, + 'Q': STATE_IDENTIFIER, + 'R': STATE_IDENTIFIER, + 'S': STATE_IDENTIFIER, + 'T': STATE_IDENTIFIER, + 'U': STATE_IDENTIFIER, + 'V': STATE_IDENTIFIER, + 'W': STATE_IDENTIFIER, + 'X': STATE_IDENTIFIER, + 'Y': STATE_IDENTIFIER, + 'Z': STATE_IDENTIFIER, + 'a': STATE_IDENTIFIER, + 'b': STATE_IDENTIFIER, + 'c': STATE_IDENTIFIER, + 'd': STATE_IDENTIFIER, + 'e': STATE_IDENTIFIER, + 'f': STATE_IDENTIFIER, + 'g': STATE_IDENTIFIER, + 'h': STATE_IDENTIFIER, + 'i': STATE_IDENTIFIER, + 'j': STATE_IDENTIFIER, + 'k': STATE_IDENTIFIER, + 'l': STATE_IDENTIFIER, + 'm': STATE_IDENTIFIER, + 'n': STATE_IDENTIFIER, + 'o': STATE_IDENTIFIER, + 'p': STATE_IDENTIFIER, + 'q': STATE_IDENTIFIER, + 'r': STATE_IDENTIFIER, + 's': STATE_IDENTIFIER, + 't': STATE_IDENTIFIER, + 'u': STATE_IDENTIFIER, + 'v': STATE_IDENTIFIER, + 'w': STATE_IDENTIFIER, + 'x': STATE_IDENTIFIER, + 'y': STATE_IDENTIFIER, + 'z': STATE_IDENTIFIER, + ' ': STATE_WHITESPACE, + "\t": STATE_WHITESPACE, + "\n": STATE_WHITESPACE, + "\r": STATE_WHITESPACE +} +SIMPLE_TOKENS = { + '.': 'dot', + '*': 'star', + ']': 'rbracket', + ',': 'comma', + ':': 'colon', + '@': 'current', + '&': 'expref', + '(': 'lparen', + ')': 'rparen', + '{': 'lbrace', + '}': 'rbrace' +} - def __init__(self): - self.master_regex = re.compile(self.TOKENS) - def tokenize(self, expression): +class Scanner(object): + def __init__(self, expression): if not expression: raise EmptyExpressionError() - previous_column = 0 - for match in self.master_regex.finditer(expression): - value = match.group() - start = match.start() - end = match.end() - if match.lastgroup == 'skip': - # Ignore whitespace. - previous_column = end - continue - if start != previous_column: - bad_value = expression[previous_column:start] - # Try to give a good error message. - if bad_value == '"': - raise LexerError( - lexer_position=previous_column, - lexer_value=value, - message='Starting quote is missing the ending quote', - expression=expression) - raise LexerError(lexer_position=previous_column, - lexer_value=value, - message='Unknown character', - expression=expression) - previous_column = end - token_type = match.lastgroup - handler = getattr(self, '_token_%s' % token_type.lower(), None) - if handler is not None: - value = handler(value, start, end) - yield {'type': token_type, 'value': value, - 'start': start, 'end': end} - # At the end of the loop make sure we've consumed all the input. - # If we haven't then we have unidentified characters. - if end != len(expression): - msg = "Unknown characters at the end of the expression" - raise LexerError(lexer_position=end, - lexer_value='', - message=msg, expression=expression) + self.expression = expression + self.pos = 0 + self.chars = list(self.expression) + self.len = len(self.expression) + self.current = self.chars[self.pos] + + def next(self): + if self.pos == self.len - 1: + self.current = None else: - yield {'type': 'eof', 'value': '', - 'start': len(expression), 'end': len(expression)} + self.pos += 1 + self.current = self.chars[self.pos] + return self.current - def _token_number(self, value, start, end): - return int(value) + def in_delimter(self, delimiter): + start = self.pos + buffer = '' + self.next() + while self.current != delimiter: + if self.current == '\\': + buffer += '\\' + self.next() + if self.current is None: + print(buffer) + raise LexerError(lexer_position=start, + lexer_value=self.expression, + message="Unclosed delimiter: %s" % buffer) + buffer += self.current + self.next() + self.next() + return buffer - def _token_quoted_identifier(self, value, start, end): - try: - return loads(value) - except ValueError as e: - error_message = str(e).split(':')[0] - raise LexerError(lexer_position=start, - lexer_value=value, - message=error_message) - def _token_string_literal(self, value, start, end): - return value[1:-1] +class Lexer(object): + def tokenize(self, expression): + scanner = Scanner(expression) + while scanner.current is not None: + if not scanner.current in TRANSITION_TABLE: + # The current char must be in the transition table to + # be valid. + yield {'type': 'unknown', 'value': scanner.current, + 'start': scanner.pos, 'end': scanner.pos} + scanner.next() + continue + state = TRANSITION_TABLE[scanner.current] + if state == STATE_SINGLE_CHAR: + yield {'type': SIMPLE_TOKENS[scanner.current], + 'value': scanner.current, + 'start': scanner.pos, 'end': scanner.pos} + scanner.next() + elif state == STATE_IDENTIFIER: + start = scanner.pos + buffer = scanner.current + while scanner.next() in VALID_IDENTIFIER: + buffer += scanner.current + yield {'type': 'identifier', 'value': buffer, + 'start': start, 'end': len(buffer)} + elif state == STATE_WHITESPACE: + scanner.next() + elif state == STATE_LBRACKET: + start = scanner.pos + next_char = scanner.next() + if next_char == ']': + scanner.next() + yield {'type': 'flatten', 'value': '[]', + 'start': start, 'end': start + 1} + elif next_char == '?': + scanner.next() + yield {'type': 'filter', 'value': '[?', + 'start': start, 'end': start + 1} + else: + yield {'type': 'lbracket', 'value': '[', + 'start': start, 'end': start} + elif state == STATE_STRING_LITERAL: + yield self._consume_raw_string_literal(scanner) + elif state == STATE_PIPE: + yield self._match_or_else(scanner, '|', 'or', 'pipe') + elif state == STATE_JSON_LITERAL: + yield self._consume_literal(scanner) + elif state == STATE_NUMBER: + start = scanner.pos + buffer = scanner.current + while scanner.next() in VALID_NUMBER: + buffer += scanner.current + yield {'type': 'number', 'value': int(buffer), + 'start': start, 'end': len(buffer)} + elif state == STATE_QUOTED_STRING: + yield self._consume_quoted_identifier(scanner) + elif state == STATE_LT: + yield self._match_or_else(scanner, '=', 'lte', 'lt') + elif state == STATE_GT: + yield self._match_or_else(scanner, '=', 'gte', 'gt') + elif state == STATE_EQ: + yield self._match_or_else(scanner, '=', 'eq', 'unknown') + elif state == STATE_NOT: + yield self._match_or_else(scanner, '=', 'ne', 'unknown') + yield {'type': 'eof', 'value': '', + 'start': len(expression), 'end': len(expression)} - def _token_literal(self, value, start, end): - actual_value = value[1:-1] - actual_value = actual_value.replace('\\`', '`').lstrip() - # First, if it looks like JSON then we parse it as - # JSON and any json parsing errors propogate as lexing - # errors. - if self._looks_like_json(actual_value): - try: - return loads(actual_value) - except ValueError: - raise LexerError(lexer_position=start, - lexer_value=value, - message="Bad token %s" % value) - else: - potential_value = '"%s"' % actual_value + def _consume_literal(self, scanner): + start = scanner.pos + lexeme = scanner.in_delimter('`') + try: + # Assume it is valid JSON and attempt to parse. + parsed_json = loads(lexeme) + except ValueError: try: - # There's a shortcut syntax where string literals - # don't have to be quoted. This is only true if the - # string doesn't start with chars that could start a valid - # JSON value. - value = loads(potential_value) + # Invalid JSON values should be converted to quoted + # JSON strings during the JEP-12 deprecation period. + parsed_json = loads('"%s"' % lexeme) warnings.warn("deprecated string literal syntax", PendingDeprecationWarning) - return value except ValueError: raise LexerError(lexer_position=start, - lexer_value=value, + lexer_value=lexeme, message="Bad token %s" % value) + return {'type': 'literal', 'value': parsed_json, + 'start': start, 'end': len(lexeme)} - def _looks_like_json(self, value): - # Figure out if the string "value" starts with something - # that looks like json. - if not value: - return False - elif value[0] in ['"', '{', '[']: - return True - elif value in ['true', 'false', 'null']: - return True - elif value[0] in ['-', '0', '1', '2', '3', '4', '5', - '6', '7', '8', '9']: - # Then this is JSON, return True. - try: - loads(value) - return True - except ValueError: - return False - else: - return False + def _consume_quoted_identifier(self, scanner): + start = scanner.pos + lexeme = scanner.in_delimter('"') + try: + return {'type': 'identifier', 'value': loads(lexeme), + 'start': start, 'end': len(lexeme)} + except ValueError as e: + error_message = str(e).split(':')[0] + raise LexerError(lexer_position=start, + lexer_value=lexeme, + message=error_message) + + def _consume_raw_string_literal(self, scanner): + start = scanner.pos + lexeme = scanner.in_delimter("'") + return {'type': 'literal', 'value': lexeme, + 'start': start, 'end': len(lexeme)} + + def _match_or_else(self, scanner, expected, match_type, else_type): + start = scanner.pos + current = scanner.current + next_char = scanner.next() + if next_char == expected: + scanner.next() + return {'type': match_type, 'value': current + next_char, + 'start': start, 'end': start + 1} + return {'type': else_type, 'value': current, + 'start': start, 'end': start} From 9db14086d6777a79448f16f811e1d29c3643fa69 Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Sun, 31 May 2015 12:54:13 -0700 Subject: [PATCH 2/6] Removing the computed table is just as fast but less code --- jmespath/lexer.py | 149 +++++++--------------------------------------- 1 file changed, 23 insertions(+), 126 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index d534a91e..2d704163 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -5,111 +5,11 @@ from jmespath.exceptions import LexerError, EmptyExpressionError -VALID_NUMBER = set(string.digits) +START_IDENTIFIER = set(string.ascii_letters + '_') VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') -STATE_IDENTIFIER = 0; -STATE_NUMBER = 1; -STATE_SINGLE_CHAR = 2; -STATE_WHITESPACE = 3; -STATE_STRING_LITERAL = 4; -STATE_QUOTED_STRING = 5; -STATE_JSON_LITERAL = 6; -STATE_LBRACKET = 7; -STATE_PIPE = 8; -STATE_LT = 9; -STATE_GT = 10; -STATE_EQ = 11; -STATE_NOT = 12; -TRANSITION_TABLE = { - '<': STATE_LT, - '>': STATE_GT, - '=': STATE_EQ, - '!': STATE_NOT, - '[': STATE_LBRACKET, - '|': STATE_PIPE, - '`': STATE_JSON_LITERAL, - '"': STATE_QUOTED_STRING, - "'": STATE_STRING_LITERAL, - '-': STATE_NUMBER, - '0': STATE_NUMBER, - '1': STATE_NUMBER, - '2': STATE_NUMBER, - '3': STATE_NUMBER, - '4': STATE_NUMBER, - '5': STATE_NUMBER, - '6': STATE_NUMBER, - '7': STATE_NUMBER, - '8': STATE_NUMBER, - '9': STATE_NUMBER, - '.': STATE_SINGLE_CHAR, - '*': STATE_SINGLE_CHAR, - ']': STATE_SINGLE_CHAR, - ',': STATE_SINGLE_CHAR, - ':': STATE_SINGLE_CHAR, - '@': STATE_SINGLE_CHAR, - '&': STATE_SINGLE_CHAR, - '(': STATE_SINGLE_CHAR, - ')': STATE_SINGLE_CHAR, - '{': STATE_SINGLE_CHAR, - '}': STATE_SINGLE_CHAR, - '_': STATE_IDENTIFIER, - 'A': STATE_IDENTIFIER, - 'B': STATE_IDENTIFIER, - 'C': STATE_IDENTIFIER, - 'D': STATE_IDENTIFIER, - 'E': STATE_IDENTIFIER, - 'F': STATE_IDENTIFIER, - 'G': STATE_IDENTIFIER, - 'H': STATE_IDENTIFIER, - 'I': STATE_IDENTIFIER, - 'J': STATE_IDENTIFIER, - 'K': STATE_IDENTIFIER, - 'L': STATE_IDENTIFIER, - 'M': STATE_IDENTIFIER, - 'N': STATE_IDENTIFIER, - 'O': STATE_IDENTIFIER, - 'P': STATE_IDENTIFIER, - 'Q': STATE_IDENTIFIER, - 'R': STATE_IDENTIFIER, - 'S': STATE_IDENTIFIER, - 'T': STATE_IDENTIFIER, - 'U': STATE_IDENTIFIER, - 'V': STATE_IDENTIFIER, - 'W': STATE_IDENTIFIER, - 'X': STATE_IDENTIFIER, - 'Y': STATE_IDENTIFIER, - 'Z': STATE_IDENTIFIER, - 'a': STATE_IDENTIFIER, - 'b': STATE_IDENTIFIER, - 'c': STATE_IDENTIFIER, - 'd': STATE_IDENTIFIER, - 'e': STATE_IDENTIFIER, - 'f': STATE_IDENTIFIER, - 'g': STATE_IDENTIFIER, - 'h': STATE_IDENTIFIER, - 'i': STATE_IDENTIFIER, - 'j': STATE_IDENTIFIER, - 'k': STATE_IDENTIFIER, - 'l': STATE_IDENTIFIER, - 'm': STATE_IDENTIFIER, - 'n': STATE_IDENTIFIER, - 'o': STATE_IDENTIFIER, - 'p': STATE_IDENTIFIER, - 'q': STATE_IDENTIFIER, - 'r': STATE_IDENTIFIER, - 's': STATE_IDENTIFIER, - 't': STATE_IDENTIFIER, - 'u': STATE_IDENTIFIER, - 'v': STATE_IDENTIFIER, - 'w': STATE_IDENTIFIER, - 'x': STATE_IDENTIFIER, - 'y': STATE_IDENTIFIER, - 'z': STATE_IDENTIFIER, - ' ': STATE_WHITESPACE, - "\t": STATE_WHITESPACE, - "\n": STATE_WHITESPACE, - "\r": STATE_WHITESPACE -} +START_NUMBER = set(string.digits) +VALID_NUMBER = set(string.digits) +WHITESPACE = set(" \t\n\r") SIMPLE_TOKENS = { '.': 'dot', '*': 'star', @@ -166,29 +66,22 @@ class Lexer(object): def tokenize(self, expression): scanner = Scanner(expression) while scanner.current is not None: - if not scanner.current in TRANSITION_TABLE: - # The current char must be in the transition table to - # be valid. - yield {'type': 'unknown', 'value': scanner.current, - 'start': scanner.pos, 'end': scanner.pos} - scanner.next() - continue - state = TRANSITION_TABLE[scanner.current] - if state == STATE_SINGLE_CHAR: + + if scanner.current in SIMPLE_TOKENS: yield {'type': SIMPLE_TOKENS[scanner.current], 'value': scanner.current, 'start': scanner.pos, 'end': scanner.pos} scanner.next() - elif state == STATE_IDENTIFIER: + elif scanner.current in START_IDENTIFIER: start = scanner.pos buffer = scanner.current while scanner.next() in VALID_IDENTIFIER: buffer += scanner.current yield {'type': 'identifier', 'value': buffer, 'start': start, 'end': len(buffer)} - elif state == STATE_WHITESPACE: + elif scanner.current in WHITESPACE: scanner.next() - elif state == STATE_LBRACKET: + elif scanner.current == '[': start = scanner.pos next_char = scanner.next() if next_char == ']': @@ -202,29 +95,33 @@ def tokenize(self, expression): else: yield {'type': 'lbracket', 'value': '[', 'start': start, 'end': start} - elif state == STATE_STRING_LITERAL: + elif scanner.current == "'": yield self._consume_raw_string_literal(scanner) - elif state == STATE_PIPE: + elif scanner.current == '|': yield self._match_or_else(scanner, '|', 'or', 'pipe') - elif state == STATE_JSON_LITERAL: + elif scanner.current == '`': yield self._consume_literal(scanner) - elif state == STATE_NUMBER: + elif scanner.current in START_NUMBER: start = scanner.pos buffer = scanner.current while scanner.next() in VALID_NUMBER: buffer += scanner.current yield {'type': 'number', 'value': int(buffer), 'start': start, 'end': len(buffer)} - elif state == STATE_QUOTED_STRING: + elif scanner.current == '"': yield self._consume_quoted_identifier(scanner) - elif state == STATE_LT: + elif scanner.current == '<': yield self._match_or_else(scanner, '=', 'lte', 'lt') - elif state == STATE_GT: + elif scanner.current == '>': yield self._match_or_else(scanner, '=', 'gte', 'gt') - elif state == STATE_EQ: - yield self._match_or_else(scanner, '=', 'eq', 'unknown') - elif state == STATE_NOT: + elif scanner.current == '!': yield self._match_or_else(scanner, '=', 'ne', 'unknown') + elif scanner.current == '=': + yield self._match_or_else(scanner, '=', 'eq', 'unknown') + else: + yield {'type': 'unknown', 'value': scanner.current, + 'start': scanner.pos, 'end': scanner.pos} + scanner.next() yield {'type': 'eof', 'value': '', 'start': len(expression), 'end': len(expression)} From 4c87771b8d3bc88efd945943b5438ea28603e054 Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Mon, 1 Jun 2015 16:56:40 -0700 Subject: [PATCH 3/6] Lexer bug fixes --- jmespath/lexer.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index 2d704163..9ff35572 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -7,7 +7,7 @@ START_IDENTIFIER = set(string.ascii_letters + '_') VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') -START_NUMBER = set(string.digits) +START_NUMBER = set(string.digits + '-') VALID_NUMBER = set(string.digits) WHITESPACE = set(" \t\n\r") SIMPLE_TOKENS = { @@ -43,7 +43,7 @@ def next(self): self.current = self.chars[self.pos] return self.current - def in_delimter(self, delimiter): + def in_delimiter(self, delimiter): start = self.pos buffer = '' self.next() @@ -66,7 +66,6 @@ class Lexer(object): def tokenize(self, expression): scanner = Scanner(expression) while scanner.current is not None: - if scanner.current in SIMPLE_TOKENS: yield {'type': SIMPLE_TOKENS[scanner.current], 'value': scanner.current, @@ -77,7 +76,7 @@ def tokenize(self, expression): buffer = scanner.current while scanner.next() in VALID_IDENTIFIER: buffer += scanner.current - yield {'type': 'identifier', 'value': buffer, + yield {'type': 'unquoted_identifier', 'value': buffer, 'start': start, 'end': len(buffer)} elif scanner.current in WHITESPACE: scanner.next() @@ -127,7 +126,7 @@ def tokenize(self, expression): def _consume_literal(self, scanner): start = scanner.pos - lexeme = scanner.in_delimter('`') + lexeme = scanner.in_delimiter('`') try: # Assume it is valid JSON and attempt to parse. parsed_json = loads(lexeme) @@ -135,21 +134,21 @@ def _consume_literal(self, scanner): try: # Invalid JSON values should be converted to quoted # JSON strings during the JEP-12 deprecation period. - parsed_json = loads('"%s"' % lexeme) + parsed_json = loads('"%s"' % lexeme.lstrip()) warnings.warn("deprecated string literal syntax", PendingDeprecationWarning) except ValueError: raise LexerError(lexer_position=start, lexer_value=lexeme, - message="Bad token %s" % value) + message="Bad token %s" % lexeme) return {'type': 'literal', 'value': parsed_json, 'start': start, 'end': len(lexeme)} def _consume_quoted_identifier(self, scanner): start = scanner.pos - lexeme = scanner.in_delimter('"') + lexeme = scanner.in_delimiter('"') try: - return {'type': 'identifier', 'value': loads(lexeme), + return {'type': 'quoted_identifier', 'value': loads(lexeme), 'start': start, 'end': len(lexeme)} except ValueError as e: error_message = str(e).split(':')[0] @@ -159,7 +158,7 @@ def _consume_quoted_identifier(self, scanner): def _consume_raw_string_literal(self, scanner): start = scanner.pos - lexeme = scanner.in_delimter("'") + lexeme = scanner.in_delimiter("'") return {'type': 'literal', 'value': lexeme, 'start': start, 'end': len(lexeme)} From 1f0ad9f038d82ec0d13f08b0c29929bde30378a7 Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Mon, 1 Jun 2015 19:41:47 -0700 Subject: [PATCH 4/6] Lexer cleanup and test fixes --- jmespath/lexer.py | 107 ++++++++++++++++++++++--------------------- tests/test_lexer.py | 11 +++++ tests/test_parser.py | 8 +--- 3 files changed, 67 insertions(+), 59 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index 9ff35572..8f12a070 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -5,26 +5,6 @@ from jmespath.exceptions import LexerError, EmptyExpressionError -START_IDENTIFIER = set(string.ascii_letters + '_') -VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') -START_NUMBER = set(string.digits + '-') -VALID_NUMBER = set(string.digits) -WHITESPACE = set(" \t\n\r") -SIMPLE_TOKENS = { - '.': 'dot', - '*': 'star', - ']': 'rbracket', - ',': 'comma', - ':': 'colon', - '@': 'current', - '&': 'expref', - '(': 'lparen', - ')': 'rparen', - '{': 'lbrace', - '}': 'rbrace' -} - - class Scanner(object): def __init__(self, expression): if not expression: @@ -45,40 +25,59 @@ def next(self): def in_delimiter(self, delimiter): start = self.pos - buffer = '' + buff = '' self.next() while self.current != delimiter: if self.current == '\\': - buffer += '\\' + buff += '\\' self.next() if self.current is None: - print(buffer) raise LexerError(lexer_position=start, lexer_value=self.expression, - message="Unclosed delimiter: %s" % buffer) - buffer += self.current + message="Unclosed %s delimiter" % delimiter) + buff += self.current self.next() + # Skip the closing delimiter. self.next() - return buffer + return buff class Lexer(object): + START_IDENTIFIER = set(string.ascii_letters + '_') + VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') + START_NUMBER = set(string.digits + '-') + VALID_NUMBER = set(string.digits) + WHITESPACE = set(" \t\n\r") + SIMPLE_TOKENS = { + '.': 'dot', + '*': 'star', + ']': 'rbracket', + ',': 'comma', + ':': 'colon', + '@': 'current', + '&': 'expref', + '(': 'lparen', + ')': 'rparen', + '{': 'lbrace', + '}': 'rbrace' + } + def tokenize(self, expression): scanner = Scanner(expression) while scanner.current is not None: - if scanner.current in SIMPLE_TOKENS: - yield {'type': SIMPLE_TOKENS[scanner.current], + if scanner.current in self.SIMPLE_TOKENS: + yield {'type': self.SIMPLE_TOKENS[scanner.current], 'value': scanner.current, - 'start': scanner.pos, 'end': scanner.pos} + 'start': scanner.pos, 'end': scanner.pos + 1} scanner.next() - elif scanner.current in START_IDENTIFIER: + elif scanner.current in self.START_IDENTIFIER: start = scanner.pos - buffer = scanner.current - while scanner.next() in VALID_IDENTIFIER: - buffer += scanner.current - yield {'type': 'unquoted_identifier', 'value': buffer, - 'start': start, 'end': len(buffer)} - elif scanner.current in WHITESPACE: + buff = scanner.current + while scanner.next() in self.VALID_IDENTIFIER: + buff += scanner.current + yield {'type': 'unquoted_identifier', 'value': buff, + 'start': start, 'end': start + len(buff)} + elif scanner.current in self.WHITESPACE: scanner.next() elif scanner.current == '[': start = scanner.pos @@ -86,27 +85,27 @@ def tokenize(self, expression): if next_char == ']': scanner.next() yield {'type': 'flatten', 'value': '[]', - 'start': start, 'end': start + 1} + 'start': start, 'end': start + 2} elif next_char == '?': scanner.next() yield {'type': 'filter', 'value': '[?', - 'start': start, 'end': start + 1} + 'start': start, 'end': start + 2} else: yield {'type': 'lbracket', 'value': '[', - 'start': start, 'end': start} + 'start': start, 'end': start + 1} elif scanner.current == "'": yield self._consume_raw_string_literal(scanner) elif scanner.current == '|': yield self._match_or_else(scanner, '|', 'or', 'pipe') elif scanner.current == '`': yield self._consume_literal(scanner) - elif scanner.current in START_NUMBER: + elif scanner.current in self.START_NUMBER: start = scanner.pos - buffer = scanner.current - while scanner.next() in VALID_NUMBER: - buffer += scanner.current - yield {'type': 'number', 'value': int(buffer), - 'start': start, 'end': len(buffer)} + buff = scanner.current + while scanner.next() in self.VALID_NUMBER: + buff += scanner.current + yield {'type': 'number', 'value': int(buff), + 'start': start, 'end': start + len(buff)} elif scanner.current == '"': yield self._consume_quoted_identifier(scanner) elif scanner.current == '<': @@ -118,15 +117,16 @@ def tokenize(self, expression): elif scanner.current == '=': yield self._match_or_else(scanner, '=', 'eq', 'unknown') else: - yield {'type': 'unknown', 'value': scanner.current, - 'start': scanner.pos, 'end': scanner.pos} - scanner.next() + raise LexerError(lexer_position=scanner.pos, + lexer_value=scanner.current, + message="Unknown token %s" % scanner.current) yield {'type': 'eof', 'value': '', 'start': len(expression), 'end': len(expression)} def _consume_literal(self, scanner): start = scanner.pos lexeme = scanner.in_delimiter('`') + lexeme = lexeme.replace('\\`', '`') try: # Assume it is valid JSON and attempt to parse. parsed_json = loads(lexeme) @@ -141,15 +141,17 @@ def _consume_literal(self, scanner): raise LexerError(lexer_position=start, lexer_value=lexeme, message="Bad token %s" % lexeme) + token_len = scanner.pos - start return {'type': 'literal', 'value': parsed_json, - 'start': start, 'end': len(lexeme)} + 'start': start, 'end': token_len} def _consume_quoted_identifier(self, scanner): start = scanner.pos - lexeme = scanner.in_delimiter('"') + lexeme = '"' + scanner.in_delimiter('"') + '"' try: + token_len = scanner.pos - start return {'type': 'quoted_identifier', 'value': loads(lexeme), - 'start': start, 'end': len(lexeme)} + 'start': start, 'end': token_len} except ValueError as e: error_message = str(e).split(':')[0] raise LexerError(lexer_position=start, @@ -159,8 +161,9 @@ def _consume_quoted_identifier(self, scanner): def _consume_raw_string_literal(self, scanner): start = scanner.pos lexeme = scanner.in_delimiter("'") + token_len = scanner.pos - start return {'type': 'literal', 'value': lexeme, - 'start': start, 'end': len(lexeme)} + 'start': start, 'end': token_len} def _match_or_else(self, scanner, expected, match_type, else_type): start = scanner.pos diff --git a/tests/test_lexer.py b/tests/test_lexer.py index d33522c8..54d973e2 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -132,6 +132,17 @@ def test_position_multiple_tokens(self): ] ) + def test_adds_quotes_when_invalid_json(self): + tokens = list(self.lexer.tokenize('`{{}`')) + self.assertEqual( + tokens, + [{'type': 'literal', 'value': '{{}', + 'start': 0, 'end': 4}, + {'type': 'eof', 'value': '', + 'start': 5, 'end': 5} + ] + ) + def test_unknown_character(self): with self.assertRaises(LexerError): tokens = list(self.lexer.tokenize('foo[0^]')) diff --git a/tests/test_parser.py b/tests/test_parser.py index 20fff821..a7856f2e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -144,18 +144,12 @@ def test_incomplete_expression_with_missing_paren(self): def test_bad_lexer_values(self): error_message = ( 'Bad jmespath expression: ' - 'Starting quote is missing the ending quote:\n' + 'Unclosed " delimiter:\n' 'foo."bar\n' ' ^') self.assert_error_message('foo."bar', error_message, exception=exceptions.LexerError) - def test_bad_lexer_literal_value_with_json_object(self): - error_message = ('Bad jmespath expression: ' - 'Bad token `{{}`:\n`{{}`\n^') - self.assert_error_message('`{{}`', error_message, - exception=exceptions.LexerError) - def test_bad_unicode_string(self): # This error message is straight from the JSON parser # and pypy has a slightly different error message, From a234f7367afdeb906b2d16a89516d5f0e0150f8f Mon Sep 17 00:00:00 2001 From: Michael Dowling Date: Wed, 3 Jun 2015 17:01:44 -0700 Subject: [PATCH 5/6] Using a stateful lexer rather than a Scanner object --- jmespath/lexer.py | 196 +++++++++++++++++++++++----------------------- 1 file changed, 97 insertions(+), 99 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index 8f12a070..3ff40119 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -5,43 +5,6 @@ from jmespath.exceptions import LexerError, EmptyExpressionError -class Scanner(object): - def __init__(self, expression): - if not expression: - raise EmptyExpressionError() - self.expression = expression - self.pos = 0 - self.chars = list(self.expression) - self.len = len(self.expression) - self.current = self.chars[self.pos] - - def next(self): - if self.pos == self.len - 1: - self.current = None - else: - self.pos += 1 - self.current = self.chars[self.pos] - return self.current - - def in_delimiter(self, delimiter): - start = self.pos - buff = '' - self.next() - while self.current != delimiter: - if self.current == '\\': - buff += '\\' - self.next() - if self.current is None: - raise LexerError(lexer_position=start, - lexer_value=self.expression, - message="Unclosed %s delimiter" % delimiter) - buff += self.current - self.next() - # Skip the closing delimiter. - self.next() - return buff - - class Lexer(object): START_IDENTIFIER = set(string.ascii_letters + '_') VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_') @@ -63,69 +26,104 @@ class Lexer(object): } def tokenize(self, expression): - scanner = Scanner(expression) - while scanner.current is not None: - if scanner.current in self.SIMPLE_TOKENS: - yield {'type': self.SIMPLE_TOKENS[scanner.current], - 'value': scanner.current, - 'start': scanner.pos, 'end': scanner.pos + 1} - scanner.next() - elif scanner.current in self.START_IDENTIFIER: - start = scanner.pos - buff = scanner.current - while scanner.next() in self.VALID_IDENTIFIER: - buff += scanner.current + self._init_expr(expression) + while self._current is not None: + if self._current in self.SIMPLE_TOKENS: + yield {'type': self.SIMPLE_TOKENS[self._current], + 'value': self._current, + 'start': self._pos, 'end': self._pos + 1} + self._next() + elif self._current in self.START_IDENTIFIER: + start = self._pos + buff = self._current + while self._next() in self.VALID_IDENTIFIER: + buff += self._current yield {'type': 'unquoted_identifier', 'value': buff, 'start': start, 'end': start + len(buff)} - elif scanner.current in self.WHITESPACE: - scanner.next() - elif scanner.current == '[': - start = scanner.pos - next_char = scanner.next() + elif self._current in self.WHITESPACE: + self._next() + elif self._current == '[': + start = self._pos + next_char = self._next() if next_char == ']': - scanner.next() + self._next() yield {'type': 'flatten', 'value': '[]', 'start': start, 'end': start + 2} elif next_char == '?': - scanner.next() + self._next() yield {'type': 'filter', 'value': '[?', 'start': start, 'end': start + 2} else: yield {'type': 'lbracket', 'value': '[', 'start': start, 'end': start + 1} - elif scanner.current == "'": - yield self._consume_raw_string_literal(scanner) - elif scanner.current == '|': - yield self._match_or_else(scanner, '|', 'or', 'pipe') - elif scanner.current == '`': - yield self._consume_literal(scanner) - elif scanner.current in self.START_NUMBER: - start = scanner.pos - buff = scanner.current - while scanner.next() in self.VALID_NUMBER: - buff += scanner.current + elif self._current == "'": + yield self._consume_raw_string_literal() + elif self._current == '|': + yield self._match_or_else('|', 'or', 'pipe') + elif self._current == '`': + yield self._consume_literal() + elif self._current in self.START_NUMBER: + start = self._pos + buff = self._current + while self._next() in self.VALID_NUMBER: + buff += self._current yield {'type': 'number', 'value': int(buff), 'start': start, 'end': start + len(buff)} - elif scanner.current == '"': - yield self._consume_quoted_identifier(scanner) - elif scanner.current == '<': - yield self._match_or_else(scanner, '=', 'lte', 'lt') - elif scanner.current == '>': - yield self._match_or_else(scanner, '=', 'gte', 'gt') - elif scanner.current == '!': - yield self._match_or_else(scanner, '=', 'ne', 'unknown') - elif scanner.current == '=': - yield self._match_or_else(scanner, '=', 'eq', 'unknown') + elif self._current == '"': + yield self._consume_quoted_identifier() + elif self._current == '<': + yield self._match_or_else('=', 'lte', 'lt') + elif self._current == '>': + yield self._match_or_else('=', 'gte', 'gt') + elif self._current == '!': + yield self._match_or_else('=', 'ne', 'unknown') + elif self._current == '=': + yield self._match_or_else('=', 'eq', 'unknown') else: - raise LexerError(lexer_position=scanner.pos, - lexer_value=scanner.current, - message="Unknown token %s" % scanner.current) + raise LexerError(lexer_position=self._pos, + lexer_value=self._current, + message="Unknown token %s" % self._current) yield {'type': 'eof', 'value': '', - 'start': len(expression), 'end': len(expression)} + 'start': self._len, 'end': self._len} + + def _init_expr(self, expression): + if not expression: + raise EmptyExpressionError() + self._pos = 0 + self._expression = expression + self._chars = list(self._expression) + self._current = self._chars[self._pos] + self._len = len(self._expression) + + def _next(self): + if self._pos == self._len - 1: + self._current = None + else: + self._pos += 1 + self._current = self._chars[self._pos] + return self._current + + def _in_delimiter(self, delimiter): + start = self._pos + buff = '' + self._next() + while self._current != delimiter: + if self._current == '\\': + buff += '\\' + self._next() + if self._current is None: + raise LexerError(lexer_position=start, + lexer_value=self._expression, + message="Unclosed %s delimiter" % delimiter) + buff += self._current + self._next() + # Skip the closing delimiter. + self._next() + return buff - def _consume_literal(self, scanner): - start = scanner.pos - lexeme = scanner.in_delimiter('`') + def _consume_literal(self): + start = self._pos + lexeme = self._in_delimiter('`') lexeme = lexeme.replace('\\`', '`') try: # Assume it is valid JSON and attempt to parse. @@ -139,17 +137,17 @@ def _consume_literal(self, scanner): PendingDeprecationWarning) except ValueError: raise LexerError(lexer_position=start, - lexer_value=lexeme, + lexer_value=self._expression, message="Bad token %s" % lexeme) - token_len = scanner.pos - start + token_len = self._pos - start return {'type': 'literal', 'value': parsed_json, 'start': start, 'end': token_len} - def _consume_quoted_identifier(self, scanner): - start = scanner.pos - lexeme = '"' + scanner.in_delimiter('"') + '"' + def _consume_quoted_identifier(self): + start = self._pos + lexeme = '"' + self._in_delimiter('"') + '"' try: - token_len = scanner.pos - start + token_len = self._pos - start return {'type': 'quoted_identifier', 'value': loads(lexeme), 'start': start, 'end': token_len} except ValueError as e: @@ -158,19 +156,19 @@ def _consume_quoted_identifier(self, scanner): lexer_value=lexeme, message=error_message) - def _consume_raw_string_literal(self, scanner): - start = scanner.pos - lexeme = scanner.in_delimiter("'") - token_len = scanner.pos - start + def _consume_raw_string_literal(self): + start = self._pos + lexeme = self._in_delimiter("'") + token_len = self._pos - start return {'type': 'literal', 'value': lexeme, 'start': start, 'end': token_len} - def _match_or_else(self, scanner, expected, match_type, else_type): - start = scanner.pos - current = scanner.current - next_char = scanner.next() + def _match_or_else(self, expected, match_type, else_type): + start = self._pos + current = self._current + next_char = self._next() if next_char == expected: - scanner.next() + self._next() return {'type': match_type, 'value': current + next_char, 'start': start, 'end': start + 1} return {'type': else_type, 'value': current, From e7c337e7f385219a2fad6185a4f1d52f14d06195 Mon Sep 17 00:00:00 2001 From: James Saryerwinnie Date: Mon, 8 Jun 2015 22:51:50 -0700 Subject: [PATCH 6/6] Rename variables to be full words NFC, just small stylistic changes. --- jmespath/lexer.py | 54 ++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/jmespath/lexer.py b/jmespath/lexer.py index 3ff40119..3c11d1d4 100644 --- a/jmespath/lexer.py +++ b/jmespath/lexer.py @@ -26,15 +26,15 @@ class Lexer(object): } def tokenize(self, expression): - self._init_expr(expression) + self._initialize_for_expression(expression) while self._current is not None: if self._current in self.SIMPLE_TOKENS: yield {'type': self.SIMPLE_TOKENS[self._current], 'value': self._current, - 'start': self._pos, 'end': self._pos + 1} + 'start': self._position, 'end': self._position + 1} self._next() elif self._current in self.START_IDENTIFIER: - start = self._pos + start = self._position buff = self._current while self._next() in self.VALID_IDENTIFIER: buff += self._current @@ -43,7 +43,7 @@ def tokenize(self, expression): elif self._current in self.WHITESPACE: self._next() elif self._current == '[': - start = self._pos + start = self._position next_char = self._next() if next_char == ']': self._next() @@ -63,7 +63,7 @@ def tokenize(self, expression): elif self._current == '`': yield self._consume_literal() elif self._current in self.START_NUMBER: - start = self._pos + start = self._position buff = self._current while self._next() in self.VALID_NUMBER: buff += self._current @@ -80,31 +80,33 @@ def tokenize(self, expression): elif self._current == '=': yield self._match_or_else('=', 'eq', 'unknown') else: - raise LexerError(lexer_position=self._pos, + raise LexerError(lexer_position=self._position, lexer_value=self._current, message="Unknown token %s" % self._current) yield {'type': 'eof', 'value': '', - 'start': self._len, 'end': self._len} + 'start': self._length, 'end': self._length} - def _init_expr(self, expression): + def _initialize_for_expression(self, expression): if not expression: raise EmptyExpressionError() - self._pos = 0 + self._position = 0 self._expression = expression self._chars = list(self._expression) - self._current = self._chars[self._pos] - self._len = len(self._expression) + self._current = self._chars[self._position] + self._length = len(self._expression) def _next(self): - if self._pos == self._len - 1: + if self._position == self._length - 1: self._current = None else: - self._pos += 1 - self._current = self._chars[self._pos] + self._position += 1 + self._current = self._chars[self._position] return self._current - def _in_delimiter(self, delimiter): - start = self._pos + def _consume_until(self, delimiter): + # Consume until the delimiter is reached, + # allowing for the delimiter to be escaped with "\". + start = self._position buff = '' self._next() while self._current != delimiter: @@ -122,8 +124,8 @@ def _in_delimiter(self, delimiter): return buff def _consume_literal(self): - start = self._pos - lexeme = self._in_delimiter('`') + start = self._position + lexeme = self._consume_until('`') lexeme = lexeme.replace('\\`', '`') try: # Assume it is valid JSON and attempt to parse. @@ -139,15 +141,15 @@ def _consume_literal(self): raise LexerError(lexer_position=start, lexer_value=self._expression, message="Bad token %s" % lexeme) - token_len = self._pos - start + token_len = self._position - start return {'type': 'literal', 'value': parsed_json, 'start': start, 'end': token_len} def _consume_quoted_identifier(self): - start = self._pos - lexeme = '"' + self._in_delimiter('"') + '"' + start = self._position + lexeme = '"' + self._consume_until('"') + '"' try: - token_len = self._pos - start + token_len = self._position - start return {'type': 'quoted_identifier', 'value': loads(lexeme), 'start': start, 'end': token_len} except ValueError as e: @@ -157,14 +159,14 @@ def _consume_quoted_identifier(self): message=error_message) def _consume_raw_string_literal(self): - start = self._pos - lexeme = self._in_delimiter("'") - token_len = self._pos - start + start = self._position + lexeme = self._consume_until("'") + token_len = self._position - start return {'type': 'literal', 'value': lexeme, 'start': start, 'end': token_len} def _match_or_else(self, expected, match_type, else_type): - start = self._pos + start = self._position current = self._current next_char = self._next() if next_char == expected: