Skip to content

Commit

Permalink
Merge branch 'mtdowling-lexer-perf' into develop
Browse files Browse the repository at this point in the history
* mtdowling-lexer-perf:
  Rename variables to be full words
  Using a stateful lexer rather than a Scanner object
  Lexer cleanup and test fixes
  Lexer bug fixes
  Removing the computed table is just as fast but less code
  No longer using a regex based lexer
  • Loading branch information
jamesls committed Jun 9, 2015
2 parents 2c6d516 + e7c337e commit b0c1004
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 134 deletions.
283 changes: 156 additions & 127 deletions jmespath/lexer.py
Original file line number Diff line number Diff line change
@@ -1,148 +1,177 @@
import re
import string
import warnings
from json import loads

from jmespath.exceptions import LexerError, EmptyExpressionError


class Lexer(object):
TOKENS = (
r'(?P<number>-?\d+)|'
r'(?P<unquoted_identifier>([a-zA-Z_][a-zA-Z_0-9]*))|'
r'(?P<quoted_identifier>("(?:\\\\|\\"|[^"])*"))|'
r'(?P<string_literal>(\'(?:\\\\|\\\'|[^\'])*\'))|'
r'(?P<literal>(`(?:\\\\|\\`|[^`])*`))|'
r'(?P<filter>\[\?)|'
r'(?P<or>\|\|)|'
r'(?P<pipe>\|)|'
r'(?P<ne>!=)|'
r'(?P<rbrace>\})|'
r'(?P<eq>==)|'
r'(?P<dot>\.)|'
r'(?P<star>\*)|'
r'(?P<gte>>=)|'
r'(?P<lparen>\()|'
r'(?P<lbrace>\{)|'
r'(?P<lte><=)|'
r'(?P<flatten>\[\])|'
r'(?P<rbracket>\])|'
r'(?P<lbracket>\[)|'
r'(?P<rparen>\))|'
r'(?P<comma>,)|'
r'(?P<colon>:)|'
r'(?P<lt><)|'
r'(?P<expref>&)|'
r'(?P<gt>>)|'
r'(?P<current>@)|'
r'(?P<skip>[ \t]+)'
)

def __init__(self):
self.master_regex = re.compile(self.TOKENS)
START_IDENTIFIER = set(string.ascii_letters + '_')
VALID_IDENTIFIER = set(string.ascii_letters + string.digits + '_')
START_NUMBER = set(string.digits + '-')
VALID_NUMBER = set(string.digits)
WHITESPACE = set(" \t\n\r")
SIMPLE_TOKENS = {
'.': 'dot',
'*': 'star',
']': 'rbracket',
',': 'comma',
':': 'colon',
'@': 'current',
'&': 'expref',
'(': 'lparen',
')': 'rparen',
'{': 'lbrace',
'}': 'rbrace'
}

def tokenize(self, expression):
self._initialize_for_expression(expression)
while self._current is not None:
if self._current in self.SIMPLE_TOKENS:
yield {'type': self.SIMPLE_TOKENS[self._current],
'value': self._current,
'start': self._position, 'end': self._position + 1}
self._next()
elif self._current in self.START_IDENTIFIER:
start = self._position
buff = self._current
while self._next() in self.VALID_IDENTIFIER:
buff += self._current
yield {'type': 'unquoted_identifier', 'value': buff,
'start': start, 'end': start + len(buff)}
elif self._current in self.WHITESPACE:
self._next()
elif self._current == '[':
start = self._position
next_char = self._next()
if next_char == ']':
self._next()
yield {'type': 'flatten', 'value': '[]',
'start': start, 'end': start + 2}
elif next_char == '?':
self._next()
yield {'type': 'filter', 'value': '[?',
'start': start, 'end': start + 2}
else:
yield {'type': 'lbracket', 'value': '[',
'start': start, 'end': start + 1}
elif self._current == "'":
yield self._consume_raw_string_literal()
elif self._current == '|':
yield self._match_or_else('|', 'or', 'pipe')
elif self._current == '`':
yield self._consume_literal()
elif self._current in self.START_NUMBER:
start = self._position
buff = self._current
while self._next() in self.VALID_NUMBER:
buff += self._current
yield {'type': 'number', 'value': int(buff),
'start': start, 'end': start + len(buff)}
elif self._current == '"':
yield self._consume_quoted_identifier()
elif self._current == '<':
yield self._match_or_else('=', 'lte', 'lt')
elif self._current == '>':
yield self._match_or_else('=', 'gte', 'gt')
elif self._current == '!':
yield self._match_or_else('=', 'ne', 'unknown')
elif self._current == '=':
yield self._match_or_else('=', 'eq', 'unknown')
else:
raise LexerError(lexer_position=self._position,
lexer_value=self._current,
message="Unknown token %s" % self._current)
yield {'type': 'eof', 'value': '',
'start': self._length, 'end': self._length}

def _initialize_for_expression(self, expression):
if not expression:
raise EmptyExpressionError()
previous_column = 0
for match in self.master_regex.finditer(expression):
value = match.group()
start = match.start()
end = match.end()
if match.lastgroup == 'skip':
# Ignore whitespace.
previous_column = end
continue
if start != previous_column:
bad_value = expression[previous_column:start]
# Try to give a good error message.
if bad_value == '"':
raise LexerError(
lexer_position=previous_column,
lexer_value=value,
message='Starting quote is missing the ending quote',
expression=expression)
raise LexerError(lexer_position=previous_column,
lexer_value=value,
message='Unknown character',
expression=expression)
previous_column = end
token_type = match.lastgroup
handler = getattr(self, '_token_%s' % token_type.lower(), None)
if handler is not None:
value = handler(value, start, end)
yield {'type': token_type, 'value': value,
'start': start, 'end': end}
# At the end of the loop make sure we've consumed all the input.
# If we haven't then we have unidentified characters.
if end != len(expression):
msg = "Unknown characters at the end of the expression"
raise LexerError(lexer_position=end,
lexer_value='',
message=msg, expression=expression)
self._position = 0
self._expression = expression
self._chars = list(self._expression)
self._current = self._chars[self._position]
self._length = len(self._expression)

def _next(self):
if self._position == self._length - 1:
self._current = None
else:
yield {'type': 'eof', 'value': '',
'start': len(expression), 'end': len(expression)}
self._position += 1
self._current = self._chars[self._position]
return self._current

def _token_number(self, value, start, end):
return int(value)
def _consume_until(self, delimiter):
# Consume until the delimiter is reached,
# allowing for the delimiter to be escaped with "\".
start = self._position
buff = ''
self._next()
while self._current != delimiter:
if self._current == '\\':
buff += '\\'
self._next()
if self._current is None:
raise LexerError(lexer_position=start,
lexer_value=self._expression,
message="Unclosed %s delimiter" % delimiter)
buff += self._current
self._next()
# Skip the closing delimiter.
self._next()
return buff

def _token_quoted_identifier(self, value, start, end):
def _consume_literal(self):
start = self._position
lexeme = self._consume_until('`')
lexeme = lexeme.replace('\\`', '`')
try:
return loads(value)
except ValueError as e:
error_message = str(e).split(':')[0]
raise LexerError(lexer_position=start,
lexer_value=value,
message=error_message)

def _token_string_literal(self, value, start, end):
return value[1:-1]

def _token_literal(self, value, start, end):
actual_value = value[1:-1]
actual_value = actual_value.replace('\\`', '`').lstrip()
# First, if it looks like JSON then we parse it as
# JSON and any json parsing errors propogate as lexing
# errors.
if self._looks_like_json(actual_value):
try:
return loads(actual_value)
except ValueError:
raise LexerError(lexer_position=start,
lexer_value=value,
message="Bad token %s" % value)
else:
potential_value = '"%s"' % actual_value
# Assume it is valid JSON and attempt to parse.
parsed_json = loads(lexeme)
except ValueError:
try:
# There's a shortcut syntax where string literals
# don't have to be quoted. This is only true if the
# string doesn't start with chars that could start a valid
# JSON value.
value = loads(potential_value)
# Invalid JSON values should be converted to quoted
# JSON strings during the JEP-12 deprecation period.
parsed_json = loads('"%s"' % lexeme.lstrip())
warnings.warn("deprecated string literal syntax",
PendingDeprecationWarning)
return value
except ValueError:
raise LexerError(lexer_position=start,
lexer_value=value,
message="Bad token %s" % value)
lexer_value=self._expression,
message="Bad token %s" % lexeme)
token_len = self._position - start
return {'type': 'literal', 'value': parsed_json,
'start': start, 'end': token_len}

def _looks_like_json(self, value):
# Figure out if the string "value" starts with something
# that looks like json.
if not value:
return False
elif value[0] in ['"', '{', '[']:
return True
elif value in ['true', 'false', 'null']:
return True
elif value[0] in ['-', '0', '1', '2', '3', '4', '5',
'6', '7', '8', '9']:
# Then this is JSON, return True.
try:
loads(value)
return True
except ValueError:
return False
else:
return False
def _consume_quoted_identifier(self):
start = self._position
lexeme = '"' + self._consume_until('"') + '"'
try:
token_len = self._position - start
return {'type': 'quoted_identifier', 'value': loads(lexeme),
'start': start, 'end': token_len}
except ValueError as e:
error_message = str(e).split(':')[0]
raise LexerError(lexer_position=start,
lexer_value=lexeme,
message=error_message)

def _consume_raw_string_literal(self):
start = self._position
lexeme = self._consume_until("'")
token_len = self._position - start
return {'type': 'literal', 'value': lexeme,
'start': start, 'end': token_len}

def _match_or_else(self, expected, match_type, else_type):
start = self._position
current = self._current
next_char = self._next()
if next_char == expected:
self._next()
return {'type': match_type, 'value': current + next_char,
'start': start, 'end': start + 1}
return {'type': else_type, 'value': current,
'start': start, 'end': start}
11 changes: 11 additions & 0 deletions tests/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ def test_position_multiple_tokens(self):
]
)

def test_adds_quotes_when_invalid_json(self):
tokens = list(self.lexer.tokenize('`{{}`'))
self.assertEqual(
tokens,
[{'type': 'literal', 'value': '{{}',
'start': 0, 'end': 4},
{'type': 'eof', 'value': '',
'start': 5, 'end': 5}
]
)

def test_unknown_character(self):
with self.assertRaises(LexerError):
tokens = list(self.lexer.tokenize('foo[0^]'))
Expand Down
8 changes: 1 addition & 7 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,18 +144,12 @@ def test_incomplete_expression_with_missing_paren(self):
def test_bad_lexer_values(self):
error_message = (
'Bad jmespath expression: '
'Starting quote is missing the ending quote:\n'
'Unclosed " delimiter:\n'
'foo."bar\n'
' ^')
self.assert_error_message('foo."bar', error_message,
exception=exceptions.LexerError)

def test_bad_lexer_literal_value_with_json_object(self):
error_message = ('Bad jmespath expression: '
'Bad token `{{}`:\n`{{}`\n^')
self.assert_error_message('`{{}`', error_message,
exception=exceptions.LexerError)

def test_bad_unicode_string(self):
# This error message is straight from the JSON parser
# and pypy has a slightly different error message,
Expand Down

0 comments on commit b0c1004

Please sign in to comment.