Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenizer refactorings required by #619 #623

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions packaging/_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from ast import literal_eval
from typing import Any, List, NamedTuple, Tuple, Union

from ._tokenizer import Tokenizer
from ._tokenizer import MARKER_VAR_SUITABLE_TOKENS, Tokenizer


class Node:
Expand Down Expand Up @@ -74,15 +74,15 @@ def parse_named_requirement(requirement: str) -> Requirement:
specifier = ""
url = ""
if tokens.match("URL_SPEC"):
url = tokens.read().text[1:].strip()
url = tokens.read(tokens.rules).text[1:].strip()
elif not tokens.match("END"):
specifier = parse_specifier(tokens)
if tokens.try_read("SEMICOLON"):
marker = ""
while not tokens.match("END"):
# we don't validate markers here, it's done later as part of
# packaging/requirements.py
marker += tokens.read().text
marker += tokens.read(tokens.rules).text
else:
marker = ""
tokens.expect(
Expand Down Expand Up @@ -130,7 +130,7 @@ def parse_version_many(tokens: Tokenizer) -> str:
while tokens.match("OP"):
parsed_specifiers += tokens.read("OP").text
if tokens.match("VERSION"):
parsed_specifiers += tokens.read("VERSION").text
parsed_specifiers += tokens.read("VERSION").text.strip()
else:
tokens.raise_syntax_error(message="Missing version")
if not tokens.match("COMMA"):
Expand Down Expand Up @@ -178,7 +178,7 @@ def parse_marker_var(tokens: Tokenizer) -> MarkerVar:
"""
marker_var: env_var | python_str
"""
if tokens.match("VARIABLE"):
if tokens.match(MARKER_VAR_SUITABLE_TOKENS):
return parse_env_var(tokens)
else:
return parse_python_str(tokens)
Expand All @@ -188,7 +188,7 @@ def parse_env_var(tokens: Tokenizer) -> Variable:
"""
env_var: VARIABLE
"""
env_var = tokens.read("VARIABLE").text.replace(".", "_")
env_var = tokens.read(MARKER_VAR_SUITABLE_TOKENS).text.strip().replace(".", "_")
if (
env_var == "platform_python_implementation"
or env_var == "python_implementation"
Expand Down Expand Up @@ -220,7 +220,7 @@ def parse_marker_op(tokens: Tokenizer) -> Op:
tokens.read("IN", error_message="NOT token must be follewed by IN token")
return Op("not in")
elif tokens.match("OP"):
return Op(tokens.read().text)
return Op(tokens.read(tokens.rules).text)
else:
return tokens.raise_syntax_error(
message='Couldn\'t parse marker operator. Expecting one of \
Expand Down
97 changes: 61 additions & 36 deletions packaging/_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,25 @@
import re
from typing import Dict, Generator, NoReturn, Optional
from typing import Any, Dict, Generator, NoReturn, Optional, Set, Union

from .specifiers import Specifier

TokenNameMatchT = Union[str, Set[str], Dict[str, Any]]


class Token:
def __init__(self, name: str, text: str, position: int) -> None:
self.name = name
self.text = text
self.position = position

def matches(self, name: str = "") -> bool:
if name and self.name != name:
return False
return True
def matches(self, name: TokenNameMatchT = "") -> bool:
if isinstance(name, str):
name = {name}

if self.name in name:
return True

return False


class ParseExceptionError(Exception):
Expand All @@ -27,15 +33,14 @@ def __init__(self, message: str, position: int) -> None:


DEFAULT_RULES = {
"LPAREN": r"\s*\(",
"RPAREN": r"\s*\)",
"LBRACKET": r"\s*\[",
"RBRACKET": r"\s*\]",
"SEMICOLON": r"\s*;",
"COMMA": r"\s*,",
"LPAREN": r"\(",
"RPAREN": r"\)",
"LBRACKET": r"\[",
"RBRACKET": r"\]",
"SEMICOLON": r";",
"COMMA": r",",
"QUOTED_STRING": re.compile(
r"""
\s*
(
('[^']*')
|
Expand All @@ -44,13 +49,12 @@ def __init__(self, message: str, position: int) -> None:
""",
re.VERBOSE,
),
"OP": r"\s*(===|==|~=|!=|<=|>=|<|>)",
"BOOLOP": r"\s*(or|and)",
"IN": r"\s*in",
"NOT": r"\s*not",
"OP": "(===|==|~=|!=|<=|>=|<|>)",
"BOOLOP": "(or|and)",
"IN": "in",
"NOT": "not",
"VARIABLE": re.compile(
r"""
\s*
(
python_version
|python_full_version
Expand All @@ -66,9 +70,12 @@ def __init__(self, message: str, position: int) -> None:
re.VERBOSE,
),
"VERSION": re.compile(Specifier._version_regex_str, re.VERBOSE | re.IGNORECASE),
"URL_SPEC": r"\s*@ *[^ ]+",
"IDENTIFIER": r"\s*[a-zA-Z0-9._-]+",
"URL_SPEC": "@ *[^ ]+",
"IDENTIFIER": "[a-zA-Z0-9._-]+",
"WS": "\\s+",
}
WHITESPACE_TOKENS = "WS"
MARKER_VAR_SUITABLE_TOKENS = {"VARIABLE", "VERSION"}


class Tokenizer:
Expand All @@ -82,51 +89,69 @@ class Tokenizer:

def __init__(self, source: str, rules: Dict[str, object] = DEFAULT_RULES) -> None:
self.source = source
self.rules = {name: re.compile(pattern) for name, pattern in rules.items()}
self.rules = {name: re.compile(pattern) for (name, pattern) in rules.items()}
self.next_token = None
self.generator = self._tokenize()
self.position = 0

def peek(self) -> Token:
def peek(self, skip: TokenNameMatchT = WHITESPACE_TOKENS) -> Token:
"""
Return the next token to be read.
"""
if not self.next_token:
while not self.next_token:
self.next_token = next(self.generator)
if self.next_token and self.next_token.matches(skip):
# print("skip", self.next_token)
self.next_token = None
# print("peek", self.next_token)
return self.next_token

def match(self, *name: str) -> bool:
def match(
self, name: TokenNameMatchT, skip: TokenNameMatchT = WHITESPACE_TOKENS
) -> bool:
"""
Return True if the next token matches the given arguments.
"""
token = self.peek()
return token.matches(*name)

def expect(self, *name: str, error_message: str) -> Token:
token = self.peek(skip)
return token.matches(name)

def expect(
self,
name: TokenNameMatchT,
error_message: str,
skip: TokenNameMatchT = WHITESPACE_TOKENS,
) -> Token:
"""
Raise SyntaxError if the next token doesn't match given arguments.
"""
token = self.peek()
if not token.matches(*name):
token = self.peek(skip)
if not token.matches(name):
raise self.raise_syntax_error(message=error_message)
return token

def read(self, *name: str, error_message: str = "") -> Token:
def read(
self,
name: TokenNameMatchT,
error_message: str = "",
skip: TokenNameMatchT = WHITESPACE_TOKENS,
) -> Token:
"""Return the next token and advance to the next token.

Raise SyntaxError if the token doesn't match.
"""
result = self.expect(*name, error_message=error_message)
result = self.expect(name, error_message=error_message, skip=skip)
self.next_token = None
return result

def try_read(self, *name: str) -> Optional[Token]:
def try_read(
self, name: TokenNameMatchT, skip: TokenNameMatchT = WHITESPACE_TOKENS
) -> Optional[Token]:
"""read() if the next token matches the given arguments.

Do nothing if it does not match.
"""
if self.match(*name):
return self.read()
if self.match(name, skip=skip):
return self.read(self.rules, skip=skip)
return None

def raise_syntax_error(self, *, message: str) -> NoReturn:
Expand All @@ -140,7 +165,7 @@ def raise_syntax_error(self, *, message: str) -> NoReturn:
self.position,
)

def _make_token(self, name: str, text: str) -> Token:
def _make_token(self, name: TokenNameMatchT, text: str) -> Token:
"""
Make a token with the current position.
"""
Expand All @@ -156,7 +181,7 @@ def _tokenize(self) -> Generator[Token, Token, None]:
if match:
token_text = match[0]

yield self._make_token(name, token_text.strip())
yield self._make_token(name, token_text)
self.position += len(token_text)
break
else:
Expand Down