From 15d95d7ac66cf4ebca69ced7896e054eb5cdeaa7 Mon Sep 17 00:00:00 2001 From: Jack Cherng Date: Wed, 18 Sep 2024 11:16:03 +0800 Subject: [PATCH] refactor: use "python-vendorize" for 3rd-party libs Signed-off-by: Jack Cherng --- .gitattributes | 1 + .gitignore | 1 + Makefile | 4 + plugin/_vendor/__init__.py | 0 plugin/{libs => _vendor}/trie/LICENSE.md | 0 plugin/{libs => _vendor}/trie/__init__.py | 19 +-- plugin/_vendor/triegex/__init__.py | 146 +++++++++++++++++++ plugin/libs/triegex/LICENSE | 21 --- plugin/libs/triegex/__init__.py | 170 ---------------------- plugin/utils.py | 5 +- pyproject.toml | 6 +- vendorize.toml | 4 + 12 files changed, 171 insertions(+), 206 deletions(-) create mode 100644 plugin/_vendor/__init__.py rename plugin/{libs => _vendor}/trie/LICENSE.md (100%) rename plugin/{libs => _vendor}/trie/__init__.py (84%) create mode 100644 plugin/_vendor/triegex/__init__.py delete mode 100644 plugin/libs/triegex/LICENSE delete mode 100644 plugin/libs/triegex/__init__.py create mode 100644 vendorize.toml diff --git a/.gitattributes b/.gitattributes index b8a5b0f0..40edee50 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ +*.dist-info/ export-ignore *.pyi export-ignore .dependabot export-ignore .editorconfig export-ignore diff --git a/.gitignore b/.gitignore index 6d11cacc..055cc491 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ Desktop.ini Thumbs.db # Python +*.dist-info/ *.py[cod] .mypy_cache/ .ruff_cache/ diff --git a/Makefile b/Makefile index d67cec49..0955e816 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,10 @@ pip-compile: uv pip compile --upgrade requirements-dev.in -o requirements-dev.txt uv pip compile --upgrade requirements-docs.in -o requirements-docs.txt +.PHONY: vendorize +vendorize: + python-vendorize + .PHONY: ci-check ci-check: @echo "========== check: mypy ==========" diff --git a/plugin/_vendor/__init__.py b/plugin/_vendor/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugin/libs/trie/LICENSE.md b/plugin/_vendor/trie/LICENSE.md similarity index 100% rename from plugin/libs/trie/LICENSE.md rename to plugin/_vendor/trie/LICENSE.md diff --git a/plugin/libs/trie/__init__.py b/plugin/_vendor/trie/__init__.py similarity index 84% rename from plugin/libs/trie/__init__.py rename to plugin/_vendor/trie/__init__.py index 21bf1704..175be277 100644 --- a/plugin/libs/trie/__init__.py +++ b/plugin/_vendor/trie/__init__.py @@ -1,16 +1,17 @@ -from typing import Dict, Generator, Iterable +""" +A Trie/Prefix Tree is a kind of search tree used to provide quick lookup +of words/patterns in a set of words. A basic Trie however has O(n^2) space complexity +making it impractical in practice. It however provides O(max(search_string, length of +longest word)) lookup time making it an optimal approach when space is not an issue. +@see https://github.com/TheAlgorithms/Python/blob/master/data_structures/trie/trie.py +This file has been modified by @jfcherng to fit his own use. +""" -class TrieNode: - """ - A Trie/Prefix Tree is a kind of search tree used to provide quick lookup - of words/patterns in a set of words. A basic Trie however has O(n^2) space complexity - making it impractical in practice. It however provides O(max(search_string, length of - longest word)) lookup time making it an optimal approach when space is not an issue. +from typing import Dict, Generator, Iterable - This file has been modified by @jfcherng to fit his own use. - """ +class TrieNode: def __init__(self) -> None: self.nodes: Dict[str, TrieNode] = dict() # Mapping from char to TrieNode self.is_leaf = False diff --git a/plugin/_vendor/triegex/__init__.py b/plugin/_vendor/triegex/__init__.py new file mode 100644 index 00000000..0b628409 --- /dev/null +++ b/plugin/_vendor/triegex/__init__.py @@ -0,0 +1,146 @@ +import collections + +__all__ = ('Triegex',) + +OR = r'|' + +# regex below matches nothing https://stackoverflow.com/a/940840/2183102. We +# use '~' to ensure it comes last when lexicographically sorted: +# max(string.printable) is '~' +NOTHING = r'~^(?#match nothing)' +GROUP = r'(?:{0})' +WORD_BOUNDARY = r'\b' + + +class TriegexNode: + + def __init__(self, char: str, end: bool, *childrens): + self.char = char if char is not None else '' + self.end = end + self.childrens = {children.char: children for children in childrens} + + def __iter__(self): + return iter(sorted(self.childrens.values(), key=lambda x: x.char)) + + def __len__(self): + return len(self.childrens) + + def __repr__(self): + return f'' + + def __contains__(self, key): + return key in self.childrens + + def __getitem__(self, key): + return self.childrens[key] + + def __delitem__(self, key): + del self.childrens[key] + + def to_regex(self): + stack = [self] + ready = [] + waiting = [] + + while stack: + waiting.append(stack.pop()) + stack.extend(waiting[-1]) + + while waiting: + node = waiting.pop() + result = node.char + + if node.end: + result += WORD_BOUNDARY + + # if there is only one children, we can safely concatenate chars + # withoug nesting + elif len(node) == 1: + result += ready.pop() + + elif len(node) > 1: + result += GROUP.format(OR.join(reversed( + [ready.pop() for _ in node] + ))) + + ready.append(result) + return ready[-1] + + +class Triegex(collections.MutableSet): + + _root = None + + def __init__(self, *words): + """ + Trigex constructor. + """ + + # make sure we match nothing when no words are added + self._root = TriegexNode(None, False, TriegexNode(NOTHING, False)) + + for word in words: + self.add(word) + + def add(self, word: str): + current = self._root + for letter in word[:-1]: + current = current.childrens.setdefault(letter, + TriegexNode(letter, False)) + # this will ensure that we correctly match the word boundary + current.childrens[word[-1]] = TriegexNode(word[-1], True) + + def to_regex(self): + r""" + Produce regular expression that will match each word in the + internal trie. + + >>> t = Triegex('foo', 'bar', 'baz') + >>> t.to_regex() + '(?:ba(?:r\\b|z\\b)|foo\\b|~^(?#match nothing))' + """ + return self._root.to_regex() + + def _traverse(self): + stack = [self._root] + current = self._root + while stack: + yield current + current = stack.pop() + stack.extend(current.childrens.values()) + + def __iter__(self): + paths = {self._root.char: []} + for node in self._traverse(): + for children in node: + paths[children.char] = [node.char] + paths[node.char] + if children.end: + char = children.char + yield ''.join(reversed([char] + paths[char])) + + def __len__(self): + return sum(1 for _ in self.__iter__()) + + def __contains__(self, word): + current = self._root + for char in word: + if char not in current: + return False + current = current[char] + return True and current.end # word has to end with the last char + + def discard(self, word): + to_delete = [self._root] + current = self._root + for char in word: + if char not in current: + return + current = current[char] + to_delete.append(current) + if not to_delete[-1].end: + return + while len(to_delete) > 1: + node = to_delete.pop() + if len(node) == 0: + del to_delete[-1][node.char] + return diff --git a/plugin/libs/triegex/LICENSE b/plugin/libs/triegex/LICENSE deleted file mode 100644 index 1e89ffab..00000000 --- a/plugin/libs/triegex/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2017 Alexander Zhukov - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/plugin/libs/triegex/__init__.py b/plugin/libs/triegex/__init__.py deleted file mode 100644 index 526c8a15..00000000 --- a/plugin/libs/triegex/__init__.py +++ /dev/null @@ -1,170 +0,0 @@ -# @see https://github.com/ZhukovAlexander/triegex -import collections - -__all__ = ("Triegex",) - -OR = r"|" - -# regex below matches nothing https://stackoverflow.com/a/940840/2183102. We -# use '~' to ensure it comes last when lexicographically sorted: -# max(string.printable) is '~' -NOTHING = r"~^(?#match nothing)" -GROUP = r"(?:{0})" -WORD_BOUNDARY = r"\b" - - -class TriegexNode: - def __init__(self, char: str, end: bool, *children): - self.char = char if char is not None else "" - self.end = end - self.children = {child.char: child for child in children} - - def __iter__(self): - return iter(sorted(self.children.values(), key=lambda x: x.char)) - - def __len__(self): - return len(self.children) - - def __repr__(self): - return "".format(self) - - def __contains__(self, key): - return key in self.children - - def __getitem__(self, key): - return self.children[key] - - def __delitem__(self, key): - del self.children[key] - - def to_regex(self) -> str: - """ - RECURSIVE IMPLEMENTATION FOR REFERENCE - suffixes = [v.to_regex() for k, v in self.children.items()] - if self.end: - suffixes += [WORD_BOUNDARY] - - if len(suffixes) > 1: - return self.char + GROUP.format(OR.join(suffixes)) - elif len(suffixes) == 1: - return self.char + suffixes[0] - else: - return self.char - """ - - stack = [self] - # marks starting indices of children of a node - lookup = [] - - # Creates an ordered list of nodes starting with root and ending with leaves by using BFS - i = 0 - j = 1 - while i < len(stack): - stack.extend(sorted(stack[i].children.values(), key=lambda node: node.char)) - lookup.append(j) - j += len(stack[i].children) - i += 1 - - i = len(stack) - # temp value array - sub_regexes = [None] * i - while i > 0: - # We start with leaves and end at root thus we decrement - i -= 1 - node = stack[i] - # Get regexes of child nodes and make a root regex - suffixes = [sub_regexes[child] for child in range(lookup[i], lookup[i] + len(node.children))] - if node.end: - # if the node is an ending node we add a \b character - suffixes += [WORD_BOUNDARY] - # If we arrive at the root node we have to add the NOTHING expression - if i == 0: - suffixes += [NOTHING] - if len(suffixes) > 1: - sub_regexes[i] = node.char + GROUP.format(OR.join(suffixes)) - elif len(suffixes) == 1: - sub_regexes[i] = node.char + suffixes[0] - else: - sub_regexes[i] = node.char - # return the top Regex - return sub_regexes[0] - - -class Triegex(collections.MutableSet): - def __init__(self, *words): - """ - Trigex constructor. - """ - - self._root = TriegexNode(None, False) - - for word in words: - self.add(word) - - def add(self, word: str): - current = self._root - for letter in word[:-1]: - if letter in current.children: - current = current.children[letter] - else: - current = current.children.setdefault(letter, TriegexNode(letter, False)) - # this will ensure that we correctly match the word boundary - if word[-1] in current.children: - current.children[word[-1]].end = True - else: - current.children[word[-1]] = TriegexNode(word[-1], True) - - def to_regex(self): - r""" - Produce regular expression that will match each word in the - internal trie. - - >>> t = Triegex('foo', 'bar', 'baz') - >>> t.to_regex() - '(?:ba(?:r\\b|z\\b)|foo\\b|~^(?#match nothing))' - """ - return self._root.to_regex() - - def _traverse(self): - stack = [self._root] - current = self._root - while stack: - yield current - current = stack.pop() - stack.extend(current.children.values()) - - def __iter__(self): - paths = {self._root.char: []} - for node in self._traverse(): - for child in node: - paths[child.char] = [node.char] + paths[node.char] - if child.end: - char = child.char - yield "".join(reversed([char] + paths[char])) - - def __len__(self): - return sum(1 for _ in self.__iter__()) - - def __contains__(self, word): - current = self._root - for char in word: - if char not in current: - return False - current = current[char] - return True and current.end # word has to end with the last char - - def discard(self, word): - to_delete = [self._root] - current = self._root - for char in word: - if char not in current: - return - current = current[char] - to_delete.append(current) - if not to_delete[-1].end: - return - while len(to_delete) > 1: - node = to_delete.pop() - if len(node) == 0: - del to_delete[-1][node.char] - return diff --git a/plugin/utils.py b/plugin/utils.py index 014f5e21..b16fad8e 100644 --- a/plugin/utils.py +++ b/plugin/utils.py @@ -17,8 +17,9 @@ import sublime from more_itertools import first_true, unique_everseen +from ._vendor.trie import TrieNode +from ._vendor.triegex import Triegex from .cache import clearable_lru_cache -from .libs.trie import TrieNode from .types import SyntaxLike _T = TypeVar("_T") @@ -81,8 +82,6 @@ def merge_literals_to_regex(literals: Iterable[str]) -> str: The returned regex is enclosed as `(?:...)`. """ - from .libs.triegex import Triegex - # this regex is enclosed by "(?:)" return Triegex(*map(re.escape, literals)).to_regex().replace(R"\b", "").replace(r"|~^(?#match nothing)", "") diff --git a/pyproject.toml b/pyproject.toml index 137f31d3..471444b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ mypy_path = 'typings:stubs' python_version = '3.8' [[tool.mypy.overrides]] -module = ["plugin.libs.*"] +module = ["plugin._vendor.*"] ignore_errors = true ignore_missing_imports = true @@ -20,7 +20,7 @@ exclude = [ '**/br-*/', '**/branch-*/', ] -ignore = ['**/.venv', '**/libs'] +ignore = ['**/.venv', '**/_vendor'] stubPath = 'typings' pythonVersion = '3.8' @@ -34,11 +34,11 @@ exclude = [ ".mypy_cache", ".venv", ".venv-*", + "_vendor", "branch-*", "stubs", "tests/files", "typings", - "vendor", "venv", "venv-*", ] diff --git a/vendorize.toml b/vendorize.toml new file mode 100644 index 00000000..7f97fdc8 --- /dev/null +++ b/vendorize.toml @@ -0,0 +1,4 @@ +target = "plugin/_vendor" +packages = [ + "triegex", +]