Merge pull request #295 from cvzi/lrucache

Lrucache
carpedm20 · May 18, 2024 · 77624cd · 77624cd
2 parents fd4230c + 39f00a8
commit 77624cd
Show file tree

Hide file tree

Showing 14 changed files with 196 additions and 131 deletions.
diff --git a/.github/workflows/pythonTests.yml b/.github/workflows/pythonTests.yml
@@ -9,11 +9,11 @@ on:
       - 'gh-pages'
 jobs:
   pytest:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
     strategy:
       max-parallel: 8
       matrix:
-        python-version: [3.7, 3.8, 3.9, "3.10", "3.11", "3.12", "3.13-dev"]
+        python-version: [3.7, 3.8, 3.9, "3.10", "pypy-3.10", "3.11", "3.12", "3.13-dev"]
     steps:
     - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
@@ -26,6 +26,9 @@ jobs:
     - name: Test with pytest
       run: |
         pytest
+    - name: Test with pytest (random test order)
+      run: |
+        pytest --shuffle
 
   lint:
     runs-on: ubuntu-latest

diff --git a/emoji/core.py b/emoji/core.py
@@ -120,20 +120,16 @@ def emojize(
 
     """
 
-    if language == 'alias':
-        language_pack = unicode_codes.get_aliases_unicode_dict()
-    else:
-        language_pack = unicode_codes.get_emoji_unicode_dict(language)
-
     pattern = re.compile('(%s[%s]+%s)' %
                          (re.escape(delimiters[0]), _EMOJI_NAME_PATTERN, re.escape(delimiters[1])))
 
     def replace(match: Match[str]) -> str:
         name = match.group(1)[len(delimiters[0]):-len(delimiters[1])]
-        emj = language_pack.get(
+        emj = unicode_codes.get_emoji_by_name(
             _DEFAULT_DELIMITER +
             unicodedata.normalize('NFKC', name) +
-            _DEFAULT_DELIMITER)
+            _DEFAULT_DELIMITER, language)
+
         if emj is None:
             return match.group(1)
 
@@ -360,11 +356,10 @@ def version(string: str) -> float:
     if string in unicode_codes.EMOJI_DATA:
         return unicode_codes.EMOJI_DATA[string]['E']
 
-    language_pack = unicode_codes.get_emoji_unicode_dict('en')
-    if string in language_pack:
-        emj_code = language_pack[string]
-        if emj_code in unicode_codes.EMOJI_DATA:
-            return unicode_codes.EMOJI_DATA[emj_code]['E']
+    # Try name lookup
+    emj_code = unicode_codes.get_emoji_by_name(string, 'en')
+    if emj_code and emj_code in unicode_codes.EMOJI_DATA:
+        return unicode_codes.EMOJI_DATA[emj_code]['E']
 
     # Try to find first emoji in string
     version: List[float] = []
@@ -378,7 +373,7 @@ def f(e: str, emoji_data: Dict[str, Any]) -> str:
     emojize(string, language='alias', version=-1, handle_version=f)
     if version:
         return version[0]
-    for lang_code in unicode_codes._EMOJI_UNICODE:  # type: ignore
+    for lang_code in unicode_codes.LANGUAGES:
         emojize(string, language=lang_code, version=-1, handle_version=f)
         if version:
             return version[0]

diff --git a/emoji/tokenizer.py b/emoji/tokenizer.py
@@ -5,7 +5,7 @@
 Components for detecting and tokenizing emoji in strings.
 
 """
-from typing import List, NamedTuple, Dict, Optional, Union, Iterator, Any
+from typing import List, NamedTuple, Dict, Union, Iterator, Any
 from emoji import unicode_codes
 
 
@@ -15,7 +15,7 @@
 ]
 
 _ZWJ = '\u200D'
-_SEARCH_TREE: Optional[Dict[str, Any]] = None
+_SEARCH_TREE: Dict[str, Any] = {}
 
 
 class EmojiMatch:
@@ -273,16 +273,15 @@ def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool)
         elif isinstance(token.value, EmojiMatch):
             if pre_previous_is_emoji and previous_is_zwj:
                 if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
-                    accumulator[-1].value._add(token.value)  # type: ignore
+                    accumulator[-1].value._add(token.value)  # pyright: ignore [reportPrivateUsage]
                     accumulator[-1] = Token(accumulator[-1].chars +
                                             _ZWJ + token.chars, accumulator[-1].value)
                 else:
                     prev = accumulator.pop()
+                    assert isinstance(prev.value, EmojiMatch)
                     accumulator.append(
                         Token(prev.chars + _ZWJ + token.chars,
-                              EmojiMatchZWJNonRGI(
-                                  prev.value,  # type: ignore
-                                  token.value)))
+                              EmojiMatchZWJNonRGI(prev.value, token.value)))
             else:
                 accumulator.append(token)
             previous_is_emoji = True
@@ -346,9 +345,7 @@ def get_search_tree() -> Dict[str, Any]:
 
 
     """
-    global _SEARCH_TREE
-    if _SEARCH_TREE is None:
-        _SEARCH_TREE = {}  # type: ignore
+    if not _SEARCH_TREE:
         for emj in unicode_codes.EMOJI_DATA:
             sub_tree = _SEARCH_TREE
             lastidx = len(emj) - 1

diff --git a/emoji/unicode_codes/__init__.py b/emoji/unicode_codes/__init__.py
@@ -1,37 +1,33 @@
-from typing import Any, Dict
-from emoji.unicode_codes.data_dict import *
+from typing import Optional
+from functools import lru_cache
+from emoji.unicode_codes.data_dict import EMOJI_DATA, STATUS, LANGUAGES
 
 __all__ = [
-    'get_emoji_unicode_dict', 'get_aliases_unicode_dict',
+    'get_emoji_by_name',
     'EMOJI_DATA', 'STATUS', 'LANGUAGES'
 ]
 
 
-_EMOJI_UNICODE: Dict[str, Any] = {lang: None for lang in LANGUAGES}  # Cache for the language dicts
+@lru_cache(maxsize=4000)
+def get_emoji_by_name(name: str, language: str) -> Optional[str]:
+    """
+    Find emoji by short-name in a specific language.
+    Returns None if not found
 
-_ALIASES_UNICODE: Dict[str, str] = {}  # Cache for the aliases dict
+    :param name: emoji short code e.g. ":banana:"
+    :param language: language-code e.g. 'es', 'de', etc. or 'alias'
+    """
 
+    fully_qualified = STATUS['fully_qualified']
 
-def get_emoji_unicode_dict(lang: str) -> Dict[str, Any]:
-    """Generate dict containing all fully-qualified and component emoji name for a language
-    The dict is only generated once per language and then cached in _EMOJI_UNICODE[lang]"""
-
-    if _EMOJI_UNICODE[lang] is None:
-        _EMOJI_UNICODE[lang] = {data[lang]: emj for emj, data in EMOJI_DATA.items()
-                                if lang in data and data['status'] <= STATUS['fully_qualified']}
-
-    return _EMOJI_UNICODE[lang]
-
-
-def get_aliases_unicode_dict() -> Dict[str, str]:
-    """Generate dict containing all fully-qualified and component aliases
-    The dict is only generated once and then cached in _ALIASES_UNICODE"""
-
-    if not _ALIASES_UNICODE:
-        _ALIASES_UNICODE.update(get_emoji_unicode_dict('en'))
+    if language == 'alias':
         for emj, data in EMOJI_DATA.items():
-            if 'alias' in data and data['status'] <= STATUS['fully_qualified']:
-                for alias in data['alias']:
-                    _ALIASES_UNICODE[alias] = emj
+            if name in data.get('alias', []) and data['status'] <= fully_qualified:
+                return emj
+        language = 'en'
+
+    for emj, data in EMOJI_DATA.items():
+        if data.get(language) == name and data['status'] <= fully_qualified:
+            return emj
 
-    return _ALIASES_UNICODE
+    return None
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ repository = "https://github.com/carpedm20/emoji/"
 
 [project.optional-dependencies]
 dev = [
-    "pytest",
+    "pytest>=7.4.4",
     "coverage",
     "coveralls",
 ]
@@ -57,15 +57,20 @@ emoji = ["py.typed"]
 [tool.setuptools.dynamic]
 version = {attr = "emoji.__version__"}
 
+[tool.pytest.ini_options]
+pythonpath = [".", "utils"]
+testpaths = ["tests"]
+
 [tool.pyright]
-pythonVersion = "3.6"
+pythonVersion = "3.7"
 pythonPlatform = "All"
 typeCheckingMode = "strict"
-
+extraPaths = ["utils"]
 exclude = [
   "**/__pycache__",
   ".git",
   ".venv",
   "build",
-  "utils",
+  "utils/gh-pages",
+  "utils/get_codes_from_unicode_emoji_data_files.py",
 ]
diff --git a/tests/__init__.py b/tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,23 @@
+from typing import List
+import random
+import pytest
+import functools
+
+
+def pytest_sessionstart():
+    # Increase cache size to unlimited size to avoid cache misses during tests
+    import emoji.unicode_codes
+    emoji.unicode_codes.get_emoji_by_name = functools.lru_cache(
+        maxsize=None)(emoji.unicode_codes.get_emoji_by_name.__wrapped__)
+
+
+def pytest_addoption(parser: pytest.Parser):
+    parser.addoption("--shuffle", dest="shuffle", action='store_true',
+                     default=False, help="Run tests in random order")
+
+
+def pytest_collection_modifyitems(session: pytest.Session, items: List[pytest.Item]):
+    if session.config.getoption("shuffle"):
+        print("")
+        print("Shuffling items for a random test order")
+        random.shuffle(items)
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -4,32 +4,9 @@
 import re
 from typing import Any, Callable, Dict, List, Tuple, Union
 from typing_extensions import Literal
-import emoji.unicode_codes
 import pytest
-import unicodedata
-
-_NormalizationForm = Literal['NFC', 'NFD', 'NFKC', 'NFKD']
-
-# Build all language packs (i.e. fill the cache):
-emoji.emojize("", language="alias")
-for lang_code in emoji.LANGUAGES:
-    emoji.emojize("", language=lang_code)
-
-
-def ascii(s: str) -> str:
-    # return escaped Code points \U000AB123
-    return s.encode("unicode-escape").decode()
-
-
-def all_language_and_alias_packs():
-    yield ('alias', emoji.unicode_codes.get_aliases_unicode_dict())
-
-    for lang_code in emoji.LANGUAGES:
-        yield (lang_code, emoji.unicode_codes.get_emoji_unicode_dict(lang_code))
-
-
-def normalize(form: _NormalizationForm, s: str) -> str:
-    return unicodedata.normalize(form, s)
+import emoji.unicode_codes
+from testutils import ascii, normalize, all_language_packs, all_language_and_alias_packs, get_emoji_unicode_dict
 
 
 def test_emojize_name_only():
@@ -116,13 +93,13 @@ def test_emojize_complicated_string():
 
 
 def test_emojize_languages():
-    for lang_code, emoji_pack in emoji.unicode_codes._EMOJI_UNICODE.items():  # pyright: ignore [reportPrivateUsage]
+    for lang_code, emoji_pack in all_language_packs():
         for name, emj in emoji_pack.items():
             assert emoji.emojize(name, language=lang_code) == emj
 
 
 def test_demojize_languages():
-    for lang_code, emoji_pack in emoji.unicode_codes._EMOJI_UNICODE.items():  # pyright: ignore [reportPrivateUsage]
+    for lang_code, emoji_pack in all_language_packs():
         for name, emj in emoji_pack.items():
             assert emoji.demojize(emj, language=lang_code) == name
 
@@ -131,31 +108,33 @@ def test_emojize_variant():
     def remove_variant(s: str) -> str:
         return re.sub('[\ufe0e\ufe0f]$', '', s)
 
+    english_pack = get_emoji_unicode_dict('en')
+
     assert emoji.emojize(
-        ':Taurus:', variant=None) == emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:']  # pyright: ignore [reportPrivateUsage]
+        ':Taurus:', variant=None) == english_pack[':Taurus:']
     assert emoji.emojize(':Taurus:', variant=None) == emoji.emojize(':Taurus:')
     assert emoji.emojize(':Taurus:', variant='text_type') == remove_variant(
-        emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:']) + '\ufe0e'  # pyright: ignore [reportPrivateUsage]
+        english_pack[':Taurus:']) + '\ufe0e'
     assert emoji.emojize(':Taurus:', variant='emoji_type') == remove_variant(
-        emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:']) + '\ufe0f'  # pyright: ignore [reportPrivateUsage]
+        english_pack[':Taurus:']) + '\ufe0f'
 
     assert emoji.emojize(
-        ':admission_tickets:', variant=None) == emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:']  # pyright: ignore [reportPrivateUsage]
+        ':admission_tickets:', variant=None) == english_pack[':admission_tickets:']
     assert emoji.emojize(':admission_tickets:', variant=None) == emoji.emojize(
         ':admission_tickets:')
     assert emoji.emojize(':admission_tickets:', variant='text_type') == remove_variant(
-        emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:']) + '\ufe0e'  # pyright: ignore [reportPrivateUsage]
+        english_pack[':admission_tickets:']) + '\ufe0e'
     assert emoji.emojize(':admission_tickets:', variant='emoji_type') == remove_variant(
-        emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:']) + '\ufe0f'  # pyright: ignore [reportPrivateUsage]
+        english_pack[':admission_tickets:']) + '\ufe0f'
 
     with pytest.raises(ValueError):
-        emoji.emojize(':admission_tickets:', variant=False)  # pyright: ignore [reportArgumentType]
+        emoji.emojize(':admission_tickets:', variant=False)  # type: ignore[arg-type]
 
     with pytest.raises(ValueError):
-        emoji.emojize(':admission_tickets:', variant=True)  # pyright: ignore [reportArgumentType]
+        emoji.emojize(':admission_tickets:', variant=True)  # type: ignore[arg-type]
 
     with pytest.raises(ValueError):
-        emoji.emojize(':admission_tickets:', variant='wrong')  # pyright: ignore [reportArgumentType]
+        emoji.emojize(':admission_tickets:', variant='wrong')  # type: ignore[arg-type]
 
     assert emoji.emojize(":football:") == ':football:'
     assert emoji.emojize(":football:", variant="text_type") == ':football:'

diff --git a/tests/test_dict.py b/tests/test_dict.py
@@ -1,14 +1,14 @@
 """Unittests for the big dict of dicts containing all emoji"""
 
-
+from typing import Set, Dict
 import re
 import emoji
 
 
 def test_all_languages_list():
     """Compare all language keys in EMOJI_DATA with the emoji.LANGUAGES list"""
 
-    langs: set[str] = set()
+    langs: Set[str] = set()
     for item in emoji.EMOJI_DATA.values():
         langs.update(item.keys())
     all_languages = {lang for lang in langs if len(lang) == 2 and lang.lower() == lang}
@@ -27,7 +27,7 @@ def test_emoji_versions():
 
 def check_duplicate_names(lang: str):
     """Check that there are no duplicate names in the fully_qualified except for different variants"""
-    seen = {}
+    seen: Dict[str, int] = {}
     for item in emoji.EMOJI_DATA.values():
         if item["status"] > emoji.STATUS["fully_qualified"]:
             continue

diff --git a/tests/test_nfkc.py b/tests/test_nfkc.py
@@ -1,19 +1,7 @@
 """Unittests for canonically equivalent Unicode sequences"""
 
-import sys
-import unicodedata
 import emoji
-from typing_extensions import Literal
-
-
-_NormalizationForm = Literal['NFC', 'NFD', 'NFKC', 'NFKD']
-
-def is_normalized(form: _NormalizationForm, s: str) -> bool:
-    if sys.version_info >= (3, 8):
-        return unicodedata.is_normalized(form, s)
-    else:
-        return unicodedata.normalize(form, s) == s
-
+from testutils import is_normalized
 
 def test_database_normalized():
     # Test if all names in EMOJI_DATA are in NFKC form