Skip to content

Commit

Permalink
Merge pull request #295 from cvzi/lrucache
Browse files Browse the repository at this point in the history
Lrucache
  • Loading branch information
TahirJalilov committed May 18, 2024
2 parents fd4230c + 39f00a8 commit 77624cd
Show file tree
Hide file tree
Showing 14 changed files with 196 additions and 131 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/pythonTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ on:
- 'gh-pages'
jobs:
pytest:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
strategy:
max-parallel: 8
matrix:
python-version: [3.7, 3.8, 3.9, "3.10", "3.11", "3.12", "3.13-dev"]
python-version: [3.7, 3.8, 3.9, "3.10", "pypy-3.10", "3.11", "3.12", "3.13-dev"]
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -26,6 +26,9 @@ jobs:
- name: Test with pytest
run: |
pytest
- name: Test with pytest (random test order)
run: |
pytest --shuffle
lint:
runs-on: ubuntu-latest
Expand Down
21 changes: 8 additions & 13 deletions emoji/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,20 +120,16 @@ def emojize(
"""

if language == 'alias':
language_pack = unicode_codes.get_aliases_unicode_dict()
else:
language_pack = unicode_codes.get_emoji_unicode_dict(language)

pattern = re.compile('(%s[%s]+%s)' %
(re.escape(delimiters[0]), _EMOJI_NAME_PATTERN, re.escape(delimiters[1])))

def replace(match: Match[str]) -> str:
name = match.group(1)[len(delimiters[0]):-len(delimiters[1])]
emj = language_pack.get(
emj = unicode_codes.get_emoji_by_name(
_DEFAULT_DELIMITER +
unicodedata.normalize('NFKC', name) +
_DEFAULT_DELIMITER)
_DEFAULT_DELIMITER, language)

if emj is None:
return match.group(1)

Expand Down Expand Up @@ -360,11 +356,10 @@ def version(string: str) -> float:
if string in unicode_codes.EMOJI_DATA:
return unicode_codes.EMOJI_DATA[string]['E']

language_pack = unicode_codes.get_emoji_unicode_dict('en')
if string in language_pack:
emj_code = language_pack[string]
if emj_code in unicode_codes.EMOJI_DATA:
return unicode_codes.EMOJI_DATA[emj_code]['E']
# Try name lookup
emj_code = unicode_codes.get_emoji_by_name(string, 'en')
if emj_code and emj_code in unicode_codes.EMOJI_DATA:
return unicode_codes.EMOJI_DATA[emj_code]['E']

# Try to find first emoji in string
version: List[float] = []
Expand All @@ -378,7 +373,7 @@ def f(e: str, emoji_data: Dict[str, Any]) -> str:
emojize(string, language='alias', version=-1, handle_version=f)
if version:
return version[0]
for lang_code in unicode_codes._EMOJI_UNICODE: # type: ignore
for lang_code in unicode_codes.LANGUAGES:
emojize(string, language=lang_code, version=-1, handle_version=f)
if version:
return version[0]
Expand Down
15 changes: 6 additions & 9 deletions emoji/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
Components for detecting and tokenizing emoji in strings.
"""
from typing import List, NamedTuple, Dict, Optional, Union, Iterator, Any
from typing import List, NamedTuple, Dict, Union, Iterator, Any
from emoji import unicode_codes


Expand All @@ -15,7 +15,7 @@
]

_ZWJ = '\u200D'
_SEARCH_TREE: Optional[Dict[str, Any]] = None
_SEARCH_TREE: Dict[str, Any] = {}


class EmojiMatch:
Expand Down Expand Up @@ -273,16 +273,15 @@ def filter_tokens(matches: Iterator[Token], emoji_only: bool, join_emoji: bool)
elif isinstance(token.value, EmojiMatch):
if pre_previous_is_emoji and previous_is_zwj:
if isinstance(accumulator[-1].value, EmojiMatchZWJNonRGI):
accumulator[-1].value._add(token.value) # type: ignore
accumulator[-1].value._add(token.value) # pyright: ignore [reportPrivateUsage]
accumulator[-1] = Token(accumulator[-1].chars +
_ZWJ + token.chars, accumulator[-1].value)
else:
prev = accumulator.pop()
assert isinstance(prev.value, EmojiMatch)
accumulator.append(
Token(prev.chars + _ZWJ + token.chars,
EmojiMatchZWJNonRGI(
prev.value, # type: ignore
token.value)))
EmojiMatchZWJNonRGI(prev.value, token.value)))
else:
accumulator.append(token)
previous_is_emoji = True
Expand Down Expand Up @@ -346,9 +345,7 @@ def get_search_tree() -> Dict[str, Any]:
"""
global _SEARCH_TREE
if _SEARCH_TREE is None:
_SEARCH_TREE = {} # type: ignore
if not _SEARCH_TREE:
for emj in unicode_codes.EMOJI_DATA:
sub_tree = _SEARCH_TREE
lastidx = len(emj) - 1
Expand Down
48 changes: 22 additions & 26 deletions emoji/unicode_codes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,33 @@
from typing import Any, Dict
from emoji.unicode_codes.data_dict import *
from typing import Optional
from functools import lru_cache
from emoji.unicode_codes.data_dict import EMOJI_DATA, STATUS, LANGUAGES

__all__ = [
'get_emoji_unicode_dict', 'get_aliases_unicode_dict',
'get_emoji_by_name',
'EMOJI_DATA', 'STATUS', 'LANGUAGES'
]


_EMOJI_UNICODE: Dict[str, Any] = {lang: None for lang in LANGUAGES} # Cache for the language dicts
@lru_cache(maxsize=4000)
def get_emoji_by_name(name: str, language: str) -> Optional[str]:
"""
Find emoji by short-name in a specific language.
Returns None if not found
_ALIASES_UNICODE: Dict[str, str] = {} # Cache for the aliases dict
:param name: emoji short code e.g. ":banana:"
:param language: language-code e.g. 'es', 'de', etc. or 'alias'
"""

fully_qualified = STATUS['fully_qualified']

def get_emoji_unicode_dict(lang: str) -> Dict[str, Any]:
"""Generate dict containing all fully-qualified and component emoji name for a language
The dict is only generated once per language and then cached in _EMOJI_UNICODE[lang]"""

if _EMOJI_UNICODE[lang] is None:
_EMOJI_UNICODE[lang] = {data[lang]: emj for emj, data in EMOJI_DATA.items()
if lang in data and data['status'] <= STATUS['fully_qualified']}

return _EMOJI_UNICODE[lang]


def get_aliases_unicode_dict() -> Dict[str, str]:
"""Generate dict containing all fully-qualified and component aliases
The dict is only generated once and then cached in _ALIASES_UNICODE"""

if not _ALIASES_UNICODE:
_ALIASES_UNICODE.update(get_emoji_unicode_dict('en'))
if language == 'alias':
for emj, data in EMOJI_DATA.items():
if 'alias' in data and data['status'] <= STATUS['fully_qualified']:
for alias in data['alias']:
_ALIASES_UNICODE[alias] = emj
if name in data.get('alias', []) and data['status'] <= fully_qualified:
return emj
language = 'en'

for emj, data in EMOJI_DATA.items():
if data.get(language) == name and data['status'] <= fully_qualified:
return emj

return _ALIASES_UNICODE
return None
13 changes: 9 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ repository = "https://github.com/carpedm20/emoji/"

[project.optional-dependencies]
dev = [
"pytest",
"pytest>=7.4.4",
"coverage",
"coveralls",
]
Expand All @@ -57,15 +57,20 @@ emoji = ["py.typed"]
[tool.setuptools.dynamic]
version = {attr = "emoji.__version__"}

[tool.pytest.ini_options]
pythonpath = [".", "utils"]
testpaths = ["tests"]

[tool.pyright]
pythonVersion = "3.6"
pythonVersion = "3.7"
pythonPlatform = "All"
typeCheckingMode = "strict"

extraPaths = ["utils"]
exclude = [
"**/__pycache__",
".git",
".venv",
"build",
"utils",
"utils/gh-pages",
"utils/get_codes_from_unicode_emoji_data_files.py",
]
4 changes: 0 additions & 4 deletions tests/__init__.py

This file was deleted.

23 changes: 23 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from typing import List
import random
import pytest
import functools


def pytest_sessionstart():
# Increase cache size to unlimited size to avoid cache misses during tests
import emoji.unicode_codes
emoji.unicode_codes.get_emoji_by_name = functools.lru_cache(
maxsize=None)(emoji.unicode_codes.get_emoji_by_name.__wrapped__)


def pytest_addoption(parser: pytest.Parser):
parser.addoption("--shuffle", dest="shuffle", action='store_true',
default=False, help="Run tests in random order")


def pytest_collection_modifyitems(session: pytest.Session, items: List[pytest.Item]):
if session.config.getoption("shuffle"):
print("")
print("Shuffling items for a random test order")
random.shuffle(items)
51 changes: 15 additions & 36 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,9 @@
import re
from typing import Any, Callable, Dict, List, Tuple, Union
from typing_extensions import Literal
import emoji.unicode_codes
import pytest
import unicodedata

_NormalizationForm = Literal['NFC', 'NFD', 'NFKC', 'NFKD']

# Build all language packs (i.e. fill the cache):
emoji.emojize("", language="alias")
for lang_code in emoji.LANGUAGES:
emoji.emojize("", language=lang_code)


def ascii(s: str) -> str:
# return escaped Code points \U000AB123
return s.encode("unicode-escape").decode()


def all_language_and_alias_packs():
yield ('alias', emoji.unicode_codes.get_aliases_unicode_dict())

for lang_code in emoji.LANGUAGES:
yield (lang_code, emoji.unicode_codes.get_emoji_unicode_dict(lang_code))


def normalize(form: _NormalizationForm, s: str) -> str:
return unicodedata.normalize(form, s)
import emoji.unicode_codes
from testutils import ascii, normalize, all_language_packs, all_language_and_alias_packs, get_emoji_unicode_dict


def test_emojize_name_only():
Expand Down Expand Up @@ -116,13 +93,13 @@ def test_emojize_complicated_string():


def test_emojize_languages():
for lang_code, emoji_pack in emoji.unicode_codes._EMOJI_UNICODE.items(): # pyright: ignore [reportPrivateUsage]
for lang_code, emoji_pack in all_language_packs():
for name, emj in emoji_pack.items():
assert emoji.emojize(name, language=lang_code) == emj


def test_demojize_languages():
for lang_code, emoji_pack in emoji.unicode_codes._EMOJI_UNICODE.items(): # pyright: ignore [reportPrivateUsage]
for lang_code, emoji_pack in all_language_packs():
for name, emj in emoji_pack.items():
assert emoji.demojize(emj, language=lang_code) == name

Expand All @@ -131,31 +108,33 @@ def test_emojize_variant():
def remove_variant(s: str) -> str:
return re.sub('[\ufe0e\ufe0f]$', '', s)

english_pack = get_emoji_unicode_dict('en')

assert emoji.emojize(
':Taurus:', variant=None) == emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:'] # pyright: ignore [reportPrivateUsage]
':Taurus:', variant=None) == english_pack[':Taurus:']
assert emoji.emojize(':Taurus:', variant=None) == emoji.emojize(':Taurus:')
assert emoji.emojize(':Taurus:', variant='text_type') == remove_variant(
emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:']) + '\ufe0e' # pyright: ignore [reportPrivateUsage]
english_pack[':Taurus:']) + '\ufe0e'
assert emoji.emojize(':Taurus:', variant='emoji_type') == remove_variant(
emoji.unicode_codes._EMOJI_UNICODE['en'][':Taurus:']) + '\ufe0f' # pyright: ignore [reportPrivateUsage]
english_pack[':Taurus:']) + '\ufe0f'

assert emoji.emojize(
':admission_tickets:', variant=None) == emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:'] # pyright: ignore [reportPrivateUsage]
':admission_tickets:', variant=None) == english_pack[':admission_tickets:']
assert emoji.emojize(':admission_tickets:', variant=None) == emoji.emojize(
':admission_tickets:')
assert emoji.emojize(':admission_tickets:', variant='text_type') == remove_variant(
emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:']) + '\ufe0e' # pyright: ignore [reportPrivateUsage]
english_pack[':admission_tickets:']) + '\ufe0e'
assert emoji.emojize(':admission_tickets:', variant='emoji_type') == remove_variant(
emoji.unicode_codes._EMOJI_UNICODE['en'][':admission_tickets:']) + '\ufe0f' # pyright: ignore [reportPrivateUsage]
english_pack[':admission_tickets:']) + '\ufe0f'

with pytest.raises(ValueError):
emoji.emojize(':admission_tickets:', variant=False) # pyright: ignore [reportArgumentType]
emoji.emojize(':admission_tickets:', variant=False) # type: ignore[arg-type]

with pytest.raises(ValueError):
emoji.emojize(':admission_tickets:', variant=True) # pyright: ignore [reportArgumentType]
emoji.emojize(':admission_tickets:', variant=True) # type: ignore[arg-type]

with pytest.raises(ValueError):
emoji.emojize(':admission_tickets:', variant='wrong') # pyright: ignore [reportArgumentType]
emoji.emojize(':admission_tickets:', variant='wrong') # type: ignore[arg-type]

assert emoji.emojize(":football:") == ':football:'
assert emoji.emojize(":football:", variant="text_type") == ':football:'
Expand Down
6 changes: 3 additions & 3 deletions tests/test_dict.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
"""Unittests for the big dict of dicts containing all emoji"""


from typing import Set, Dict
import re
import emoji


def test_all_languages_list():
"""Compare all language keys in EMOJI_DATA with the emoji.LANGUAGES list"""

langs: set[str] = set()
langs: Set[str] = set()
for item in emoji.EMOJI_DATA.values():
langs.update(item.keys())
all_languages = {lang for lang in langs if len(lang) == 2 and lang.lower() == lang}
Expand All @@ -27,7 +27,7 @@ def test_emoji_versions():

def check_duplicate_names(lang: str):
"""Check that there are no duplicate names in the fully_qualified except for different variants"""
seen = {}
seen: Dict[str, int] = {}
for item in emoji.EMOJI_DATA.values():
if item["status"] > emoji.STATUS["fully_qualified"]:
continue
Expand Down
14 changes: 1 addition & 13 deletions tests/test_nfkc.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,7 @@
"""Unittests for canonically equivalent Unicode sequences"""

import sys
import unicodedata
import emoji
from typing_extensions import Literal


_NormalizationForm = Literal['NFC', 'NFD', 'NFKC', 'NFKD']

def is_normalized(form: _NormalizationForm, s: str) -> bool:
if sys.version_info >= (3, 8):
return unicodedata.is_normalized(form, s)
else:
return unicodedata.normalize(form, s) == s

from testutils import is_normalized

def test_database_normalized():
# Test if all names in EMOJI_DATA are in NFKC form
Expand Down
Loading

0 comments on commit 77624cd

Please sign in to comment.