Skip to content

Commit

Permalink
fix(tokens): incorrect caching of async tokenizer (#47)
Browse files Browse the repository at this point in the history
The async tokenizer would crash if it was called multiple times in the same process
  • Loading branch information
RobertCraigie authored Jun 29, 2023
1 parent 31c7256 commit 089a3f4
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 11 deletions.
32 changes: 26 additions & 6 deletions examples/tokens.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,32 @@
#!/usr/bin/env poetry run python

from anthropic import Anthropic
import asyncio

client = Anthropic()
from anthropic import Anthropic, AsyncAnthropic

text = "hello world!"

tokens = client.count_tokens(text)
print(f"'{text}' is {tokens} tokens")
def sync_tokens() -> None:
client = Anthropic()

assert tokens == 3
text = "hello world!"

tokens = client.count_tokens(text)
print(f"'{text}' is {tokens} tokens")

assert tokens == 3


async def async_tokens() -> None:
anthropic = AsyncAnthropic()

text = "fist message"
tokens = await anthropic.count_tokens(text)
print(f"'{text}' is {tokens} tokens")

text = "second message"
tokens = await anthropic.count_tokens(text)
print(f"'{text}' is {tokens} tokens")


sync_tokens()
asyncio.run(async_tokens())
26 changes: 21 additions & 5 deletions src/anthropic/_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from __future__ import annotations

from typing import cast
from pathlib import Path
from functools import lru_cache

from anyio import Path as AsyncPath

Expand All @@ -13,15 +15,29 @@ def _get_tokenizer_cache_path() -> Path:
return Path(__file__).parent / "tokenizer.json"


@lru_cache(maxsize=None)
_tokenizer: Tokenizer | None = None


def _load_tokenizer(raw: str) -> Tokenizer:
global _tokenizer

_tokenizer = cast(Tokenizer, Tokenizer.from_str(raw))
return _tokenizer


def sync_get_tokenizer() -> Tokenizer:
if _tokenizer is not None:
return _tokenizer

tokenizer_path = _get_tokenizer_cache_path()
text = tokenizer_path.read_text()
return Tokenizer.from_str(text)
return _load_tokenizer(text)


@lru_cache(maxsize=None)
async def async_get_tokenizer() -> Tokenizer:
if _tokenizer is not None:
return _tokenizer

tokenizer_path = AsyncPath(_get_tokenizer_cache_path())
text = await tokenizer_path.read_text()
return Tokenizer.from_str(text)
return _load_tokenizer(text)

0 comments on commit 089a3f4

Please sign in to comment.