from __future__ import annotations
from typing import TYPE_CHECKING, cast
from pathlib import Path
from anyio import Path as AsyncPath
# tokenizers is untyped, https://github.com/huggingface/tokenizers/issues/811
# note: this comment affects the entire file
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
if TYPE_CHECKING:
# we only import this at the type-level as deferring the import
# avoids issues like this: https://github.com/anthropics/anthropic-sdk-python/issues/280
from tokenizers import Tokenizer as TokenizerType # type: ignore[import]
else:
TokenizerType = None
def _get_tokenizer_cache_path() -> Path:
return Path(__file__).parent / "tokenizer.json"
_tokenizer: TokenizerType | None = None
def _load_tokenizer(raw: str) -> TokenizerType:
global _tokenizer
from tokenizers import Tokenizer
_tokenizer = cast(TokenizerType, Tokenizer.from_str(raw))
return _tokenizer
def sync_get_tokenizer() -> TokenizerType:
if _tokenizer is not None:
return _tokenizer
tokenizer_path = _get_tokenizer_cache_path()
text = tokenizer_path.read_text(encoding="utf-8")
return _load_tokenizer(text)
async def async_get_tokenizer() -> TokenizerType:
if _tokenizer is not None:
return _tokenizer
tokenizer_path = AsyncPath(_get_tokenizer_cache_path())
text = await tokenizer_path.read_text(encoding="utf-8")
return _load_tokenizer(text)