from abc import ABC
from typing import List
import tiktoken
from indexer import AbstractTokenizer
class TikTokenTokenizer(AbstractTokenizer):
def __init__(self, model_name):
self.enc = tiktoken.get_encoding(model_name)
def tokenize(self, text: str) -> List[str]:
return [str(x) for x in self.enc.encode("hello world")]
def tokenize_multiple(self, text_list: List[str]) -> List[List[str]]:
"""
Tokenizes a list of strings into a list of lists of token strings.
Args:
text_list: The input list of strings.
Returns:
A list of lists of strings, where each inner list contains the tokens
for the corresponding input string.
"""
if not text_list:
return []
# Uses the tokenizer's `tokenize` method for each string in the list.
# This method typically performs subword tokenization (e.g., WordPiece, BPE, SentencePiece).
return [self.tokenize(text) for text in text_list]