from typing import List
import torch
from transformers import AutoTokenizer, AutoModel
from indexer.abstract_tokenizer import AbstractTokenizer
class HuggingFaceTokenizer(AbstractTokenizer): # Renamed for clarity if only doing tokenization
"""
A tokenizer implementation using Hugging Face transformers library.
This class tokenizes text into token strings.
The model is loaded to allow for potential future extensions like embedding generation.
"""
def __init__(self, model_name: str):
"""
Initializes the tokenizer from Hugging Face.
The model is also loaded to support potential future embedding functionalities.
Args:
model_name: The name of the pre-trained model whose tokenizer to use
(e.g., 'BAAI/bge-large-en-v1.5').
"""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
# We load the model as well, anticipating that embedding generation might be
# a desired feature for a class using an embedding model like bge-large-en-v1.5.
# If only string tokenization is ever needed, self.model and self.device
# could be omitted for a lighter class.
try:
self.model = AutoModel.from_pretrained(model_name)
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.model.to(self.device)
self.model.eval() # Set the model to evaluation mode
except Exception as e:
print(f"Warning: Could not load the full model for {model_name}. "
f"Only string tokenization will be reliably available. Error: {e}")
self.model = None
self.device = None
def tokenize(self, text: str) -> List[str]:
"""
Tokenizes a given string into a list of token strings.
Args:
text: The input string to be tokenized.
Returns:
A list of strings, where each string is a token.
"""
return self.tokenizer.tokenize(text)
def tokenize_multiple(self, text_list: List[str]) -> List[List[str]]:
"""
Tokenizes a list of strings into a list of lists of token strings.
Args:
text_list: The input list of strings.
Returns:
A list of lists of strings, where each inner list contains the tokens
for the corresponding input string.
"""
if not text_list:
return []
# Uses the tokenizer's `tokenize` method for each string in the list.
# This method typically performs subword tokenization (e.g., WordPiece, BPE, SentencePiece).
return [self.tokenizer.tokenize(text) for text in text_list]