import unittest
from indexer.hf_tokenizer import HuggingFaceTokenizer
class TestHuggingFaceTokenizer(unittest.TestCase):
def setUp(self):
# This model's tokenizer will be used.
# The full model is also loaded by default in HuggingFaceTokenizer.
self.model_name = "BAAI/bge-large-en-v1.5" # Using the specified model
# self.model_name = "bert-base-uncased" # A smaller alternative for faster tests if needed
try:
self.tokenizer_impl = HuggingFaceTokenizer(model_name=self.model_name)
except Exception as e:
self.fail(f"Failed to initialize HuggingFaceTokenizer: {e}")
def test_tokenize_single_sentence(self):
"""Test tokenization of a single sentence into token strings."""
text = "Hello, world!"
# For BAAI/bge-large-en-v1.5, the tokenizer is based on BERT WordPiece.
# It adds [CLS] and [SEP] for encoding, but .tokenize() gives raw tokens.
tokens = self.tokenizer_impl.tokenize(text)
self.assertIsInstance(tokens, list)
self.assertTrue(all(isinstance(token, str) for token in tokens))
# Example: ['hello', ',', 'world', '!'] or subword tokens like ['hel', '##lo', ',', 'world', '!']
# For "Hello, world!" with bge-large-en-v1.5 tokenizer:
# It should be something like: ['hello', ',', 'world', '!']
# Let's check based on a known behavior or just general properties.
self.assertEqual(tokens, ['hello', ',', 'world', '!']) # Adjusted for typical bert tokenizer behavior
print(f"\nTest 'tokenize' output for '{text}': {tokens}")
def test_tokenize_empty_string(self):
"""Test tokenization of an empty string."""
text = ""
tokens = self.tokenizer_impl.tokenize(text)
self.assertIsInstance(tokens, list)
self.assertEqual(len(tokens), 0)
print(f"\nTest 'tokenize' output for empty string: {tokens}")
def test_tokenize_multiple_sentences(self):
"""Test tokenizing multiple sentences into lists of token strings."""
text_list = [
"This is the first sentence.",
"Another one for testing."
]
tokenized_list = self.tokenizer_impl.tokenize_multiple(text_list)
self.assertIsInstance(tokenized_list, list)
self.assertEqual(len(tokenized_list), len(text_list))
expected_tokens_sentence1 = ['this', 'is', 'the', 'first', 'sentence', '.']
expected_tokens_sentence2 = ['another', 'one', 'for', 'testing', '.']
self.assertEqual(tokenized_list[0], expected_tokens_sentence1)
self.assertTrue(all(isinstance(token, str) for token in tokenized_list[0]))
self.assertEqual(tokenized_list[1], expected_tokens_sentence2)
self.assertTrue(all(isinstance(token, str) for token in tokenized_list[1]))
print(f"\nTest 'tokenize_multiple' output for {text_list}: {tokenized_list}")
def test_tokenize_multiple_empty_list(self):
"""Test tokenize_multiple with an empty list."""
text_list = []
tokenized_list = self.tokenizer_impl.tokenize_multiple(text_list)
self.assertIsInstance(tokenized_list, list)
self.assertEqual(len(tokenized_list), 0)
print("\nTest 'tokenize_multiple' with empty list processed correctly.")
def test_tokenize_multiple_with_empty_string_in_list(self):
"""Test tokenize_multiple with a list containing an empty string."""
text_list = ["Hello", "", "world"]
tokenized_list = self.tokenizer_impl.tokenize_multiple(text_list)
self.assertIsInstance(tokenized_list, list)
self.assertEqual(len(tokenized_list), 3)
self.assertEqual(tokenized_list[0], ['hello'])
self.assertEqual(tokenized_list[1], [])
self.assertEqual(tokenized_list[2], ['world'])
print(f"\nTest 'tokenize_multiple' with an empty string in list: {tokenized_list}")
if __name__ == '__main__':
unittest.main()