Skip to main content
Glama
testhuggingface.py3.07 kB
""" Huggingface module tests """ import os import unittest import numpy as np from txtai.vectors import VectorsFactory class TestHFVectors(unittest.TestCase): """ HFVectors tests """ @classmethod def setUpClass(cls): """ Create HFVectors instance. """ cls.model = VectorsFactory.create({"path": "sentence-transformers/nli-mpnet-base-v2"}, None) def testIndex(self): """ Test transformers indexing """ # Generate enough volume to test batching documents = [(x, "This is a test", None) for x in range(1000)] ids, dimension, batches, stream = self.model.index(documents) self.assertEqual(len(ids), 1000) self.assertEqual(dimension, 768) self.assertEqual(batches, 2) self.assertIsNotNone(os.path.exists(stream)) # Test shape of serialized embeddings with open(stream, "rb") as queue: self.assertEqual(np.load(queue).shape, (500, 768)) def testText(self): """ Test transformers text conversion """ self.model.tokenize = True self.assertEqual(self.model.prepare("Y 123 This is a test!"), "test") self.assertEqual(self.model.prepare(["This", "is", "a", "test"]), "This is a test") self.model.tokenize = False self.assertEqual(self.model.prepare("Y 123 This is a test!"), "Y 123 This is a test!") self.assertEqual(self.model.prepare(["This", "is", "a", "test"]), "This is a test") def testTransform(self): """ Test transformers transform """ # Sample documents: one where tokenizer changes text and one with no changes to text documents = [(0, "This is a test and has no tokenization", None), (1, "test tokenization", None)] # Run with tokenization enabled self.model.tokenize = True embeddings1 = [self.model.transform(d) for d in documents] # Run with tokenization disabled self.model.tokenize = False embeddings2 = [self.model.transform(d) for d in documents] self.assertFalse(np.array_equal(embeddings1[0], embeddings2[0])) self.assertTrue(np.array_equal(embeddings1[1], embeddings2[1])) def testTransformArray(self): """ Test transformers skips transforming NumPy arrays """ # Generate data and run through vector model data1 = np.random.rand(5, 5).astype(np.float32) data2 = self.model.transform((0, data1, None)) # Test transform method returns original data self.assertTrue(np.array_equal(data1, data2)) def testTransformLong(self): """ Test transformers transform on long text """ # Sample documents: short text and longer text documents = [(0, "This is long text " * 512, None), (1, "This is short text", None)] # Run transform and ensure it completes without errors embeddings = [self.model.transform(d) for d in documents] self.assertIsNotNone(embeddings)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/neuml/txtai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server