Skip to main content
Glama
testwordvectors.py4.72 kB
""" WordVectors module tests """ import os import tempfile import unittest from unittest.mock import patch import numpy as np from huggingface_hub.errors import HFValidationError from txtai.vectors import VectorsFactory from txtai.vectors.dense.words import create, transform class TestWordVectors(unittest.TestCase): """ Vectors tests. """ @classmethod def setUpClass(cls): """ Sets the pretrained model to use """ # Test with pretrained glove quantized vectors cls.path = "neuml/glove-6B-quantized" @patch("os.cpu_count") def testIndex(self, cpucount): """ Test word vectors indexing """ # Mock CPU count cpucount.return_value = 1 # Generate data documents = [(x, "This is a test", None) for x in range(1000)] model = VectorsFactory.create({"path": self.path, "parallel": True}, None) ids, dimension, batches, stream = model.index(documents, 1) self.assertEqual(len(ids), 1000) self.assertEqual(dimension, 300) self.assertEqual(batches, 1000) self.assertIsNotNone(os.path.exists(stream)) # Test shape of serialized embeddings with open(stream, "rb") as queue: self.assertEqual(np.load(queue).shape, (1, 300)) @patch("os.cpu_count") def testIndexBatch(self, cpucount): """ Test word vectors indexing with batch size set """ # Mock CPU count cpucount.return_value = 1 # Generate data documents = [(x, "This is a test", None) for x in range(1000)] model = VectorsFactory.create({"path": self.path, "parallel": True}, None) ids, dimension, batches, stream = model.index(documents, 512) self.assertEqual(len(ids), 1000) self.assertEqual(dimension, 300) self.assertEqual(batches, 2) self.assertIsNotNone(os.path.exists(stream)) # Test shape of serialized embeddings with open(stream, "rb") as queue: self.assertEqual(np.load(queue).shape, (512, 300)) self.assertEqual(np.load(queue).shape, (488, 300)) def testIndexSerial(self): """ Test word vector indexing in single process mode """ # Generate data documents = [(x, "This is a test", None) for x in range(1000)] model = VectorsFactory.create({"path": self.path, "parallel": False}, None) ids, dimension, batches, stream = model.index(documents, 1) self.assertEqual(len(ids), 1000) self.assertEqual(dimension, 300) self.assertEqual(batches, 1000) self.assertIsNotNone(os.path.exists(stream)) # Test shape of serialized embeddings with open(stream, "rb") as queue: self.assertEqual(np.load(queue).shape, (1, 300)) def testIndexSerialBatch(self): """ Test word vector indexing in single process mode with batch size set """ # Generate data documents = [(x, "This is a test", None) for x in range(1000)] model = VectorsFactory.create({"path": self.path, "parallel": False}, None) ids, dimension, batches, stream = model.index(documents, 512) self.assertEqual(len(ids), 1000) self.assertEqual(dimension, 300) self.assertEqual(batches, 2) self.assertIsNotNone(os.path.exists(stream)) # Test shape of serialized embeddings with open(stream, "rb") as queue: self.assertEqual(np.load(queue).shape, (512, 300)) self.assertEqual(np.load(queue).shape, (488, 300)) def testLookup(self): """ Test word vector lookup """ model = VectorsFactory.create({"path": self.path}, None) self.assertEqual(model.lookup(["txtai", "embeddings", "sentence"]).shape, (3, 300)) def testMultiprocess(self): """ Test multiprocess helper methods """ create({"path": self.path}, None) uid, vector = transform((0, "test", None)) self.assertEqual(uid, 0) self.assertEqual(vector.shape, (300,)) def testNoExist(self): """ Test loading model that doesn't exist """ # Test non-existent path raises an exception with self.assertRaises((IOError, HFValidationError)): VectorsFactory.create({"method": "words", "path": os.path.join(tempfile.gettempdir(), "noexist")}, None) def testTransform(self): """ Test word vector transform """ model = VectorsFactory.create({"path": self.path}, None) self.assertEqual(len(model.transform((None, ["txtai"], None))), 300)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/neuml/txtai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server