Code-Index-MCP

test_metadata_extractor.py•17.2 KiB

""" Unit tests for metadata extraction functionality. Tests the metadata extractor including: - Frontmatter parsing (YAML and TOML) - Title detection with various patterns - Author and date extraction - Language detection - Keyword extraction using TF-IDF - Summary generation - File metadata extraction - Code-specific metadata """ import os import tempfile from pathlib import Path import pytest from mcp_server.document_processing.metadata_extractor import MetadataExtractor class TestMetadataExtractor: """Test metadata extraction functionality.""" def setup_method(self): """Set up test fixtures.""" self.extractor = MetadataExtractor() def test_extract_yaml_frontmatter(self): """Test extracting YAML frontmatter.""" content = """--- title: Test Document author: John Doe date: 2024-01-15 tags: - python - testing - documentation description: | This is a multi-line description of the document. custom_field: value --- # Main Content This is the document body.""" frontmatter = self.extractor.extract_frontmatter(content) assert frontmatter is not None assert frontmatter["title"] == "Test Document" assert frontmatter["author"] == "John Doe" # Handle both string and date object if isinstance(frontmatter["date"], str): assert frontmatter["date"] == "2024-01-15" else: assert str(frontmatter["date"]) == "2024-01-15" assert len(frontmatter["tags"]) == 3 assert "python" in frontmatter["tags"] assert "This is a multi-line" in frontmatter["description"] assert frontmatter["custom_field"] == "value" def test_extract_toml_frontmatter(self): """Test extracting TOML frontmatter.""" content = """+++ title = "Test Document" author = "Jane Doe" date = "2024-01-15" tags = ["rust", "systems", "programming"] [metadata] version = "1.0.0" status = "draft" +++ # Main Content""" frontmatter = self.extractor.extract_frontmatter(content) # Will be None if tomllib not available if frontmatter: assert frontmatter["title"] == "Test Document" assert frontmatter["author"] == "Jane Doe" assert "rust" in frontmatter.get("tags", []) if "metadata" in frontmatter: assert frontmatter["metadata"]["version"] == "1.0.0" def test_remove_frontmatter(self): """Test removing frontmatter from content.""" yaml_content = """--- title: Test --- Content here""" toml_content = """+++ title = "Test" +++ Content here""" # Test YAML removal cleaned_yaml = self.extractor.remove_frontmatter(yaml_content) assert not cleaned_yaml.startswith("---") assert cleaned_yaml.strip() == "Content here" # Test TOML removal cleaned_toml = self.extractor.remove_frontmatter(toml_content) assert not cleaned_toml.startswith("+++") assert cleaned_toml.strip() == "Content here" def test_detect_title_patterns(self): """Test title detection with various patterns.""" # Markdown heading assert self.extractor.detect_title("# Main Title\n\nContent") == "Main Title" # Underlined title assert ( self.extractor.detect_title("Document Title\n==============\n\nContent") == "Document Title" ) # Explicit title field assert self.extractor.detect_title("Title: My Document\n\nContent") == "My Document" # HTML title assert self.extractor.detect_title("<h1>HTML Title</h1>\n\nContent") == "HTML Title" # Template syntax assert ( self.extractor.detect_title('{{ title: "Template Title" }}\n\nContent') == "Template Title" ) def test_detect_title_from_html(self): """Test title detection from HTML documents.""" html_content = """<!DOCTYPE html> <html> <head> <title>Page Title</title> </head> <body> <h1>Main Heading</h1> <p>Content</p> </body> </html>""" title = self.extractor.detect_title(html_content) assert title == "Page Title" def test_detect_title_from_docstring(self): """Test title detection from Python docstrings.""" python_content = '''""" Module Title: Data Processing Utilities This module provides utility functions for data processing. Author: Developer Date: 2024-01-15 """ import pandas as pd def process_data(): pass''' title = self.extractor.detect_title(python_content, "utils.py") assert title == "Module Title: Data Processing Utilities" def test_detect_title_fallback(self): """Test title detection fallback mechanisms.""" # First non-empty line content = """ Introduction to Programming This document covers basic programming concepts.""" title = self.extractor.detect_title(content) assert title == "Introduction to Programming" # From filename title = self.extractor.detect_title("", "my-awesome-document.txt") assert title == "My Awesome Document" # Snake case filename title = self.extractor.detect_title("", "data_processing_guide.md") assert title == "Data Processing Guide" def test_extract_author(self): """Test author extraction.""" # Standard patterns assert self.extractor.extract_author("Author: John Smith\n") == "John Smith" assert self.extractor.extract_author("By: Jane Doe\n") == "Jane Doe" assert self.extractor.extract_author("Written by: Dr. Bob\n") == "Dr. Bob" # Code style assert self.extractor.extract_author("@author Alice Johnson\n") == "Alice Johnson" assert self.extractor.extract_author('__author__ = "Python Dev"') == "Python Dev" # Case insensitive assert self.extractor.extract_author("AUTHOR: Test User\n") == "Test User" def test_extract_date(self): """Test date extraction.""" # ISO format assert self.extractor.extract_date("Date: 2024-01-15\n") == "2024-01-15" assert self.extractor.extract_date("Published: 2024-12-31\n") == "2024-12-31" # US format assert self.extractor.extract_date("Updated: 1/15/2024\n") == "1/15/2024" assert self.extractor.extract_date("Date: 12/31/2024\n") == "12/31/2024" # Code style assert self.extractor.extract_date("@date 2024-01-15\n") == "2024-01-15" def test_detect_language(self): """Test language/format detection.""" # Python python_code = """ def calculate_sum(numbers): total = 0 for num in numbers: total += num return total if __name__ == "__main__": print(calculate_sum([1, 2, 3])) """ assert self.extractor.detect_language(python_code) == "python" # JavaScript js_code = """ const greeting = "Hello"; let count = 0; function sayHello(name) { return `${greeting}, ${name}!`; } const arrowFunc = () => console.log("Arrow function"); """ assert self.extractor.detect_language(js_code) == "javascript" # Java java_code = """ package com.example; public class HelloWorld { private String message; public static void main(String[] args) { System.out.println("Hello, World!"); } } """ assert self.extractor.detect_language(java_code) == "java" # Markdown markdown_content = """ # Main Title ## Section 1 * List item 1 * List item 2 1. Numbered item 2. Another item [Link text](https://example.com) """ assert self.extractor.detect_language(markdown_content) == "markdown" # HTML html_content = """ <html> <head> <title>Test</title> </head> <body> <div class="container"> <h1>Hello</h1> </div> </body> </html> """ assert self.extractor.detect_language(html_content) == "html" def test_extract_keywords(self): """Test keyword extraction using TF-IDF.""" content = """ Machine learning is a subset of artificial intelligence that focuses on building applications that learn from data and improve their accuracy over time without being programmed to do so. Machine learning algorithms build a model based on training data, known as "training" or "learning" phase. Deep learning is a subset of machine learning that uses artificial neural networks with multiple layers. Deep learning models can achieve state-of-the-art accuracy in many tasks including computer vision and natural language processing. The key difference between traditional machine learning and deep learning is the ability of deep learning models to automatically learn representations from data without manual feature engineering. """ keywords = self.extractor.extract_keywords(content, max_keywords=10) assert len(keywords) <= 10 assert len(keywords) > 0 # Check that important terms are extracted assert any("learning" in kw for kw in keywords) assert any("machine" in kw or "deep" in kw for kw in keywords) assert any("data" in kw for kw in keywords) # Check that stop words are not included assert not any(kw in ["the", "is", "and", "or", "but"] for kw in keywords) def test_generate_summary(self): """Test summary generation.""" content = """--- title: Test Document --- # Introduction This document provides a comprehensive guide to Python programming. It covers basic syntax, data structures, functions, classes, and advanced topics like decorators and generators. ## Getting Started To begin with Python, you need to install it from python.org. Python is known for its simple and readable syntax, making it perfect for beginners. ```python def hello_world(): print("Hello, World!") ``` ## Data Types Python supports various data types including integers, floats, strings, lists, tuples, dictionaries, and sets. Each data type has its own characteristics and use cases. ## Conclusion Python is a versatile language suitable for web development, data science, automation, and more.""" summary = self.extractor.generate_summary(content, max_length=200) assert summary is not None assert len(summary) <= 210 # Allow some flexibility assert "comprehensive guide to Python programming" in summary assert "```" not in summary # Code blocks should be removed assert "---" not in summary # Frontmatter should be removed def test_generate_summary_edge_cases(self): """Test summary generation edge cases.""" # Only code code_only = """```python def test(): pass ```""" summary = self.extractor.generate_summary(code_only) assert summary is None or summary == "" # Only headings headings_only = """# Title ## Subtitle ### Sub-subtitle""" summary = self.extractor.generate_summary(headings_only) assert summary is None or summary == "" # Very short content short_content = "This is short." summary = self.extractor.generate_summary(short_content) # The implementation might return None or the content itself for very short text assert summary is None or summary == "This is short." def test_extract_file_metadata(self): """Test file metadata extraction.""" # Create a temporary file with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as tmp: tmp.write("Test content") tmp_path = tmp.name try: metadata = self.extractor.extract_file_metadata(tmp_path) assert metadata["file_name"] == Path(tmp_path).name assert metadata["file_path"] == str(Path(tmp_path).absolute()) assert metadata["file_size"] > 0 assert metadata["file_extension"] == ".txt" assert "created_at" in metadata assert "modified_at" in metadata finally: os.unlink(tmp_path) def test_extract_file_metadata_nonexistent(self): """Test file metadata extraction for non-existent file.""" metadata = self.extractor.extract_file_metadata("/nonexistent/file.txt") assert metadata == {} def test_extract_code_metadata_python(self): """Test code metadata extraction for Python.""" python_code = """ import os import sys from pathlib import Path import numpy as np from sklearn.model_selection import train_test_split class DataProcessor: def __init__(self): pass def process(self, data): return data class ModelTrainer: def train(self): pass def load_data(filepath): pass def preprocess_data(data): pass def save_results(results, output_path): pass """ metadata = self.extractor.extract_code_metadata(python_code, "python") assert "dependencies" in metadata assert "os" in metadata["dependencies"] # Check if numpy is in dependencies (might be 'numpy' or 'numpy as np') assert any("numpy" in dep for dep in metadata["dependencies"]) # sklearn.model_selection becomes just 'sklearn' assert "sklearn" in metadata["dependencies"] assert "classes" in metadata assert "DataProcessor" in metadata["classes"] assert "ModelTrainer" in metadata["classes"] assert "functions" in metadata assert "load_data" in metadata["functions"] assert "preprocess_data" in metadata["functions"] assert "save_results" in metadata["functions"] def test_extract_code_metadata_javascript(self): """Test code metadata extraction for JavaScript.""" js_code = """ import React from 'react'; import { useState, useEffect } from 'react'; const axios = require('axios'); import './styles.css'; class ComponentClass { render() { return null; } } function processData(input) { return input; } const arrowFunction = (param) => { return param * 2; }; export const namedExport = async () => { await fetch('/api/data'); }; """ metadata = self.extractor.extract_code_metadata(js_code, "javascript") assert "dependencies" in metadata # The regex extracts the full module path/name # Check if any dependency contains 'axios' or is exactly 'axios' assert any("axios" in dep for dep in metadata["dependencies"]) assert "./styles.css" in metadata["dependencies"] assert "functions" in metadata assert "processData" in metadata["functions"] assert "arrowFunction" in metadata["functions"] assert "namedExport" in metadata["functions"] assert "classes" in metadata assert "ComponentClass" in metadata["classes"] def test_extract_metadata_complete(self): """Test complete metadata extraction.""" content = """--- title: Python Best Practices Guide author: Expert Developer date: 2024-01-15 tags: [python, best-practices, coding-standards] version: 2.0 --- # Python Best Practices Guide This comprehensive guide covers Python programming best practices, including code organization, error handling, testing, and performance optimization. ## Table of Contents 1. Code Organization 2. Error Handling 3. Testing Strategies 4. Performance Tips ## Code Organization Always structure your Python projects with clear module separation... """ with tempfile.NamedTemporaryFile(mode="w", suffix=".md", delete=False) as tmp: tmp.write(content) tmp_path = tmp.name try: metadata = self.extractor.extract_metadata(content, tmp_path) # From frontmatter assert metadata["title"] == "Python Best Practices Guide" assert metadata["author"] == "Expert Developer" # Handle both string and date object if isinstance(metadata["date"], str): assert metadata["date"] == "2024-01-15" else: assert str(metadata["date"]) == "2024-01-15" assert "python" in metadata["tags"] assert metadata["version"] == 2.0 # Detected assert metadata["language"] == "markdown" assert "keywords" in metadata assert len(metadata["keywords"]) > 0 assert "summary" in metadata # File metadata assert metadata["file_name"] == Path(tmp_path).name assert metadata["file_extension"] == ".md" finally: os.unlink(tmp_path) def test_metadata_extraction_empty_content(self): """Test metadata extraction with empty content.""" metadata = self.extractor.extract_metadata("") # Should return some default/empty metadata assert isinstance(metadata, dict) assert "title" not in metadata or metadata["title"] is None def test_metadata_extraction_no_patterns(self): """Test metadata extraction when no patterns match.""" content = "Just plain text without any metadata patterns or structure." metadata = self.extractor.extract_metadata(content) # Should still extract what it can assert "keywords" in metadata assert "summary" in metadata assert metadata["summary"] == content # Short enough to be the summary if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_metadata_extractor.py•17.2 KiB