Code-Index-MCP

Code-Index-MCP
mcp_server
document_processing

metadata_extractor.py•18.5 KiB

""" Metadata extractor for document processing. This module provides functionality to extract metadata from various document types, including title detection, author information, timestamps, keywords, and summaries. """ import re import yaml try: import tomllib # Python 3.11+ except ImportError: try: import toml as tomllib except ImportError: tomllib = None import logging from collections import Counter from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) class MetadataExtractor: """Extracts metadata from documents using various strategies.""" def __init__(self): """Initialize the metadata extractor.""" # Common title patterns self.title_patterns = [ r"^#\s+(.+)$", # Markdown h1 r"^(.+)\n={3,}$", # Underlined title r"^Title:\s*(.+)$", # Explicit title field r"^<h1[^>]*>(.+?)</h1>", # HTML h1 r'^{{\s*title:\s*"(.+?)"\s*}}', # Common template syntax ] # Author patterns self.author_patterns = [ r"(?:Author|By|Written by):\s*(.+?)(?:\n|$)", r"@author\s+(.+?)(?:\n|$)", # Javadoc style r'__author__\s*=\s*["\'](.+?)["\']', # Python style ] # Date patterns self.date_patterns = [ r"(?:Date|Published|Updated):\s*(\d{4}-\d{2}-\d{2})", r"(?:Date|Published|Updated):\s*(\d{1,2}/\d{1,2}/\d{4})", r"@date\s+(.+?)(?:\n|$)", ] # Language detection patterns (basic) self.language_indicators = { "python": [ r"def\s+\w+\s*$", r"import\s+\w+", r"class\s+\w+", r"if\s+__name__", ], "javascript": [ r"function\s+\w+\s*\(", r"const\s+\w+\s*=", r"let\s+\w+\s*=", r"=>", ], "java": [ r"public\s+class", r"private\s+\w+", r"package\s+\w+", r"import\s+java\.", ], "markdown": [r"^#{1,6}\s+", r"^\*\s+", r"^\d+\.\s+", r"\[.+\]\(.+$"], "html": [r"<html", r"<body", r"<div", r"<head>"], } def extract_metadata(self, content: str, file_path: Optional[str] = None) -> Dict[str, Any]: """ Extract metadata from document content. Args: content: The document content file_path: Optional file path for additional metadata Returns: Dictionary containing extracted metadata """ metadata = {} # Extract frontmatter if present frontmatter = self.extract_frontmatter(content) if frontmatter: metadata.update(frontmatter) # Remove frontmatter from content for further processing content = self.remove_frontmatter(content) # Extract title if "title" not in metadata: title = self.detect_title(content, file_path) if title: metadata["title"] = title # Extract author if "author" not in metadata: author = self.extract_author(content) if author: metadata["author"] = author # Extract date if "date" not in metadata: date = self.extract_date(content) if date: metadata["date"] = date # Detect language if "language" not in metadata: language = self.detect_language(content) if language: metadata["language"] = language # Extract keywords if "keywords" not in metadata: keywords = self.extract_keywords(content) if keywords: metadata["keywords"] = keywords # Generate summary if "summary" not in metadata and "description" not in metadata: summary = self.generate_summary(content) if summary: metadata["summary"] = summary # Add file metadata if available if file_path: file_metadata = self.extract_file_metadata(file_path) metadata.update(file_metadata) return metadata def extract_frontmatter(self, content: str) -> Optional[Dict[str, Any]]: """ Extract YAML or TOML frontmatter from content. Args: content: The document content Returns: Parsed frontmatter as dictionary or None """ # YAML frontmatter pattern yaml_pattern = r"^---\s*\n(.*?)\n---\s*\n" yaml_match = re.match(yaml_pattern, content, re.DOTALL) if yaml_match: try: return yaml.safe_load(yaml_match.group(1)) except yaml.YAMLError: logger.warning("Failed to parse YAML frontmatter") # TOML frontmatter pattern if tomllib: toml_pattern = r"^\+\+\+\s*\n(.*?)\n\+\+\+\s*\n" toml_match = re.match(toml_pattern, content, re.DOTALL) if toml_match: try: if hasattr(tomllib, "loads"): return tomllib.loads(toml_match.group(1)) else: # Python 3.11+ tomllib only has load() for file objects import io return tomllib.load(io.StringIO(toml_match.group(1))) except Exception: logger.warning("Failed to parse TOML frontmatter") return None def remove_frontmatter(self, content: str) -> str: """Remove frontmatter from content.""" # Remove YAML frontmatter content = re.sub(r"^---\s*\n.*?\n---\s*\n", "", content, flags=re.DOTALL) # Remove TOML frontmatter content = re.sub(r"^\+\+\+\s*\n.*?\n\+\+\+\s*\n", "", content, flags=re.DOTALL) return content def detect_title(self, content: str, file_path: Optional[str] = None) -> Optional[str]: """ Detect document title using various heuristics. Args: content: The document content file_path: Optional file path as fallback Returns: Detected title or None """ # Try each title pattern for pattern in self.title_patterns: match = re.search(pattern, content, re.MULTILINE | re.IGNORECASE) if match: title = match.group(1).strip() # Clean up common markdown/HTML artifacts title = re.sub(r"[#*`]", "", title) title = re.sub(r"<[^>]+>", "", title) return title # Special handling for HTML title tag html_title_match = re.search(r"<title>(.+?)</title>", content, re.IGNORECASE | re.DOTALL) if html_title_match: return html_title_match.group(1).strip() # Look for Python/code docstring title docstring_match = re.search(r'(?:"""|\'\'\')(.*?)(?:"""|\'\'\')', content, re.DOTALL) if docstring_match and file_path and file_path.endswith((".py", ".js", ".java")): docstring_content = docstring_match.group(1).strip() # First line of docstring is often the title first_line = docstring_content.split("\n")[0].strip() if first_line and not first_line.lower().startswith(("author:", "date:", "copyright:")): return first_line # Fallback: use first non-empty line lines = content.strip().split("\n") for i, line in enumerate(lines[:10]): # Check first 10 lines line = line.strip() # Skip common non-title patterns if ( line and not line.startswith( ("#", "-", "*", "```", "/*", "//", "<!--", "<!DOCTYPE", "<?xml") ) and not line.lower().startswith( ("author:", "date:", "by:", "written by:", "copyright:") ) and line not in ('"""', "'''", "*/") and len(line) > 3 and len(line) < 100 ): # For Python docstrings, skip lines that look like metadata if file_path and file_path.endswith(".py"): # Skip if inside a docstring and looks like metadata if i > 0 and i < len(lines) - 1: prev_line = lines[i - 1].strip() next_line = lines[i + 1].strip() if i + 1 < len(lines) else "" # Check if we're inside a docstring if ( prev_line in ('"""', "'''") or next_line in ('"""', "'''") or (i >= 2 and lines[i - 2].strip() in ('"""', "'''")) ): # This might be inside a docstring, check if it's metadata if ":" in line: continue return line # Last resort: use filename if file_path: filename = Path(file_path).stem # Convert snake_case or kebab-case to title case title = filename.replace("_", " ").replace("-", " ") return title.title() return None def extract_author(self, content: str) -> Optional[str]: """Extract author information from content.""" for pattern in self.author_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: return match.group(1).strip() return None def extract_date(self, content: str) -> Optional[str]: """Extract date information from content.""" for pattern in self.date_patterns: match = re.search(pattern, content, re.IGNORECASE) if match: return match.group(1).strip() return None def detect_language(self, content: str) -> Optional[str]: """ Detect the primary language/format of the document. Args: content: The document content Returns: Detected language or None """ scores = {} for language, patterns in self.language_indicators.items(): score = 0 for pattern in patterns: matches = len(re.findall(pattern, content, re.MULTILINE)) score += matches if score > 0: scores[language] = score if scores: # Return language with highest score return max(scores, key=scores.get) return None def extract_keywords(self, content: str, max_keywords: int = 10) -> List[str]: """ Extract keywords using TF-IDF algorithm. Args: content: The document content max_keywords: Maximum number of keywords to extract Returns: List of extracted keywords """ # Simple tokenization (can be improved with proper NLP) words = re.findall(r"\b[a-zA-Z]{3,}\b", content.lower()) # Remove common stop words (basic list) stop_words = { "the", "is", "at", "which", "on", "and", "a", "an", "as", "are", "been", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "will", "with", "be", "have", "this", "can", "but", "not", "you", "all", "they", "their", "what", "when", "where", "who", "why", "how", "these", "those", "some", "many", "much", "very", "such", "only", "other", "into", "after", "before", "then", "also", "just", "more", "most", "than", } words = [w for w in words if w not in stop_words] if not words: return [] # Calculate term frequency word_count = Counter(words) total_words = len(words) # Calculate TF-IDF (simplified - without IDF from corpus) tf_scores = {} for word, count in word_count.items(): tf = count / total_words # Boost longer words (they tend to be more specific) length_boost = 1 + (len(word) - 3) * 0.1 tf_scores[word] = tf * length_boost # Get top keywords sorted_keywords = sorted(tf_scores.items(), key=lambda x: x[1], reverse=True) keywords = [word for word, score in sorted_keywords[:max_keywords]] return keywords def generate_summary(self, content: str, max_length: int = 200) -> Optional[str]: """ Generate a summary from the document content. Args: content: The document content max_length: Maximum length of the summary Returns: Generated summary or None """ # Remove frontmatter and code blocks for summary clean_content = self.remove_frontmatter(content) clean_content = re.sub(r"```.*?```", "", clean_content, flags=re.DOTALL) clean_content = re.sub(r"`[^`]+`", "", clean_content) # Remove HTML tags clean_content = re.sub(r"<[^>]+>", "", clean_content) # Remove language-specific comment blocks clean_content = re.sub(r"/\*.*?\*/", "", clean_content, flags=re.DOTALL) clean_content = re.sub(r'""".*?"""', "", clean_content, flags=re.DOTALL) clean_content = re.sub(r"'''.*?'''", "", clean_content, flags=re.DOTALL) # Split into paragraphs paragraphs = re.split(r"\n\s*\n", clean_content) # Find first substantial paragraph for para in paragraphs: para = para.strip() # Skip headers, lists, imports, etc. if ( para and not para.startswith("#") and not para.startswith("*") and not para.startswith("-") and not para.startswith("1.") and not para.startswith("import ") and not para.startswith("from ") and not para.startswith("class ") and not para.startswith("def ") and not para.startswith("//") and not para.startswith("/*") and len(para) > 50 ): # Truncate if needed if len(para) > max_length: # Try to cut at sentence boundary sentences = re.split(r"[.!?]\s+", para) summary = "" for sentence in sentences: if len(summary) + len(sentence) < max_length: summary += sentence + ". " else: break return summary.strip() or para[:max_length] + "..." return para # Fallback: use beginning of clean content if clean_content: # Skip any leading whitespace or short lines lines = clean_content.strip().split("\n") for line in lines: line = line.strip() if len(line) > 20: return line[:max_length] + "..." if len(line) > max_length else line return None def extract_file_metadata(self, file_path: str) -> Dict[str, Any]: """ Extract metadata from file system. Args: file_path: Path to the file Returns: Dictionary with file metadata """ metadata = {} try: path = Path(file_path) if path.exists(): stat = path.stat() metadata["file_name"] = path.name metadata["file_path"] = str(path.absolute()) metadata["file_size"] = stat.st_size metadata["created_at"] = datetime.fromtimestamp(stat.st_ctime).isoformat() metadata["modified_at"] = datetime.fromtimestamp(stat.st_mtime).isoformat() metadata["file_extension"] = path.suffix.lower() except Exception as e: logger.warning(f"Failed to extract file metadata: {e}") return metadata def extract_code_metadata(self, content: str, language: str) -> Dict[str, Any]: """ Extract metadata specific to code files. Args: content: The code content language: The programming language Returns: Dictionary with code-specific metadata """ metadata = {} # Extract imports/dependencies if language == "python": imports = re.findall(r"^(?:from\s+(\S+)\s+)?import\s+(.+)$", content, re.MULTILINE) dependencies = [] for from_module, import_names in imports: if from_module: dependencies.append(from_module.split(".")[0]) else: for name in import_names.split(","): dependencies.append(name.strip().split(".")[0]) metadata["dependencies"] = list(set(dependencies)) # Extract classes and functions classes = re.findall(r"^class\s+(\w+)", content, re.MULTILINE) functions = re.findall(r"^def\s+(\w+)", content, re.MULTILINE) metadata["classes"] = classes metadata["functions"] = functions elif language == "javascript": # Extract imports imports = re.findall(r'(?:import|require)\s*$?[\'"]([^\'"]+)[\'"]$?', content) metadata["dependencies"] = list(set(imports)) # Extract functions and classes functions = re.findall( r"(?:function\s+(\w+)|const\s+(\w+)\s*=\s*(?:async\s+)?(?:$[^)]*$\s*)?=>)", content, ) functions = [f for group in functions for f in group if f] metadata["functions"] = list(set(functions)) classes = re.findall(r"class\s+(\w+)", content) metadata["classes"] = classes return metadata

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

metadata_extractor.py•18.5 KiB