import os
import re
from datetime import datetime
from typing import Dict
class MetadataExtractor:
@staticmethod
def extract_from_filename(filepath: str) -> Dict:
"""Extract date and info from filename"""
filename = os.path.basename(filepath)
metadata = {}
# Try to extract date (YYYY-MM-DD or YYYYMMDD)
date_patterns = [
r'(\d{4}-\d{2}-\d{2})',
r'(\d{8})'
]
for pattern in date_patterns:
match = re.search(pattern, filename)
if match:
date_str = match.group(1)
metadata['date'] = date_str
break
return metadata
@staticmethod
def extract_keywords(text: str, max_words: int = 10) -> list:
"""Simple keyword extraction from text"""
# Take first 1000 chars
sample = text[:1000].lower()
# Remove common words
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for'}
words = re.findall(r'\b\w+\b', sample)
keywords = [w for w in words if w not in stop_words and len(w) > 3]
# Count frequency
from collections import Counter
word_freq = Counter(keywords)
return [word for word, _ in word_freq.most_common(max_words)]
@staticmethod
def enhance_metadata(doc_metadata: Dict, filepath: str, text: str) -> Dict:
"""Combine all metadata sources"""
enhanced = {**doc_metadata}
# Add file info
file_meta = MetadataExtractor.extract_from_filename(filepath)
enhanced.update(file_meta)
# Add file stats
stat = os.stat(filepath)
enhanced['last_modified'] = datetime.fromtimestamp(stat.st_mtime).isoformat()
enhanced['file_size'] = stat.st_size
# Add keywords as comma-separated string
keywords = MetadataExtractor.extract_keywords(text)
enhanced['keywords'] = ', '.join(keywords) # Convert list to string
return enhanced