indexer.py•5.1 kB
"""Indexer for Jekyll blog posts."""
from pathlib import Path
from typing import List, Dict, Any
from collections import defaultdict
from jekyll_mcp.parser import PostParser
class PostIndexer:
"""Index Jekyll blog posts for searching."""
def __init__(self, posts_dir: Path, drafts_dir: Path = None):
"""
Initialize the indexer.
Args:
posts_dir: Path to _posts directory
drafts_dir: Optional path to _drafts directory
"""
self.posts_dir = Path(posts_dir)
self.drafts_dir = Path(drafts_dir) if drafts_dir else None
self.parser = PostParser()
self.posts = []
self.categories = defaultdict(int)
self.tags = defaultdict(int)
def index_all(self):
"""Index all posts and drafts."""
self._index_directory(self.posts_dir, published=True)
if self.drafts_dir and self.drafts_dir.exists():
self._index_directory(self.drafts_dir, published=False)
self._build_category_tag_counts()
def _index_directory(self, directory: Path, published: bool):
"""Index posts from a directory."""
for file_path in directory.iterdir():
if self.parser.is_valid_post(file_path):
try:
post_data = self.parser.parse_post(file_path)
post_data['published'] = published
self.posts.append(post_data)
except Exception as e:
print(f"Error parsing {file_path}: {e}")
def _build_category_tag_counts(self):
"""Build category and tag counts."""
for post in self.posts:
metadata = post['metadata']
# Handle categories (can be list or single string)
categories = metadata.get('categories', [])
if isinstance(categories, str):
categories = [categories]
for cat in categories:
self.categories[cat] += 1
# Handle tags (can be list or single string)
tags = metadata.get('tags', [])
if isinstance(tags, str):
tags = [tags]
for tag in tags:
self.tags[tag] += 1
def search_posts(
self,
query: str = None,
category: str = None,
tags: List[str] = None,
limit: int = 10,
published_only: bool = True
) -> List[Dict[str, Any]]:
"""
Search for posts.
Args:
query: Search term (searches in title, content, and metadata)
category: Filter by category
tags: Filter by tags
limit: Maximum number of results
published_only: Only return published posts
Returns:
List of matching posts with metadata
"""
results = []
for post in self.posts:
# Skip drafts if published_only
if published_only and not post['published']:
continue
# Filter by category
if category:
post_categories = post['metadata'].get('categories', [])
if isinstance(post_categories, str):
post_categories = [post_categories]
if category not in post_categories:
continue
# Filter by tags
if tags:
post_tags = post['metadata'].get('tags', [])
if isinstance(post_tags, str):
post_tags = [post_tags]
if not any(tag in post_tags for tag in tags):
continue
# Search query
if query:
query_lower = query.lower()
searchable_text = (
post['metadata'].get('title', '').lower() + ' ' +
post['content'].lower() + ' ' +
post['metadata'].get('slug', '').lower()
)
if query_lower not in searchable_text:
continue
results.append(self._format_result(post))
if len(results) >= limit:
break
return results
def get_post_by_slug(self, slug: str) -> Dict[str, Any]:
"""Get a post by its slug."""
for post in self.posts:
if post['metadata'].get('slug') == slug:
return post
return None
def _format_result(self, post: Dict[str, Any]) -> Dict[str, Any]:
"""Format a post for return."""
metadata = post['metadata']
content = post['content']
# Create excerpt (first 200 chars)
excerpt = content[:200] + '...' if len(content) > 200 else content
return {
'title': metadata.get('title', 'Untitled'),
'slug': metadata.get('slug', ''),
'date': str(metadata.get('date', '')),
'categories': metadata.get('categories', []),
'tags': metadata.get('tags', []),
'file_path': post['file_path'],
'published': post['published'],
'excerpt': excerpt
}