md-pdf-mcp
by seanivore
Verified
- md_pdf_mcp
"""Core PDF conversion functionality for markdown to PDF conversion."""
import os
import tempfile
import urllib.request
import urllib.error
from pathlib import Path
from typing import Optional, Dict
from urllib.parse import urlparse
import markdown
from xml.etree import ElementTree
from PIL import Image as PILImage
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table
from reportlab.lib.pagesizes import A4
from reportlab.lib.units import inch
from reportlab.lib.styles import ParagraphStyle
from .vscode_styles import get_vscode_stylesheet, em_to_pt, THEME_COLORS
from markdown.extensions import fenced_code, codehilite, attr_list, tables, toc, extra
class MDPDFError(Exception):
"""Base exception for MD-PDF-MCP"""
pass
class InvalidMarkdownError(MDPDFError):
"""Raised when markdown cannot be parsed"""
pass
class PDFGenerationError(MDPDFError):
"""Raised when PDF generation fails"""
pass
class ImageError(MDPDFError):
"""Raised when image processing fails"""
pass
def is_url(path: str) -> bool:
"""Check if a path is a URL."""
try:
result = urlparse(path)
return all([result.scheme, result.netloc])
except ValueError:
return False
def download_image(url: str, temp_dir: str) -> str:
"""Download an image to a temporary file."""
try:
filename = os.path.join(temp_dir, os.path.basename(url))
urllib.request.urlretrieve(url, filename)
return filename
except (urllib.error.URLError, OSError) as e:
raise ImageError(f"Failed to download image {url}: {str(e)}")
def get_image_size(image_path: str, max_width: float) -> tuple[float, float]:
"""Calculate image dimensions constrained to max width."""
try:
with PILImage.open(image_path) as img:
orig_width, orig_height = img.size
if orig_width <= max_width:
return orig_width, orig_height
scale_factor = max_width / orig_width
new_height = orig_height * scale_factor
return max_width, new_height
except Exception as e:
raise ImageError(f"Failed to process image {image_path}: {str(e)}")
def process_inline_text(element) -> str:
"""Process inline text formatting (bold, italic, etc.)"""
if element.text is None:
element.text = ''
text = element.text
for child in element:
if child.text:
if child.tag == 'strong' or child.tag == 'b':
text += f'<b>{child.text}</b>'
elif child.tag == 'em' or child.tag == 'i':
text += f'<i>{child.text}</i>'
else:
text += child.text
for nested in child:
if nested.text:
if nested.tag == 'strong' or nested.tag == 'b':
text += f'<b>{nested.text}</b>'
elif nested.tag == 'em' or nested.tag == 'i':
text += f'<i>{nested.text}</i>'
else:
text += nested.text
if nested.tail:
text += nested.tail
if child.tail:
text += child.tail
return text.strip()
def validate_markdown(text: str) -> None:
"""Validate markdown syntax."""
stack = []
for i, char in enumerate(text):
if char in '[(':
stack.append((char, i))
elif char in '])':
if not stack:
raise InvalidMarkdownError(f"Unmatched closing bracket at position {i}")
last_char, _ = stack.pop()
if (char == ']' and last_char != '[') or (char == ')' and last_char != '('):
raise InvalidMarkdownError(f"Mismatched brackets at position {i}")
if stack:
pos = stack[-1][1]
raise InvalidMarkdownError(f"Unclosed bracket at position {pos}")
def convert_markdown_to_pdf(
markdown_text: str,
output_path: str,
theme: str = 'light',
progress_callback: Optional[callable] = None
) -> bool:
"""Convert markdown to PDF using VS Code styling."""
try:
with tempfile.TemporaryDirectory() as temp_dir:
if progress_callback:
progress_callback(0, "Starting conversion...")
# Handle empty content
if not markdown_text.strip():
doc = SimpleDocTemplate(
output_path,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
doc.build([])
return True
# Split content but preserve header and signature newlines
lines = markdown_text.split('\n')
processed_lines = []
in_header = True
in_signature = False
for line in lines:
if line.strip() == '':
processed_lines.append('') # Keep empty lines
if len(processed_lines) > 4: # After title, role, blank line, and date
in_header = False
elif in_header:
processed_lines.append(line) # Keep header lines as-is
elif 'Hope to hear from you soon' in line: # Start of signature
in_signature = True
processed_lines.append(line)
elif in_signature:
processed_lines.append(line) # Preserve signature line breaks
else:
processed_lines.append(line.rstrip()) # Outside header/signature, replace single newlines
processed_text = '\n'.join(processed_lines)
# Validate markdown syntax
validate_markdown(processed_text)
try:
# Parse markdown to HTML with extensions
html = markdown.markdown(
processed_text,
extensions=[
'fenced_code',
'codehilite',
'attr_list',
'tables',
'toc',
'extra',
],
output_format='xhtml'
)
except Exception as e:
raise InvalidMarkdownError(f"Failed to parse markdown: {str(e)}")
if progress_callback:
progress_callback(25, "Markdown parsed...")
# Create PDF document with styles
doc = SimpleDocTemplate(
output_path,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
page_width = A4[0] - 144 # Width minus margins
styles = get_vscode_stylesheet(theme)
if progress_callback:
progress_callback(50, "Styles applied...")
# Convert HTML to flowables
elements = []
try:
root = ElementTree.fromstring(f"<root>{html}</root>")
except ElementTree.ParseError as e:
raise InvalidMarkdownError(f"Generated HTML is invalid: {str(e)}")
# Track document sections
in_header = False
in_signature = False
last_was_heading = False
for element in root.iter():
if element.tag == 'root':
continue
if element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
text = process_inline_text(element)
style = f'Heading{element.tag[1]}'
elements.append(Paragraph(text, styles[style]))
# Update section tracking
if element.tag == 'h1':
in_header = True
else:
in_header = False
last_was_heading = True
elif element.tag == 'p':
text = process_inline_text(element)
# Check for signature section
if 'Hope to hear from you soon' in text:
in_signature = True
# Use special styles for different sections
if in_header:
if 'ITALICS' in text: # Date line
text = text.replace('ITALICS', '').strip()
elements.append(Paragraph(text, styles['Heading3']))
else: # Role line
elements.append(Paragraph(text, styles['Heading2']))
if last_was_heading:
elements.append(Spacer(1, em_to_pt(0.3)))
elif in_signature:
# Split signature into lines and add each as separate paragraph
sig_lines = text.split('\n')
for line in sig_lines:
if line.strip():
elements.append(Paragraph(line.strip(), styles['Signature']))
else:
elements.append(Paragraph(text, styles['Body']))
last_was_heading = False
elif element.tag == 'ul':
list_items = []
for li in element.findall('li'):
text = process_inline_text(li)
if text.strip():
list_items.append(Paragraph('• ' + text.strip(), styles['ListItem']))
# Add all list items
elements.extend(list_items)
# Add space after the whole list
if list_items: # Only add space if list wasn't empty
elements.append(Spacer(1, em_to_pt(0.8)))
last_was_heading = False
elif element.tag == 'pre':
# Handle code blocks properly
code = element.find('code')
if code is not None:
# Get the code text
text = code.text.strip('`') if code.text else ''
# Split into lines and process each line
lines = text.split('\n')
processed_lines = []
for line in lines:
line = line.rstrip() # Remove trailing whitespace
if line.lstrip().startswith('#'): # Python comment
processed_lines.append(Paragraph(line, styles['CodeComment']))
else:
processed_lines.append(Paragraph(line, styles['Pre']))
elements.extend(processed_lines)
else:
text = element.text.strip('`') if element.text else ''
elements.append(Paragraph(text, styles['Pre']))
last_was_heading = False
elif element.tag == 'img':
src = element.get('src')
if not src:
continue
# Handle remote images
if is_url(src):
try:
src = download_image(src, temp_dir)
except ImageError as e:
print(f"Warning: Failed to download image {src}: {e}")
continue
# Calculate image size
try:
width, height = get_image_size(src, page_width)
image = Image(src, width=width, height=height)
elements.append(image)
except ImageError as e:
print(f"Warning: Failed to process image {src}: {e}")
continue
last_was_heading = False
if progress_callback:
progress_callback(75, "Content processed...")
# Generate PDF
doc.build(elements)
if progress_callback:
progress_callback(100, "PDF generated successfully!")
return True
except Exception as e:
raise PDFGenerationError(f"Failed to generate PDF: {str(e)}")