#!/usr/bin/env python3
"""
Utility functions for converting between Markdown and PDF formats.
"""
import os
import tempfile
import subprocess
from pathlib import Path
from typing import Tuple, Optional
import pdfplumber
import markdown
def resolve_path(file_path: str) -> Path:
"""
Resolve a file path, making it absolute if it's relative.
Args:
file_path: The file path to resolve (can be relative or absolute)
Returns:
Path object resolved against current working directory if relative
"""
path = Path(file_path)
if path.is_absolute():
return path
else:
# Resolve relative paths against current working directory
return Path.cwd() / path
def markdown_to_pdf(
markdown_content: str,
output_path: str,
size: str = "l",
pdf_engine: str = "weasyprint"
) -> Tuple[bool, str]:
"""
Convert Markdown content to PDF using pandoc with mermaid support.
Args:
markdown_content: The markdown content to convert
output_path: Path where the PDF should be saved
size: Size option - 's' (small), 'm' (medium), 'l' (large)
pdf_engine: PDF engine to use (weasyprint or pdflatex)
Returns:
Tuple of (success: bool, message: str)
"""
try:
# Create temporary markdown file
with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as temp_md:
temp_md.write(markdown_content)
temp_md_path = temp_md.name
# Create temporary CSS file
css_content = _get_css_for_size(size)
with tempfile.NamedTemporaryFile(mode='w', suffix='.css', delete=False) as temp_css:
temp_css.write(css_content)
temp_css_path = temp_css.name
# Set margin based on size
margins = {"s": "0.35in", "m": "0.5in", "l": "1in"}
margin = margins.get(size, "1in")
# Set TOC depth based on size
toc_depths = {"s": 2, "m": 3, "l": 4}
toc_depth = toc_depths.get(size, 3)
# Build pandoc command
pandoc_cmd = [
"pandoc", temp_md_path,
"--filter", "mermaid-filter",
"--pdf-engine", pdf_engine,
f"--variable=geometry:margin={margin}",
f"--css={temp_css_path}",
"--toc",
f"--toc-depth={toc_depth}",
"-o", output_path
]
# Execute pandoc command
result = subprocess.run(pandoc_cmd, capture_output=True, text=True)
# Cleanup temporary files
os.unlink(temp_md_path)
os.unlink(temp_css_path)
if result.returncode == 0:
return True, f"Successfully converted to PDF: {output_path}"
else:
return False, f"Pandoc error: {result.stderr}"
except Exception as e:
return False, f"Error during conversion: {str(e)}"
def pdf_to_markdown(pdf_path: str, output_path: str) -> Tuple[bool, str]:
"""
Convert PDF content to Markdown format.
Args:
pdf_path: Path to the PDF file
output_path: Path where the Markdown should be saved
Returns:
Tuple of (success: bool, message: str)
"""
try:
markdown_content = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# Extract text from the page
text = page.extract_text()
if text:
# Add page break marker for multi-page documents
if page_num > 1:
markdown_content.append("\n\n---\n\n")
# Basic text processing to improve markdown formatting
processed_text = _process_text_to_markdown(text)
markdown_content.append(processed_text)
# Join all content
full_markdown = "".join(markdown_content)
# Write to output file
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_markdown)
return True, f"Successfully converted to Markdown: {output_path}"
except Exception as e:
return False, f"Error during PDF to Markdown conversion: {str(e)}"
def _process_text_to_markdown(text: str) -> str:
"""
Process extracted PDF text to improve markdown formatting.
Args:
text: Raw text extracted from PDF
Returns:
Processed text with basic markdown formatting
"""
# Remove null bytes and other binary characters
text = text.replace('\x00', '').replace('\x01', '').replace('\x02', '').replace('\x03', '')
# Clean up other problematic characters
text = ''.join(char for char in text if ord(char) >= 32 or char in ['\n', '\t', '\r'])
lines = text.split('\n')
processed_lines = []
for line in lines:
line = line.strip()
if not line:
processed_lines.append("")
continue
# Try to detect headings (lines that are all caps or have specific patterns)
if len(line) < 100 and (line.isupper() or line.endswith(':')):
# Make it a heading
processed_lines.append(f"## {line}")
else:
processed_lines.append(line)
return '\n'.join(processed_lines)
def _get_css_for_size(size: str) -> str:
"""
Get CSS content based on the size parameter.
Args:
size: Size option - 's' (small), 'm' (medium), 'l' (large)
Returns:
CSS content as string
"""
if size == "s":
return """
/* Main document styles - Small/Compact Size */
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
font-size: 8pt;
line-height: 1.2;
color: #333;
max-width: 100%;
margin: 0;
padding: 0;
}
@page {
size: letter;
margin: 0.35in;
}
/* Heading styles - compact */
h1, h2, h3, h4, h5, h6 {
color: #0066cc;
margin-top: 8px;
margin-bottom: 4px;
line-height: 1.1;
page-break-after: avoid;
}
h1 { font-size: 14pt; margin-top: 10px; }
h2 { font-size: 12pt; margin-top: 8px; }
h3 { font-size: 10pt; }
h4, h5, h6 { font-size: 9pt; }
/* Code blocks and inline code */
code {
font-family: 'Courier New', monospace;
font-size: 7pt;
padding: 0.1em 0.2em;
background-color: rgba(27, 31, 35, 0.05);
border-radius: 2px;
}
pre {
background-color: #f6f8fa;
border-radius: 2px;
padding: 5px;
margin: 5px 0;
overflow: auto;
font-size: 7pt;
line-height: 1.1;
white-space: pre-wrap;
}
pre code {
background-color: transparent;
padding: 0;
}
/* Tables - compact */
table {
border-collapse: collapse;
width: 100%;
margin: 5px 0;
font-size: 7pt;
line-height: 1.1;
}
table, th, td {
border: 0.5px solid #ddd;
}
th, td {
padding: 2px 4px;
text-align: left;
}
th {
background-color: #f5f5f5;
font-size: 7pt;
font-weight: bold;
}
/* Lists - compact */
ul, ol {
margin-top: 3px;
margin-bottom: 3px;
padding-left: 15px;
}
li {
margin-bottom: 1px;
line-height: 1.1;
}
blockquote {
border-left: 2px solid #ddd;
padding-left: 8px;
margin: 5px 0;
color: #666;
font-size: 7pt;
}
/* Mermaid diagrams - compact */
.mermaid {
text-align: center;
font-size: 7pt;
width: 100%;
max-width: 100%;
transform: scale(0.7);
transform-origin: top center;
margin: 0 auto -30px auto;
max-height: 250px;
}
/* Images - compact */
img {
max-width: 70%;
max-height: 250px;
height: auto;
display: block;
margin: 4px auto;
}
p {
margin-top: 3px;
margin-bottom: 3px;
}
/* TOC adjustments */
#toc {
font-size: 7pt;
line-height: 1.1;
margin-bottom: 10px;
}
#toc ul {
padding-left: 12px;
margin: 0;
}
#toc li {
margin-bottom: 0;
}
/* Two-column layout */
.twoColumn {
column-count: 2;
column-gap: 15px;
column-rule: 1px solid #ddd;
}
/* Code highlighting syntax */
.sourceCode {
font-size: 7pt;
}
/* Page break controls */
figure, table {
page-break-inside: avoid;
}
hr {
height: 1px;
background-color: #ddd;
border: none;
margin: 5px 0;
}
"""
elif size == "m":
return """
/* Main document styles - Medium Size */
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
font-size: 9pt;
line-height: 1.3;
color: #333;
max-width: 100%;
margin: 0;
padding: 0;
}
@page {
size: letter;
margin: 0.5in;
}
/* Heading styles */
h1, h2, h3, h4, h5, h6 {
color: #0066cc;
margin-top: 12px;
margin-bottom: 8px;
line-height: 1.2;
page-break-after: avoid;
}
h1 { font-size: 16pt; }
h2 { font-size: 14pt; }
h3 { font-size: 12pt; }
h4, h5, h6 { font-size: 10pt; }
/* Code blocks and inline code */
code {
font-family: Monaco, Consolas, "Courier New", monospace;
font-size: 8pt;
padding: 0.1em 0.2em;
background-color: rgba(27, 31, 35, 0.05);
border-radius: 2px;
}
pre {
background-color: #f6f8fa;
border-radius: 2px;
padding: 8px;
margin: 8px 0;
overflow: auto;
font-size: 8pt;
line-height: 1.3;
}
/* Black & White overrides */
body { color: #111; }
h1, h2, h3, h4, h5, h6 { color: #111; }
table, th, td { border-color: #444; }
code, pre { background-color: #f8f8f8; color: #111; }
img { filter: grayscale(100%) contrast(120%); }
"""
else: # size == "l" (large)
return """
/* Main document styles - Large Size */
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif;
font-size: 11pt;
line-height: 1.4;
color: #333;
max-width: 100%;
margin: 0;
padding: 0;
}
@page {
size: letter;
margin: 1in;
}
/* Heading styles */
h1, h2, h3, h4, h5, h6 {
color: #0066cc;
margin-top: 20px;
margin-bottom: 12px;
line-height: 1.3;
page-break-after: avoid;
}
h1 { font-size: 20pt; }
h2 { font-size: 16pt; }
h3 { font-size: 14pt; }
h4 { font-size: 12pt; }
h5, h6 { font-size: 11pt; }
/* Code blocks and inline code */
code {
font-family: Monaco, Consolas, "Courier New", monospace;
font-size: 10pt;
padding: 0.2em 0.4em;
background-color: rgba(27, 31, 35, 0.05);
border-radius: 3px;
}
pre {
background-color: #f6f8fa;
border-radius: 3px;
padding: 16px;
margin: 16px 0;
overflow: auto;
font-size: 10pt;
line-height: 1.4;
}
/* Tables */
table {
border-collapse: collapse;
width: 100%;
margin: 15px 0;
font-size: 10pt;
}
table, th, td {
border: 1px solid #ddd;
}
th, td {
padding: 8px 12px;
text-align: left;
}
th {
background-color: #f6f8fa;
font-weight: bold;
}
/* Black & White overrides */
body { color: #111; }
h1, h2, h3, h4, h5, h6 { color: #111; }
table, th, td { border-color: #444; }
code, pre { background-color: #f8f8f8; color: #111; }
img { filter: grayscale(100%) contrast(120%); }
"""