#!/usr/bin/env python3
"""
Utility script to manually convert PDFs to markdown
Usage: python convert_pdfs.py [pdf_folder] [markdown_folder]
"""
import sys
import json
from pathlib import Path
from datetime import datetime
import PyPDF2
import hashlib
def get_file_hash(file_path: Path) -> str:
"""Calculate hash of file to detect changes"""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def pdf_to_markdown(pdf_path: Path) -> str:
"""Convert PDF to markdown format"""
markdown_lines = []
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
total_pages = len(pdf_reader.pages)
markdown_lines.append(f"# {pdf_path.stem}")
markdown_lines.append(f"\n*Source: {pdf_path.name}*")
markdown_lines.append(f"*Total Pages: {total_pages}*")
markdown_lines.append(f"*Processed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*")
markdown_lines.append("\n---\n")
for page_num in range(total_pages):
try:
page = pdf_reader.pages[page_num]
text = page.extract_text()
# Clean up text for markdown
text = text.strip()
if text:
markdown_lines.append(f"\n## Page {page_num + 1}\n")
# Process text to improve markdown formatting
lines = text.split('\n')
for line in lines:
line = line.strip()
if line:
markdown_lines.append(line)
if line.endswith('.') or line.endswith(':'):
markdown_lines.append("") # Add blank line
print(f" Processed page {page_num + 1}/{total_pages}", end='\r')
except Exception as e:
print(f"\n Error on page {page_num + 1}: {e}")
markdown_lines.append(f"\n## Page {page_num + 1}\n")
markdown_lines.append(f"*[Error extracting page: {e}]*\n")
print() # New line after progress
except Exception as e:
print(f"Error processing PDF: {e}")
markdown_lines = [f"# Error Processing {pdf_path.name}\n\n{str(e)}"]
return '\n'.join(markdown_lines)
def main():
# Get folders from arguments or use defaults
pdf_folder = Path(sys.argv[1] if len(sys.argv) > 1 else "./quantconnect-docs")
markdown_folder = Path(sys.argv[2] if len(sys.argv) > 2 else pdf_folder / "markdown")
print(f"PDF folder: {pdf_folder}")
print(f"Markdown folder: {markdown_folder}")
if not pdf_folder.exists():
print(f"Error: PDF folder {pdf_folder} does not exist")
return
# Create markdown folder
markdown_folder.mkdir(exist_ok=True)
# Load cache
cache_file = markdown_folder / ".pdf_cache.json"
cache = {}
if cache_file.exists():
try:
with open(cache_file, 'r') as f:
cache = json.load(f)
except Exception as e:
print(f"Warning: Could not load cache: {e}")
# Process PDFs
pdf_files = list(pdf_folder.glob("*.pdf"))
print(f"\nFound {len(pdf_files)} PDF files")
for i, pdf_file in enumerate(pdf_files, 1):
print(f"\n[{i}/{len(pdf_files)}] Processing {pdf_file.name}...")
# Check if already processed
file_hash = get_file_hash(pdf_file)
cached_info = cache.get(pdf_file.name, {})
if cached_info.get('hash') == file_hash:
markdown_path = markdown_folder / cached_info.get('markdown_file', f"{pdf_file.stem}.md")
if markdown_path.exists():
print(" Already processed (skipping)")
continue
# Convert to markdown
try:
markdown_content = pdf_to_markdown(pdf_file)
# Save markdown
markdown_filename = f"{pdf_file.stem}.md"
markdown_path = markdown_folder / markdown_filename
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
# Update cache
cache[pdf_file.name] = {
'hash': file_hash,
'markdown_file': markdown_filename,
'processed_date': datetime.now().isoformat(),
'size': pdf_file.stat().st_size
}
print(f" Saved to {markdown_path}")
except Exception as e:
print(f" Error: {e}")
# Save cache
with open(cache_file, 'w') as f:
json.dump(cache, f, indent=2)
print(f"\nProcessing complete. Markdown files saved to {markdown_folder}")
if __name__ == "__main__":
main()