#!/usr/bin/env python3
"""
Extract base64 PDFs from Playwright MCP tool output.
Usage:
python extract_pdfs.py <input_file> <output_dir> <prefix>
Example:
python extract_pdfs.py \
~/.claude/projects/.../tool-results/mcp-playwright-xxx.txt \
facturen/github-2025 \
github-2025
The input file should contain JSON output from browser_run_code that fetched
PDFs as base64. Expected format is an array of {month, data} objects where
data is base64-encoded PDF content.
"""
import json
import base64
import sys
import os
def main():
if len(sys.argv) < 4:
print("Usage: python extract_pdfs.py <input_file> <output_dir> <prefix>")
print("Example: python extract_pdfs.py input.txt facturen/github-2025 github-2025")
sys.exit(1)
input_file = sys.argv[1]
output_dir = sys.argv[2]
prefix = sys.argv[3]
os.makedirs(output_dir, exist_ok=True)
with open(input_file, 'r') as f:
content = f.read()
# Parse outer JSON wrapper (Claude tool output format)
outer = json.loads(content)
text = outer[0]['text']
# Find the JSON array in the text
# Format: ### Result\n"[{\"month\":\"01\",...}]"
start = text.find('"[{')
end = text.rfind('}]"') + 2
if start < 0 or end <= start:
print("Could not find JSON array in text")
sys.exit(1)
# Extract the escaped JSON string (without outer quotes)
json_escaped = text[start+1:end]
# Unescape
json_str = json_escaped.replace('\\"', '"')
# Parse PDF array
pdfs = json.loads(json_str)
print(f"Found {len(pdfs)} PDFs")
for item in pdfs:
month = item['month']
data = item['data']
filename = os.path.join(output_dir, f"{prefix}-{month}.pdf")
pdf_bytes = base64.b64decode(data)
with open(filename, 'wb') as f:
f.write(pdf_bytes)
print(f"Saved: {filename} ({len(pdf_bytes)} bytes)")
print(f"\nDone! {len(pdfs)} PDFs saved to {output_dir}")
if __name__ == "__main__":
main()