Canadian Building Code MCP Server

batch_process.py•10.1 KiB

""" Batch Process - 여러 PDF를 일괄 처리 """ import subprocess import json from pathlib import Path from dataclasses import dataclass from typing import List, Dict import time @dataclass class BatchConfig: """배치 처리 설정""" sources_dir: str = "sources" marker_dir: str = "marker" output_dir: str = "output" run_marker: bool = True run_pipeline: bool = True dry_run: bool = False @dataclass class BatchResult: """배치 처리 결과""" pdf_name: str marker_success: bool = False pipeline_success: bool = False tables_total: int = 0 tables_fixed: int = 0 error_message: str = "" skipped: bool = False def normalize_folder_name(name: str) -> str: """폴더명 정규화: 공백→언더스코어, 소문자""" return name.lower().replace(" ", "_") def is_already_processed(marker_dir: str, pdf_name: str) -> bool: """marker/ 폴더에 이미 출력물 있으면 True""" pdf_stem = normalize_folder_name(Path(pdf_name).stem) marker_path = Path(marker_dir) / pdf_stem if marker_path.exists(): md_files = list(marker_path.glob("*.md")) meta_files = list(marker_path.glob("*_meta.json")) return bool(md_files and meta_files) return False def get_processing_order(pdf_files: List[Path]) -> List[Path]: """ 처리 순서 지정: 1. National (NBC, NFC, NPC, NECB) 2. BC (BCBC) 3. Alberta (ABC/NBCAE) 4. 나머지 Provincial (OBC, QCC, QECB, QPC, QSC) 5. User Guides (UGP4, UGNECB, IUGP9) """ # (패턴, 우선순위) - 순서대로 매칭, 먼저 매칭되면 사용 order_rules = [ # National (1-4) - 정확한 패턴 ('nbc2025', 1), ('nfc2025', 2), ('npc2025', 3), ('necb2025', 4), # BC (5) ('bcbc', 5), # Alberta (6) ('nbcae', 6), # Provincial (7-11) ('obc', 7), ('qcc', 8), ('qecb', 9), ('qpc', 10), ('qsc', 11), # User Guides (12-14) ('ugp4', 12), ('ugnecb', 13), ('iugp9', 14), ('ugp9', 14), ] def get_priority(pdf_path: Path) -> int: name_lower = pdf_path.stem.lower() for pattern, priority in order_rules: if pattern in name_lower: return priority return 99 # 알 수 없는 파일은 마지막 return sorted(pdf_files, key=get_priority) def run_marker(pdf_path: str, output_dir: str) -> bool: """ Marker PDF 파서 실행 Returns: 성공 여부 """ try: # marker_single 경로 (Windows Python Scripts) marker_exe = r"C:\Users\A\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\Scripts\marker_single.exe" cmd = [ marker_exe, pdf_path, "--output_dir", output_dir, "--output_format", "markdown" ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=10800 # 3시간 타임아웃 ) return result.returncode == 0 except subprocess.TimeoutExpired: print(f"Marker timeout for {pdf_path}") return False except FileNotFoundError: print("Marker not installed. Run: pip install marker-pdf") return False except Exception as e: print(f"Marker error: {e}") return False def find_marker_outputs(marker_dir: str, pdf_name: str) -> Dict[str, str]: """ Marker 출력 파일 찾기 Returns: {"md": "path/to/file.md", "meta": "path/to/meta.json"} """ marker_path = Path(marker_dir) pdf_stem = normalize_folder_name(Path(pdf_name).stem) # Marker는 보통 pdf_name/ 하위에 출력 possible_dirs = [ marker_path / pdf_stem, marker_path, ] result = {"md": None, "meta": None} for dir_path in possible_dirs: if not dir_path.exists(): continue # MD 파일 찾기 md_files = list(dir_path.glob("*.md")) if md_files: result["md"] = str(md_files[0]) # meta.json 찾기 meta_files = list(dir_path.glob("*_meta.json")) + list(dir_path.glob("meta.json")) if meta_files: result["meta"] = str(meta_files[0]) if result["md"] and result["meta"]: break return result def process_single_pdf( pdf_path: str, config: BatchConfig ) -> BatchResult: """단일 PDF 처리""" pdf_name = Path(pdf_path).name result = BatchResult(pdf_name=pdf_name) print(f"\n{'='*60}") print(f"Processing: {pdf_name}") print(f"{'='*60}") # Step 1: Marker 실행 if config.run_marker: print("\n[Marker] Converting PDF to Markdown...") marker_output = Path(config.marker_dir) / normalize_folder_name(Path(pdf_path).stem) if config.dry_run: print(" (dry-run) Would run marker") result.marker_success = True else: result.marker_success = run_marker(pdf_path, str(marker_output)) if not result.marker_success: result.error_message = "Marker failed" return result else: result.marker_success = True # Step 2: Marker 출력 찾기 outputs = find_marker_outputs(config.marker_dir, pdf_name) if not outputs["md"]: result.error_message = "MD file not found" return result # Step 3: Hybrid Pipeline 실행 if config.run_pipeline: print("\n[Pipeline] Running hybrid pipeline...") from smart_hybrid_pipeline import run_pipeline, PipelineConfig, print_summary pipeline_config = PipelineConfig( pdf_path=pdf_path, md_path=outputs["md"], meta_json_path=outputs["meta"], output_format="markdown", dry_run=config.dry_run, verbose=True ) pipeline_result = run_pipeline(pipeline_config) print_summary(pipeline_result) result.pipeline_success = True result.tables_total = pipeline_result.total_tables result.tables_fixed = pipeline_result.fixed_tables return result def batch_process(config: BatchConfig) -> List[BatchResult]: """ sources/ 폴더의 모든 PDF 일괄 처리 """ # 절대 경로로 변환 (marker_single이 상대 경로 처리 못함) sources_path = Path(config.sources_dir).resolve() marker_dir_abs = str(Path(config.marker_dir).resolve()) config.marker_dir = marker_dir_abs if not sources_path.exists(): print(f"Sources directory not found: {config.sources_dir}") return [] pdf_files = list(sources_path.glob("*.pdf")) if not pdf_files: print(f"No PDF files found in {config.sources_dir}") return [] # 처리 순서 정렬 pdf_files = get_processing_order(pdf_files) print(f"Found {len(pdf_files)} PDF files") print("\nProcessing order:") for i, pdf in enumerate(pdf_files, 1): print(f" {i:2d}. {pdf.name}") results = [] start_time = time.time() skipped_count = 0 for i, pdf_path in enumerate(pdf_files, 1): # 이미 처리된 파일 건너뛰기 if config.run_marker and is_already_processed(config.marker_dir, pdf_path.name): print(f"\n[{i}/{len(pdf_files)}] [SKIP] Already processed: {pdf_path.name}") result = BatchResult(pdf_name=pdf_path.name, skipped=True, marker_success=True) results.append(result) skipped_count += 1 continue print(f"\n[{i}/{len(pdf_files)}]") result = process_single_pdf(str(pdf_path), config) results.append(result) elapsed = time.time() - start_time # 최종 요약 print("\n" + "="*60) print("BATCH SUMMARY") print("="*60) print(f"Total PDFs: {len(results)}") print(f"Skipped: {skipped_count}") print(f"Marker OK: {sum(1 for r in results if r.marker_success and not r.skipped)}") print(f"Pipeline OK: {sum(1 for r in results if r.pipeline_success)}") print(f"Tables fixed: {sum(r.tables_fixed for r in results)}") print(f"Time elapsed: {elapsed:.1f}s") print("="*60) # 실패 목록 (skipped 제외) failed = [r for r in results if not r.skipped and (not r.marker_success or not r.pipeline_success)] if failed: print("\nFailed:") for r in failed: print(f" - {r.pdf_name}: {r.error_message}") return results def save_batch_report(results: List[BatchResult], output_path: str): """배치 결과를 JSON으로 저장""" report = [] for r in results: report.append({ "pdf": r.pdf_name, "marker": r.marker_success, "pipeline": r.pipeline_success, "tables_total": r.tables_total, "tables_fixed": r.tables_fixed, "error": r.error_message }) with open(output_path, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nReport saved: {output_path}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Batch PDF Processing") parser.add_argument("mode", nargs="?", default="all", choices=["all", "marker", "pipeline"], help="Processing mode") parser.add_argument("--sources", default="sources", help="PDF sources directory") parser.add_argument("--marker-dir", default="marker", help="Marker output directory") parser.add_argument("--dry-run", action="store_true", help="Don't modify files") parser.add_argument("--report", default="batch_report.json", help="Output report path") args = parser.parse_args() config = BatchConfig( sources_dir=args.sources, marker_dir=args.marker_dir, run_marker=(args.mode in ["all", "marker"]), run_pipeline=(args.mode in ["all", "pipeline"]), dry_run=args.dry_run ) results = batch_process(config) if results: save_batch_report(results, args.report)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DavidCho1999/Canada-AEC-Code-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

batch_process.py•10.1 KiB