Canadian Building Code MCP Server

surgery.py•5.66 KiB

""" Surgery - pdfplumber로 깨진 테이블 재추출 """ import pdfplumber from dataclasses import dataclass from typing import List, Optional from pathlib import Path @dataclass class ExtractedTable: """추출된 테이블 데이터""" table_id: str page_num: int headers: List[str] rows: List[List[str]] raw_data: List[List[str]] # 원본 데이터 def extract_table_from_pdf( pdf_path: str, page_num: int, table_id: str = "" ) -> Optional[ExtractedTable]: """ 특정 페이지에서 테이블 추출 Args: pdf_path: PDF 파일 경로 page_num: 페이지 번호 (1-indexed) table_id: 테이블 ID (로깅용) Returns: ExtractedTable 또는 None """ try: with pdfplumber.open(pdf_path) as pdf: if page_num < 1 or page_num > len(pdf.pages): print(f"Invalid page number: {page_num}") return None page = pdf.pages[page_num - 1] # 0-indexed tables = page.extract_tables() if not tables: print(f"No tables found on page {page_num}") return None # 첫 번째 테이블 사용 (대부분 한 페이지에 테이블 하나) raw_data = tables[0] if not raw_data or len(raw_data) < 2: print(f"Table too small on page {page_num}") return None # 헤더와 데이터 분리 headers = [cell or "" for cell in raw_data[0]] rows = [[cell or "" for cell in row] for row in raw_data[1:]] return ExtractedTable( table_id=table_id, page_num=page_num, headers=headers, rows=rows, raw_data=raw_data ) except Exception as e: print(f"Error extracting table from page {page_num}: {e}") return None def extract_multipage_table( pdf_path: str, start_page: int, end_page: int, table_id: str = "" ) -> Optional[ExtractedTable]: """ 여러 페이지에 걸친 테이블 추출 및 병합 Multi-page 테이블 처리: - 첫 페이지: 헤더 + 데이터 - 나머지 페이지: 데이터만 (헤더 반복 제거) """ all_rows = [] headers = None try: with pdfplumber.open(pdf_path) as pdf: for page_num in range(start_page, end_page + 1): if page_num < 1 or page_num > len(pdf.pages): continue page = pdf.pages[page_num - 1] tables = page.extract_tables() if not tables: continue raw_data = tables[0] if headers is None: # 첫 페이지: 헤더 저장 headers = [cell or "" for cell in raw_data[0]] rows = [[cell or "" for cell in row] for row in raw_data[1:]] else: # 이후 페이지: 헤더가 반복되면 스킵 first_row = [cell or "" for cell in raw_data[0]] if first_row == headers: rows = [[cell or "" for cell in row] for row in raw_data[1:]] else: rows = [[cell or "" for cell in row] for row in raw_data] all_rows.extend(rows) if headers is None: return None return ExtractedTable( table_id=table_id, page_num=start_page, headers=headers, rows=all_rows, raw_data=[headers] + all_rows ) except Exception as e: print(f"Error extracting multipage table: {e}") return None def table_to_markdown(table: ExtractedTable) -> str: """ ExtractedTable을 마크다운 테이블 문자열로 변환 """ lines = [] # 헤더 header_line = "| " + " | ".join(table.headers) + " |" lines.append(header_line) # 구분선 separator = "|" + "|".join(["---"] * len(table.headers)) + "|" lines.append(separator) # 데이터 행 for row in table.rows: # 열 개수 맞추기 padded_row = row + [""] * (len(table.headers) - len(row)) row_line = "| " + " | ".join(padded_row[:len(table.headers)]) + " |" lines.append(row_line) return "\n".join(lines) def table_to_html(table: ExtractedTable) -> str: """ ExtractedTable을 HTML 테이블로 변환 """ lines = ['<table class="obc-table">'] # 헤더 lines.append(' <thead>') lines.append(' <tr>') for header in table.headers: lines.append(f' <th>{header}</th>') lines.append(' </tr>') lines.append(' </thead>') # 바디 lines.append(' <tbody>') for row in table.rows: lines.append(' <tr>') for i, cell in enumerate(row): if i < len(table.headers): lines.append(f' <td>{cell}</td>') lines.append(' </tr>') lines.append(' </tbody>') lines.append('</table>') return "\n".join(lines) if __name__ == "__main__": import sys if len(sys.argv) < 3: print("Usage: python surgery.py <pdf_path> <page_num> [table_id]") sys.exit(1) pdf_path = sys.argv[1] page_num = int(sys.argv[2]) table_id = sys.argv[3] if len(sys.argv) > 3 else "" table = extract_table_from_pdf(pdf_path, page_num, table_id) if table: print(f"Extracted table from page {page_num}:") print(f" Headers: {table.headers}") print(f" Rows: {len(table.rows)}") print() print("Markdown output:") print(table_to_markdown(table)) else: print("Failed to extract table")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DavidCho1999/Canada-AEC-Code-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

surgery.py•5.66 KiB