Canadian Building Code MCP Server

extractor.py•12.5 KiB

# extractor.py - Canadian Building Code GPT Helper # Upload this file to GPT Knowledge for faster PDF extraction import re def extract_section(pdf_path, page_num, section_id): """ Extract section text from PDF by Section ID. Args: pdf_path: Path to uploaded PDF file page_num: Page number (1-indexed) section_id: Section ID like "9.8.8.2", "3.2.2.55" Returns: Extracted text or error message """ # Get page text text = None try: import fitz doc = fitz.open(pdf_path) page = doc[page_num - 1] text = page.get_text("text") doc.close() except: try: import pdfplumber with pdfplumber.open(pdf_path) as pdf: page = pdf.pages[page_num - 1] text = page.extract_text() or "" except Exception as e: return f"Error reading PDF: {e}" if not text: return f"Could not extract text from page {page_num}" # Find section ID in text if section_id not in text: # Try without leading zeros or with variations variations = [ section_id, section_id.replace(".", ". "), section_id + ".", section_id + " " ] found = False for var in variations: if var in text: section_id = var found = True break if not found: return f"Section {section_id} not found on page {page_num}.\n\nPage preview:\n{text[:1000]}" # Extract from section ID to next section start_idx = text.find(section_id) remaining = text[start_idx:] # Find next section (pattern: newline + number.number.number) next_match = re.search(r'\n\d+\.\d+\.\d+\.?\d*[\s\.]', remaining[len(section_id):]) if next_match: end_idx = len(section_id) + next_match.start() result = remaining[:end_idx].strip() else: # No next section, take up to 2500 chars result = remaining[:2500].strip() return result def search_json(json_data, query, code_filter=None): """ Search JSON for sections matching query. Args: json_data: Loaded JSON data (dict) query: Search keywords code_filter: Optional code name filter (e.g., "OBC", "NBC") Returns: List of matching sections """ results = [] query_lower = query.lower() query_words = query_lower.split() # Get codes to search if "codes" in json_data: codes = json_data["codes"] else: codes = {json_data.get("code", "unknown"): json_data} for code_name, code_data in codes.items(): # Apply code filter if code_filter and code_filter.upper() not in code_name.upper(): continue sections = code_data.get("sections", []) for section in sections: score = 0 # Check title title = section.get("title", "").lower() for word in query_words: if word in title: score += 10 # Check keywords keywords = section.get("keywords", []) for word in query_words: if word in keywords: score += 5 for kw in keywords: if word in kw: score += 2 if score > 0: results.append({ "id": section.get("id"), "title": section.get("title"), "page": section.get("page"), "code": code_name, "score": score }) # Sort by score results.sort(key=lambda x: x["score"], reverse=True) return results[:10] # Top 10 def extract_sections_batch(pdf_path, sections): """ Extract multiple sections in ONE PDF open (faster!). Args: pdf_path: Path to uploaded PDF file sections: List of {"page": int, "id": str} e.g., [{"page": 452, "id": "4.1.3.2"}, ...] Returns: Dict of {section_id: extracted_text} """ results = {} # Try PyMuPDF first try: import fitz doc = fitz.open(pdf_path) for s in sections: page_num = s["page"] section_id = s["id"] try: page = doc[page_num - 1] text = page.get_text("text") if section_id in text: start_idx = text.find(section_id) remaining = text[start_idx:] # Find next section next_match = re.search( r'\n\d+\.\d+\.\d+\.?\d*[\s\.]', remaining[len(section_id):] ) if next_match: end_idx = len(section_id) + next_match.start() results[section_id] = remaining[:end_idx].strip() else: results[section_id] = remaining[:2000].strip() else: results[section_id] = f"[Section {section_id} not found on page {page_num}]" except Exception as e: results[section_id] = f"[Error extracting {section_id}: {e}]" doc.close() return results except ImportError: pass # Fallback: pdfplumber try: import pdfplumber with pdfplumber.open(pdf_path) as pdf: for s in sections: page_num = s["page"] section_id = s["id"] try: page = pdf.pages[page_num - 1] text = page.extract_text() or "" if section_id in text: start_idx = text.find(section_id) remaining = text[start_idx:] next_match = re.search( r'\n\d+\.\d+\\.d+\.?\d*[\s\.]', remaining[len(section_id):] ) if next_match: end_idx = len(section_id) + next_match.start() results[section_id] = remaining[:end_idx].strip() else: results[section_id] = remaining[:2000].strip() else: results[section_id] = f"[Section {section_id} not found on page {page_num}]" except Exception as e: results[section_id] = f"[Error: {e}]" return results except Exception as e: return {"error": f"Could not open PDF: {e}"} def extract_table(json_data, table_id): """ Extract table markdown from JSON maps (fast, primary method). Args: json_data: Loaded JSON map data (single code or multi-code dict) table_id: Table ID (e.g., "4.1.5.3" or "Table-4.1.5.3") Returns: Dict with table info or error message { "id": "Table-4.1.5.3", "title": "Live Loads on Floors...", "page": 452, "markdown": "| Col1 | Col2 |\n|---|---|\n...", "table_info": {"rows": 15, "cols": 4}, "source": "JSON" } """ # Normalize ID - add "Table-" prefix if missing if not table_id.startswith("Table-"): table_id = f"Table-{table_id}" # Handle both single-code and multi-code JSON structures codes = json_data if isinstance(json_data, dict) and "tables" not in json_data else {"default": json_data} # Search in tables array for code_name, code_data in codes.items(): tables = code_data.get("tables", []) for table in tables: if table.get("id") == table_id: return { "id": table["id"], "title": table.get("title", ""), "page": table.get("page", 0), "markdown": table.get("markdown", ""), "table_info": table.get("table_info", {}), "source": "JSON" } return {"error": f"Table {table_id} not found in JSON"} def extract_table_from_pdf(pdf_path, page, table_id): """ Attempt to extract table from PDF by searching for table ID. Note: No bbox available, uses text search + heuristics. This is a FALLBACK method - JSON markdown is preferred. Args: pdf_path: Path to PDF file page: Page number table_id: Table ID to search for (e.g., "Table-4.1.5.3") Returns: Dict with extracted text or error """ # Normalize table ID if not table_id.startswith("Table-"): table_id = f"Table-{table_id}" # Try PyMuPDF first try: import fitz doc = fitz.open(pdf_path) page_obj = doc[page - 1] text = page_obj.get_text("text") doc.close() # Search for table ID (with or without "Table " prefix) search_patterns = [ table_id, table_id.replace("Table-", "Table "), table_id.replace("Table-", "") ] start_idx = -1 for pattern in search_patterns: if pattern in text: start_idx = text.find(pattern) break if start_idx >= 0: # Extract ~2000 chars after table ID excerpt = text[start_idx:start_idx + 2000] return { "id": table_id, "page": page, "text": excerpt, "source": "PDF (heuristic)", "note": "⚠️ Table extracted from PDF text. Format may not match original." } else: return {"error": f"Table {table_id} not found on page {page}"} except ImportError: pass # Fallback: pdfplumber try: import pdfplumber with pdfplumber.open(pdf_path) as pdf: page_obj = pdf.pages[page - 1] text = page_obj.extract_text() or "" search_patterns = [ table_id, table_id.replace("Table-", "Table "), table_id.replace("Table-", "") ] start_idx = -1 for pattern in search_patterns: if pattern in text: start_idx = text.find(pattern) break if start_idx >= 0: excerpt = text[start_idx:start_idx + 2000] return { "id": table_id, "page": page, "text": excerpt, "source": "PDF (heuristic)", "note": "⚠️ Table extracted from PDF text. Format may not match original." } else: return {"error": f"Table {table_id} not found on page {page}"} except Exception as e: return {"error": f"PDF extraction failed: {e}"} def extract_tables_batch(json_data, table_ids): """ Extract multiple tables from JSON in one call (fast). Args: json_data: Loaded JSON map table_ids: List of table IDs e.g., ["4.1.5.3", "9.10.14.4"] or ["Table-4.1.5.3", ...] Returns: Dict of {table_id: table_data} e.g., { "Table-4.1.5.3": {"id": "...", "markdown": "...", ...}, "Table-9.10.14.4": {"id": "...", "markdown": "...", ...} } """ results = {} for tid in table_ids: # Normalize ID normalized_id = tid if tid.startswith("Table-") else f"Table-{tid}" # Extract table result = extract_table(json_data, tid) results[normalized_id] = result return results # Quick usage examples: # # 1. Extract single section from PDF: # text = extract_section("/path/to/obc.pdf", 127, "9.8.8.2") # # 2. Extract multiple sections (FASTER!): # texts = extract_sections_batch("/path/to/nbc.pdf", [ # {"page": 452, "id": "4.1.3.2"}, # {"page": 538, "id": "4.3.3.1"}, # {"page": 537, "id": "4.2.7.2"} # ]) # # 3. Search JSON: # import json # with open("OBC_Vol1.json") as f: # data = json.load(f) # results = search_json(data, "guard height stairs") # # 4. Extract table from JSON (NEW!): # table = extract_table(data, "4.1.5.3") # print(table["markdown"]) # Display formatted table # # 5. Extract multiple tables (BATCH!): # tables = extract_tables_batch(data, ["4.1.5.3", "9.10.14.4"]) # for tid, table in tables.items(): # print(f"## {table['title']}") # print(table['markdown']) # # 6. Extract table from PDF (FALLBACK!): # table = extract_table_from_pdf("/path/to/nbc.pdf", 452, "Table-4.1.5.3") # print(table["text"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DavidCho1999/Canada-AEC-Code-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

extractor.py•12.5 KiB