Canadian Building Code MCP Server

generate_map_v2.py•30.4 KiB

#!/usr/bin/env python3 """ Map Generator v2 for Canadian Building Codes JSON-First approach: Directly parse Docling JSON output Key improvements over v1: - No MD parsing needed - Accurate page numbers and bbox from JSON - Division prefix for unique section IDs (A-1.1.1.1, B-1.1.1.1) - TOC filtering via duplicate detection v2.1 Updates: - Table indexing: Parses tables from Docling output - Table markdown: Converts tables to searchable markdown format """ import os import re import json import argparse from pathlib import Path from collections import Counter from typing import List, Dict, Optional, Tuple from dataclasses import dataclass, asdict from datetime import datetime # Building code stopwords STOPWORDS = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'it', 'its', 'if', 'then', 'than', 'so', 'such', 'no', 'not', 'only', 'same', 'other', 'into', 'over', 'under', 'after', 'before', 'between', 'each', 'all', 'both', 'any', 'some', 'more', 'most', 'less', 'least', 'own', 'up', 'down', 'out', 'off', 'about', 'also', 'just', 'how', 'when', 'where', 'which', 'who', 'whom', 'what', 'why', # Building code specific 'shall', 'must', 'except', 'unless', 'where', 'provided', 'required', 'article', 'section', 'sentence', 'clause', 'subsection', 'part', 'division', 'table', 'figure', 'note', 'appendix', 'annex', 'accordance', 'compliance', 'requirement', 'provision', 'applicable', 'apply', 'applies', 'applied', 'described', 'specified', 'permitted', 'see', 'refer', 'reference', 'following', 'listed', 'contained', 'include', 'including', 'includes', 'included', 'conform', 'conforms', } def extract_keywords(text: str, max_keywords: int = 15) -> List[str]: """Extract meaningful keywords using simple TF approach.""" if not text: return [] words = re.findall(r'[a-z][a-z0-9-]*[a-z0-9]|[a-z]', text.lower()) words = [w for w in words if w not in STOPWORDS and len(w) > 2] counter = Counter(words) return [word for word, _ in counter.most_common(max_keywords)] # ============================================ # Table Parsing Functions # ============================================ # Regex pattern for Table IDs: "Table 4.1.5.3", "Table A-9.10.14.4", "Table D-1.1.2" TABLE_ID_PATTERN = re.compile( r'Table\s+([A-D]-)?([\d]+(?:\.[\d]+)*(?:-[A-Z])?)', re.IGNORECASE ) # Additional patterns for tables without "Table X.X.X" format # "Forming Part of Sentence 3.3.1.5.(1)" or "Forming Part of Article 9.33.6.5." FORMING_PART_PATTERN = re.compile( r'Forming Part of (Article|Sentences?)\s+([\d.()]+)', re.IGNORECASE ) # "Detail EA-1", "Detail EB-2" (OBC Vol2 detail drawings) DETAIL_PATTERN = re.compile( r'Detail\s+([A-Z]{1,2}-\d+)', re.IGNORECASE ) def table_to_markdown(table: Dict, max_chars: int = 2000) -> str: """ Convert Docling table data to markdown format. Args: table: Docling table object with 'data' containing grid max_chars: Maximum characters for markdown output Returns: Markdown formatted table string """ data = table.get('data', {}) # Use grid directly (preferred) or fall back to table_cells grid = data.get('grid', []) if not grid: return "" num_rows = len(grid) num_cols = len(grid[0]) if grid else 0 if num_rows == 0 or num_cols == 0: return "" # Build markdown from grid lines = [] for i, row in enumerate(grid): row_cells = [] for cell in row: text = cell.get('text', '') if isinstance(cell, dict) else str(cell) text = text.replace('\n', ' ').replace('|', '/').strip() # Truncate long cell text if len(text) > 50: text = text[:47] + "..." row_cells.append(text if text else ' ') line = '| ' + ' | '.join(row_cells) + ' |' lines.append(line) if i == 0: # Header separator after first row lines.append('|' + '---|' * len(row_cells)) markdown = '\n'.join(lines) # Truncate if too long if len(markdown) > max_chars: markdown = markdown[:max_chars] + "\n...(truncated)" return markdown def build_page_caption_map(texts: List[Dict]) -> Dict[int, List[Dict]]: """ Build a mapping of page numbers to table caption elements. Checks both 'caption' AND 'section_header' labels for Table patterns. Also captures "Forming Part of" and "Detail" patterns. Includes bbox for proximity matching. Returns: Dict mapping page_no -> list of {text, table_id, title, bbox, ...} """ page_captions = {} seen_table_ids_per_page = {} # Track to avoid duplicates # Check both caption and section_header elements for t in texts: label = t.get('label', '') # Check both caption and section_header for Table patterns if label not in ('caption', 'section_header'): continue text = t.get('text', '') prov = t.get('prov', [{}])[0] page = prov.get('page_no', 0) bbox = prov.get('bbox', None) # Skip "Notes to Table" entries - these are not table captions if text.lower().startswith('notes to table'): continue table_id = None title = None is_continued = False # Pattern 1: Standard "Table X.X.X" pattern match = TABLE_ID_PATTERN.search(text) if match: prefix = match.group(1) or '' number = match.group(2) table_id = f"Table-{prefix}{number}" # Extract title title_start = match.end() title = text[title_start:].strip() title = re.sub(r'^[\.\s]+', '', title) title = re.sub(r'$Continued$.*$', '', title, flags=re.IGNORECASE).strip() if not title: title = f"Table {prefix}{number}" else: title = f"Table {prefix}{number}. {title[:100]}" # Check if this is a (Continued) entry is_continued = bool(re.search(r'$Continued$', text, re.IGNORECASE)) # Pattern 2: "Forming Part of Sentence/Article X.X.X" if not table_id: match = FORMING_PART_PATTERN.search(text) if match: ref_type = match.group(1) # Article or Sentence ref_num = match.group(2).rstrip('.') table_id = f"Table-FP-{ref_num}" # Title is the text before "Forming Part of" title_text = text[:match.start()].strip() if title_text: title = f"{title_text[:80]} (Forming Part of {ref_type} {ref_num})" else: title = f"Table Forming Part of {ref_type} {ref_num}" # Pattern 3: "Detail EA-1" etc (OBC Vol2) if not table_id: match = DETAIL_PATTERN.search(text) if match: detail_num = match.group(1) table_id = f"Detail-{detail_num}" title = text[:100].strip() # Skip if no pattern matched if not table_id: continue # Avoid duplicates on the same page (same table_id at same position) page_key = (page, table_id, bbox.get('t', 0) if bbox else 0) if page_key in seen_table_ids_per_page: continue seen_table_ids_per_page[page_key] = True if page not in page_captions: page_captions[page] = [] page_captions[page].append({ 'text': text, 'table_id': table_id, 'title': title, 'is_continued': is_continued, 'source_label': label, 'bbox': bbox # Include bbox for proximity matching }) # Sort captions on each page by vertical position (top to bottom) for page in page_captions: page_captions[page].sort(key=lambda x: x['bbox'].get('t', 0) if x['bbox'] else 0) return page_captions def extract_table_id_enhanced(table: Dict, texts: List[Dict], page_captions: Dict[int, List[Dict]]) -> Tuple[Optional[str], str, bool]: """ Extract table ID and title using multiple strategies. Returns: (table_id, title, is_continued) tuple """ prov = table.get('prov', [{}])[0] page = prov.get('page_no', 0) # Strategy 1: Check linked captions (original method) captions = table.get('captions', []) for cap in captions: if isinstance(cap, dict) and '$ref' in cap: ref = cap['$ref'] try: idx = int(ref.split('/')[-1]) if idx < len(texts): caption_text = texts[idx].get('text', '') # Pattern 1: Standard "Table X.X.X" match = TABLE_ID_PATTERN.search(caption_text) if match: prefix = match.group(1) or '' number = match.group(2) table_id = f"Table-{prefix}{number}" title_start = match.end() title = caption_text[title_start:].strip() title = re.sub(r'^[\.\s]+', '', title) is_continued = bool(re.search(r'$Continued$', caption_text, re.IGNORECASE)) title = re.sub(r'$Continued$.*$', '', title, flags=re.IGNORECASE).strip() if not title: title = f"Table {prefix}{number}" else: title = f"Table {prefix}{number}. {title[:100]}" return table_id, title, is_continued # Pattern 2: "Forming Part of Sentence/Article X.X.X" match = FORMING_PART_PATTERN.search(caption_text) if match: ref_type = match.group(1) ref_num = match.group(2).rstrip('.') table_id = f"Table-FP-{ref_num}" title_text = caption_text[:match.start()].strip() if title_text: title = f"{title_text[:80]} (Forming Part of {ref_type} {ref_num})" else: title = f"Table Forming Part of {ref_type} {ref_num}" return table_id, title, False # Pattern 3: "Detail EA-1" etc match = DETAIL_PATTERN.search(caption_text) if match: detail_num = match.group(1) table_id = f"Detail-{detail_num}" title = caption_text[:100].strip() return table_id, title, False except (ValueError, IndexError): pass # Strategy 2: Check nearby caption text elements on the same page using bbox proximity if page in page_captions: captions_on_page = page_captions[page] if captions_on_page: table_bbox = prov.get('bbox', {}) table_top = table_bbox.get('t', 0) if table_bbox else 0 # First pass: Find non-continued caption closest above the table best_caption = None best_continued = None # Track best (Continued) caption as fallback for cap in captions_on_page: cap_bbox = cap.get('bbox', {}) cap_bottom = cap_bbox.get('b', 0) if cap_bbox else 0 # Caption should be above the table (or very close) if cap_bottom <= table_top + 50: # 50 units tolerance if cap['is_continued']: best_continued = cap else: best_caption = cap else: # Once we pass the table position, stop looking break # If no non-continued caption found, try to find one by closest distance if not best_caption: non_continued = [c for c in captions_on_page if not c['is_continued']] if non_continued: def distance(cap): cap_bbox = cap.get('bbox', {}) cap_top = cap_bbox.get('t', 0) if cap_bbox else 0 return abs(cap_top - table_top) best_caption = min(non_continued, key=distance) # Use non-continued caption if found, otherwise use continued caption if best_caption: return best_caption['table_id'], best_caption['title'], False elif best_continued: # This is a continuation page - return with is_continued=True return best_continued['table_id'], best_continued['title'], True # No valid table ID found return None, f"Table (page {page})", False def extract_table_keywords(table: Dict) -> List[str]: """Extract keywords from table cells using grid data.""" data = table.get('data', {}) grid = data.get('grid', []) all_text = [] for row in grid: for cell in row: if isinstance(cell, dict): all_text.append(cell.get('text', '')) else: all_text.append(str(cell)) return extract_keywords(' '.join(all_text), max_keywords=20) def parse_tables(data: Dict, page_division: Dict) -> List[Dict]: """ Parse tables from Docling JSON and convert to section format. Uses enhanced caption matching for better table identification. Args: data: Full Docling JSON data page_division: Mapping of page numbers to Division letters Returns: List of table sections """ tables = data.get('tables', []) texts = data.get('texts', []) # Build page -> caption mapping for enhanced matching page_captions = build_page_caption_map(texts) table_sections = [] seen_ids = {} # table_id -> first page seen continued_tables = [] # Tables marked as (Continued) for table in tables: prov = table.get('prov', [{}])[0] page = prov.get('page_no', 0) bbox = prov.get('bbox', None) # Skip tables on early pages (TOC, preface) if page < 20: continue # Skip very small tables (likely formatting artifacts) grid = table.get('data', {}).get('grid', []) if len(grid) < 2: continue # Get table ID and title using enhanced method table_id, title, is_continued = extract_table_id_enhanced(table, texts, page_captions) # Skip if no valid ID if not table_id: continue # Handle duplicates and continued tables if table_id in seen_ids: # If this is a later page, it might have additional data # We'll keep the first occurrence but note the page range if is_continued: continued_tables.append({ 'table_id': table_id, 'page': page }) continue seen_ids[table_id] = page # Get division for this page division = page_division.get(page, None) # Convert to markdown markdown = table_to_markdown(table) # Extract keywords keywords = extract_table_keywords(table) # Get table dimensions from grid num_rows = len(grid) num_cols = len(grid[0]) if grid else 0 table_section = { "id": table_id, "title": title, "page": page, "bbox": bbox, "level": "table", "type": "table", "division": division, "keywords": keywords, "table_info": { "rows": num_rows, "cols": num_cols, }, "markdown": markdown, } table_sections.append(table_section) # Mark tables that span multiple pages for cont in continued_tables: for section in table_sections: if section['id'] == cont['table_id']: if 'page_end' not in section: section['page_end'] = cont['page'] else: section['page_end'] = max(section['page_end'], cont['page']) break # Found the matching section, no need to continue inner loop # Count multi-page tables multi_page = sum(1 for s in table_sections if 'page_end' in s) print(f" Found {len(table_sections)} tables with valid IDs (from {len(tables)} detected, {multi_page} multi-page)") return table_sections def get_level_from_id(section_id: str) -> str: """Determine hierarchy level from ID pattern.""" # Division A, B, C if re.match(r'^[A-C]$', section_id): return 'division' # Part pattern: A-Part 1, or just numeric start if 'Part' in section_id: return 'part' # Count dots for numeric IDs (after removing Division prefix) # A-1.1 → 1 dot → section numeric_part = re.sub(r'^[A-C]-', '', section_id) dots = numeric_part.count('.') if dots == 0: return 'part' # Just "1" or "2" elif dots == 1: # 1.1 return 'section' elif dots == 2: # 1.1.1 return 'subsection' elif dots >= 3: # 1.1.1.1 return 'article' return 'unknown' def get_parent_id(full_id: str) -> Optional[str]: """Calculate parent ID from full section ID.""" # A-1.1.1.1 → A-1.1.1 match = re.match(r'^([A-C]-)?(.*)', full_id) if match: prefix = match.group(1) or '' numeric = match.group(2) parts = numeric.split('.') if len(parts) > 1: parent_numeric = '.'.join(parts[:-1]) return f"{prefix}{parent_numeric}" return None def parse_docling_json(json_path: str) -> Dict: """ Parse Docling JSON and extract sections with coordinates. Strategy: 1. Track current Division from page_header AND section_header 2. Extract section_header items with Division prefix 3. Collect content text for keyword extraction 4. Handle duplicates (keep larger page = filter TOC) """ with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) texts = data.get('texts', []) # Build page → Division mapping from multiple sources page_division = {} # 1. From page_header: "Division A", "Division B", etc. for item in texts: if item.get('label') == 'page_header': text = item.get('text', '') match = re.match(r'^Division\s+([A-C])$', text) if match: prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) page_division[page] = match.group(1) # 2. From page_footer: "Division A", "Division A 1-1", etc. (more consistent) for item in texts: if item.get('label') == 'page_footer': text = item.get('text', '') # Match "Division A", "Division A 1-1", "1-2 Division A", etc. match = re.search(r'Division\s+([A-C])', text) if match: prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) page_division[page] = match.group(1) # 3. From section_header: "Division A:" or "Division A" (standalone) # Only from content pages (page > 15) for item in texts: if item.get('label') == 'section_header': text = item.get('text', '') # Match both "Division A:" and standalone "Division A" match = re.match(r'^Division\s+([A-C])(?::|$)', text) if match: prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) # Skip TOC pages if page > 15: page_division[page] = match.group(1) # Fill gaps: propagate Division forward until next change if page_division: sorted_pages = sorted(page_division.keys()) max_page = max(sorted_pages) + 100 filled_division = {} current_div = None for page in range(1, max_page): if page in page_division: current_div = page_division[page] filled_division[page] = current_div page_division = filled_division # Extract sections from section_headers sections = [] seen_ids = {} # Track seen IDs to filter duplicates # Also collect content for keywords current_section_idx = -1 section_contents = [] # List of content strings per section # Pattern for numeric section IDs: "1.1.1.1. Title" or "A-1.1.1.1. Title" numeric_pattern = re.compile(r'^([A-C]-)?(\d+(?:\.\d+)*)\.\s*(.*)$') # Pattern for Part: "Part 1 Title" or "Part 1. Title" part_pattern = re.compile(r'^Part\s+(\d+)\.?\s*(.*)$', re.IGNORECASE) # Pattern for Appendix: "A-1.1.1.1. Title" (Appendix annotations) appendix_pattern = re.compile(r'^(A-\d+(?:\.\d+)*(?:\.$\d+$)?)\s*(.*)$') for item in texts: label = item.get('label', '') text = item.get('text', '').strip() prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) bbox = prov.get('bbox', None) # Get current division for this page current_division = page_division.get(page, None) if label == 'section_header': section_id = None title = text # Try numeric pattern first match = numeric_pattern.match(text) if match: existing_prefix = match.group(1) # May be None or "A-" numeric_id = match.group(2) title = match.group(3) or numeric_id # Build full ID with Division prefix if existing_prefix: # Already has prefix (like Appendix A-1.1.1.1) section_id = f"{existing_prefix}{numeric_id}" elif current_division: section_id = f"{current_division}-{numeric_id}" else: section_id = numeric_id # Try Part pattern elif part_pattern.match(text): match = part_pattern.match(text) part_num = match.group(1) title = match.group(2) or f"Part {part_num}" if current_division: section_id = f"{current_division}-Part{part_num}" else: section_id = f"Part{part_num}" # Division header itself elif text.startswith('Division') and ':' in text: # "Division A: Compliance..." → just "A" match = re.match(r'^Division\s+([A-C]):', text) if match: section_id = match.group(1) title = text # Skip if no valid ID extracted if not section_id: continue # Skip TOC entries (pages 1-15 typically) # But keep Division headers even if early if page < 15 and not re.match(r'^[A-C]$', section_id): continue # Duplicate handling: keep the one with larger page (skip TOC) if section_id in seen_ids: existing_page = seen_ids[section_id]['page'] if page > existing_page: # Update with this one (further in document = real content) idx = seen_ids[section_id]['idx'] sections[idx] = { "id": section_id, "title": title, "page": page, "bbox": bbox, "level": get_level_from_id(section_id), "parent_id": get_parent_id(section_id), "division": current_division, } seen_ids[section_id] = {'page': page, 'idx': idx} # else: keep existing (earlier, possibly real if this is index) continue # New section section = { "id": section_id, "title": title, "page": page, "bbox": bbox, "level": get_level_from_id(section_id), "parent_id": get_parent_id(section_id), "division": current_division, } seen_ids[section_id] = {'page': page, 'idx': len(sections)} sections.append(section) current_section_idx = len(sections) - 1 section_contents.append("") # Collect content text for keywords elif label in ('text', 'list_item', 'paragraph') and current_section_idx >= 0: if current_section_idx < len(section_contents): section_contents[current_section_idx] += " " + text # Extract keywords for each section for i, section in enumerate(sections): if i < len(section_contents): section['keywords'] = extract_keywords(section_contents[i]) else: section['keywords'] = [] return sections, data, page_division def generate_map(json_path: str, code_name: str = None, code_version: str = None) -> Dict: """Generate map JSON from Docling JSON output.""" json_path = Path(json_path) # Auto-detect code name and version from filename if code_name is None: name = json_path.stem.upper() code_name = re.sub(r'[\d_].*$', '', name) or name if code_version is None: match = re.search(r'(\d{4})', json_path.stem) code_version = match.group(1) if match else "unknown" print(f"Parsing: {json_path.name}") print(f"Code: {code_name} {code_version}") sections, raw_data, page_division = parse_docling_json(str(json_path)) print(f"Found {len(sections)} sections") # Parse tables table_sections = parse_tables(raw_data, page_division) # Get source PDF metadata origin = raw_data.get('origin', {}) source_pdf = { "filename": origin.get('filename', ''), } # Build map structure map_data = { "code": code_name, "version": code_version, "generated": datetime.now().isoformat(), "source_pdf": source_pdf, "sections": [], "tables": [], # Separate tables array for quick access } # Convert sections to output format for section in sections: section_dict = { "id": section['id'], "title": section['title'], "page": section['page'], "level": section['level'], } if section.get('parent_id'): section_dict["parent_id"] = section['parent_id'] if section.get('bbox'): section_dict["bbox"] = section['bbox'] if section.get('keywords'): section_dict["keywords"] = section['keywords'] if section.get('division'): section_dict["division"] = section['division'] map_data["sections"].append(section_dict) # Add table sections to both sections and tables arrays for table in table_sections: # Add to sections (for unified search) section_dict = { "id": table['id'], "title": table['title'], "page": table['page'], "level": "table", "type": "table", } if table.get('bbox'): section_dict["bbox"] = table['bbox'] if table.get('keywords'): section_dict["keywords"] = table['keywords'] if table.get('division'): section_dict["division"] = table['division'] if table.get('page_end'): section_dict["page_end"] = table['page_end'] map_data["sections"].append(section_dict) # Add to tables array (with full markdown) table_entry = { "id": table['id'], "title": table['title'], "page": table['page'], "table_info": table.get('table_info', {}), "markdown": table.get('markdown', ''), "keywords": table.get('keywords', []), } if table.get('division'): table_entry["division"] = table['division'] if table.get('page_end'): table_entry["page_end"] = table['page_end'] map_data["tables"].append(table_entry) return map_data def main(): parser = argparse.ArgumentParser( description="Generate Map JSON from Docling JSON output (v2)" ) parser.add_argument( "input", help="Docling JSON file or directory containing JSON file" ) parser.add_argument( "--output", "-o", help="Output JSON file (default: maps/<code_name><version>.json)" ) parser.add_argument( "--code", "-c", help="Code name (e.g., NBC, OBC). Auto-detected if not provided." ) parser.add_argument( "--version", "-v", help="Code version (e.g., 2025). Auto-detected if not provided." ) args = parser.parse_args() input_path = Path(args.input) # Find JSON file if input_path.is_dir(): json_files = [f for f in input_path.glob("*.json") if not f.name.endswith('_meta.json')] if not json_files: print(f"No JSON files found in {input_path}") return json_path = json_files[0] else: json_path = input_path # Generate map map_data = generate_map(str(json_path), args.code, args.version) # Determine output path if args.output: output_path = Path(args.output) else: output_dir = Path("maps") output_dir.mkdir(exist_ok=True) output_path = output_dir / f"{map_data['code']}{map_data['version']}.json" # Save map with open(output_path, 'w', encoding='utf-8') as f: json.dump(map_data, f, ensure_ascii=False, indent=2) print(f"\nSaved: {output_path}") print(f"Sections: {len(map_data['sections'])} (including {len(map_data['tables'])} tables)") # Show sample sections if map_data['sections']: print("\nSample sections:") for section in map_data['sections'][:5]: print(f" {section['id']}: page {section['page']} - {section['title'][:40]}") # Show sample tables if map_data['tables']: print("\nSample tables:") for table in map_data['tables'][:5]: info = table.get('table_info', {}) print(f" {table['id']}: page {table['page']} ({info.get('rows', '?')}x{info.get('cols', '?')}) - {table['title'][:40]}") # Show stats by division divisions = {} for s in map_data['sections']: div = s.get('division', 'None') divisions[div] = divisions.get(div, 0) + 1 print(f"\nBy Division: {divisions}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DavidCho1999/Canada-AEC-Code-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

generate_map_v2.py•30.4 KiB