Canadian Building Code MCP Server

generate_map_guide.py•15 KiB

#!/usr/bin/env python3 """ Map Generator for User's Guides (Commentary-based documents) Handles UGP4 (Structural Commentaries) and UGNECB (Energy Code Guide) Different from v2 which handles Division A/B/C based building codes. """ import os import re import json import argparse from pathlib import Path from collections import Counter from typing import List, Dict, Optional, Tuple from dataclasses import dataclass from datetime import datetime # Stopwords for keyword extraction STOPWORDS = { 'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these', 'those', 'it', 'its', 'if', 'then', 'than', 'so', 'such', 'no', 'not', 'only', 'same', 'other', 'into', 'over', 'under', 'after', 'before', 'between', 'each', 'all', 'both', 'any', 'some', 'more', 'most', 'less', 'shall', 'must', 'except', 'unless', 'provided', 'required', 'article', 'section', 'sentence', 'clause', 'part', 'commentary', 'see', 'refer', 'reference', 'following', 'example', 'figure', 'table', } def extract_keywords(text: str, max_keywords: int = 15) -> List[str]: """Extract meaningful keywords using simple TF approach.""" if not text: return [] words = re.findall(r'[a-z][a-z0-9-]*[a-z0-9]|[a-z]', text.lower()) words = [w for w in words if w not in STOPWORDS and len(w) > 2] counter = Counter(words) return [word for word, _ in counter.most_common(max_keywords)] def slugify(text: str) -> str: """Convert text to URL-friendly slug.""" # Remove special characters, keep alphanumeric and spaces text = re.sub(r'[^\w\s-]', '', text) # Replace spaces with hyphens text = re.sub(r'\s+', '-', text.strip()) # Remove multiple hyphens text = re.sub(r'-+', '-', text) return text[:50] # Limit length def parse_ugp4(json_path: str) -> List[Dict]: """ Parse UGP4 (Structural Commentaries). Structure: Commentary A, B, C... with subsection titles. """ with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) texts = data.get('texts', []) sections = [] seen_ids = {} # Track current commentary current_commentary = None current_commentary_title = None current_section_idx = -1 section_contents = [] # Commentary pattern: "Commentary A", "Commentary B", etc. commentary_pattern = re.compile(r'^Commentary\s+([A-Z])(?:\s+(.+))?$') for item in texts: label = item.get('label', '') text = item.get('text', '').strip() prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) bbox = prov.get('bbox', None) # Skip early pages (TOC, intro) if page < 15: continue if label == 'section_header': # Check if it's a Commentary header match = commentary_pattern.match(text) if match: letter = match.group(1) title = match.group(2) or "" # Only create new commentary if letter changes if letter != current_commentary: current_commentary = letter current_commentary_title = title section_id = f"Commentary-{letter}" if section_id not in seen_ids: section = { "id": section_id, "title": f"Commentary {letter}" + (f": {title}" if title else ""), "page": page, "level": "commentary", "bbox": bbox, } seen_ids[section_id] = len(sections) sections.append(section) current_section_idx = len(sections) - 1 section_contents.append("") continue # Skip if no current commentary yet if not current_commentary: continue # Skip common non-content headers skip_patterns = [ 'Table of Contents', 'Notable Changes', 'References', 'CANADIAN COMMISSION', 'NRCC-CONST', 'First Printing' ] if any(p in text for p in skip_patterns): continue # This is a subsection within the commentary slug = slugify(text) if not slug: continue section_id = f"{current_commentary}-{slug}" # Handle duplicates - keep later occurrence if section_id in seen_ids: existing_idx = seen_ids[section_id] existing_page = sections[existing_idx]['page'] if page > existing_page: sections[existing_idx] = { "id": section_id, "title": text, "page": page, "level": "subsection", "parent_id": f"Commentary-{current_commentary}", "bbox": bbox, } continue section = { "id": section_id, "title": text, "page": page, "level": "subsection", "parent_id": f"Commentary-{current_commentary}", "bbox": bbox, } seen_ids[section_id] = len(sections) sections.append(section) current_section_idx = len(sections) - 1 section_contents.append("") # Collect content for keywords elif label in ('text', 'list_item', 'paragraph') and current_section_idx >= 0: if current_section_idx < len(section_contents): section_contents[current_section_idx] += " " + text # Add keywords for i, section in enumerate(sections): if i < len(section_contents): section['keywords'] = extract_keywords(section_contents[i]) else: section['keywords'] = [] return sections, data def parse_ugnecb(json_path: str) -> List[Dict]: """ Parse UGNECB (Energy Code Guide). Structure: Commentary on Part X with Article references. """ with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) texts = data.get('texts', []) sections = [] seen_ids = {} current_part = None current_section_idx = -1 section_contents = [] # Pattern for Part headers in page headers/footers part_pattern = re.compile(r'Commentary on Part\s+(\d+)') # Pattern for Article references article_pattern = re.compile(r'$Article\s+([\d.]+)$') # Pattern for Examples example_pattern = re.compile(r'^Example\s+([\d-]+)') # First pass: identify Part boundaries from page headers page_to_part = {} for item in texts: if item.get('label') in ('page_header', 'section_header'): text = item.get('text', '') match = part_pattern.search(text) if match: prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) if page > 5: # Skip TOC page_to_part[page] = match.group(1) # Fill gaps if page_to_part: max_page = max(page_to_part.keys()) + 50 current_p = None filled = {} for pg in range(1, max_page): if pg in page_to_part: current_p = page_to_part[pg] filled[pg] = current_p page_to_part = filled for item in texts: label = item.get('label', '') text = item.get('text', '').strip() prov = item.get('prov', [{}])[0] page = prov.get('page_no', 0) bbox = prov.get('bbox', None) # Skip early pages if page < 8: continue # Get current part from page mapping part_num = page_to_part.get(page) if label == 'section_header': # Skip Commentary on Part X headers (just page markers) if part_pattern.match(text): # Create Part entry if new if part_num and part_num != current_part: current_part = part_num section_id = f"Part{part_num}" if section_id not in seen_ids: section = { "id": section_id, "title": f"Part {part_num}", "page": page, "level": "part", "bbox": bbox, } seen_ids[section_id] = len(sections) sections.append(section) current_section_idx = len(sections) - 1 section_contents.append("") continue # Skip common headers skip_patterns = [ 'Table of Contents', 'User\'s Guide', 'NRCC-CONST', 'Energy', 'Introduction', 'Development', 'Overview', 'Referenced Standards', 'Additional Information' ] if any(p in text for p in skip_patterns): continue if not part_num: continue # Check for Article reference in title article_match = article_pattern.search(text) example_match = example_pattern.match(text) if article_match: # Article-based section article_num = article_match.group(1) section_id = f"Part{part_num}-Article{article_num}" title = text elif example_match: # Example section example_num = example_match.group(1) section_id = f"Part{part_num}-Example{example_num}" title = text else: # General subsection slug = slugify(text) if not slug or len(slug) < 3: continue section_id = f"Part{part_num}-{slug}" title = text # Handle duplicates if section_id in seen_ids: existing_idx = seen_ids[section_id] existing_page = sections[existing_idx]['page'] if page > existing_page: sections[existing_idx] = { "id": section_id, "title": title, "page": page, "level": "subsection", "parent_id": f"Part{part_num}", "bbox": bbox, } continue section = { "id": section_id, "title": title, "page": page, "level": "subsection", "parent_id": f"Part{part_num}", "bbox": bbox, } seen_ids[section_id] = len(sections) sections.append(section) current_section_idx = len(sections) - 1 section_contents.append("") # Collect content for keywords elif label in ('text', 'list_item', 'paragraph') and current_section_idx >= 0: if current_section_idx < len(section_contents): section_contents[current_section_idx] += " " + text # Add keywords for i, section in enumerate(sections): if i < len(section_contents): section['keywords'] = extract_keywords(section_contents[i]) else: section['keywords'] = [] return sections, data def generate_map(json_path: str, guide_type: str, code_name: str = None, code_version: str = None) -> Dict: """Generate map JSON from Docling JSON output.""" json_path = Path(json_path) # Auto-detect from filename if code_name is None: name = json_path.stem.upper() code_name = re.sub(r'[\d_].*$', '', name) or name if code_version is None: match = re.search(r'(\d{4})', json_path.stem) code_version = match.group(1) if match else "unknown" print(f"Parsing: {json_path.name}") print(f"Guide: {code_name} {code_version} (type: {guide_type})") # Parse based on guide type if guide_type == 'ugp4': sections, raw_data = parse_ugp4(str(json_path)) elif guide_type == 'ugnecb': sections, raw_data = parse_ugnecb(str(json_path)) else: raise ValueError(f"Unknown guide type: {guide_type}") print(f"Found {len(sections)} sections") # Get source PDF metadata origin = raw_data.get('origin', {}) source_pdf = {"filename": origin.get('filename', '')} # Build map map_data = { "code": code_name, "version": code_version, "document_type": "guide", # Mark as guide! "generated": datetime.now().isoformat(), "source_pdf": source_pdf, "sections": sections } return map_data def main(): parser = argparse.ArgumentParser( description="Generate Map JSON for User's Guides (Commentary-based)" ) parser.add_argument( "input", help="Docling JSON file or directory" ) parser.add_argument( "--type", "-t", required=True, choices=['ugp4', 'ugnecb'], help="Guide type: ugp4 (Structural Commentaries) or ugnecb (Energy Guide)" ) parser.add_argument( "--output", "-o", help="Output JSON file" ) parser.add_argument( "--code", "-c", help="Code name (e.g., UGP4)" ) parser.add_argument( "--version", "-v", help="Version (e.g., 2020)" ) args = parser.parse_args() input_path = Path(args.input) # Find JSON file if input_path.is_dir(): json_files = [f for f in input_path.glob("*.json") if not f.name.endswith('_meta.json')] if not json_files: print(f"No JSON files found in {input_path}") return json_path = json_files[0] else: json_path = input_path # Generate map map_data = generate_map(str(json_path), args.type, args.code, args.version) # Output path if args.output: output_path = Path(args.output) else: output_dir = Path("maps") output_dir.mkdir(exist_ok=True) output_path = output_dir / f"{map_data['code']}_{map_data['version']}.json" # Save with open(output_path, 'w', encoding='utf-8') as f: json.dump(map_data, f, ensure_ascii=False, indent=2) print(f"\nSaved: {output_path}") print(f"Sections: {len(map_data['sections'])}") # Show samples if map_data['sections']: print("\nSample sections:") for section in map_data['sections'][:10]: print(f" {section['id']}: page {section['page']} - {section['title'][:50]}") # Stats by level levels = {} for s in map_data['sections']: lvl = s.get('level', 'unknown') levels[lvl] = levels.get(lvl, 0) + 1 print(f"\nBy level: {levels}") if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/DavidCho1999/Canada-AEC-Code-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

generate_map_guide.py•15 KiB