Yaizu Smart City MCP Server

smartcity-mcp
scripts

pdf_to_json_generator.py•23.9 KiB

#!/usr/bin/env python3 """ PDF解析からJSON生成ツール PDFファイルを詳細解析して、リッチなJSONデータモデルを生成 """ import asyncio import json import re from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional # PDF解析ライブラリ try: import PyPDF2 import fitz # PyMuPDF PDF_LIBRARIES_AVAILABLE = True except ImportError: print("⚠️ PDF解析ライブラリが見つかりません。基本的なテンプレートで生成します。") PDF_LIBRARIES_AVAILABLE = False class PDFToJSONGenerator: """PDF解析からJSON生成クラス""" def __init__(self): self.documentation_dir = Path("data/documentation") self.api_specs_dir = Path("data/api_specs") self.api_specs_dir.mkdir(parents=True, exist_ok=True) # OpenAPIカテゴリマッピング self.category_mapping = { "bousai-api": "防災情報API", "public-facility-api": "公共施設API", "tourism-api": "観光・産業API" } def extract_text_from_pdf(self, pdf_path: Path) -> str: """PDFからテキストを抽出""" if not PDF_LIBRARIES_AVAILABLE: return "" text = "" try: # PyMuPDFを使用してテキストを抽出 doc = fitz.open(pdf_path) for page in doc: text += page.get_text() doc.close() except Exception as e: print(f" ⚠️ PDF読み込みエラー: {pdf_path.name} - {e}") try: # フォールバック：PyPDF2を試す with open(pdf_path, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() except Exception as e2: print(f" ❌ PDF読み込み失敗: {pdf_path.name} - {e2}") return text def analyze_pdf_content(self, pdf_path: Path, text: str) -> Dict[str, Any]: """PDFコンテンツを解析してメタデータを抽出""" entity_name = pdf_path.stem # テキストから情報を抽出 analysis = { "entity_type": entity_name, "extracted_fields": self._extract_field_information(text), "data_types": self._identify_data_types(text), "relationships": self._find_relationships(text), "constraints": self._extract_constraints(text), "examples": self._extract_examples(text), "description": self._generate_description(entity_name, text) } return analysis def _extract_field_information(self, text: str) -> List[Dict[str, Any]]: """テキストからフィールド情報を抽出""" fields = [] # 日本語のフィールド名パターンを検索 field_patterns = [ r'([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*([^\n]+)', # field: description r'・\s*([^：\n]+)：([^\n]+)', # ・フィールド名：説明 r'項目名\s*[:：]\s*([^\n]+)', # 項目名: 値 r'属性\s*[:：]\s*([^\n]+)', # 属性: 値 ] for pattern in field_patterns: matches = re.findall(pattern, text) for match in matches: field_name, description = match field_name = field_name.strip() description = description.strip() if len(field_name) > 1 and len(description) > 1: # データ型を推定 data_type = self._guess_data_type(field_name, description) fields.append({ "name": self._normalize_field_name(field_name), "description": description[:200], # 説明を200文字に制限 "type": data_type, "required": self._is_required_field(field_name, description) }) return fields[:10] # 最大10フィールドに制限 def _normalize_field_name(self, name: str) -> str: """フィールド名を正規化""" # 日本語をローマ字に変換（簡易版） replacements = { '名前': 'name', '名称': 'name', '住所': 'address', '位置': 'location', '座標': 'coordinates', 'ID': 'id', '識別子': 'id', '種別': 'type', '分類': 'category', '状態': 'status', '状況': 'status', '日時': 'dateTime', '時刻': 'time', '値': 'value', '容量': 'capacity', '人数': 'capacity' } for jp, en in replacements.items(): if jp in name: return en # 英数字のみに変換 normalized = re.sub(r'[^a-zA-Z0-9]', '_', name) normalized = re.sub(r'_+', '_', normalized).strip('_') return normalized.lower() if normalized else 'unknown_field' def _guess_data_type(self, field_name: str, description: str) -> str: """フィールド名と説明からデータ型を推定""" text = (field_name + " " + description).lower() if any(keyword in text for keyword in ['座標', 'coordinate', '緯度', '経度', 'location']): return 'geo:json' elif any(keyword in text for keyword in ['日時', 'datetime', '時刻', 'time', '日付', 'date']): return 'DateTime' elif any(keyword in text for keyword in ['数', 'number', '値', 'value', '容量', 'capacity', '人数']): return 'Number' elif any(keyword in text for keyword in ['true', 'false', 'boolean', '有無', 'フラグ']): return 'Boolean' elif 'id' in text or '識別' in text: return 'Text' else: return 'Text' def _is_required_field(self, field_name: str, description: str) -> bool: """必須フィールドかどうかを判定""" text = (field_name + " " + description).lower() # 必須と思われるキーワード required_keywords = ['必須', 'required', 'id', '識別子', 'type', '種別'] optional_keywords = ['任意', 'optional', '可能', 'オプション'] if any(keyword in text for keyword in required_keywords): return True elif any(keyword in text for keyword in optional_keywords): return False else: # IDやtypeっぽいフィールドは必須とする return field_name.lower() in ['id', 'type', 'name', '識別子', '種別', '名称'] def _identify_data_types(self, text: str) -> List[str]: """テキストから使用されているデータ型を特定""" types = set() if re.search(r'座標|緯度|経度|location|coordinate', text, re.IGNORECASE): types.add('geo:json') if re.search(r'日時|datetime|timestamp|時刻', text, re.IGNORECASE): types.add('DateTime') if re.search(r'数値|number|値|容量|人数', text, re.IGNORECASE): types.add('Number') if re.search(r'住所|address', text, re.IGNORECASE): types.add('PostalAddress') if re.search(r'URL|リンク|link', text, re.IGNORECASE): types.add('URL') return list(types) def _find_relationships(self, text: str) -> List[str]: """他のエンティティとの関係を特定""" relationships = [] # 関連する可能性のあるエンティティ名を検索 entity_patterns = [ r'避難所', r'避難場所', r'防災施設', r'医療機関', r'AED', r'センサー', r'カメラ', r'観光地', r'イベント' ] for pattern in entity_patterns: if re.search(pattern, text): relationships.append(pattern) return relationships[:5] # 最大5つに制限 def _extract_constraints(self, text: str) -> Dict[str, Any]: """制約条件を抽出""" constraints = {} # 数値制約を検索 number_patterns = [ r'最大(\d+)', r'最小(\d+)', r'上限(\d+)', r'下限(\d+)' ] for pattern in number_patterns: matches = re.findall(pattern, text) if matches: constraint_type = pattern.replace(r'(\d+)', '').replace('\\', '') constraints[constraint_type] = int(matches[0]) return constraints def _extract_examples(self, text: str) -> List[str]: """例やサンプル値を抽出""" examples = [] # 例を示すパターンを検索 example_patterns = [ r'例\s*[:：]\s*([^\n]+)', r'サンプル\s*[:：]\s*([^\n]+)', r'具体例\s*[:：]\s*([^\n]+)' ] for pattern in example_patterns: matches = re.findall(pattern, text) for match in matches: examples.append(match.strip()[:100]) # 100文字に制限 return examples[:3] # 最大3つの例 def _generate_description(self, entity_name: str, text: str) -> str: """エンティティの説明を生成""" # テキストの最初の数行から説明を抽出 lines = text.split('\n')[:10] # 最初の10行 description_text = ' '.join(line.strip() for line in lines if line.strip()) if len(description_text) > 300: description_text = description_text[:300] + "..." return description_text if description_text else f"{entity_name}に関する情報を管理するエンティティ" def generate_enhanced_json_schema(self, pdf_path: Path, category: str) -> Dict[str, Any]: """PDFから拡張JSONスキーマを生成""" entity_type = pdf_path.stem entity_name_jp = self._get_japanese_name(entity_type) # PDFテキストを抽出・解析 pdf_text = self.extract_text_from_pdf(pdf_path) analysis = self.analyze_pdf_content(pdf_path, pdf_text) # 基本スキーマ schema = { "schema_version": "2.0.0", "generated_at": datetime.now().isoformat(), "source": { "pdf_file": pdf_path.name, "pdf_path": str(pdf_path), "api_category": self.category_mapping.get(category, category), "text_extraction_success": bool(pdf_text) }, "entity": { "type": entity_type, "name_ja": entity_name_jp, "description": analysis["description"], "category": self._classify_entity_category(entity_type), "fiware_service": "smartcity_yaizu", "fiware_service_path": f"/{entity_type}" }, "attributes": self._build_enhanced_attributes(analysis), "api_specification": { "base_url": "https://api.smartcity-yaizu.jp", "endpoints": self._generate_api_endpoints(entity_type), "required_headers": { "Fiware-Service": "smartcity_yaizu", "Fiware-ServicePath": f"/{entity_type}", "Content-Type": "application/json" } }, "usage_examples": self._generate_usage_examples(entity_type), "relationships": { "related_entities": analysis["relationships"], "potential_links": self._find_potential_entity_links(entity_type) }, "metadata": { "data_quality": self._assess_data_quality(analysis), "completeness": len(analysis["extracted_fields"]) > 0, "last_updated": datetime.now().isoformat() } } return schema def _get_japanese_name(self, entity_type: str) -> str: """エンティティタイプから日本語名を推定""" name_mappings = { "Aed": "AED設置場所", "Event": "イベント一覧", "EventDetail": "イベント詳細", "PublicFacility": "公共施設", "SightseeingMapStore": "観光施設等一覧", "FactoryDirectSalesPlace": "工場併設直売所", "WeatherAlert": "警報・注意報", "WeatherForecast": "天候", "EvacuationShelter": "避難所開設状況", "EvacuationSpace": "指定緊急避難場所", "PrecipitationGauge": "雨量計", "StreamGauge": "河川水位計", "CameraInformation": "河川・海岸カメラ" } return name_mappings.get(entity_type, entity_type) def _classify_entity_category(self, entity_type: str) -> List[str]: """エンティティのカテゴリを分類""" categories = [] disaster_keywords = ['Evacuation', 'Disaster', 'Weather', 'Alert', 'Flood', 'Tsunami'] infrastructure_keywords = ['Facility', 'Tank', 'Warehouse', 'Station', 'Building'] environmental_keywords = ['Gauge', 'Sensor', 'Camera', 'Information'] tourism_keywords = ['Event', 'Sightseeing', 'Tourism', 'Factory'] medical_keywords = ['Aed', 'Hospital', 'Aid', 'Relief'] entity_lower = entity_type.lower() if any(keyword.lower() in entity_lower for keyword in disaster_keywords): categories.append("disaster_management") if any(keyword.lower() in entity_lower for keyword in infrastructure_keywords): categories.append("infrastructure") if any(keyword.lower() in entity_lower for keyword in environmental_keywords): categories.append("environmental") if any(keyword.lower() in entity_lower for keyword in tourism_keywords): categories.append("tourism_industry") if any(keyword.lower() in entity_lower for keyword in medical_keywords): categories.append("medical_emergency") return categories if categories else ["general"] def _build_enhanced_attributes(self, analysis: Dict[str, Any]) -> Dict[str, Any]: """拡張属性を構築""" attributes = { # 標準FIWARE属性 "id": { "type": "Text", "description": "エンティティの一意識別子", "required": True, "format": "uri", "example": f"urn:ngsi-ld:{analysis['entity_type']}:001" }, "type": { "type": "Text", "description": "エンティティタイプ", "required": True, "constant": analysis['entity_type'] }, "dateCreated": { "type": "DateTime", "description": "作成日時", "required": False, "format": "date-time" }, "dateModified": { "type": "DateTime", "description": "更新日時", "required": False, "format": "date-time" } } # 位置情報（ほとんどのエンティティに存在） attributes["location"] = { "type": "geo:json", "description": "地理的位置情報", "required": False, "properties": { "type": {"type": "Text", "enum": ["Point"]}, "coordinates": {"type": "Array", "items": "Number", "minItems": 2, "maxItems": 2} } } # 住所情報 attributes["address"] = { "type": "PostalAddress", "description": "住所情報", "required": False, "properties": { "addressCountry": {"type": "Text", "default": "JP"}, "addressRegion": {"type": "Text", "default": "静岡県"}, "addressLocality": {"type": "Text", "default": "焼津市"}, "streetAddress": {"type": "Text"} } } # PDFから抽出したフィールドを追加 for field in analysis.get("extracted_fields", []): attributes[field["name"]] = { "type": field["type"], "description": field["description"], "required": field["required"] } return attributes def _generate_api_endpoints(self, entity_type: str) -> List[Dict[str, Any]]: """APIエンドポイント情報を生成""" endpoints = [ { "name": "全エンティティ取得", "method": "GET", "path": "/v2/entities", "description": f"全ての{entity_type}エンティティを取得", "parameters": { "type": entity_type, "limit": {"type": "integer", "default": 100}, "offset": {"type": "integer", "default": 0} } }, { "name": "ID指定取得", "method": "GET", "path": "/v2/entities/{entityId}", "description": f"特定の{entity_type}エンティティを取得", "path_parameters": { "entityId": {"type": "string", "description": "エンティティID"} } }, { "name": "地理的検索", "method": "GET", "path": "/v2/entities", "description": f"地理的範囲内の{entity_type}エンティティを検索", "parameters": { "type": entity_type, "georel": {"type": "string", "example": "near;maxDistance:1000"}, "geometry": {"type": "string", "example": "point"}, "coords": {"type": "string", "example": "34.866,138.321"} } } ] return endpoints def _generate_usage_examples(self, entity_type: str) -> List[Dict[str, Any]]: """使用例を生成""" examples = [ { "name": "基本検索", "description": f"{entity_type}の一覧を取得", "curl_example": f"""curl -X GET "https://api.smartcity-yaizu.jp/v2/entities?type={entity_type}&limit=10" \\ -H "Fiware-Service: smartcity_yaizu" \\ -H "Fiware-ServicePath: /{entity_type}" """ }, { "name": "近隣検索", "description": "現在地から1km以内の施設を検索", "curl_example": f"""curl -X GET "https://api.smartcity-yaizu.jp/v2/entities?type={entity_type}&georel=near;maxDistance:1000&geometry=point&coords=34.866,138.321" \\ -H "Fiware-Service: smartcity_yaizu" \\ -H "Fiware-ServicePath: /{entity_type}" """ } ] return examples def _find_potential_entity_links(self, entity_type: str) -> List[str]: """潜在的な関連エンティティを特定""" relationships = { "Aed": ["FirstAidStation", "ReliefHospital", "EvacuationShelter"], "EvacuationShelter": ["EvacuationSpace", "Aed", "FirstAidStation"], "WeatherAlert": ["WeatherForecast", "PrecipitationGauge", "StreamGauge"], "Event": ["EventDetail", "SightseeingMapStore"], "PublicFacility": ["Aed"] } return relationships.get(entity_type, []) def _assess_data_quality(self, analysis: Dict[str, Any]) -> str: """データ品質を評価""" score = 0 if len(analysis.get("extracted_fields", [])) > 0: score += 30 if len(analysis.get("data_types", [])) > 0: score += 20 if len(analysis.get("relationships", [])) > 0: score += 20 if len(analysis.get("examples", [])) > 0: score += 15 if len(analysis.get("constraints", {})) > 0: score += 15 if score >= 80: return "high" elif score >= 50: return "medium" else: return "low" async def process_all_pdfs(self) -> Dict[str, Any]: """全PDFファイルを処理してJSONを生成""" print("="*60) print("PDF→JSON 完全再生成ツール") print("="*60) results = { "generated_at": datetime.now().isoformat(), "total_generated": 0, "by_category": {}, "generated_files": [] } # カテゴリ別にPDFを処理 for category_dir in self.documentation_dir.iterdir(): if category_dir.is_dir() and category_dir.name in self.category_mapping: category = category_dir.name print(f"\n📂 処理中: {category} ({self.category_mapping[category]})") pdf_files = list(category_dir.glob("*.pdf")) print(f" 📄 PDF数: {len(pdf_files)} ファイル") category_results = [] for pdf_file in pdf_files: print(f" 🔄 生成中: {pdf_file.name}") # JSONスキーマ生成 json_schema = self.generate_enhanced_json_schema(pdf_file, category) # JSONファイル保存 json_filename = f"{pdf_file.stem}.json" json_path = self.api_specs_dir / json_filename with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_schema, f, ensure_ascii=False, indent=2) category_results.append({ "pdf_file": pdf_file.name, "json_file": json_filename, "entity_type": pdf_file.stem, "file_size": json_path.stat().st_size }) print(f" ✅ 生成完了: {json_filename} ({json_path.stat().st_size:,} bytes)") results["by_category"][category] = { "category_name": self.category_mapping[category], "pdf_count": len(pdf_files), "generated_count": len(category_results), "files": category_results } results["total_generated"] += len(category_results) results["generated_files"].extend(category_results) # 統合インデックスファイル生成 index_file = self.api_specs_dir / "index.json" with open(index_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n📊 生成完了サマリー:") print(f" 📄 総生成数: {results['total_generated']} ファイル") for category, info in results["by_category"].items(): print(f" 📂 {info['category_name']}: {info['generated_count']} ファイル") print(f" 📋 インデックス: {index_file}") print(f" 💾 保存先: {self.api_specs_dir}") return results async def main(): """メイン実行関数""" generator = PDFToJSONGenerator() results = await generator.process_all_pdfs() print(f"\n🎉 PDF→JSON変換が完了しました！") print(f"📊 {results['total_generated']}個のJSONファイルが生成されました。") if __name__ == "__main__": asyncio.run(main())

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/moma1992/smartcity-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf_to_json_generator.py•23.9 KiB