Skip to main content
Glama
moma1992

Yaizu Smart City MCP Server

by moma1992
pdf_to_json_generator.py24.5 kB
#!/usr/bin/env python3 """ PDF解析からJSON生成ツール PDFファイルを詳細解析して、リッチなJSONデータモデルを生成 """ import asyncio import json import re from datetime import datetime from pathlib import Path from typing import Dict, List, Any, Optional # PDF解析ライブラリ try: import PyPDF2 import fitz # PyMuPDF PDF_LIBRARIES_AVAILABLE = True except ImportError: print("⚠️ PDF解析ライブラリが見つかりません。基本的なテンプレートで生成します。") PDF_LIBRARIES_AVAILABLE = False class PDFToJSONGenerator: """PDF解析からJSON生成クラス""" def __init__(self): self.documentation_dir = Path("data/documentation") self.api_specs_dir = Path("data/api_specs") self.api_specs_dir.mkdir(parents=True, exist_ok=True) # OpenAPIカテゴリマッピング self.category_mapping = { "bousai-api": "防災情報API", "public-facility-api": "公共施設API", "tourism-api": "観光・産業API" } def extract_text_from_pdf(self, pdf_path: Path) -> str: """PDFからテキストを抽出""" if not PDF_LIBRARIES_AVAILABLE: return "" text = "" try: # PyMuPDFを使用してテキストを抽出 doc = fitz.open(pdf_path) for page in doc: text += page.get_text() doc.close() except Exception as e: print(f" ⚠️ PDF読み込みエラー: {pdf_path.name} - {e}") try: # フォールバック:PyPDF2を試す with open(pdf_path, 'rb') as f: reader = PyPDF2.PdfReader(f) for page in reader.pages: text += page.extract_text() except Exception as e2: print(f" ❌ PDF読み込み失敗: {pdf_path.name} - {e2}") return text def analyze_pdf_content(self, pdf_path: Path, text: str) -> Dict[str, Any]: """PDFコンテンツを解析してメタデータを抽出""" entity_name = pdf_path.stem # テキストから情報を抽出 analysis = { "entity_type": entity_name, "extracted_fields": self._extract_field_information(text), "data_types": self._identify_data_types(text), "relationships": self._find_relationships(text), "constraints": self._extract_constraints(text), "examples": self._extract_examples(text), "description": self._generate_description(entity_name, text) } return analysis def _extract_field_information(self, text: str) -> List[Dict[str, Any]]: """テキストからフィールド情報を抽出""" fields = [] # 日本語のフィールド名パターンを検索 field_patterns = [ r'([a-zA-Z_][a-zA-Z0-9_]*)\s*:\s*([^\n]+)', # field: description r'・\s*([^:\n]+):([^\n]+)', # ・フィールド名:説明 r'項目名\s*[::]\s*([^\n]+)', # 項目名: 値 r'属性\s*[::]\s*([^\n]+)', # 属性: 値 ] for pattern in field_patterns: matches = re.findall(pattern, text) for match in matches: field_name, description = match field_name = field_name.strip() description = description.strip() if len(field_name) > 1 and len(description) > 1: # データ型を推定 data_type = self._guess_data_type(field_name, description) fields.append({ "name": self._normalize_field_name(field_name), "description": description[:200], # 説明を200文字に制限 "type": data_type, "required": self._is_required_field(field_name, description) }) return fields[:10] # 最大10フィールドに制限 def _normalize_field_name(self, name: str) -> str: """フィールド名を正規化""" # 日本語をローマ字に変換(簡易版) replacements = { '名前': 'name', '名称': 'name', '住所': 'address', '位置': 'location', '座標': 'coordinates', 'ID': 'id', '識別子': 'id', '種別': 'type', '分類': 'category', '状態': 'status', '状況': 'status', '日時': 'dateTime', '時刻': 'time', '値': 'value', '容量': 'capacity', '人数': 'capacity' } for jp, en in replacements.items(): if jp in name: return en # 英数字のみに変換 normalized = re.sub(r'[^a-zA-Z0-9]', '_', name) normalized = re.sub(r'_+', '_', normalized).strip('_') return normalized.lower() if normalized else 'unknown_field' def _guess_data_type(self, field_name: str, description: str) -> str: """フィールド名と説明からデータ型を推定""" text = (field_name + " " + description).lower() if any(keyword in text for keyword in ['座標', 'coordinate', '緯度', '経度', 'location']): return 'geo:json' elif any(keyword in text for keyword in ['日時', 'datetime', '時刻', 'time', '日付', 'date']): return 'DateTime' elif any(keyword in text for keyword in ['数', 'number', '値', 'value', '容量', 'capacity', '人数']): return 'Number' elif any(keyword in text for keyword in ['true', 'false', 'boolean', '有無', 'フラグ']): return 'Boolean' elif 'id' in text or '識別' in text: return 'Text' else: return 'Text' def _is_required_field(self, field_name: str, description: str) -> bool: """必須フィールドかどうかを判定""" text = (field_name + " " + description).lower() # 必須と思われるキーワード required_keywords = ['必須', 'required', 'id', '識別子', 'type', '種別'] optional_keywords = ['任意', 'optional', '可能', 'オプション'] if any(keyword in text for keyword in required_keywords): return True elif any(keyword in text for keyword in optional_keywords): return False else: # IDやtypeっぽいフィールドは必須とする return field_name.lower() in ['id', 'type', 'name', '識別子', '種別', '名称'] def _identify_data_types(self, text: str) -> List[str]: """テキストから使用されているデータ型を特定""" types = set() if re.search(r'座標|緯度|経度|location|coordinate', text, re.IGNORECASE): types.add('geo:json') if re.search(r'日時|datetime|timestamp|時刻', text, re.IGNORECASE): types.add('DateTime') if re.search(r'数値|number|値|容量|人数', text, re.IGNORECASE): types.add('Number') if re.search(r'住所|address', text, re.IGNORECASE): types.add('PostalAddress') if re.search(r'URL|リンク|link', text, re.IGNORECASE): types.add('URL') return list(types) def _find_relationships(self, text: str) -> List[str]: """他のエンティティとの関係を特定""" relationships = [] # 関連する可能性のあるエンティティ名を検索 entity_patterns = [ r'避難所', r'避難場所', r'防災施設', r'医療機関', r'AED', r'センサー', r'カメラ', r'観光地', r'イベント' ] for pattern in entity_patterns: if re.search(pattern, text): relationships.append(pattern) return relationships[:5] # 最大5つに制限 def _extract_constraints(self, text: str) -> Dict[str, Any]: """制約条件を抽出""" constraints = {} # 数値制約を検索 number_patterns = [ r'最大(\d+)', r'最小(\d+)', r'上限(\d+)', r'下限(\d+)' ] for pattern in number_patterns: matches = re.findall(pattern, text) if matches: constraint_type = pattern.replace(r'(\d+)', '').replace('\\', '') constraints[constraint_type] = int(matches[0]) return constraints def _extract_examples(self, text: str) -> List[str]: """例やサンプル値を抽出""" examples = [] # 例を示すパターンを検索 example_patterns = [ r'例\s*[::]\s*([^\n]+)', r'サンプル\s*[::]\s*([^\n]+)', r'具体例\s*[::]\s*([^\n]+)' ] for pattern in example_patterns: matches = re.findall(pattern, text) for match in matches: examples.append(match.strip()[:100]) # 100文字に制限 return examples[:3] # 最大3つの例 def _generate_description(self, entity_name: str, text: str) -> str: """エンティティの説明を生成""" # テキストの最初の数行から説明を抽出 lines = text.split('\n')[:10] # 最初の10行 description_text = ' '.join(line.strip() for line in lines if line.strip()) if len(description_text) > 300: description_text = description_text[:300] + "..." return description_text if description_text else f"{entity_name}に関する情報を管理するエンティティ" def generate_enhanced_json_schema(self, pdf_path: Path, category: str) -> Dict[str, Any]: """PDFから拡張JSONスキーマを生成""" entity_type = pdf_path.stem entity_name_jp = self._get_japanese_name(entity_type) # PDFテキストを抽出・解析 pdf_text = self.extract_text_from_pdf(pdf_path) analysis = self.analyze_pdf_content(pdf_path, pdf_text) # 基本スキーマ schema = { "schema_version": "2.0.0", "generated_at": datetime.now().isoformat(), "source": { "pdf_file": pdf_path.name, "pdf_path": str(pdf_path), "api_category": self.category_mapping.get(category, category), "text_extraction_success": bool(pdf_text) }, "entity": { "type": entity_type, "name_ja": entity_name_jp, "description": analysis["description"], "category": self._classify_entity_category(entity_type), "fiware_service": "smartcity_yaizu", "fiware_service_path": f"/{entity_type}" }, "attributes": self._build_enhanced_attributes(analysis), "api_specification": { "base_url": "https://api.smartcity-yaizu.jp", "endpoints": self._generate_api_endpoints(entity_type), "required_headers": { "Fiware-Service": "smartcity_yaizu", "Fiware-ServicePath": f"/{entity_type}", "Content-Type": "application/json" } }, "usage_examples": self._generate_usage_examples(entity_type), "relationships": { "related_entities": analysis["relationships"], "potential_links": self._find_potential_entity_links(entity_type) }, "metadata": { "data_quality": self._assess_data_quality(analysis), "completeness": len(analysis["extracted_fields"]) > 0, "last_updated": datetime.now().isoformat() } } return schema def _get_japanese_name(self, entity_type: str) -> str: """エンティティタイプから日本語名を推定""" name_mappings = { "Aed": "AED設置場所", "Event": "イベント一覧", "EventDetail": "イベント詳細", "PublicFacility": "公共施設", "SightseeingMapStore": "観光施設等一覧", "FactoryDirectSalesPlace": "工場併設直売所", "WeatherAlert": "警報・注意報", "WeatherForecast": "天候", "EvacuationShelter": "避難所開設状況", "EvacuationSpace": "指定緊急避難場所", "PrecipitationGauge": "雨量計", "StreamGauge": "河川水位計", "CameraInformation": "河川・海岸カメラ" } return name_mappings.get(entity_type, entity_type) def _classify_entity_category(self, entity_type: str) -> List[str]: """エンティティのカテゴリを分類""" categories = [] disaster_keywords = ['Evacuation', 'Disaster', 'Weather', 'Alert', 'Flood', 'Tsunami'] infrastructure_keywords = ['Facility', 'Tank', 'Warehouse', 'Station', 'Building'] environmental_keywords = ['Gauge', 'Sensor', 'Camera', 'Information'] tourism_keywords = ['Event', 'Sightseeing', 'Tourism', 'Factory'] medical_keywords = ['Aed', 'Hospital', 'Aid', 'Relief'] entity_lower = entity_type.lower() if any(keyword.lower() in entity_lower for keyword in disaster_keywords): categories.append("disaster_management") if any(keyword.lower() in entity_lower for keyword in infrastructure_keywords): categories.append("infrastructure") if any(keyword.lower() in entity_lower for keyword in environmental_keywords): categories.append("environmental") if any(keyword.lower() in entity_lower for keyword in tourism_keywords): categories.append("tourism_industry") if any(keyword.lower() in entity_lower for keyword in medical_keywords): categories.append("medical_emergency") return categories if categories else ["general"] def _build_enhanced_attributes(self, analysis: Dict[str, Any]) -> Dict[str, Any]: """拡張属性を構築""" attributes = { # 標準FIWARE属性 "id": { "type": "Text", "description": "エンティティの一意識別子", "required": True, "format": "uri", "example": f"urn:ngsi-ld:{analysis['entity_type']}:001" }, "type": { "type": "Text", "description": "エンティティタイプ", "required": True, "constant": analysis['entity_type'] }, "dateCreated": { "type": "DateTime", "description": "作成日時", "required": False, "format": "date-time" }, "dateModified": { "type": "DateTime", "description": "更新日時", "required": False, "format": "date-time" } } # 位置情報(ほとんどのエンティティに存在) attributes["location"] = { "type": "geo:json", "description": "地理的位置情報", "required": False, "properties": { "type": {"type": "Text", "enum": ["Point"]}, "coordinates": {"type": "Array", "items": "Number", "minItems": 2, "maxItems": 2} } } # 住所情報 attributes["address"] = { "type": "PostalAddress", "description": "住所情報", "required": False, "properties": { "addressCountry": {"type": "Text", "default": "JP"}, "addressRegion": {"type": "Text", "default": "静岡県"}, "addressLocality": {"type": "Text", "default": "焼津市"}, "streetAddress": {"type": "Text"} } } # PDFから抽出したフィールドを追加 for field in analysis.get("extracted_fields", []): attributes[field["name"]] = { "type": field["type"], "description": field["description"], "required": field["required"] } return attributes def _generate_api_endpoints(self, entity_type: str) -> List[Dict[str, Any]]: """APIエンドポイント情報を生成""" endpoints = [ { "name": "全エンティティ取得", "method": "GET", "path": "/v2/entities", "description": f"全ての{entity_type}エンティティを取得", "parameters": { "type": entity_type, "limit": {"type": "integer", "default": 100}, "offset": {"type": "integer", "default": 0} } }, { "name": "ID指定取得", "method": "GET", "path": "/v2/entities/{entityId}", "description": f"特定の{entity_type}エンティティを取得", "path_parameters": { "entityId": {"type": "string", "description": "エンティティID"} } }, { "name": "地理的検索", "method": "GET", "path": "/v2/entities", "description": f"地理的範囲内の{entity_type}エンティティを検索", "parameters": { "type": entity_type, "georel": {"type": "string", "example": "near;maxDistance:1000"}, "geometry": {"type": "string", "example": "point"}, "coords": {"type": "string", "example": "34.866,138.321"} } } ] return endpoints def _generate_usage_examples(self, entity_type: str) -> List[Dict[str, Any]]: """使用例を生成""" examples = [ { "name": "基本検索", "description": f"{entity_type}の一覧を取得", "curl_example": f"""curl -X GET "https://api.smartcity-yaizu.jp/v2/entities?type={entity_type}&limit=10" \\ -H "Fiware-Service: smartcity_yaizu" \\ -H "Fiware-ServicePath: /{entity_type}" """ }, { "name": "近隣検索", "description": "現在地から1km以内の施設を検索", "curl_example": f"""curl -X GET "https://api.smartcity-yaizu.jp/v2/entities?type={entity_type}&georel=near;maxDistance:1000&geometry=point&coords=34.866,138.321" \\ -H "Fiware-Service: smartcity_yaizu" \\ -H "Fiware-ServicePath: /{entity_type}" """ } ] return examples def _find_potential_entity_links(self, entity_type: str) -> List[str]: """潜在的な関連エンティティを特定""" relationships = { "Aed": ["FirstAidStation", "ReliefHospital", "EvacuationShelter"], "EvacuationShelter": ["EvacuationSpace", "Aed", "FirstAidStation"], "WeatherAlert": ["WeatherForecast", "PrecipitationGauge", "StreamGauge"], "Event": ["EventDetail", "SightseeingMapStore"], "PublicFacility": ["Aed"] } return relationships.get(entity_type, []) def _assess_data_quality(self, analysis: Dict[str, Any]) -> str: """データ品質を評価""" score = 0 if len(analysis.get("extracted_fields", [])) > 0: score += 30 if len(analysis.get("data_types", [])) > 0: score += 20 if len(analysis.get("relationships", [])) > 0: score += 20 if len(analysis.get("examples", [])) > 0: score += 15 if len(analysis.get("constraints", {})) > 0: score += 15 if score >= 80: return "high" elif score >= 50: return "medium" else: return "low" async def process_all_pdfs(self) -> Dict[str, Any]: """全PDFファイルを処理してJSONを生成""" print("="*60) print("PDF→JSON 完全再生成ツール") print("="*60) results = { "generated_at": datetime.now().isoformat(), "total_generated": 0, "by_category": {}, "generated_files": [] } # カテゴリ別にPDFを処理 for category_dir in self.documentation_dir.iterdir(): if category_dir.is_dir() and category_dir.name in self.category_mapping: category = category_dir.name print(f"\n📂 処理中: {category} ({self.category_mapping[category]})") pdf_files = list(category_dir.glob("*.pdf")) print(f" 📄 PDF数: {len(pdf_files)} ファイル") category_results = [] for pdf_file in pdf_files: print(f" 🔄 生成中: {pdf_file.name}") # JSONスキーマ生成 json_schema = self.generate_enhanced_json_schema(pdf_file, category) # JSONファイル保存 json_filename = f"{pdf_file.stem}.json" json_path = self.api_specs_dir / json_filename with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_schema, f, ensure_ascii=False, indent=2) category_results.append({ "pdf_file": pdf_file.name, "json_file": json_filename, "entity_type": pdf_file.stem, "file_size": json_path.stat().st_size }) print(f" ✅ 生成完了: {json_filename} ({json_path.stat().st_size:,} bytes)") results["by_category"][category] = { "category_name": self.category_mapping[category], "pdf_count": len(pdf_files), "generated_count": len(category_results), "files": category_results } results["total_generated"] += len(category_results) results["generated_files"].extend(category_results) # 統合インデックスファイル生成 index_file = self.api_specs_dir / "index.json" with open(index_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\n📊 生成完了サマリー:") print(f" 📄 総生成数: {results['total_generated']} ファイル") for category, info in results["by_category"].items(): print(f" 📂 {info['category_name']}: {info['generated_count']} ファイル") print(f" 📋 インデックス: {index_file}") print(f" 💾 保存先: {self.api_specs_dir}") return results async def main(): """メイン実行関数""" generator = PDFToJSONGenerator() results = await generator.process_all_pdfs() print(f"\n🎉 PDF→JSON変換が完了しました!") print(f"📊 {results['total_generated']}個のJSONファイルが生成されました。") if __name__ == "__main__": asyncio.run(main())

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/moma1992/smartcity-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server