get_kvkk_document_markdown
Extract and format KVKK decision documents into Markdown with metadata for simplified access and analysis on the Yargı MCP server.
Instructions
Get KVKK decision document in Markdown format with metadata extraction
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| decision_url | Yes | KVKK decision URL from search results | |
| page_number | No | Page number for paginated Markdown content (1-indexed, accepts int). Default is 1 (first 5,000 characters). |
Implementation Reference
- mcp_server_main.py:1704-1766 (handler)MCP tool registration and handler function for get_kvkk_document_markdown. Thin wrapper that validates input URL, calls KvkkApiClient.get_decision_document, and handles errors by returning structured error responses using the KvkkDocumentMarkdown model.description="Get KVKK decision document in Markdown format with metadata extraction", annotations={ "readOnlyHint": True, "openWorldHint": False, "idempotentHint": True } ) async def get_kvkk_document_markdown( decision_url: str = Field(..., description="KVKK decision URL from search results"), page_number: int = Field(1, ge=1, description="Page number for paginated Markdown content (1-indexed, accepts int). Default is 1 (first 5,000 characters).") ) -> Dict[str, Any]: """Get KVKK decision as paginated Markdown.""" logger.info(f"KVKK document retrieval tool called for URL: {decision_url}") if not decision_url or not decision_url.strip(): return KvkkDocumentMarkdown( source_url=HttpUrl("https://www.kvkk.gov.tr"), title=None, decision_date=None, decision_number=None, subject_summary=None, markdown_chunk=None, current_page=page_number or 1, total_pages=0, is_paginated=False, error_message="Decision URL is required and cannot be empty." ).model_dump() try: # Validate URL format if not decision_url.startswith("https://www.kvkk.gov.tr/"): return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=None, decision_date=None, decision_number=None, subject_summary=None, markdown_chunk=None, current_page=page_number or 1, total_pages=0, is_paginated=False, error_message="Invalid KVKK decision URL format. URL must start with https://www.kvkk.gov.tr/" ).model_dump() result = await kvkk_client_instance.get_decision_document(decision_url, page_number or 1) logger.info(f"KVKK document retrieved successfully. Page {result.current_page}/{result.total_pages}, Content length: {len(result.markdown_chunk) if result.markdown_chunk else 0}") return result.model_dump() except Exception as e: logger.exception(f"Error retrieving KVKK document: {e}") return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=None, decision_date=None, decision_number=None, subject_summary=None, markdown_chunk=None, current_page=page_number or 1, total_pages=0, is_paginated=False, error_message=f"Error retrieving KVKK document: {str(e)}" ).model_dump()
- kvkk_mcp_module/client.py:279-367 (handler)Core implementation of KVKK document retrieval: fetches HTML from decision_url, extracts metadata and content using BeautifulSoup, converts HTML to Markdown using MarkItDown, implements pagination by 5000-char chunks, and returns structured KvkkDocumentMarkdown.async def get_decision_document(self, decision_url: str, page_number: int = 1) -> KvkkDocumentMarkdown: """Retrieve and convert a KVKK decision document to paginated Markdown.""" logger.info(f"KvkkApiClient: Getting decision document from: {decision_url}, page: {page_number}") try: # Fetch the decision page response = await self.http_client.get(decision_url) response.raise_for_status() # Extract content from HTML extracted_data = self._extract_decision_content_from_html(response.text, decision_url) # Convert HTML content to Markdown full_markdown_content = None if extracted_data["html_content"]: full_markdown_content = self._convert_html_to_markdown(extracted_data["html_content"]) if not full_markdown_content: return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=extracted_data["title"], decision_date=extracted_data["decision_date"], decision_number=extracted_data["decision_number"], subject_summary=extracted_data["subject_summary"], markdown_chunk=None, current_page=page_number, total_pages=0, is_paginated=False, error_message="Could not convert document content to Markdown" ) # Calculate pagination content_length = len(full_markdown_content) total_pages = math.ceil(content_length / self.DOCUMENT_MARKDOWN_CHUNK_SIZE) if total_pages == 0: total_pages = 1 # Clamp page number to valid range current_page_clamped = max(1, min(page_number, total_pages)) # Extract the requested chunk start_index = (current_page_clamped - 1) * self.DOCUMENT_MARKDOWN_CHUNK_SIZE end_index = start_index + self.DOCUMENT_MARKDOWN_CHUNK_SIZE markdown_chunk = full_markdown_content[start_index:end_index] return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=extracted_data["title"], decision_date=extracted_data["decision_date"], decision_number=extracted_data["decision_number"], subject_summary=extracted_data["subject_summary"], markdown_chunk=markdown_chunk, current_page=current_page_clamped, total_pages=total_pages, is_paginated=(total_pages > 1), error_message=None ) except httpx.HTTPStatusError as e: error_msg = f"HTTP error {e.response.status_code} when fetching decision document" logger.error(f"KvkkApiClient: {error_msg}") return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=None, decision_date=None, decision_number=None, subject_summary=None, markdown_chunk=None, current_page=page_number, total_pages=0, is_paginated=False, error_message=error_msg ) except Exception as e: error_msg = f"Unexpected error when fetching decision document: {str(e)}" logger.error(f"KvkkApiClient: {error_msg}") return KvkkDocumentMarkdown( source_url=HttpUrl(decision_url), title=None, decision_date=None, decision_number=None, subject_summary=None, markdown_chunk=None, current_page=page_number, total_pages=0, is_paginated=False, error_message=error_msg )
- kvkk_mcp_module/models.py:33-49 (schema)Pydantic model defining the input/output schema for KVKK document responses, including paginated markdown chunks, metadata fields, and error handling.class KvkkDocumentMarkdown(BaseModel): """Model for KVKK decision document content converted to paginated Markdown.""" source_url: HttpUrl = Field(description="URL of the original KVKK decision page.") title: Optional[str] = Field(None, description="Title of the KVKK decision.") decision_date: Optional[str] = Field(None, description="Decision date (Karar Tarihi).") decision_number: Optional[str] = Field(None, description="Decision number (Karar No).") subject_summary: Optional[str] = Field(None, description="Subject summary (Konu Özeti).") markdown_chunk: Optional[str] = Field(None, description="A 5,000 character chunk of the Markdown content.") current_page: int = Field(description="The current page number of the markdown chunk (1-indexed).") total_pages: int = Field(description="Total number of pages for the full markdown content.") is_paginated: bool = Field(description="True if the full markdown content is split into multiple pages.") error_message: Optional[str] = Field(None, description="Value") class Config: json_encoders = { HttpUrl: str }
- kvkk_mcp_module/client.py:194-278 (helper)Supporting helper methods: _extract_decision_content_from_html parses KVKK page HTML for metadata and content using BeautifulSoup; _convert_html_to_markdown converts extracted HTML to Markdown using MarkItDown library.def _extract_decision_content_from_html(self, html: str, url: str) -> Dict[str, Any]: """Extract decision content from KVKK decision page HTML.""" try: soup = BeautifulSoup(html, 'html.parser') # Extract title title = None title_element = soup.find('h3', class_='blog-post-title') if title_element: title = title_element.get_text(strip=True) elif soup.title: title = soup.title.get_text(strip=True) # Extract decision content from the main content div content_div = soup.find('div', class_='blog-post-inner') if not content_div: # Fallback to other possible content containers content_div = soup.find('div', style='text-align:justify;') if not content_div: logger.warning(f"Could not find decision content div in {url}") return { "title": title, "decision_date": None, "decision_number": None, "subject_summary": None, "html_content": None } # Extract decision metadata from table decision_date = None decision_number = None subject_summary = None table = content_div.find('table') if table: rows = table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) >= 3: field_name = cells[0].get_text(strip=True) field_value = cells[2].get_text(strip=True) if 'Karar Tarihi' in field_name: decision_date = field_value elif 'Karar No' in field_name: decision_number = field_value elif 'Konu Özeti' in field_name: subject_summary = field_value return { "title": title, "decision_date": decision_date, "decision_number": decision_number, "subject_summary": subject_summary, "html_content": str(content_div) } except Exception as e: logger.error(f"Error extracting content from HTML for {url}: {e}") return { "title": None, "decision_date": None, "decision_number": None, "subject_summary": None, "html_content": None } def _convert_html_to_markdown(self, html_content: str) -> Optional[str]: """Convert HTML content to Markdown using MarkItDown with BytesIO to avoid filename length issues.""" if not html_content: return None try: # Convert HTML string to bytes and create BytesIO stream html_bytes = html_content.encode('utf-8') html_stream = io.BytesIO(html_bytes) # Pass BytesIO stream to MarkItDown to avoid temp file creation md_converter = MarkItDown(enable_plugins=False) result = md_converter.convert(html_stream) return result.text_content except Exception as e: logger.error(f"Error converting HTML to Markdown: {e}") return None