Law Scrapper MCP

content_processor.py•3.23 KiB

"""Content processing for legal acts - HTML/PDF to Markdown conversion.""" import logging import re from dataclasses import dataclass logger = logging.getLogger(__name__) @dataclass class Section: """Represents a section in a legal document.""" id: str title: str level: int # heading level (1=chapter, 2=article, 3=paragraph) start_pos: int end_pos: int | None = None content: str = "" class ContentProcessor: """Processes legal act content from HTML/PDF to structured Markdown.""" def html_to_markdown(self, html: str) -> str: """Convert HTML to Markdown using markdownify.""" from markdownify import markdownify md = markdownify(html, heading_style="ATX", strip=["img", "script", "style"]) # Clean up: remove excessive blank lines, normalize whitespace md = re.sub(r"\n{3,}", "\n\n", md) return md.strip() def pdf_to_text(self, pdf_bytes: bytes) -> str: """Extract text from PDF using pdfplumber.""" try: import io import pdfplumber text_parts = [] with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text_parts.append(page_text) if not text_parts: return "" return "\n\n".join(text_parts) except Exception as e: logger.warning(f"PDF extraction failed: {e}") return "" def index_sections(self, markdown: str) -> list[Section]: """Parse markdown and create section index based on headings.""" sections = [] # Match markdown headings and also Polish legal patterns like "Art. 1.", "Rozdział 1" heading_pattern = re.compile( r"^(#{1,6})\s+(.+)$|^(Art\.\s*\d+[a-z]?\.?)(.*)$|^(Rozdział\s+\w+)(.*)$|^(DZIAŁ\s+\w+)(.*)$", re.MULTILINE, ) matches = list(heading_pattern.finditer(markdown)) for i, match in enumerate(matches): if match.group(1): # Markdown heading level = len(match.group(1)) title = match.group(2).strip() elif match.group(3): # Art. pattern level = 2 title = (match.group(3) + (match.group(4) or "")).strip() elif match.group(5): # Rozdział pattern level = 1 title = (match.group(5) + (match.group(6) or "")).strip() elif match.group(7): # DZIAŁ pattern level = 1 title = (match.group(7) + (match.group(8) or "")).strip() else: continue start_pos = match.start() end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(markdown) section_id = re.sub(r"[^\w\s.-]", "", title).strip().replace(" ", "_")[:50] sections.append( Section( id=section_id, title=title, level=level, start_pos=start_pos, end_pos=end_pos, content=markdown[start_pos:end_pos].strip(), ) ) return sections

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/numikel/law-scrapper-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

content_processor.py•3.23 KiB