Compliance Scanner MCP

Overview Schema Related Servers Score Discussions

pdf_reader.py•7.54 KiB

""" PDF 파일 처리 모듈 - read_pdf_to_text: PDF를 텍스트로 변환 - search_keyword_in_pdf: PDF에서 키워드 검색 - get_pdf_info: PDF 메타 정보 추출 """ import pdfplumber from pathlib import Path from typing import Optional, Dict, List # 보안: regulations/ 폴더 외부 접근 차단을 위한 기본 경로 BASE_REGULATIONS_DIR = Path(__file__).parent.parent.parent / "regulations" def _validate_path(pdf_path: str) -> Path: """ PDF 경로를 검증하고 안전한 절대 경로를 반환합니다. regulations/ 폴더 외부 접근을 차단합니다. Args: pdf_path: PDF 경로 Returns: Path: 검증된 절대 경로 Raises: ValueError: 경로가 regulations/ 폴더 외부인 경우 FileNotFoundError: 파일이 존재하지 않는 경우 """ # 절대 경로인 경우 if Path(pdf_path).is_absolute(): full_path = Path(pdf_path) else: # 상대 경로인 경우 regulations/ 기준으로 해석 if pdf_path.startswith("regulations/"): full_path = Path(__file__).parent.parent.parent / pdf_path else: full_path = BASE_REGULATIONS_DIR / pdf_path # 경로 정규화 full_path = full_path.resolve() # 보안 검증: regulations/ 폴더 내부인지 확인 try: full_path.relative_to(BASE_REGULATIONS_DIR.resolve()) except ValueError: raise ValueError(f"보안 오류: regulations/ 폴더 외부 접근이 차단되었습니다: {pdf_path}") if not full_path.exists(): raise FileNotFoundError(f"PDF 파일을 찾을 수 없습니다: {pdf_path}") return full_path def read_pdf_to_text( pdf_path: str, start_page: int = 1, end_page: Optional[int] = None ) -> str: """ PDF 파일을 텍스트로 변환합니다. 각 페이지에 [페이지 X/총페이지] 마커를 추가합니다. Args: pdf_path: PDF 경로 (예: "regulations/ISMS-P.pdf" 또는 "ISMS-P.pdf") start_page: 시작 페이지 (1부터 시작, 기본값: 1) end_page: 끝 페이지 (None이면 마지막까지) Returns: str: 페이지 마커가 포함된 전체 텍스트 Raises: FileNotFoundError: PDF 파일이 없는 경우 ValueError: 경로가 regulations/ 폴더 외부인 경우 Exception: PDF 파싱 오류 """ full_path = _validate_path(pdf_path) text_content = [] try: with pdfplumber.open(full_path) as pdf: total_pages = len(pdf.pages) # 페이지 범위 조정 (1-indexed → 0-indexed) start_idx = max(0, start_page - 1) end_idx = min(total_pages, end_page if end_page else total_pages) # 범위 검증 if start_idx >= total_pages: raise ValueError(f"시작 페이지({start_page})가 총 페이지 수({total_pages})를 초과합니다.") for page_num in range(start_idx, end_idx): page = pdf.pages[page_num] text = page.extract_text() if text: marker = f"[페이지 {page_num + 1}/{total_pages}]" text_content.append(f"{marker}\n{text}") else: marker = f"[페이지 {page_num + 1}/{total_pages}]" text_content.append(f"{marker}\n(텍스트 없음)") return "\n\n".join(text_content) except pdfplumber.pdfminer.pdfparser.PDFSyntaxError as e: raise Exception(f"PDF 파싱 오류 (손상된 파일): {str(e)}") except Exception as e: if "FileNotFoundError" in str(type(e)) or "ValueError" in str(type(e)): raise raise Exception(f"PDF 읽기 오류: {str(e)}") def search_keyword_in_pdf(pdf_path: str, keyword: str) -> Dict: """ PDF에서 키워드를 검색합니다. 대소문자를 구분하지 않고 검색하며, 페이지당 최대 3개의 매칭 줄을 반환합니다. Args: pdf_path: PDF 경로 keyword: 검색할 키워드 Returns: dict: { "keyword": str, "total_matches": int, "results": [{"page": int, "matches": List[str]}] } """ try: full_path = _validate_path(pdf_path) except (FileNotFoundError, ValueError) as e: return {"error": str(e)} results: List[Dict] = [] keyword_lower = keyword.lower() total_match_count = 0 try: with pdfplumber.open(full_path) as pdf: for page_num, page in enumerate(pdf.pages, 1): text = page.extract_text() if text and keyword_lower in text.lower(): lines = text.split('\n') matches = [ line.strip() for line in lines if keyword_lower in line.lower() and line.strip() ] if matches: # 페이지당 최대 3개까지 page_matches = matches[:3] total_match_count += len(matches) results.append({ "page": page_num, "matches": page_matches, "total_in_page": len(matches) }) return { "keyword": keyword, "total_matches": total_match_count, "pages_with_matches": len(results), "results": results } except Exception as e: return {"error": f"검색 오류: {str(e)}"} def get_pdf_info(pdf_path: str) -> Dict: """ PDF 파일의 메타 정보를 추출합니다. Args: pdf_path: PDF 경로 Returns: dict: { "pages": int, "size_mb": float, "path": str } """ try: full_path = _validate_path(pdf_path) except (FileNotFoundError, ValueError) as e: return {"error": str(e)} try: with pdfplumber.open(full_path) as pdf: pages = len(pdf.pages) size_bytes = full_path.stat().st_size size_mb = round(size_bytes / 1024 / 1024, 2) return { "pages": pages, "size_mb": size_mb, "path": str(full_path) } except Exception as e: return {"error": str(e)} def list_regulation_files() -> List[Dict]: """ regulations/ 폴더의 모든 파일 목록을 반환합니다. Returns: List[Dict]: 파일 정보 리스트 [{ "path": str, "name": str, "type": "pdf" | "txt", "size_mb": float, "pages": int # PDF만 }] """ files = [] regulations_dir = BASE_REGULATIONS_DIR if not regulations_dir.exists(): return [] for file_path in regulations_dir.rglob("*"): if file_path.is_file() and not file_path.name.startswith('.'): info = { "path": str(file_path.relative_to(regulations_dir.parent)), "name": file_path.name, "type": file_path.suffix[1:] if file_path.suffix else "unknown", "size_mb": round(file_path.stat().st_size / 1024 / 1024, 2) } # PDF인 경우 페이지 수 추가 if file_path.suffix.lower() == ".pdf": try: with pdfplumber.open(file_path) as pdf: info["pages"] = len(pdf.pages) except Exception: info["pages"] = -1 # 오류 시 -1 files.append(info) return files

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Tae4an/compliance-scanner-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf_reader.py•7.54 KiB