import fitz # PyMuPDF
from docx import Document
from bs4 import BeautifulSoup
import os
from typing import Dict, List
class DocumentProcessor:
@staticmethod
def extract_text(filepath: str) -> Dict[str, any]:
"""Extract text and metadata from document"""
ext = os.path.splitext(filepath)[1].lower()
processors = {
'.pdf': DocumentProcessor._process_pdf,
'.docx': DocumentProcessor._process_docx,
'.html': DocumentProcessor._process_html,
'.md': DocumentProcessor._process_text,
'.txt': DocumentProcessor._process_text
}
processor = processors.get(ext)
if not processor:
raise ValueError(f"Unsupported file type: {ext}")
return processor(filepath)
@staticmethod
def _process_pdf(filepath: str) -> Dict:
doc = fitz.open(filepath)
text = ""
pages = []
for page_num, page in enumerate(doc, 1):
page_text = page.get_text()
text += page_text
pages.append({"page": page_num, "text": page_text})
metadata = doc.metadata
return {
"text": text,
"pages": pages,
"metadata": {
"title": metadata.get("title", os.path.basename(filepath)),
"author": metadata.get("author", "Unknown"),
"page_count": len(doc)
}
}
@staticmethod
def _process_docx(filepath: str) -> Dict:
doc = Document(filepath)
text = "\n".join([para.text for para in doc.paragraphs])
return {
"text": text,
"metadata": {
"title": os.path.basename(filepath),
"author": doc.core_properties.author or "Unknown",
"page_count": len(doc.paragraphs)
}
}
@staticmethod
def _process_html(filepath: str) -> Dict:
with open(filepath, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f.read(), 'lxml')
# Remove scripts and styles
for script in soup(["script", "style"]):
script.decompose()
text = soup.get_text()
title = soup.find('title')
return {
"text": text,
"metadata": {
"title": title.string if title else os.path.basename(filepath),
"author": "Unknown",
"page_count": 1
}
}
@staticmethod
def _process_text(filepath: str) -> Dict:
with open(filepath, 'r', encoding='utf-8') as f:
text = f.read()
return {
"text": text,
"metadata": {
"title": os.path.basename(filepath),
"author": "Unknown",
"page_count": 1
}
}