"""
Document processing agent - extracts text from various file formats
"""
import os
from PyPDF2 import PdfReader
from docx import Document
class DocumentProcessor:
"""Agent responsible for extracting text from documents"""
def __init__(self):
self.supported_formats = ['.pdf', '.docx', '.txt']
def extract_text(self, file_path: str) -> str:
"""
Extract text from document based on file extension
Args:
file_path: Path to the document file
Returns:
Extracted text content
"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.pdf':
return self._extract_from_pdf(file_path)
elif file_ext == '.docx':
return self._extract_from_docx(file_path)
elif file_ext == '.txt':
return self._extract_from_txt(file_path)
else:
raise ValueError(f"Unsupported file format: {file_ext}")
def _extract_from_pdf(self, file_path: str) -> str:
"""Extract text from PDF file"""
try:
with open(file_path, 'rb') as file:
reader = PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise RuntimeError(f"Error extracting PDF: {str(e)}")
def _extract_from_docx(self, file_path: str) -> str:
"""Extract text from DOCX file"""
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
raise RuntimeError(f"Error extracting DOCX: {str(e)}")
def _extract_from_txt(self, file_path: str) -> str:
"""Extract text from TXT file"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read().strip()
except Exception as e:
raise RuntimeError(f"Error extracting TXT: {str(e)}")