import PyPDF2
from docx import Document
from typing import Optional
from pathlib import Path
import tempfile
import requests
from urllib.parse import urlparse
class ReaderTools():
@staticmethod
def download_link_to_file(url: str) -> Optional[str]:
"""
Download a file from a URL to a temporary file and return the file path.
Args:
url (str): The URL of the file to download
Returns:
Optional[str]: File URL in format '/path/to/file' or None if download fails
Raises:
ValueError: If the URL is invalid or empty
requests.RequestException: If the download fails
"""
if not url or not isinstance(url, str):
raise ValueError("URL must be a non-empty string")
try:
parsed_url = urlparse(url)
if not all([parsed_url.scheme, parsed_url.netloc]):
raise ValueError(f"Invalid URL: {url}")
session = requests.Session()
response = session.get(url, stream=True, timeout=30)
response.raise_for_status()
# Get file extension from URL or content type
ext = Path(parsed_url.path).suffix.lower()
if not ext and 'content-type' in response.headers:
content_type = response.headers['content-type'].lower()
if 'pdf' in content_type:
ext = '.pdf'
elif 'word' in content_type:
ext = '.docx'
elif 'text/plain' in content_type:
ext = '.txt'
else:
ext = '.bin'
# Create a temporary file with the correct extension
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as temp_file:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
temp_file.write(chunk)
return temp_file.name
except requests.RequestException as e:
raise requests.RequestException(f"Failed to download file from {url}: {str(e)}")
except Exception as e:
raise Exception(f"Unexpected error while downloading {url}: {str(e)}")
finally:
if 'response' in locals():
response.close()
@staticmethod
def read_document(input_file_path: str) -> str:
"""
Read the content of a document file. Supports TXT, DOCX, and PDF formats.
Args:
input_file_path: Path to the input document file.
Returns:
str: Content of the document if successful, error message otherwise.
"""
try:
if input_file_path.startswith("http"):
input_file_path = ReaderTools.download_link_to_file(input_file_path)
file_path = Path(input_file_path)
if not file_path.exists():
return f"Error: File not found at {input_file_path}"
if not file_path.suffix.lower() in ['.txt', '.docx', '.pdf']:
return f"Error: Unsupported file format. Supported formats: .txt, .docx, .pdf"
if file_path.suffix.lower() == '.txt':
return ReaderTools._read_txt(file_path)
elif file_path.suffix.lower() == '.docx':
return ReaderTools._read_docx(file_path)
elif file_path.suffix.lower() == '.pdf':
return ReaderTools._read_pdf(file_path)
except Exception as e:
return f"Error reading file {input_file_path}: {str(e)}"
@staticmethod
def _read_txt(file_path: Path) -> str:
"""Read content from a text file."""
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
@staticmethod
def _read_docx(file_path: Path) -> str:
"""Read content from a DOCX file."""
doc = Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs if paragraph.text])
@staticmethod
def _read_pdf(file_path: Path) -> str:
"""Read content from a PDF file."""
text = []
with open(file_path, 'rb') as f:
pdf_reader = PyPDF2.PdfReader(f)
for page in pdf_reader.pages:
page_text = page.extract_text()
if page_text:
text.append(page_text)
return '\n'.join(text)