from __future__ import annotations
from pathlib import Path
from typing import Iterable, List
import pandas as pd
from pypdf import PdfReader
SUPPORTED_SUFFIXES = {".txt", ".md", ".csv", ".pdf"}
def list_docs(root: Path) -> List[Path]:
return sorted(
[
p
for p in root.rglob("*")
if p.is_file() and p.suffix.lower() in SUPPORTED_SUFFIXES
]
)
def _read_text(path: Path) -> str:
return path.read_text(encoding="utf-8", errors="ignore")
def _read_csv(path: Path) -> str:
df = pd.read_csv(path)
content = df.to_csv(index=False)
return content if isinstance(content, str) else ""
def _read_pdf(path: Path) -> str:
reader = PdfReader(str(path))
texts: List[str] = []
for page in reader.pages:
txt = page.extract_text() or ""
if txt.strip():
texts.append(txt)
return "\n".join(texts)
def read_doc(path: Path) -> str:
suf = path.suffix.lower()
if suf in {".txt", ".md"}:
return _read_text(path)
if suf == ".csv":
return _read_csv(path)
if suf == ".pdf":
return _read_pdf(path)
raise ValueError(f"Unsupported file type: {suf}")
def yield_file_texts(paths: Iterable[Path]) -> Iterable[tuple[Path, str]]:
for p in paths:
try:
yield p, read_doc(p)
except Exception as e: # noqa: BLE001
# Skip unreadable files but keep the pipeline going
print(f"[warn] failed to read {p}: {e}")