#!/usr/bin/env python3
"""RAG Evaluation Management CLI.
Commands:
generate - Generate ground truth answers from nfcorpus dataset
upload - Upload nfcorpus documents as Nextcloud notes
Usage:
# Generate ground truth
uv run python tools/rag_eval_cli.py generate
# Upload corpus to Nextcloud
uv run python tools/rag_eval_cli.py upload --nextcloud-url http://localhost:8000 --username admin --password admin
"""
import io
import json
import sys
import zipfile
from pathlib import Path
from typing import Any
import anyio
import click
import httpx
from datasets import load_dataset
from httpx import BasicAuth
# Add parent directory to path to import from tests/
sys.path.insert(0, str(Path(__file__).parent.parent))
from nextcloud_mcp_server.client import NextcloudClient
from tests.rag_evaluation.llm_providers import create_llm_provider
# Paths
FIXTURES_DIR = Path(__file__).parent.parent / "tests" / "rag_evaluation" / "fixtures"
CORPUS_DIR = FIXTURES_DIR / "nfcorpus"
GROUND_TRUTH_FILE = FIXTURES_DIR / "ground_truth.json"
NOTE_MAPPING_FILE = FIXTURES_DIR / "note_mapping.json"
# Dataset URL
NFCORPUS_URL = (
"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/nfcorpus.zip"
)
# Selected test queries (from ADR-013)
SELECTED_QUERIES = [
"PLAIN-2630", # Alkylphenol Endocrine Disruptors and Allergies
"PLAIN-2660", # How Long to Detox From Fish Before Pregnancy?
"PLAIN-2510", # Coffee and Artery Function
"PLAIN-2430", # Preventing Brain Loss with B Vitamins?
"PLAIN-2690", # Chronic Headaches and Pork Tapeworms
]
def ensure_corpus_downloaded(force_download: bool = False) -> Path:
"""Ensure nfcorpus dataset is downloaded to fixtures directory.
Args:
force_download: Force re-download even if corpus exists
Returns:
Path to corpus directory
Raises:
RuntimeError: If download fails
"""
if CORPUS_DIR.exists() and not force_download:
click.echo(f"Corpus already exists at {CORPUS_DIR}")
return CORPUS_DIR
click.echo(f"Downloading nfcorpus dataset to {CORPUS_DIR}...")
# Create fixtures directory
FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
# Download using HuggingFace datasets library (handles caching)
try:
# Download corpus
click.echo(" Downloading corpus...")
corpus_dataset = load_dataset(
"BeIR/nfcorpus",
"corpus",
split="corpus",
)
# Download queries
click.echo(" Downloading queries...")
queries_dataset = load_dataset(
"BeIR/nfcorpus",
"queries",
split="queries",
)
# Save to local fixtures directory as JSONL
CORPUS_DIR.mkdir(parents=True, exist_ok=True)
# Save corpus
with open(CORPUS_DIR / "corpus.jsonl", "w") as f:
for doc in corpus_dataset:
f.write(json.dumps(doc) + "\n")
# Save queries
with open(CORPUS_DIR / "queries.jsonl", "w") as f:
for query in queries_dataset:
f.write(json.dumps(query) + "\n")
# Download qrels from BEIR directly (not available via HuggingFace)
click.echo(" Downloading qrels from BEIR ZIP...")
with httpx.Client(timeout=300.0) as client:
response = client.get(NFCORPUS_URL)
response.raise_for_status()
# Extract qrels from ZIP
with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
# The qrels are in nfcorpus/qrels/test.tsv within the ZIP
qrels_path = "nfcorpus/qrels/test.tsv"
qrels_dir = CORPUS_DIR / "qrels"
qrels_dir.mkdir(parents=True, exist_ok=True)
qrels_content = zf.read(qrels_path).decode("utf-8")
with open(qrels_dir / "test.tsv", "w") as f:
f.write(qrels_content)
click.echo(f"Dataset downloaded to {CORPUS_DIR}")
return CORPUS_DIR
except Exception as e:
raise RuntimeError(f"Failed to download nfcorpus dataset: {e}") from e
def load_corpus(corpus_dir: Path) -> dict[str, dict]:
"""Load corpus documents from local directory.
Args:
corpus_dir: Path to corpus directory
Returns:
Dict mapping document ID to document data
"""
corpus = {}
with open(corpus_dir / "corpus.jsonl") as f:
for line in f:
doc = json.loads(line)
corpus[doc["_id"]] = doc
return corpus
def load_queries(corpus_dir: Path) -> dict[str, dict]:
"""Load queries from local directory.
Args:
corpus_dir: Path to corpus directory
Returns:
Dict mapping query ID to query data
"""
queries = {}
with open(corpus_dir / "queries.jsonl") as f:
for line in f:
query = json.loads(line)
queries[query["_id"]] = query
return queries
def load_qrels(corpus_dir: Path) -> dict[str, list[tuple[str, int]]]:
"""Load query relevance judgments from local directory.
Args:
corpus_dir: Path to corpus directory
Returns:
Dict mapping query ID to list of (doc_id, score) tuples
"""
qrels: dict[str, list[tuple[str, int]]] = {}
with open(corpus_dir / "qrels" / "test.tsv") as f:
next(f) # Skip header
for line in f:
query_id, corpus_id, score = line.strip().split("\t")
if query_id not in qrels:
qrels[query_id] = []
qrels[query_id].append((corpus_id, int(score)))
# Sort by score descending
for query_id in qrels:
qrels[query_id].sort(key=lambda x: x[1], reverse=True)
return qrels
async def generate_ground_truth_answer(
query_text: str, relevant_docs: list[dict[str, Any]], llm
) -> str:
"""Generate ground truth answer from highly relevant documents.
Args:
query_text: The query/question
relevant_docs: List of highly relevant documents (top 5)
llm: LLM provider instance
Returns:
Generated ground truth answer
"""
# Construct context from documents
context_parts = []
for i, doc in enumerate(relevant_docs, 1):
context_parts.append(
f"Document {i}:\nTitle: {doc['title']}\nText: {doc['text']}\n"
)
context = "\n".join(context_parts)
# Generate ground truth
prompt = f"""Based on the following medical/biomedical documents, provide a comprehensive, factual answer to this question.
Question: {query_text}
{context}
Instructions:
- Provide a clear, well-structured answer that synthesizes information from the documents
- Focus on accuracy and completeness
- Use specific facts and findings from the documents
- Keep the answer concise but informative (2-4 paragraphs)
- Do not make up information not present in the documents
Answer:"""
click.echo(f" Generating answer for: {query_text}")
answer = await llm.generate(prompt, max_tokens=500)
click.echo(f" Generated {len(answer)} characters")
return answer.strip()
@click.group()
def cli():
"""RAG Evaluation Management CLI.
Manage ground truth generation and corpus upload for RAG evaluation tests.
"""
pass
@cli.command()
@click.option(
"--provider",
type=click.Choice(["ollama", "anthropic"]),
default="ollama",
help="LLM provider to use for generation",
)
@click.option(
"--model",
help="Model name (default: llama3.2:1b for Ollama, claude-3-5-sonnet-20241022 for Anthropic)",
)
@click.option(
"--force-download",
is_flag=True,
help="Force re-download of nfcorpus dataset",
)
def generate(provider: str, model: str | None, force_download: bool):
"""Generate ground truth answers for RAG evaluation.
This command:
1. Downloads nfcorpus dataset (if not already cached)
2. For each selected query, extracts highly relevant documents
3. Uses an LLM to synthesize a reference answer
4. Saves ground truth to fixtures/ground_truth.json
Environment variables:
RAG_EVAL_PROVIDER: Provider type (ollama or anthropic)
RAG_EVAL_OLLAMA_BASE_URL: Ollama base URL
RAG_EVAL_OLLAMA_MODEL: Ollama model name
RAG_EVAL_ANTHROPIC_API_KEY: Anthropic API key
RAG_EVAL_ANTHROPIC_MODEL: Anthropic model name
"""
async def _generate():
click.echo("=" * 80)
click.echo("RAG Ground Truth Generation")
click.echo("=" * 80)
# Ensure corpus is downloaded
corpus_dir = ensure_corpus_downloaded(force_download)
# Load dataset
click.echo("\nLoading nfcorpus dataset...")
corpus = load_corpus(corpus_dir)
queries = load_queries(corpus_dir)
qrels = load_qrels(corpus_dir)
click.echo(f"Loaded {len(corpus)} documents, {len(queries)} queries")
# Create LLM provider
click.echo("\nInitializing LLM provider...")
try:
llm = create_llm_provider(
provider=provider,
ollama_model=model if provider == "ollama" else None,
anthropic_model=model if provider == "anthropic" else None,
)
provider_type = type(llm).__name__
click.echo(f"Using provider: {provider_type}")
except ValueError as e:
click.echo(f"\nError: {e}", err=True)
return 1
# Generate ground truth for each selected query
ground_truth_data = []
try:
for query_id in SELECTED_QUERIES:
if query_id not in queries:
click.echo(
f"\nWarning: Query {query_id} not found in dataset", err=True
)
continue
query = queries[query_id]
query_text = query["text"]
# Get highly relevant documents (score=2)
if query_id not in qrels:
click.echo(
f"\nWarning: No relevance judgments for {query_id}", err=True
)
continue
highly_relevant_doc_ids = [
doc_id for doc_id, score in qrels[query_id] if score == 2
]
if not highly_relevant_doc_ids:
click.echo(
f"\nWarning: No highly relevant docs for {query_id}", err=True
)
continue
# Get top 5 highly relevant documents
relevant_docs = []
for doc_id in highly_relevant_doc_ids[:5]:
if doc_id in corpus:
relevant_docs.append(corpus[doc_id])
if not relevant_docs:
click.echo(
f"\nWarning: Could not load documents for {query_id}", err=True
)
continue
# Generate ground truth answer
click.echo(f"\n{'-' * 80}")
ground_truth_answer = await generate_ground_truth_answer(
query_text, relevant_docs, llm
)
# Store result
ground_truth_data.append(
{
"query_id": query_id,
"query_text": query_text,
"ground_truth_answer": ground_truth_answer,
"expected_document_ids": highly_relevant_doc_ids,
"highly_relevant_count": len(highly_relevant_doc_ids),
}
)
click.echo(f" Preview: {ground_truth_answer[:200]}...")
finally:
await llm.close()
# Save ground truth
GROUND_TRUTH_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(GROUND_TRUTH_FILE, "w") as f:
json.dump(ground_truth_data, f, indent=2)
click.echo(f"\n{'=' * 80}")
click.echo(f"Generated {len(ground_truth_data)} ground truth answers")
click.echo(f"Saved to: {GROUND_TRUTH_FILE}")
click.echo("=" * 80)
return 0
sys.exit(anyio.run(_generate))
@cli.command()
@click.option(
"--nextcloud-url",
envvar="NEXTCLOUD_HOST",
required=True,
help="Nextcloud base URL (e.g., http://localhost:8000)",
)
@click.option(
"--username",
envvar="NEXTCLOUD_USERNAME",
required=True,
help="Nextcloud username",
)
@click.option(
"--password",
envvar="NEXTCLOUD_PASSWORD",
required=True,
help="Nextcloud password",
)
@click.option(
"--category",
default="nfcorpus_rag_eval",
help="Category/folder for uploaded notes",
)
@click.option(
"--force-download",
is_flag=True,
help="Force re-download of nfcorpus dataset",
)
@click.option(
"--force",
is_flag=True,
help="Delete all existing notes in the target category before uploading",
)
def upload(
nextcloud_url: str,
username: str,
password: str,
category: str,
force_download: bool,
force: bool,
):
"""Upload nfcorpus corpus documents as Nextcloud notes.
This command:
1. Downloads nfcorpus dataset (if not already cached)
2. Optionally deletes existing notes in target category (--force)
3. Uploads all corpus documents as Nextcloud notes
4. Saves document ID → note ID mapping to fixtures/note_mapping.json
The note mapping file is used by pytest tests to map expected document IDs
to actual note IDs in Nextcloud.
"""
async def _upload():
click.echo("=" * 80)
click.echo("Upload nfcorpus Corpus to Nextcloud")
click.echo("=" * 80)
# Ensure corpus is downloaded
corpus_dir = ensure_corpus_downloaded(force_download)
# Load corpus
click.echo("\nLoading corpus...")
corpus = load_corpus(corpus_dir)
click.echo(f"Loaded {len(corpus)} documents")
# Create Nextcloud client
click.echo(f"\nConnecting to Nextcloud at {nextcloud_url}...")
nc_client = NextcloudClient(
base_url=nextcloud_url,
username=username,
auth=BasicAuth(username, password),
)
try:
# Delete existing notes in category if force is specified
if force:
click.echo(
f"\n--force specified: Deleting existing notes in category '{category}'..."
)
# Collect notes to delete
notes_to_delete = []
async for note in nc_client.notes.get_all_notes():
if note.get("category") == category:
notes_to_delete.append(note["id"])
if not notes_to_delete:
click.echo(f"No existing notes found in category '{category}'")
else:
click.echo(f"Found {len(notes_to_delete)} notes to delete")
deleted_count = 0
delete_errors = []
delete_semaphore = anyio.Semaphore(20)
async def delete_note(note_id: int):
"""Delete a single note."""
nonlocal deleted_count
async with delete_semaphore:
try:
await nc_client.notes.delete_note(note_id)
deleted_count += 1
if deleted_count % 100 == 0:
click.echo(f" Deleted {deleted_count} notes...")
except Exception as e:
error_msg = f"Error deleting note {note_id}: {e}"
delete_errors.append(error_msg)
click.echo(f" {error_msg}", err=True)
# Delete all notes concurrently
async with anyio.create_task_group() as tg:
for note_id in notes_to_delete:
tg.start_soon(delete_note, note_id)
click.echo(
f"Deleted {deleted_count} existing notes in category '{category}'"
)
if delete_errors:
click.echo(
f"Encountered {len(delete_errors)} errors during deletion",
err=True,
)
# Upload documents concurrently
click.echo(f"\nUploading {len(corpus)} documents as notes (concurrent)...")
click.echo(f"Category: {category}")
note_mapping = {}
uploaded_count = 0
upload_errors = []
# Semaphore to limit concurrent uploads (avoid overwhelming server)
max_concurrent = 20
semaphore = anyio.Semaphore(max_concurrent)
async def upload_document(doc_id: str, doc: dict[str, Any]):
"""Upload a single document as a note."""
nonlocal uploaded_count
async with semaphore:
title = f"[{doc_id}] {doc['title'][:100]}" # Truncate long titles
content = doc["text"]
try:
note_data = await nc_client.notes.create_note(
title=title,
content=content,
category=category,
)
# Store mapping
note_id = note_data["id"]
note_mapping[doc_id] = note_id
uploaded_count += 1
# Progress indicator every 100 docs
if uploaded_count % 100 == 0:
click.echo(
f" Uploaded {uploaded_count}/{len(corpus)} documents..."
)
except Exception as e:
error_msg = f"Error uploading {doc_id}: {e}"
upload_errors.append(error_msg)
click.echo(f" {error_msg}", err=True)
# Upload all documents concurrently using task group
async with anyio.create_task_group() as tg:
for doc_id, doc in corpus.items():
tg.start_soon(upload_document, doc_id, doc)
click.echo(f"\nUploaded {uploaded_count} documents successfully")
if upload_errors:
click.echo(
f"Encountered {len(upload_errors)} errors during upload", err=True
)
# Save note mapping
with open(NOTE_MAPPING_FILE, "w") as f:
json.dump(note_mapping, f, indent=2)
click.echo(f"Saved note mapping to: {NOTE_MAPPING_FILE}")
click.echo(f" Mapped {len(note_mapping)} document IDs to note IDs")
finally:
# Close the Nextcloud client
await nc_client.close()
click.echo("=" * 80)
click.echo("Upload complete!")
click.echo("=" * 80)
return 0
sys.exit(anyio.run(_upload))
if __name__ == "__main__":
cli()