#!/usr/bin/env python3
"""
Validate eval dataset - check that all file paths exist and all substrings are found.
Usage:
python scripts/validate_dataset.py [dataset_path] [repos_base_path]
Example:
python scripts/validate_dataset.py data/eval_dataset_v3.jsonl data/issues
"""
import json
import sys
from pathlib import Path
from typing import NamedTuple
class ValidationResult(NamedTuple):
question_id: str
file_path: str
status: str # 'OK', 'FILE_NOT_FOUND', 'SUBSTRING_NOT_FOUND'
missing_substrings: list[str]
details: str
def validate_dataset(dataset_path: str, repos_base_path: str) -> list[ValidationResult]:
"""Validate all entries in the dataset."""
results = []
dataset_file = Path(dataset_path)
repos_base = Path(repos_base_path)
if not dataset_file.exists():
print(f"❌ Dataset file not found: {dataset_path}")
sys.exit(1)
if not repos_base.exists():
print(f"❌ Repos base path not found: {repos_base_path}")
sys.exit(1)
with open(dataset_file, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
except json.JSONDecodeError as e:
print(f"❌ JSON parse error on line {line_num}: {e}")
continue
question_id = entry.get('id', f'line_{line_num}')
repo_path = entry.get('repo_path', '')
expected_items = entry.get('expected_items', [])
# Find repo directory (handle variations like django/repo, django)
repo_dir = repos_base / repo_path / 'repo'
if not repo_dir.exists():
repo_dir = repos_base / repo_path
if not repo_dir.exists():
results.append(ValidationResult(
question_id=question_id,
file_path=str(repo_path),
status='REPO_NOT_FOUND',
missing_substrings=[],
details=f"Repository directory not found: {repo_dir}"
))
continue
for item in expected_items:
file_path = item.get('file_path', '')
must_include = item.get('must_include_substrings', [])
# Build full path
full_path = repo_dir / file_path
# Check if file exists
if not full_path.exists():
results.append(ValidationResult(
question_id=question_id,
file_path=file_path,
status='FILE_NOT_FOUND',
missing_substrings=[],
details=f"File not found: {full_path}"
))
continue
# Read file content
try:
content = full_path.read_text(encoding='utf-8', errors='ignore')
except Exception as e:
results.append(ValidationResult(
question_id=question_id,
file_path=file_path,
status='READ_ERROR',
missing_substrings=[],
details=f"Cannot read file: {e}"
))
continue
# Check substrings
missing = []
for substring in must_include:
if substring not in content:
missing.append(substring)
if missing:
results.append(ValidationResult(
question_id=question_id,
file_path=file_path,
status='SUBSTRING_NOT_FOUND',
missing_substrings=missing,
details=f"Missing {len(missing)}/{len(must_include)} substrings"
))
else:
results.append(ValidationResult(
question_id=question_id,
file_path=file_path,
status='OK',
missing_substrings=[],
details=f"All {len(must_include)} substrings found"
))
return results
def print_results(results: list[ValidationResult]) -> tuple[int, int, int]:
"""Print validation results and return counts."""
ok_count = 0
warning_count = 0
error_count = 0
# Group by question
questions = {}
for r in results:
if r.question_id not in questions:
questions[r.question_id] = []
questions[r.question_id].append(r)
print("\n" + "=" * 80)
print("📊 DATASET VALIDATION REPORT")
print("=" * 80 + "\n")
for qid, items in questions.items():
all_ok = all(r.status == 'OK' for r in items)
if all_ok:
print(f"✅ {qid}")
ok_count += len(items)
else:
print(f"\n❌ {qid}")
for r in items:
if r.status == 'OK':
print(f" ✅ {r.file_path} - {r.details}")
ok_count += 1
elif r.status == 'SUBSTRING_NOT_FOUND':
print(f" ⚠️ {r.file_path} - {r.details}")
for sub in r.missing_substrings:
print(f" Missing: \"{sub}\"")
warning_count += 1
else:
print(f" ❌ {r.file_path} - {r.status}: {r.details}")
error_count += 1
# Summary
total = ok_count + warning_count + error_count
print("\n" + "=" * 80)
print("📈 SUMMARY")
print("=" * 80)
print(f" Total checks: {total}")
print(f" ✅ Passed: {ok_count}")
print(f" ⚠️ Missing subs: {warning_count}")
print(f" ❌ Errors: {error_count}")
print("=" * 80)
if error_count == 0 and warning_count == 0:
print("\n🎉 All validations passed!")
elif error_count == 0:
print(f"\n⚠️ {warning_count} substring issues found. Files exist but some expected text missing.")
else:
print(f"\n❌ {error_count} errors found. Some files don't exist!")
return ok_count, warning_count, error_count
def main():
# Default paths
dataset_path = "data/eval_dataset_v3.jsonl"
repos_base_path = "data/issues"
# Override with command line args
if len(sys.argv) > 1:
dataset_path = sys.argv[1]
if len(sys.argv) > 2:
repos_base_path = sys.argv[2]
print(f"🔍 Validating dataset: {dataset_path}")
print(f"📁 Repos base path: {repos_base_path}")
results = validate_dataset(dataset_path, repos_base_path)
ok, warnings, errors = print_results(results)
# Exit code: 0 if all OK, 1 if errors, 2 if only warnings
if errors > 0:
sys.exit(1)
elif warnings > 0:
sys.exit(2)
else:
sys.exit(0)
if __name__ == "__main__":
main()