ChatRAG-Bench.ipynb•4.01 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/hybridial"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"hybridial\"\n",
"df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"print(df.ctxs.apply(len).max())\n",
"df.loc[\n",
" (df.messages.apply(len) >= df.messages.apply(len).max() // 2)\n",
" & (df.ctxs.apply(len) >= df.ctxs.apply(len).max() // 2)\n",
"].sample(10, random_state=42).to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/sqa"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"sqa\"\n",
"df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
" 100, random_state=42\n",
").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/doqa_cooking"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"doqa_cooking\"\n",
"df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
" 100, random_state=42\n",
").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://huggingface.co/datasets/nvidia/ChatRAG-Bench/viewer/doqa_travel"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"doqa_travel\"\n",
"df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
" 100, random_state=42\n",
").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"doqa_movies\"\n",
"df = load_dataset(\"nvidia/ChatRAG-Bench\", name)[\"test\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
" 100, random_state=42\n",
").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://huggingface.co/datasets/nvidia/ChatQA-Training-Data/viewer/synthetic_convqa"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"name = \"synthetic_convqa\"\n",
"df = load_dataset(\"nvidia/ChatQA-Training-Data\", name)[\"train\"].to_pandas()\n",
"print(df.messages.apply(len).max())\n",
"df.loc[df.messages.apply(len) >= df.messages.apply(len).max() // 2].sample(\n",
" 100, random_state=42\n",
").to_csv(f\"{name}_samples.csv.gz\", index=False)\n",
"df.sample(10)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}