wrangle_wiki_toxic.ipynb•1.06 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Wrangle Wiki Toxic"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"split = \"test\"\n",
"df = pd.read_parquet(\n",
" f\"https://huggingface.co/api/datasets/OxAISH-AL-LLM/wiki_toxic/parquet/default/{split}/0.parquet\"\n",
")\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df[\"toxic\"] = df[\"label\"].map(bool)\n",
"df = df.drop(columns=[\"label\"])\n",
"df = df.rename(columns={\"comment_text\": \"text\"})\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_json(f\"wiki_toxic-{split}.jsonl\", orient=\"records\", lines=True)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}