evaluate_traces.ipynb•5.26 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import phoenix as px\n",
"\n",
"pxc = px.Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from phoenix.session.evaluation import get_retrieved_documents\n",
"\n",
"retrieved_documents = get_retrieved_documents(pxc, project_name=\"research_assistant\")\n",
"retrieved_documents.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import nest_asyncio\n",
"\n",
"from phoenix.evals import (\n",
" OpenAIModel,\n",
" llm_classify,\n",
")\n",
"\n",
"nest_asyncio.apply()\n",
"\n",
"model = OpenAIModel(\n",
" model=\"gpt-4o\",\n",
" temperature=0.0,\n",
")\n",
"\n",
"bias_detection_prompt = \"\"\"\n",
"You are an AI assistant designed to detect bias in documents. Given an original question\n",
"and a document, your task is to analyze the document and determine if there is any bias present.\n",
"\n",
"Please follow these steps:\n",
"1. Read the original question carefully.\n",
"2. Read the document provided.\n",
"3. Identify any statements, language, or information in the document that may indicate bias.\n",
"4. Provide a classification of the document's bias level:\n",
"Unbiased, Somewhat Unbiased, Somewhat Biased, Biased.\n",
"5. Provide a detailed explanation for your classification, citing specific parts of the\n",
"document that influenced your decision.\n",
"\n",
"Original Question: {input}\n",
"Document: {reference}\n",
"\n",
"Your analysis should be thorough and objective. Please ensure that your explanation\n",
"is clear and concise.\n",
"\n",
"Example response:\n",
"************\n",
"EXPLANATION: An explanation of your reasoning for the label you chose\n",
"LABEL: \"bias\", \"unbiased\", \"somewhat biased\", \"somewhat unbiased\"\n",
"************\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bias_classifications = llm_classify(\n",
" dataframe=retrieved_documents,\n",
" template=bias_detection_prompt,\n",
" model=model,\n",
" rails=[\"Unbiased\", \"Biased\", \"Somewhat Biased\", \"Somewhat Unbiased\"],\n",
" provide_explanation=True,\n",
")\n",
"bias_classifications[\"score\"] = bias_classifications[\"label\"].map(\n",
" {\"unbiased\": 1, \"somewhat unbiased\": 0.75, \"somewhat biased\": 0.5, \"biased\": 0}\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"bias_classifications.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"span_bias_classifications = bias_classifications.copy()\n",
"\n",
"span_bias_classifications[\"average_score\"] = span_bias_classifications.groupby(\"context.span_id\")[\n",
" \"score\"\n",
"].transform(\"mean\")\n",
"\n",
"span_bias_classifications[\"label\"] = (\n",
" span_bias_classifications[\"average_score\"]\n",
" .apply(\n",
" lambda x: min(\n",
" {1: \"unbiased\", 0.75: \"somewhat unbiased\", 0.5: \"somewhat biased\", 0: \"biased\"}.keys(),\n",
" key=lambda k: abs(k - x),\n",
" )\n",
" )\n",
" .map({1: \"unbiased\", 0.75: \"somewhat unbiased\", 0.5: \"somewhat biased\", 0: \"biased\"})\n",
")\n",
"\n",
"# Combine all rows with the same context.span_id into one row, with explanations being a concatenation of all the explanations\n",
"span_bias_classifications = (\n",
" span_bias_classifications.groupby(\"context.span_id\")\n",
" .agg(\n",
" {\n",
" \"label\": \"first\",\n",
" \"explanation\": lambda x: \"\\n----\\n\".join(x),\n",
" \"exceptions\": \"first\",\n",
" \"execution_status\": \"first\",\n",
" \"execution_seconds\": \"mean\",\n",
" \"score\": \"mean\",\n",
" \"average_score\": \"first\",\n",
" }\n",
" )\n",
" .reset_index()\n",
")\n",
"span_bias_classifications.set_index(\"context.span_id\", inplace=True)\n",
"span_bias_classifications.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from phoenix.trace import DocumentEvaluations, SpanEvaluations\n",
"\n",
"px.Client().log_evaluations(\n",
" SpanEvaluations(\n",
" dataframe=span_bias_classifications,\n",
" eval_name=\"Bias Detection\",\n",
" ),\n",
" DocumentEvaluations(\n",
" dataframe=bias_classifications,\n",
" eval_name=\"Relevance\",\n",
" ),\n",
")"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}