run_analysis.pyā¢34 kB
"""
Direct Analysis Runner - Generate Excel with Modular Architecture
Run this script to create your experiment analysis Excel file directly.
The analysis functions are now modular and reusable:
- Statistical analysis (t-tests): src/statflow/analysis/statistical_analysis.py
- Table generators: src/statflow/analysis/table_generators.py
- AI insights: src/statflow/analysis/ai_insights.py
Usage:
python run_analysis.py
"""
import json
import sys
import os
from datetime import datetime
# Add src to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
import mysql.connector
from mysql.connector import Error
import openpyxl
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import openpyxl.utils
from statflow.query_builder import QueryBuilder
from statflow.analysis import (
create_ttest_table,
generate_ai_insights,
create_avg_metric_table,
create_section_summary_table,
create_avg_gq_table,
generate_word_report_with_real_data,
create_ssd_vs_alt_ttest_table
)
def load_config():
"""Load database configuration from config.json."""
try:
with open('config.json', 'r') as f:
return json.load(f)
except FileNotFoundError:
print("[!] Error: config.json not found!")
print(" Please create config.json with your MySQL credentials.")
sys.exit(1)
except json.JSONDecodeError as e:
print(f"[!] Error parsing config.json: {e}")
sys.exit(1)
def run_combined_query(config):
"""
Run the combined query across both databases.
Returns:
tuple: (columns, data, query_builder) or (None, None, None) on error
"""
db1_config = config['mysql_dump']
try:
# Connect to MySQL server
connection = mysql.connector.connect(
host=db1_config['host'],
port=db1_config['port'],
user=db1_config['user'],
password=db1_config['password']
)
cursor = connection.cursor(dictionary=True)
# Build query using modular query builder
# Dynamically load all database configs from config.json
databases = []
for key in sorted(config.keys()):
if key.startswith('mysql_dump'):
databases.append(config[key])
query_builder = QueryBuilder(databases)
query = query_builder.build_query()
print("[*] Running query for userIds with complete analysis...")
cursor.execute(query)
results = cursor.fetchall()
columns = cursor.column_names
cursor.close()
connection.close()
# Count experts and novices
experts = sum(1 for r in results if r['pre_test_binary'] == 'expert')
novices = sum(1 for r in results if r['pre_test_binary'] == 'novice')
print(f"[+] Fetched {len(results)} users: {experts} experts, {novices} novices")
return columns, results, query_builder
except Error as e:
print(f"[!] MySQL Error: {e}")
return None, None, None
def export_to_excel(columns, data, output_path, query_builder, config, update_existing=True):
"""
Export data to Excel with comprehensive formatting and analysis.
Args:
columns: List of column names from query
data: List of dictionaries containing the data
output_path: Path where Excel file should be saved
query_builder: QueryBuilder instance for getting header sections
config: Configuration dictionary
update_existing: If True, update existing file; if False, create new
Returns:
bool: True if successful, False otherwise
"""
try:
# Load existing workbook or create new one
if update_existing and os.path.exists(output_path):
print(f"[*] Updating existing file: {output_path}")
workbook = openpyxl.load_workbook(output_path)
# Remove old sheet names if they exist (to prevent duplicates)
if "Combined_Data" in workbook.sheetnames:
del workbook["Combined_Data"]
if "Expert_Novice_Analysis" in workbook.sheetnames:
del workbook["Expert_Novice_Analysis"]
# Create new Expert_Novice_Analysis sheet at the beginning
if workbook.sheetnames:
workbook.create_sheet("Expert_Novice_Analysis", 0)
else:
workbook.create_sheet("Expert_Novice_Analysis")
sheet = workbook["Expert_Novice_Analysis"]
else:
print(f"[*] Creating new file: {output_path}")
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet.title = "Expert_Novice_Analysis"
# Helper alias for get_column_letter
gcl = get_column_letter
# Row 1: Merged headers (dynamic from query builder)
header_sections = query_builder.get_header_sections()
for section in header_sections:
cell_range = f"{section['start_col']}1:{section['end_col']}1"
sheet.merge_cells(cell_range)
cell = sheet[f"{section['start_col']}1"]
cell.value = section['name']
cell.font = Font(bold=True, size=12, color="1F4E78")
cell.alignment = Alignment(horizontal="center", vertical="center")
# Row 2: Column headers
column_mapping = query_builder.get_column_mapping()
light_grey_border = Side(style='thin', color='D3D3D3')
for col_idx, col_name in enumerate(columns, start=1):
cell = sheet.cell(row=2, column=col_idx)
display_name = column_mapping.get(col_name, col_name)
cell.value = display_name
cell.font = Font(bold=True, size=10)
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
# Add light grey border to headers
cell.border = Border(
left=light_grey_border,
right=light_grey_border,
top=light_grey_border,
bottom=light_grey_border
)
# Map column indices to section names for coloring
col_to_section = {}
for section in header_sections:
start_col_idx = openpyxl.utils.column_index_from_string(section['start_col'])
end_col_idx = openpyxl.utils.column_index_from_string(section['end_col'])
for col_idx in range(start_col_idx, end_col_idx + 1):
col_to_section[col_idx] = section['name']
# Track the row where experts end (to add bold line separator)
expert_end_row = None
# Define colors for each section
section_colors = {
'User characteristics': 'FDE9D9', # Orange
'Time': 'E8F8E8', # Green
'Accuracy': 'E6F2FF', # Blue
'Satisfaction': 'F4EBFA', # Plum
'Graph Questions': 'F2F2F2' # Light Gray
}
# Data rows (starting from row 3)
data_start_row = 3
for row_idx, row_data in enumerate(data, start=data_start_row):
for col_idx, col_name in enumerate(columns, start=1):
cell = sheet.cell(row=row_idx, column=col_idx)
value = row_data.get(col_name)
# Handle None values
if value is None:
cell.value = ""
else:
cell.value = value
# Apply section-based coloring and light grey borders
section_name = col_to_section.get(col_idx)
if section_name and section_name in section_colors:
color = section_colors[section_name]
cell.fill = PatternFill(start_color=color, end_color=color, fill_type="solid")
# Add light grey border for better distinction
light_grey_border = Side(style='thin', color='D3D3D3')
cell.border = Border(
left=light_grey_border,
right=light_grey_border,
top=light_grey_border,
bottom=light_grey_border
)
# Alignment
cell.alignment = Alignment(horizontal="center", vertical="center")
# Number formatting for time columns (if it looks like a time value)
if col_name.startswith('TIME_') and isinstance(value, (int, float)):
cell.number_format = '0.00'
# Track where experts end
if row_data.get('pre_test_binary') == 'expert':
expert_end_row = row_idx
# Add bold border between experts and novices
if expert_end_row:
thick_bottom = Side(style='medium', color='000000')
for col_idx in range(1, len(columns) + 1):
cell = sheet.cell(row=expert_end_row, column=col_idx)
current_border = cell.border
cell.border = Border(
left=current_border.left,
right=current_border.right,
top=current_border.top,
bottom=thick_bottom
)
# AVERAGE row (after all data)
average_row = data_start_row + len(data)
# Set row height for average row
sheet.row_dimensions[average_row].height = 20
for col_idx, col_name in enumerate(columns, start=1):
cell = sheet.cell(row=average_row, column=col_idx)
# Special handling for certain columns
if col_name == 'unique_userId':
cell.value = "AVERAGE"
cell.font = Font(bold=True, size=11, color="FFFFFF")
elif col_name == 'pre_study_testscore':
cell.value = "" # Don't average the test score
elif col_name == 'pre_test_binary':
cell.value = "" # Don't average binary values
else:
# Calculate average for numeric columns
col_letter = gcl(col_idx)
cell.value = f'=AVERAGE({col_letter}{data_start_row}:{col_letter}{average_row-1})'
cell.number_format = '0.00'
cell.font = Font(bold=True, size=10, color="FFFFFF")
# Orange background for average row with light grey border
cell.fill = PatternFill(start_color="F4B084", end_color="F4B084", fill_type="solid")
cell.alignment = Alignment(horizontal="center", vertical="center")
# Add light grey border for consistency
light_grey_border = Side(style='thin', color='D3D3D3')
cell.border = Border(
left=light_grey_border,
right=light_grey_border,
top=light_grey_border,
bottom=light_grey_border
)
# Add bold borders around each section
thick_border = Side(style='medium', color='000000')
for section in header_sections:
# Get column indices
start_col_idx = openpyxl.utils.column_index_from_string(section['start_col'])
end_col_idx = openpyxl.utils.column_index_from_string(section['end_col'])
# Apply thick borders to section edges (rows 1 to average_row)
for row_idx in range(1, average_row + 1):
# Left border
left_cell = sheet.cell(row=row_idx, column=start_col_idx)
left_cell.border = Border(
left=thick_border,
right=left_cell.border.right,
top=left_cell.border.top,
bottom=left_cell.border.bottom
)
# Right border
right_cell = sheet.cell(row=row_idx, column=end_col_idx)
right_cell.border = Border(
left=right_cell.border.left,
right=thick_border,
top=right_cell.border.top,
bottom=right_cell.border.bottom
)
# Auto-adjust column widths
for col_idx in range(1, len(columns) + 1):
max_length = 0
col_letter = gcl(col_idx)
for row in sheet[col_letter]:
try:
if row.value:
max_length = max(max_length, len(str(row.value)))
except:
pass
adjusted_width = min(max_length + 2, 30) # Cap at 30
sheet.column_dimensions[col_letter].width = max(adjusted_width, 12)
# Freeze panes: Keep headers (rows 1-2) and first column (unique_userId) visible while scrolling
# Freeze at B3 means everything above row 3 and left of column B will be frozen
sheet.freeze_panes = 'B3'
# ===== ADD STATISTICAL ANALYSIS TABLES =====
# Determine row ranges for experts and novices
expert_start = data_start_row
expert_end = expert_end_row if expert_end_row else data_start_row
novice_start = expert_end + 1 if expert_end_row else data_start_row
novice_end = average_row - 1
# Start t-test tables below the average row
ttest_start_row = average_row + 3
# Graph lists for different metrics
ssd_time_graphs = [
('TIME_EQU_1', 'EQU-1'), ('TIME_STU_2', 'STU-2'), ('TIME_GOA_3', 'GOA-3'),
('TIME_PAT_4', 'PAT-4'), ('TIME_ENR_5', 'ENR-5')
]
alt_time_graphs = [
('TIME_EQU_6', 'EQU-6'), ('TIME_STU_7', 'STU-7'), ('TIME_GOA_8', 'GOA-8'),
('TIME_PAT_9', 'PAT-9'), ('TIME_ENR_10', 'ENR-10')
]
# Find column positions for each section
time_section_col = None
accuracy_section_col = None
satisfaction_section_col = None
graph_questions_section_col = None
for section in header_sections:
if section['name'] == 'Time':
time_section_col = openpyxl.utils.column_index_from_string(section['start_col'])
elif section['name'] == 'Accuracy':
accuracy_section_col = openpyxl.utils.column_index_from_string(section['start_col'])
elif section['name'] == 'Satisfaction':
satisfaction_section_col = openpyxl.utils.column_index_from_string(section['start_col'])
elif section['name'] == 'Graph Questions':
graph_questions_section_col = openpyxl.utils.column_index_from_string(section['start_col'])
# Create TIME T-TEST table under Time section
if time_section_col:
ssd_time_graphs = [
('TIME_EQU_1', 'EQU-1'), ('TIME_STU_2', 'STU-2'), ('TIME_GOA_3', 'GOA-3'),
('TIME_PAT_4', 'PAT-4'), ('TIME_ENR_5', 'ENR-5')
]
alt_time_graphs = [
('TIME_EQU_6', 'EQU-6'), ('TIME_STU_7', 'STU-7'), ('TIME_GOA_8', 'GOA-8'),
('TIME_PAT_9', 'PAT-9'), ('TIME_ENR_10', 'ENR-10')
]
time_aggregates = [
('AVG_TIME_SSD', 'Avg SSD'),
('AVG_TIME_ALT', 'Avg ALT')
]
time_overall = ('AVG_OVERALL_TIME', 'Overall Avg')
time_ttest_end_row = create_ttest_table(
sheet, ttest_start_row, time_section_col,
"TIME T-TEST", "27AE60", "16A085", "E8F8F5",
ssd_time_graphs, alt_time_graphs, columns,
expert_start, expert_end, novice_start, novice_end,
time_aggregates, time_overall
)
# Add Average Time table below the t-test table
avg_time_table_start = time_ttest_end_row + 2
avg_time_end_row = create_avg_metric_table(
sheet, avg_time_table_start, time_section_col, columns,
data_start_row, average_row - 1, 'TIME',
"AVERAGE TIME IN SECONDS BY GRAPH TYPE", "27AE60", "16A085"
)
# Add Time Summary table (SSD vs ALT)
time_summary_start = avg_time_end_row + 2
create_section_summary_table(
sheet, time_summary_start, time_section_col, columns,
average_row, 'TIME',
"OVERALL TIME: SSD vs ALT", "27AE60", "16A085"
)
# Create ACCURACY T-TEST table under Accuracy section
if accuracy_section_col:
ssd_acc_graphs = [
('ACC_EQU_1', 'EQU-1'), ('ACC_STU_2', 'STU-2'), ('ACC_GOA_3', 'GOA-3'),
('ACC_PAT_4', 'PAT-4'), ('ACC_ENR_5', 'ENR-5')
]
alt_acc_graphs = [
('ACC_EQU_6', 'EQU-6'), ('ACC_STU_7', 'STU-7'), ('ACC_GOA_8', 'GOA-8'),
('ACC_PAT_9', 'PAT-9'), ('ACC_ENR_10', 'ENR-10')
]
accuracy_aggregates = [
('AVG_ACC_SSD', 'Avg SSD'),
('AVG_ACC_ALT', 'Avg ALT')
]
acc_overall = ('AVG_OVERALL_ACC', 'Overall Avg')
acc_ttest_end_row = create_ttest_table(
sheet, ttest_start_row, accuracy_section_col,
"ACCURACY T-TEST", "3498DB", "2980B9", "EBF5FB",
ssd_acc_graphs, alt_acc_graphs, columns,
expert_start, expert_end, novice_start, novice_end,
accuracy_aggregates, acc_overall
)
# Add Average Accuracy table below the t-test table
avg_acc_table_start = acc_ttest_end_row + 2
avg_acc_end_row = create_avg_metric_table(
sheet, avg_acc_table_start, accuracy_section_col, columns,
data_start_row, average_row - 1, 'ACCURACY',
"AVERAGE ACCURACY BY GRAPH TYPE", "3498DB", "2980B9"
)
# Add Accuracy Summary table (SSD vs ALT)
acc_summary_start = avg_acc_end_row + 2
create_section_summary_table(
sheet, acc_summary_start, accuracy_section_col, columns,
average_row, 'ACCURACY',
"OVERALL ACCURACY: SSD vs ALT", "3498DB", "2980B9"
)
# Create SATISFACTION T-TEST table under Satisfaction section
if satisfaction_section_col:
ssd_sat_graphs = [
('SAT_EQU_1', 'EQU-1'), ('SAT_STU_2', 'STU-2'), ('SAT_GOA_3', 'GOA-3'),
('SAT_PAT_4', 'PAT-4'), ('SAT_ENR_5', 'ENR-5')
]
alt_sat_graphs = [
('SAT_EQU_6', 'EQU-6'), ('SAT_STU_7', 'STU-7'), ('SAT_GOA_8', 'GOA-8'),
('SAT_PAT_9', 'PAT-9'), ('SAT_ENR_10', 'ENR-10')
]
satisfaction_aggregates = [
('AVG_SAT_SSD', 'Avg SSD'),
('AVG_SAT_ALT', 'Avg ALT')
]
sat_overall = ('AVG_OVERALL_SAT', 'Overall Avg')
sat_ttest_end_row = create_ttest_table(
sheet, ttest_start_row, satisfaction_section_col,
"SATISFACTION T-TEST", "9B59B6", "8E44AD", "F4ECF7",
ssd_sat_graphs, alt_sat_graphs, columns,
expert_start, expert_end, novice_start, novice_end,
satisfaction_aggregates, sat_overall
)
# Add Average Satisfaction table below the t-test table
avg_sat_table_start = sat_ttest_end_row + 2
avg_sat_end_row = create_avg_metric_table(
sheet, avg_sat_table_start, satisfaction_section_col, columns,
data_start_row, average_row - 1, 'SATISFACTION',
"AVERAGE SATISFACTION BY GRAPH TYPE", "9B59B6", "8E44AD"
)
# Add Satisfaction Summary table (SSD vs ALT)
sat_summary_start = avg_sat_end_row + 2
create_section_summary_table(
sheet, sat_summary_start, satisfaction_section_col, columns,
average_row, 'SATISFACTION',
"OVERALL SATISFACTION: SSD vs ALT", "9B59B6", "8E44AD"
)
# Create GRAPH QUESTIONS T-TEST table under Graph Questions section
if graph_questions_section_col:
# Graph questions are only for graphs 1-5 (SSD graphs)
# Each graph has 2 questions (GQ1 and GQ2)
gq_questions = [
('GQ_EQU_1_1', 'EQU-1-1'), ('GQ_EQU_1_2', 'EQU-1-2'),
('GQ_STU_2_1', 'STU-2-1'), ('GQ_STU_2_2', 'STU-2-2'),
('GQ_GOA_3_1', 'GOA-3-1'), ('GQ_GOA_3_2', 'GOA-3-2'),
('GQ_PAT_4_1', 'PAT-4-1'), ('GQ_PAT_4_2', 'PAT-4-2'),
('GQ_ENR_5_1', 'ENR-5-1'), ('GQ_ENR_5_2', 'ENR-5-2')
]
# No ALT graphs for graph questions, so pass empty list
alt_gq_questions = []
gq_aggregates = [
('TOTAL_GQ_SCORE', 'Total GQ Score'),
('AVG_GQ_SCORE', 'Avg GQ Score')
]
gq_overall = ('AVG_GQ_SCORE', 'Overall Avg')
gq_ttest_end_row = create_ttest_table(
sheet, ttest_start_row, graph_questions_section_col,
"GRAPH QUESTIONS T-TEST", "F39C12", "E67E22", "FEF5E7",
gq_questions, alt_gq_questions, columns,
expert_start, expert_end, novice_start, novice_end,
gq_aggregates, gq_overall
)
# Add Average Graph Questions table below the t-test table
avg_gq_table_start = gq_ttest_end_row + 2
create_avg_gq_table(
sheet, avg_gq_table_start, graph_questions_section_col, columns,
data_start_row, average_row - 1,
"AVERAGE GRAPH QUESTIONS PERFORMANCE BY GRAPH TYPE", "F39C12", "E67E22"
)
# ===== GENERATE AI INSIGHTS =====
# AI insights generation is now handled by the modular function
generate_ai_insights(workbook, data, columns, config)
# Save workbook
workbook.save(output_path)
print(f"[+] Excel file saved: {output_path}")
print(f" - Row 1: Section headers (User characteristics, Time, Accuracy, Satisfaction, Graph Questions)")
print(f" - Row 2: Bold column headers")
print(f" - Row 3-{average_row-1}: {len(data)} data rows with color-coded sections:")
print(f" - User characteristics: Orange")
print(f" - Time: Green")
print(f" - Accuracy: Blue")
print(f" - Satisfaction: Plum")
print(f" - Graph Questions: Light Gray")
print(f" - Bold horizontal line separating Experts from Novices")
print(f" - Row {average_row}: AVERAGE row (orange background with white text)")
print(f" - Bold vertical borders around each section for visual distinction")
print(f" - Light grey borders on all colored cells for better readability")
print(f" - Freeze panes: Headers (rows 1-2) and first column stay visible when scrolling")
print(f" - T-Test Analysis tables starting at row {ttest_start_row}:")
print(f" * TIME T-Test positioned under Time section (includes SSD, ALT, Aggregates, Total Overall)")
print(f" * ACCURACY T-Test positioned under Accuracy section (includes SSD, ALT, Aggregates, Total Overall)")
print(f" * SATISFACTION T-Test positioned under Satisfaction section (includes SSD, ALT, Aggregates, Total Overall)")
print(f" * GRAPH QUESTIONS T-Test positioned under Graph Questions section (all 10 questions, Aggregates, Total Overall)")
print(f" - Average Metric tables (below each t-test):")
print(f" * Average Time in Seconds by Graph Type")
print(f" * Average Accuracy by Graph Type")
print(f" * Average Satisfaction by Graph Type")
print(f" * Average Graph Questions Performance by Graph Type")
print(f" - Overall Summary tables (SSD vs ALT comparison under each section)")
if config.get('openai', {}).get('enabled', False):
print(f" - AI Insights sheet with comprehensive analysis")
print(f" - Other sheets preserved (if any existed)")
return True
except Exception as e:
print(f"[!] Error creating Excel file: {e}")
return False
def main():
"""Main execution."""
print("=" * 60)
print(" EXPERIMENT ANALYSIS - Direct Excel Export")
print(" Now with Modular Architecture!")
print("=" * 60)
print()
# Load configuration
print("[*] Loading configuration from config.json...")
config = load_config()
# Check credentials
if config['mysql_dump']['user'] == 'your_username':
print()
print("[!] WARNING: You need to update config.json first!")
print(" Please edit config.json and set your MySQL credentials:")
print(" - user: your MySQL username")
print(" - password: your MySQL password")
print()
sys.exit(1)
print(f"[+] Configuration loaded")
print(f" - Database 1: {config['mysql_dump']['database']}")
print(f" - Database 2: {config['mysql_dump_2']['database']}")
print()
# Run query
columns, data, query_builder = run_combined_query(config)
if columns is None or data is None or query_builder is None:
print()
print("[!] Failed to fetch data. Please check:")
print(" - MySQL server is running")
print(" - Database credentials are correct")
print(" - Both databases exist")
print(" - Table 'mainstudy_responses' exists in both databases")
sys.exit(1)
print()
# Prepare output path
output_dir = config['excel_output']['default_path']
# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
print(f"š Creating output directory: {output_dir}")
os.makedirs(output_dir)
# Use fixed filename (will update same file each time)
output_file = os.path.join(output_dir, "experiment_analysis.xlsx")
# Check if file exists
if os.path.exists(output_file):
print(f"[!] File already exists: {output_file}")
print(f" The Combined_Data sheet will be refreshed with latest data.")
# Export to Excel (will update if exists)
print(f"[*] Exporting to Excel...")
success = export_to_excel(columns, data, output_file, query_builder, config, update_existing=True)
# Generate Word document
print(f"[*] Generating Word document...")
word_output_file = os.path.join(output_dir, "experiment_analysis.docx")
# Extract analysis data for Word document
ttest_results = {}
avg_metrics = {}
summary_tables = {}
# Generate AI insights if enabled
ai_insights = None
if config.get('openai', {}).get('enabled', False):
try:
ai_insights = generate_ai_insights(data, config)
except Exception as e:
print(f"[!] AI insights generation failed: {e}")
ai_insights = None
# Generate Word document
try:
word_path = generate_word_report_with_real_data(
data, columns, config,
ttest_results, avg_metrics, summary_tables,
ai_insights, word_output_file
)
print(f"[+] Word document saved: {word_path}")
except Exception as e:
print(f"[!] Word document generation failed: {e}")
print(" Excel file was still created successfully")
print()
print("=" * 60)
if success:
print("[SUCCESS]")
print()
print(f"Your analysis is ready at:")
print(f" š Excel: {output_file}")
print(f" š Word: {word_output_file}")
print()
print("What's in the files:")
print(f" [+] Row 1 Headers:")
print(f" - 'User characteristics' (B1:C1)")
print(f" - 'Time' (D1:S1) - 16 columns with SSD/ALT/Overall aggregates")
print(f" - 'Accuracy' (T1:AI1) - 16 columns with SSD/ALT/Overall aggregates")
print(f" - 'Satisfaction' (AJ1:AY1) - 16 columns with SSD/ALT/Overall aggregates")
print(f" - 'Graph Questions' (AZ1:BK1) - 12 columns with Total/Avg aggregates")
print(f" [+] Row 2: Column headers")
print(f" - Unique UserId | pre_study_testscore | pre_test_binary")
print(f" - Time: EQU-1 to ENR-10, Total/Avg SSD, Total/Avg ALT, Total/Avg Overall Time")
print(f" - Accuracy: EQU-1 to ENR-10, Total/Avg SSD, Total/Avg ALT, Total/Avg Overall Acc")
print(f" - Satisfaction: EQU-1 to ENR-10, Total/Avg SSD, Total/Avg ALT, Total/Avg Overall Sat")
print(f" - Graph Questions: EQU-1-1/2 to ENR-5-1/2, Total/Avg GQ Score")
print(f" [+] Row 3-{3+len(data)-1}: {len(data)} users with complete data")
print(f" [+] Color-coded sections (Orange, Green, Blue, Plum, Light Gray)")
print(f" [+] Bold horizontal line separating Experts from Novices")
print(f" [+] Row {3+len(data)}: AVERAGE row (orange background, white text)")
print(f" Note: pre_study_testscore excluded from average")
print(f" [+] Bold vertical borders separating each section visually")
print(f" [+] Light grey borders on all colored cells for better distinction")
print(f" [+] Freeze panes enabled: Headers and first column stay visible while scrolling")
print(f" [+] T-Test Analysis tables below the data:")
print(f" - TIME T-Test positioned under Time section columns")
print(f" - ACCURACY T-Test positioned under Accuracy section columns")
print(f" - SATISFACTION T-Test positioned under Satisfaction section columns")
print(f" - GRAPH QUESTIONS T-Test positioned under Graph Questions section columns")
print(f" - Time/Accuracy/Satisfaction compare Experts vs Novices for:")
print(f" * SSD (1-5)")
print(f" * ALT (6-10)")
print(f" * Aggregates (Avg SSD, Avg ALT)")
print(f" * TOTAL OVERALL (Highlighted row with overall average comparison)")
print(f" - Graph Questions compares Experts vs Novices for:")
print(f" * All 10 questions (EQU-1-1/2, STU-2-1/2, GOA-3-1/2, PAT-4-1/2, ENR-5-1/2)")
print(f" * Aggregates (Total GQ Score, Avg GQ Score)")
print(f" * TOTAL OVERALL (Highlighted row with overall average comparison)")
print(f" [+] Average Metric tables by Graph Type (EQU, STU, GOA, PAT, ENR):")
print(f" - Average Time in Seconds table under Time section")
print(f" - Average Accuracy table under Accuracy section")
print(f" - Average Satisfaction table under Satisfaction section")
print(f" - Average Graph Questions Performance table under Graph Questions section")
print(f" - Time/Accuracy/Satisfaction show SSD vs ALT values for all graph types")
print(f" - Graph Questions shows GQ1 vs GQ2 values for all graph types")
print(f" [+] Overall Summary tables (SSD vs ALT comparison):")
print(f" - OVERALL TIME: SSD vs ALT (under Time section)")
print(f" - OVERALL ACCURACY: SSD vs ALT (under Accuracy section)")
print(f" - OVERALL SATISFACTION: SSD vs ALT (under Satisfaction section)")
if config.get('openai', {}).get('enabled', False):
print(f" [+] AI-Generated Insights (second sheet):")
print(f" - Comprehensive analysis of experimental results")
print(f" - Key insights, recommendations, and practical implications")
print(f" [+] Word Document Features:")
print(f" - Professional academic report format")
print(f" - All analysis tables in clean, readable format")
print(f" - Statistical t-test results with interpretation")
print(f" - Performance metrics by graph type")
print(f" - Overall summary tables (SSD vs ALT)")
print(f" - AI-generated insights and recommendations")
print(f" - Experiment summary and participant statistics")
# Count experts and novices by database
l1_experts = sum(1 for row in data if row['pre_test_binary'] == 'expert' and row['unique_userId'].startswith('L1_'))
l2_experts = sum(1 for row in data if row['pre_test_binary'] == 'expert' and row['unique_userId'].startswith('L2_'))
l1_novices = sum(1 for row in data if row['pre_test_binary'] == 'novice' and row['unique_userId'].startswith('L1_'))
l2_novices = sum(1 for row in data if row['pre_test_binary'] == 'novice' and row['unique_userId'].startswith('L2_'))
print(f" 1. L1 experts: {l1_experts}")
print(f" 2. L2 experts: {l2_experts}")
print(f" 3. L1 novices: {l1_novices}")
print(f" 4. L2 novices: {l2_novices}")
print()
print("[+] The code is now modular and reusable!")
print(" Analysis modules are in: src/statflow/analysis/")
print(" - statistical_analysis.py: T-test tables")
print(" - table_generators.py: Average metrics and summaries")
print(" - ai_insights.py: AI-powered insights")
else:
print("[!] Failed to create Excel file")
print(" Check the error messages above")
print("=" * 60)
print()
if __name__ == "__main__":
main()