"""
A/B testing module for App Store Optimization.
Plans and tracks A/B tests for metadata and visual assets.
"""
from typing import Dict, List, Any, Optional
import math
class ABTestPlanner:
"""Plans and tracks A/B tests for ASO elements."""
# Minimum detectable effect sizes (conservative estimates)
MIN_EFFECT_SIZES = {
'icon': 0.10, # 10% conversion improvement
'screenshot': 0.08, # 8% conversion improvement
'title': 0.05, # 5% conversion improvement
'description': 0.03 # 3% conversion improvement
}
# Statistical confidence levels
CONFIDENCE_LEVELS = {
'high': 0.95, # 95% confidence
'standard': 0.90, # 90% confidence
'exploratory': 0.80 # 80% confidence
}
def __init__(self):
"""Initialize A/B test planner."""
self.active_tests = []
def design_test(
self,
test_type: str,
variant_a: Dict[str, Any],
variant_b: Dict[str, Any],
hypothesis: str,
success_metric: str = 'conversion_rate'
) -> Dict[str, Any]:
"""
Design an A/B test with hypothesis and variables.
Args:
test_type: Type of test ('icon', 'screenshot', 'title', 'description')
variant_a: Control variant details
variant_b: Test variant details
hypothesis: Expected outcome hypothesis
success_metric: Metric to optimize
Returns:
Test design with configuration
"""
test_design = {
'test_id': self._generate_test_id(test_type),
'test_type': test_type,
'hypothesis': hypothesis,
'variants': {
'a': {
'name': 'Control',
'details': variant_a,
'traffic_split': 0.5
},
'b': {
'name': 'Variation',
'details': variant_b,
'traffic_split': 0.5
}
},
'success_metric': success_metric,
'secondary_metrics': self._get_secondary_metrics(test_type),
'minimum_effect_size': self.MIN_EFFECT_SIZES.get(test_type, 0.05),
'recommended_confidence': 'standard',
'best_practices': self._get_test_best_practices(test_type)
}
self.active_tests.append(test_design)
return test_design
def calculate_sample_size(
self,
baseline_conversion: float,
minimum_detectable_effect: float,
confidence_level: str = 'standard',
power: float = 0.80
) -> Dict[str, Any]:
"""
Calculate required sample size for statistical significance.
Args:
baseline_conversion: Current conversion rate (0-1)
minimum_detectable_effect: Minimum effect size to detect (0-1)
confidence_level: 'high', 'standard', or 'exploratory'
power: Statistical power (typically 0.80 or 0.90)
Returns:
Sample size calculation with duration estimates
"""
alpha = 1 - self.CONFIDENCE_LEVELS[confidence_level]
beta = 1 - power
# Expected conversion for variant B
expected_conversion_b = baseline_conversion * (1 + minimum_detectable_effect)
# Z-scores for alpha and beta
z_alpha = self._get_z_score(1 - alpha / 2) # Two-tailed test
z_beta = self._get_z_score(power)
# Pooled standard deviation
p_pooled = (baseline_conversion + expected_conversion_b) / 2
sd_pooled = math.sqrt(2 * p_pooled * (1 - p_pooled))
# Sample size per variant
n_per_variant = math.ceil(
((z_alpha + z_beta) ** 2 * sd_pooled ** 2) /
((expected_conversion_b - baseline_conversion) ** 2)
)
total_sample_size = n_per_variant * 2
# Estimate duration based on typical traffic
duration_estimates = self._estimate_test_duration(
total_sample_size,
baseline_conversion
)
return {
'sample_size_per_variant': n_per_variant,
'total_sample_size': total_sample_size,
'baseline_conversion': baseline_conversion,
'expected_conversion_improvement': minimum_detectable_effect,
'expected_conversion_b': expected_conversion_b,
'confidence_level': confidence_level,
'statistical_power': power,
'duration_estimates': duration_estimates,
'recommendations': self._generate_sample_size_recommendations(
n_per_variant,
duration_estimates
)
}
def calculate_significance(
self,
variant_a_conversions: int,
variant_a_visitors: int,
variant_b_conversions: int,
variant_b_visitors: int
) -> Dict[str, Any]:
"""
Calculate statistical significance of test results.
Args:
variant_a_conversions: Conversions for control
variant_a_visitors: Visitors for control
variant_b_conversions: Conversions for variation
variant_b_visitors: Visitors for variation
Returns:
Significance analysis with decision recommendation
"""
# Calculate conversion rates
rate_a = variant_a_conversions / variant_a_visitors if variant_a_visitors > 0 else 0
rate_b = variant_b_conversions / variant_b_visitors if variant_b_visitors > 0 else 0
# Calculate improvement
if rate_a > 0:
relative_improvement = (rate_b - rate_a) / rate_a
else:
relative_improvement = 0
absolute_improvement = rate_b - rate_a
# Calculate standard error
se_a = math.sqrt(rate_a * (1 - rate_a) / variant_a_visitors) if variant_a_visitors > 0 else 0
se_b = math.sqrt(rate_b * (1 - rate_b) / variant_b_visitors) if variant_b_visitors > 0 else 0
se_diff = math.sqrt(se_a**2 + se_b**2)
# Calculate z-score
z_score = absolute_improvement / se_diff if se_diff > 0 else 0
# Calculate p-value (two-tailed)
p_value = 2 * (1 - self._standard_normal_cdf(abs(z_score)))
# Determine significance
is_significant_95 = p_value < 0.05
is_significant_90 = p_value < 0.10
# Generate decision
decision = self._generate_test_decision(
relative_improvement,
is_significant_95,
is_significant_90,
variant_a_visitors + variant_b_visitors
)
return {
'variant_a': {
'conversions': variant_a_conversions,
'visitors': variant_a_visitors,
'conversion_rate': round(rate_a, 4)
},
'variant_b': {
'conversions': variant_b_conversions,
'visitors': variant_b_visitors,
'conversion_rate': round(rate_b, 4)
},
'improvement': {
'absolute': round(absolute_improvement, 4),
'relative_percentage': round(relative_improvement * 100, 2)
},
'statistical_analysis': {
'z_score': round(z_score, 3),
'p_value': round(p_value, 4),
'is_significant_95': is_significant_95,
'is_significant_90': is_significant_90,
'confidence_level': '95%' if is_significant_95 else ('90%' if is_significant_90 else 'Not significant')
},
'decision': decision
}
def track_test_results(
self,
test_id: str,
results_data: Dict[str, Any]
) -> Dict[str, Any]:
"""
Track ongoing test results and provide recommendations.
Args:
test_id: Test identifier
results_data: Current test results
Returns:
Test tracking report with next steps
"""
# Find test
test = next((t for t in self.active_tests if t['test_id'] == test_id), None)
if not test:
return {'error': f'Test {test_id} not found'}
# Calculate significance
significance = self.calculate_significance(
results_data['variant_a_conversions'],
results_data['variant_a_visitors'],
results_data['variant_b_conversions'],
results_data['variant_b_visitors']
)
# Calculate test progress
total_visitors = results_data['variant_a_visitors'] + results_data['variant_b_visitors']
required_sample = results_data.get('required_sample_size', 10000)
progress_percentage = min((total_visitors / required_sample) * 100, 100)
# Generate recommendations
recommendations = self._generate_tracking_recommendations(
significance,
progress_percentage,
test['test_type']
)
return {
'test_id': test_id,
'test_type': test['test_type'],
'progress': {
'total_visitors': total_visitors,
'required_sample_size': required_sample,
'progress_percentage': round(progress_percentage, 1),
'is_complete': progress_percentage >= 100
},
'current_results': significance,
'recommendations': recommendations,
'next_steps': self._determine_next_steps(
significance,
progress_percentage
)
}
def generate_test_report(
self,
test_id: str,
final_results: Dict[str, Any]
) -> Dict[str, Any]:
"""
Generate final test report with insights and recommendations.
Args:
test_id: Test identifier
final_results: Final test results
Returns:
Comprehensive test report
"""
test = next((t for t in self.active_tests if t['test_id'] == test_id), None)
if not test:
return {'error': f'Test {test_id} not found'}
significance = self.calculate_significance(
final_results['variant_a_conversions'],
final_results['variant_a_visitors'],
final_results['variant_b_conversions'],
final_results['variant_b_visitors']
)
# Generate insights
insights = self._generate_test_insights(
test,
significance,
final_results
)
# Implementation plan
implementation_plan = self._create_implementation_plan(
test,
significance
)
return {
'test_summary': {
'test_id': test_id,
'test_type': test['test_type'],
'hypothesis': test['hypothesis'],
'duration_days': final_results.get('duration_days', 'N/A')
},
'results': significance,
'insights': insights,
'implementation_plan': implementation_plan,
'learnings': self._extract_learnings(test, significance)
}
def _generate_test_id(self, test_type: str) -> str:
"""Generate unique test ID."""
import time
timestamp = int(time.time())
return f"{test_type}_{timestamp}"
def _get_secondary_metrics(self, test_type: str) -> List[str]:
"""Get secondary metrics to track for test type."""
metrics_map = {
'icon': ['tap_through_rate', 'impression_count', 'brand_recall'],
'screenshot': ['tap_through_rate', 'time_on_page', 'scroll_depth'],
'title': ['impression_count', 'tap_through_rate', 'search_visibility'],
'description': ['time_on_page', 'scroll_depth', 'tap_through_rate']
}
return metrics_map.get(test_type, ['tap_through_rate'])
def _get_test_best_practices(self, test_type: str) -> List[str]:
"""Get best practices for specific test type."""
practices_map = {
'icon': [
'Test only one element at a time (color vs. style vs. symbolism)',
'Ensure icon is recognizable at small sizes (60x60px)',
'Consider cultural context for global audience',
'Test against top competitor icons'
],
'screenshot': [
'Test order of screenshots (users see first 2-3)',
'Use captions to tell story',
'Show key features and benefits',
'Test with and without device frames'
],
'title': [
'Test keyword variations, not major rebrand',
'Keep brand name consistent',
'Ensure title fits within character limits',
'Test on both search and browse contexts'
],
'description': [
'Test structure (bullet points vs. paragraphs)',
'Test call-to-action placement',
'Test feature vs. benefit focus',
'Maintain keyword density'
]
}
return practices_map.get(test_type, ['Test one variable at a time'])
def _estimate_test_duration(
self,
required_sample_size: int,
baseline_conversion: float
) -> Dict[str, Any]:
"""Estimate test duration based on typical traffic levels."""
# Assume different daily traffic scenarios
traffic_scenarios = {
'low': 100, # 100 page views/day
'medium': 1000, # 1000 page views/day
'high': 10000 # 10000 page views/day
}
estimates = {}
for scenario, daily_views in traffic_scenarios.items():
days = math.ceil(required_sample_size / daily_views)
estimates[scenario] = {
'daily_page_views': daily_views,
'estimated_days': days,
'estimated_weeks': round(days / 7, 1)
}
return estimates
def _generate_sample_size_recommendations(
self,
sample_size: int,
duration_estimates: Dict[str, Any]
) -> List[str]:
"""Generate recommendations based on sample size."""
recommendations = []
if sample_size > 50000:
recommendations.append(
"Large sample size required - consider testing smaller effect size or increasing traffic"
)
if duration_estimates['medium']['estimated_days'] > 30:
recommendations.append(
"Long test duration - consider higher minimum detectable effect or focus on high-impact changes"
)
if duration_estimates['low']['estimated_days'] > 60:
recommendations.append(
"Insufficient traffic for reliable testing - consider user acquisition or broader targeting"
)
if not recommendations:
recommendations.append("Sample size and duration are reasonable for this test")
return recommendations
def _get_z_score(self, percentile: float) -> float:
"""Get z-score for given percentile (approximation)."""
# Common z-scores
z_scores = {
0.80: 0.84,
0.85: 1.04,
0.90: 1.28,
0.95: 1.645,
0.975: 1.96,
0.99: 2.33
}
return z_scores.get(percentile, 1.96)
def _standard_normal_cdf(self, z: float) -> float:
"""Approximate standard normal cumulative distribution function."""
# Using error function approximation
t = 1.0 / (1.0 + 0.2316419 * abs(z))
d = 0.3989423 * math.exp(-z * z / 2.0)
p = d * t * (0.3193815 + t * (-0.3565638 + t * (1.781478 + t * (-1.821256 + t * 1.330274))))
if z > 0:
return 1.0 - p
else:
return p
def _generate_test_decision(
self,
improvement: float,
is_significant_95: bool,
is_significant_90: bool,
total_visitors: int
) -> Dict[str, Any]:
"""Generate test decision and recommendation."""
if total_visitors < 1000:
return {
'decision': 'continue',
'rationale': 'Insufficient data - continue test to reach minimum sample size',
'action': 'Keep test running'
}
if is_significant_95:
if improvement > 0:
return {
'decision': 'implement_b',
'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 95% confidence',
'action': 'Implement Variant B'
}
else:
return {
'decision': 'keep_a',
'rationale': 'Variant A performs better with 95% confidence',
'action': 'Keep current version (A)'
}
elif is_significant_90:
if improvement > 0:
return {
'decision': 'implement_b_cautiously',
'rationale': f'Variant B shows {improvement*100:.1f}% improvement with 90% confidence',
'action': 'Consider implementing B, monitor closely'
}
else:
return {
'decision': 'keep_a',
'rationale': 'Variant A performs better with 90% confidence',
'action': 'Keep current version (A)'
}
else:
return {
'decision': 'inconclusive',
'rationale': 'No statistically significant difference detected',
'action': 'Either keep A or test different hypothesis'
}
def _generate_tracking_recommendations(
self,
significance: Dict[str, Any],
progress: float,
test_type: str
) -> List[str]:
"""Generate recommendations for ongoing test."""
recommendations = []
if progress < 50:
recommendations.append(
f"Test is {progress:.0f}% complete - continue collecting data"
)
if progress >= 100:
if significance['statistical_analysis']['is_significant_95']:
recommendations.append(
"Sufficient data collected with significant results - ready to conclude test"
)
else:
recommendations.append(
"Sample size reached but no significant difference - consider extending test or concluding"
)
return recommendations
def _determine_next_steps(
self,
significance: Dict[str, Any],
progress: float
) -> str:
"""Determine next steps for test."""
if progress < 100:
return f"Continue test until reaching 100% sample size (currently {progress:.0f}%)"
decision = significance.get('decision', {}).get('decision', 'inconclusive')
if decision == 'implement_b':
return "Implement Variant B and monitor metrics for 2 weeks"
elif decision == 'keep_a':
return "Keep Variant A and design new test with different hypothesis"
else:
return "Test inconclusive - either keep A or design new test"
def _generate_test_insights(
self,
test: Dict[str, Any],
significance: Dict[str, Any],
results: Dict[str, Any]
) -> List[str]:
"""Generate insights from test results."""
insights = []
improvement = significance['improvement']['relative_percentage']
if significance['statistical_analysis']['is_significant_95']:
insights.append(
f"Strong evidence: Variant B {'improved' if improvement > 0 else 'decreased'} "
f"conversion by {abs(improvement):.1f}% with 95% confidence"
)
insights.append(
f"Tested {test['test_type']} changes: {test['hypothesis']}"
)
# Add context-specific insights
if test['test_type'] == 'icon' and improvement > 5:
insights.append(
"Icon change had substantial impact - visual first impression is critical"
)
return insights
def _create_implementation_plan(
self,
test: Dict[str, Any],
significance: Dict[str, Any]
) -> List[Dict[str, str]]:
"""Create implementation plan for winning variant."""
plan = []
if significance.get('decision', {}).get('decision') == 'implement_b':
plan.append({
'step': '1. Update store listing',
'details': f"Replace {test['test_type']} with Variant B across all platforms"
})
plan.append({
'step': '2. Monitor metrics',
'details': 'Track conversion rate for 2 weeks to confirm sustained improvement'
})
plan.append({
'step': '3. Document learnings',
'details': 'Record insights for future optimization'
})
return plan
def _extract_learnings(
self,
test: Dict[str, Any],
significance: Dict[str, Any]
) -> List[str]:
"""Extract key learnings from test."""
learnings = []
improvement = significance['improvement']['relative_percentage']
learnings.append(
f"Testing {test['test_type']} can yield {abs(improvement):.1f}% conversion change"
)
if test['test_type'] == 'title':
learnings.append(
"Title changes affect search visibility and user perception"
)
elif test['test_type'] == 'screenshot':
learnings.append(
"First 2-3 screenshots are critical for conversion"
)
return learnings
def plan_ab_test(
test_type: str,
variant_a: Dict[str, Any],
variant_b: Dict[str, Any],
hypothesis: str,
baseline_conversion: float
) -> Dict[str, Any]:
"""
Convenience function to plan an A/B test.
Args:
test_type: Type of test
variant_a: Control variant
variant_b: Test variant
hypothesis: Test hypothesis
baseline_conversion: Current conversion rate
Returns:
Complete test plan
"""
planner = ABTestPlanner()
test_design = planner.design_test(
test_type,
variant_a,
variant_b,
hypothesis
)
sample_size = planner.calculate_sample_size(
baseline_conversion,
planner.MIN_EFFECT_SIZES.get(test_type, 0.05)
)
return {
'test_design': test_design,
'sample_size_requirements': sample_size
}