Understand null hypotheses, p-values, t-tests, and how to run rigorous A/B tests that produce trustworthy conclusions.
A/B tests. Drug trials. Feature experiments. They all use hypothesis testing. The framework is simple: state what you'd expect by chance (the null hypothesis), measure what actually happened, and calculate how surprised you should be.
from scipy import stats
import numpy as np
np.random.seed(42)
# Control group: old checkout flow
control = np.random.normal(loc=50, scale=15, size=200) # avg $50 order
# Treatment group: new checkout flow
treatment = np.random.normal(loc=54, scale=15, size=200) # avg $54 order
# Two-sample t-test (independent groups, unequal variances)
t_stat, p_value = stats.ttest_ind(control, treatment, equal_var=False)
print(f"Control mean: ${control.mean():.2f}")
print(f"Treatment mean: ${treatment.mean():.2f}")
print(f"Difference: ${treatment.mean() - control.mean():.2f}")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"Significant: {p_value < 0.05}")
# Effect size (Cohen's d)
pooled_std = np.sqrt((control.std()**2 + treatment.std()**2) / 2)
cohens_d = (treatment.mean() - control.mean()) / pooled_std
print(f"Cohen's d: {cohens_d:.3f}") # 0.2=small, 0.5=medium, 0.8=largefrom statsmodels.stats.power import TTestIndPower
# How many samples do I need to detect a meaningful effect?
analysis = TTestIndPower()
n = analysis.solve_power(
effect_size=0.3, # minimum detectable effect (Cohen's d)
alpha=0.05, # significance level
power=0.80 # 80% chance of detecting the effect if real
)
print(f"Required sample size per group: {n:.0f}")