Build linear and logistic regression models, interpret coefficients, and understand what makes a regression result meaningful vs spurious.
Regression is the workhorse of data analysis. It answers the fundamental question: what is the relationship between this variable and that outcome? Linear regression predicts continuous outcomes. Logistic regression predicts probability of a binary outcome. Together they cover 80% of real-world prediction problems.
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import statsmodels.api as sm
np.random.seed(42)
n = 500
# Generate synthetic house price data
sqft = np.random.uniform(800, 4000, n)
bedrooms = np.random.randint(1, 6, n)
age = np.random.uniform(0, 50, n)
price = 100 + 0.15 * sqft + 20 * bedrooms - 1.5 * age + np.random.normal(0, 30, n)
X = np.column_stack([sqft, bedrooms, age])
y = price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"R²: {r2_score(y_test, y_pred):.3f}")
print(f"MAE: ${mean_absolute_error(y_test, y_pred):.0f}K")
print(f"Coefficients: sqft={model.coef_[0]:.2f}, beds={model.coef_[1]:.2f}, age={model.coef_[2]:.2f}")The coefficient for sqft (say, 0.15) means: holding all else constant, each additional square foot is associated with a $150 increase in price. This "all else constant" assumption is critical — and it's where most misinterpretations happen.
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
# Simulate churn data
np.random.seed(42)
n = 1000
tenure = np.random.uniform(1, 60, n)
monthly_charge = np.random.uniform(20, 120, n)
support_calls = np.random.poisson(2, n)
churn_prob = 1 / (1 + np.exp(-(-2 + 0.05 * monthly_charge - 0.03 * tenure + 0.3 * support_calls)))
churn = np.random.binomial(1, churn_prob)
X = np.column_stack([tenure, monthly_charge, support_calls])
X_train, X_test, y_train, y_test = train_test_split(X, churn, test_size=0.2)
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_prob):.3f}")