Day 1: ETL: Extract, Transform, Load — Data Engineering in 5 Days

Today's Objective

By the end of this lesson, you will understand the core concepts of ETL: Extract, Transform, Load — Data Engineering in 5 Days and be able to apply them in a real project.

python

# etl_pipeline.py
import csv
import sqlite3
from datetime import datetime
from pathlib import Path

# Extract
def extract(filepath: str) -> list[dict]:
    with open(filepath, newline='', encoding='utf-8') as f:
        return list(csv.DictReader(f))

# Transform
def transform(records: list[dict]) -> list[dict]:
    clean = []
    for r in records:
        # Skip rows with missing required fields
        if not r.get('email') or not r.get('name'):
            continue
        clean.append({
            'name': r['name'].strip().title(),
            'email': r['email'].strip().lower(),
            'signup_date': datetime.strptime(
                r.get('date', '2026-01-01'), '%Y-%m-%d').date(),
            'active': r.get('status', 'active') == 'active',
        })
    return clean

# Load
def load(records: list[dict], db_path: str) -> int:
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute('''CREATE TABLE IF NOT EXISTS users (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT, email TEXT UNIQUE, signup_date TEXT, active INTEGER
    )''')
    cur.executemany('''INSERT OR IGNORE INTO users
        (name, email, signup_date, active) VALUES
        (:name, :email, :signup_date, :active)''', records)
    conn.commit()
    count = cur.rowcount
    conn.close()
    return count

if __name__ == '__main__':
    rows = extract('users.csv')
    clean = transform(rows)
    n = load(clean, 'warehouse.db')
    print(f'Loaded {n} new records')

Tip: Always validate your data after loading. Run a COUNT(*) and compare it to your source record count to catch silent failures.

ETL: Extract, Transform, Load — Data Engineering in 5 Days

Today's Objective

Exercise: Build an ETL for Sales Data

Day 1 Summary

Supporting References & Reading

Go deeper with these external resources.

Day 1 Checkpoint