When data exceeds memory, you need a distributed engine. PySpark runs Spark on your laptop for learning, but scales to hundreds of nodes in the cloud. Today you will process data using Spark DataFrames.
pip install pyspark
# or use Databricks Community Edition (free)from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, DateType
spark = SparkSession.builder \
.appName('SalesAnalysis') \
.master('local[*]') \
.getOrCreate()
# Read CSV with schema inference
df = spark.read.csv('sales.csv', header=True, inferSchema=True)
df.printSchema()
df.show(5)
# DataFrame transformations (lazy — nothing runs until an action)
result = (
df
.filter(F.col('status') == 'completed')
.withColumn('revenue', F.col('quantity') * F.col('unit_price'))
.withColumn('month', F.month('order_date'))
.groupBy('month', 'category')
.agg(
F.sum('revenue').alias('total_revenue'),
F.count('*').alias('order_count'),
F.avg('revenue').alias('avg_order_value'),
)
.orderBy('month', F.desc('total_revenue'))
)
# Action — triggers execution
result.show()
# Write to Parquet (columnar, compressed, fast)
result.write.mode('overwrite').parquet('output/monthly_revenue/')
spark.stop()# Window functions in Spark
from pyspark.sql.window import Window
window = Window.partitionBy('customer_id').orderBy('order_date')
ranked = df.withColumn(
'order_rank', F.row_number().over(window)
).withColumn(
'running_total', F.sum('revenue').over(window)
)