from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, sequence
def main():
# 2. Shuffle Spill, Memory Pressure
# Force low memory to trigger spill
spark = SparkSession.builder \
.appName("Ex_Spill_Memory") \
.config("spark.executor.memory", "512m") \
.config("spark.memory.fraction", "0.2") \
.config("spark.sql.shuffle.partitions", "1") \
.getOrCreate()
print("Generating Data for Spill...")
# Generate enough data to exceed the small memory buffer
# Each row is small, but we generate millions
df = spark.range(0, 1000000).withColumn("data", explode(sequence(col("id"), col("id") + 10)))
print("Triggering Sort (Spill)...")
# Sorting a large dataset on a single partition with low memory guarantees a disk spill
# This should trigger "Shuffle and Spill Analysis" and "Memory Pressure"
df.sort("data").count()
spark.stop()
if __name__ == "__main__":
main()