from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
spark = SparkSession.builder \
.appName("SpillJob") \
.config("spark.executor.memory", "512m") \
.config("spark.memory.fraction", "0.1") \
.getOrCreate()
# Generate enough data to force spill with low memory
# 2 million rows with string manipulation to increase size
df = spark.range(0, 2000000).withColumn("str", F.expr("repeat('a', 1000)"))
# Sort triggers shuffle, low memory should trigger spill
sorted_df = df.orderBy("str")
print(f"Count: {sorted_df.count()}")
spark.stop()
if __name__ == "__main__":
main()