from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
"""
FIXED: Applying recommendations from spill job analysis.
Original issues:
- Low executor memory (512m)
- Low memory fraction (0.1)
- Large string creation causing memory pressure
- orderBy on large data
"""
spark = SparkSession.builder \
.appName("SpillJobFixed") \
.config("spark.executor.memory", "2g") \
.config("spark.memory.fraction", "0.6") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.getOrCreate()
# FIXED: Reduce string size or use more efficient approach
# Instead of 1000 chars, use 100
df = spark.range(0, 2000000).withColumn("str", F.expr("repeat('a', 100)"))
# FIXED: If full sort not needed, use limit
# Or increase shuffle partitions to reduce per-partition size
spark.conf.set("spark.sql.shuffle.partitions", str(spark.sparkContext.defaultParallelism * 2))
# If only top N needed
top_n = df.orderBy("str").limit(1000)
print(f"Top N count: {top_n.count()}")
# If full sort needed, it's now more efficient with better config
# sorted_df = df.orderBy("str")
# print(f"Count: {sorted_df.count()}")
spark.stop()
if __name__ == "__main__":
main()