from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
"""
FIXED: Using reduceByKey instead of groupByKey.
reduceByKey combines values locally before shuffling.
"""
spark = SparkSession.builder.appName("ReduceByKeyJob").getOrCreate()
# Create RDD with key-value pairs
data = [(i % 100, i) for i in range(100000)]
rdd = spark.sparkContext.parallelize(data)
# FIXED: reduceByKey combines locally first
# Only sends reduced values across network
sums = rdd.reduceByKey(lambda a, b: a + b)
result = sums.collect()
print(f"Computed {len(result)} sums")
# ALTERNATIVE: Use DataFrame API (even better)
df = spark.createDataFrame(data, ["key", "value"])
df_sums = df.groupBy("key").agg(F.sum("value").alias("sum"))
df_result = df_sums.collect()
print(f"DataFrame computed {len(df_result)} sums")
spark.stop()
if __name__ == "__main__":
main()