We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/ravipesala/spark_mcp_optimizer'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
job_groupbykey.py•754 B
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
"""
BAD: Using groupByKey instead of reduceByKey/aggregateByKey.
groupByKey shuffles all data, very inefficient.
"""
spark = SparkSession.builder.appName("GroupByKeyJob").getOrCreate()
# Create RDD with key-value pairs
data = [(i % 100, i) for i in range(100000)]
rdd = spark.sparkContext.parallelize(data)
# BAD: groupByKey shuffles ALL values
# This sends all values for each key across the network
grouped = rdd.groupByKey()
sums = grouped.mapValues(lambda values: sum(values))
result = sums.collect()
print(f"Computed {len(result)} sums")
spark.stop()
if __name__ == "__main__":
main()