from pyspark.sql import SparkSession
import random
def main():
spark = SparkSession.builder.appName("InefficientJob").getOrCreate()
sc = spark.sparkContext
# Create skewed data: 0 appears 90% of the time
data = []
for _ in range(10000):
if random.random() < 0.9:
data.append((0, "skewed"))
else:
data.append((random.randint(1, 100), "normal"))
rdd = sc.parallelize(data, 10)
# Inefficient: groupByKey
# This should be flagged by our parser/recommender
grouped = rdd.groupByKey().mapValues(list).collect()
print(f"Grouped count: {len(grouped)}")
# Force a spill (maybe - hard with small data but let's try low memory)
# We will just rely on Code Analysis for groupByKey and Skew Analysis for the 0 key if visible
spark.stop()
if __name__ == "__main__":
main()