from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import time
import random
def main():
spark = SparkSession.builder.appName("SkewedJob").getOrCreate()
# Create skewed data: 90% of keys are "1", 10% are random
# heavily skewed to key=1
large_skew_data = []
for _ in range(100000):
large_skew_data.append((1, "skewed"))
for i in range(10000):
large_skew_data.append((i, "uniform"))
df = spark.createDataFrame(large_skew_data, ["id", "val"]).repartition(10)
# Force a shuffle with skew
# Key '1' will go to one partition and take much longer
grouped = df.groupBy("id").count()
grouped.collect()
print("Skew job finished")
spark.stop()
if __name__ == "__main__":
main()