package examples
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import scala.util.Random
object ExSkewJoin {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.appName("ExSkewJoin_Scala")
.getOrCreate()
import spark.implicits._
// Create skewed dataset
val skewedKey = 1
val skewFactor = 10000
val leftData = (0 until 100000).map { i =>
if (Random.nextDouble() < 0.9) (skewedKey, s"data_$i")
else (Random.nextInt(100), s"data_$i")
}.toDF("id", "value")
val rightData = (0 until 100).map(i => (i, s"ref_$i")).toDF("id", "ref_value")
// Inefficient Join (Standard shuffling without salting/broadcast)
// Should trigger skew in execution
leftData.join(rightData, "id").count()
spark.stop()
}
}