from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
spark = SparkSession.builder.appName("MissingPredicateJob").getOrCreate()
# Create two large tables
df1 = spark.range(100000).toDF("id")
df1 = df1.withColumn("value", F.rand() * 100)
df1 = df1.withColumn("category", (F.col("id") % 10).cast("string"))
df2 = spark.range(100000).toDF("id")
df2 = df2.withColumn("amount", F.rand() * 1000)
df2 = df2.withColumn("category", (F.col("id") % 10).cast("string"))
# BAD: Join without filtering first
# Should filter on category before joining
result = df1.join(df2, "id", "inner")
# Filter AFTER join (inefficient)
filtered = result.filter(F.col("category") == "5")
print(f"Count: {filtered.count()}")
spark.stop()
if __name__ == "__main__":
main()