We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/ravipesala/spark_mcp_optimizer'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
job_small_files.py•870 B
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
def main():
spark = SparkSession.builder \
.appName("SmallFilesJob") \
.config("spark.sql.files.maxPartitionBytes", "128k") \
.getOrCreate()
# Simulate reading many small files by creating many small partitions
# This causes small file explosion problem
df = spark.range(100000).repartition(1000) # 1000 tiny partitions
# Write as many small files
output_path = "/tmp/small_files_output"
df.write.mode("overwrite").parquet(output_path)
# Read back - will create many tasks
df_read = spark.read.parquet(output_path)
# Simple aggregation - inefficient due to many small partitions
result = df_read.groupBy(F.col("id") % 10).count()
result.show()
spark.stop()
if __name__ == "__main__":
main()