vorobuev
5/8/2020 - 8:08 AM

Spark Partitions

// This helps to optimize partition size of existing dataset
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.spark.sql.{Dataset, SaveMode, SparkSession}

// if we search in hive warehouse we should use lower-case
val path = new Path("/apps/hive/warehouse/rpdc_2630_gen.db/hdim_truncated")
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val dirSize = fs.getContentSummary(path).getLength
val fileNum = dirSize/(512 * 1024 * 1024)  // let's say 512 MB per file
val df = spark.table("rpdc_2630_gen.HDIM_TRUNCATED")
df.coalesce(fileNum.toInt).write.mode(SaveMode.Append).format("orc").saveAsTable("rpdc_2630_gen.RPOANMINFO_HDIM")