Quotes

Friday, November 13, 2020

Hive, Scala snippets

 import org.apache.spark.sql.{Column, SaveMode, SparkSession}

 val df1=spark.read.format("orc").option("header", "true").option("delimiter","|").load("s3a:

val df2=spark.read.format("orc").option("header", "true").option("delimiter","|").load("s3a:


val dropColLst ="insert_datetime_utc,extract_datetime"

val dropColumns = dropColLst.split(",")


 val leftTableFilteredDF = df1.select(df1.columns.filter(colName => !dropColumns.contains(colName)).map(colName => new Column(colName)): _*)

 val rightTableFilteredDF = df2.select(df2.columns.filter(colName => !dropColumns.contains(colName)).map(colName => new Column(colName)): _*)

 val df12 = leftTableFilteredDF.except(rightTableFilteredDF)

val df21 = rightTableFilteredDF.except(leftTableFilteredDF)


    export SPARK_MAJOR_VERSION=2

SHOW PARTITIONS schema.tablename;

ALTER TABLE schema.tablename DROP IF EXISTS PARTITION(partition_by_month_id=202010);