From c0ec591925d3b6002274edec8576d4f14a6dabd1 Mon Sep 17 00:00:00 2001 From: BInwei Yang Date: Mon, 19 Sep 2022 23:36:57 -0700 Subject: [PATCH] [OPPRO-368] Add parquet support in readme. Remove dwrf (#381) --- backends-velox/workload/tpch/run_tpch/tpch_parquet.scala | 2 +- docs/Velox.md | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/backends-velox/workload/tpch/run_tpch/tpch_parquet.scala b/backends-velox/workload/tpch/run_tpch/tpch_parquet.scala index 067e1a2afcda..82903b5c5cb7 100644 --- a/backends-velox/workload/tpch/run_tpch/tpch_parquet.scala +++ b/backends-velox/workload/tpch/run_tpch/tpch_parquet.scala @@ -5,7 +5,7 @@ import java.util.Arrays import sys.process._ //Configurations: -var parquet_file_path = "/PATH/TO/TPCH_DWRF_PATH" +var parquet_file_path = "/PATH/TO/TPCH_PARQUET_PATH" var gluten_root = "/PATH/TO/GLUTEN" def time[R](block: => R): R = { diff --git a/docs/Velox.md b/docs/Velox.md index 50d0c844db87..ca2b1bc30e8f 100644 --- a/docs/Velox.md +++ b/docs/Velox.md @@ -39,23 +39,21 @@ In Gluten, all 22 queries can be fully offloaded into Velox for computing. ### Data preparation -Parquet format still have performance issue in Velox. We use dwrf format instead. Refer to [Test TPCH on Velox backend](../backends-velox/workload/tpch/README.md) for How to convert parquet to dwrf format during data generation. - Considering current Velox does not fully support Decimal and Date data type, the [datagen script](../backends-velox/workload/tpch/gen_data/parquet_dataset/tpch_datagen_parquet.scala) transforms "Decimal-to-Double" and "Date-to-String". As a result, we need to modify the TPCH queries a bit. You can find the [modified TPC-H queries](../backends-velox/workload/tpch/tpch.queries.updated/). ### Submit the Spark SQL job -Submit test script from spark-shell. You can find the scala code to [Run TPC-H](../backends-velox/workload/tpch/run_tpch/tpch_dwrf.scala) as an example. Please remember to modify the location of TPC-H files as well as TPC-H queries in backends-velox/workload/tpch/run_tpch/tpch_dwrf.scala before you run the testing. +Submit test script from spark-shell. You can find the scala code to [Run TPC-H](../backends-velox/workload/tpch/run_tpch/tpch_parquet.scala) as an example. Please remember to modify the location of TPC-H files as well as TPC-H queries in backends-velox/workload/tpch/run_tpch/tpch_parquet.scala before you run the testing. ``` -var dwrf_file_path = "/PATH/TO/TPCH_DWRF_PATH" +var parquet_file_path = "/PATH/TO/TPCH_PARQUET_PATH" var gluten_root = "/PATH/TO/GLUTEN" ``` Below script shows an example about how to run the testing, you should modify the parameters such as executor cores, memory, offHeap size based on your environment. ```shell script -cat tpch_dwrf.scala | spark-shell --name tpch_powertest_velox --master yarn --deploy-mode client --conf spark.plugins=io.glutenproject.GlutenPlugin --conf --conf spark.gluten.sql.columnar.backend.lib=velox --conf spark.driver.extraClassPath=${gluten_jvm_jar} --conf spark.executor.extraClassPath=${gluten_jvm_jar} --conf spark.memory.offHeap.size=20g --conf spark.sql.sources.useV1SourceList=avro --num-executors 6 --executor-cores 6 --driver-memory 20g --executor-memory 25g --conf spark.executor.memoryOverhead=5g --conf spark.driver.maxResultSize=32g +cat tpch_parquet.scala | spark-shell --name tpch_powertest_velox --master yarn --deploy-mode client --conf spark.plugins=io.glutenproject.GlutenPlugin --conf --conf spark.gluten.sql.columnar.backend.lib=velox --conf spark.driver.extraClassPath=${gluten_jvm_jar} --conf spark.executor.extraClassPath=${gluten_jvm_jar} --conf spark.memory.offHeap.size=20g --conf spark.sql.sources.useV1SourceList=avro --num-executors 6 --executor-cores 6 --driver-memory 20g --executor-memory 25g --conf spark.executor.memoryOverhead=5g --conf spark.driver.maxResultSize=32g ``` ### Result