apache · andygrove · Jan 13, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/common/src/main/java/org/apache/comet/parquet/Native.java b/common/src/main/java/org/apache/comet/parquet/Native.java
@@ -249,7 +249,12 @@ public static native void setPageV2(
    * @return a handle to the record batch reader, used in subsequent calls.
    */
   public static native long initRecordBatchReader(
-      String filePath, long fileSize, long start, long length, byte[] requiredSchema);
+      String filePath,
+      long fileSize,
+      long start,
+      long length,
+      byte[] requiredSchema,
+      String sessionTimezone);
 
   // arrow native version of read batch
   /**

diff --git a/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java b/common/src/main/java/org/apache/comet/parquet/NativeBatchReader.java
@@ -346,7 +346,7 @@ public void init() throws URISyntaxException, IOException {
 
     this.handle =
         Native.initRecordBatchReader(
-            filePath, fileSize, start, length, serializedRequestedArrowSchema);
+            filePath, fileSize, start, length, serializedRequestedArrowSchema, timeZoneId);
     isInitialized = true;
   }
 

diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -77,24 +77,26 @@ object CometConf extends ShimCometConf {
     .booleanConf
     .createWithDefault(false)
 
-  val SCAN_NATIVE = "native"
-  val SCAN_NATIVE_FULL = "native_full"
-  val SCAN_NATIVE_RECORDBATCH = "native_recordbatch"
+  val SCAN_NATIVE_COMET = "native_comet"
+  val SCAN_NATIVE_DATAFUSION = "native_datafusion"
+  val SCAN_NATIVE_ICEBERG_COMPAT = "native_iceberg_compat"
 
   val COMET_NATIVE_SCAN_IMPL: ConfigEntry[String] = conf("spark.comet.scan.impl")
     .doc(
-      "The implementation of Comet Native Scan to use. Available modes are 'native'," +
-        "'native_full', and 'native_recordbatch'. " +
-        "'native' is for the original Comet native scan which uses a jvm based parquet file " +
-        "reader and native column decoding. Supports simple types only " +
-        "'native_full' is a fully native implementation of scan based on DataFusion" +
-        "'native_recordbatch' is a native implementation that exposes apis to read parquet " +
-        "columns natively.")
+      s"The implementation of Comet Native Scan to use. Available modes are '$SCAN_NATIVE_COMET'," +
+        s"'$SCAN_NATIVE_DATAFUSION', and '$SCAN_NATIVE_ICEBERG_COMPAT'. " +
+        s"'$SCAN_NATIVE_COMET' is for the original Comet native scan which uses a jvm based " +
+        "parquet file reader and native column decoding. Supports simple types only " +
+        s"'$SCAN_NATIVE_DATAFUSION' is a fully native implementation of scan based on DataFusion" +
+        s"'$SCAN_NATIVE_ICEBERG_COMPAT' is a native implementation that exposes apis to read " +
+        "parquet columns natively.")
     .internal()
     .stringConf
     .transform(_.toLowerCase(Locale.ROOT))
-    .checkValues(Set(SCAN_NATIVE, SCAN_NATIVE_FULL, SCAN_NATIVE_RECORDBATCH))
-    .createWithDefault(sys.env.getOrElse("NATIVE_SCAN_IMPL", SCAN_NATIVE_FULL))
+    .checkValues(Set(SCAN_NATIVE_COMET, SCAN_NATIVE_DATAFUSION, SCAN_NATIVE_ICEBERG_COMPAT))
+    .createWithDefault(sys.env
+      .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_NATIVE_COMET)
+      .toLowerCase(Locale.ROOT))
 
   val COMET_PARQUET_PARALLEL_IO_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.parquet.read.parallel.io.enabled")

diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/Cargo.toml b/native/Cargo.toml
@@ -37,9 +37,7 @@ arrow = { version = "53.2.0", features = ["prettyprint", "ffi", "chrono-tz"] }
 arrow-array = { version = "53.2.0" }
 arrow-buffer = { version = "53.2.0" }
 arrow-data = { version = "53.2.0" }
-arrow-ipc = { version = "53.2.0" }
 arrow-schema = { version = "53.2.0" }
-flatbuffers = { version = "24.3.25" }
 parquet = { version = "53.2.0", default-features = false, features = ["experimental"] }
 datafusion = { version = "43.0.0", default-features = false, features = ["unicode_expressions", "crypto_expressions", "parquet"] }
 datafusion-common = { version = "43.0.0" }

diff --git a/native/core/Cargo.toml b/native/core/Cargo.toml
@@ -40,8 +40,6 @@ arrow-array = { workspace = true }
 arrow-buffer = { workspace = true }
 arrow-data = { workspace = true }
 arrow-schema = { workspace = true }
-arrow-ipc = { workspace = true }
-flatbuffers = { workspace = true }
 parquet = { workspace = true, default-features = false, features = ["experimental"] }
 futures = { workspace = true }
 mimalloc = { version = "*", default-features = false, optional = true }

diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1156,8 +1156,11 @@ impl PhysicalPlanner {
                 table_parquet_options.global.pushdown_filters = true;
                 table_parquet_options.global.reorder_filters = true;
 
-                let mut spark_parquet_options =
-                    SparkParquetOptions::new(EvalMode::Legacy, "UTC", false);
+                let mut spark_parquet_options = SparkParquetOptions::new(
+                    EvalMode::Legacy,
+                    scan.session_timezone.as_str(),
+                    false,
+                );
                 spark_parquet_options.allow_cast_unsigned_ints = true;
 
                 let mut builder = ParquetExecBuilder::new(file_scan_config)

diff --git a/native/core/src/parquet/mod.rs b/native/core/src/parquet/mod.rs
@@ -46,6 +46,7 @@ use self::util::jni::TypePromotionInfo;
 use crate::execution::operators::ExecutionError;
 use crate::execution::utils::SparkArrowConvert;
 use crate::parquet::data_type::AsBytes;
+use crate::parquet::parquet_support::SparkParquetOptions;
 use crate::parquet::schema_adapter::SparkSchemaAdapterFactory;
 use arrow::buffer::{Buffer, MutableBuffer};
 use arrow_array::{Array, RecordBatch};
@@ -59,8 +60,6 @@ use datafusion_execution::{SendableRecordBatchStream, TaskContext};
 use futures::{poll, StreamExt};
 use jni::objects::{JBooleanArray, JByteArray, JLongArray, JPrimitiveArray, JString, ReleaseMode};
 use jni::sys::jstring;
-use parquet::arrow::arrow_reader::ParquetRecordBatchReader;
-use parquet_support::SparkParquetOptions;
 use read::ColumnReader;
 use util::jni::{convert_column_descriptor, convert_encoding, deserialize_schema, get_file_path};
 
@@ -608,7 +607,6 @@ enum ParquetReaderState {
 struct BatchContext {
     runtime: tokio::runtime::Runtime,
     batch_stream: Option<SendableRecordBatchStream>,
-    batch_reader: Option<ParquetRecordBatchReader>,
     current_batch: Option<RecordBatch>,
     reader_state: ParquetReaderState,
 }
@@ -640,14 +638,14 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
     start: jlong,
     length: jlong,
     required_schema: jbyteArray,
+    session_timezone: jstring,
 ) -> jlong {
     try_unwrap_or_throw(&e, |mut env| unsafe {
         let path: String = env
             .get_string(&JString::from_raw(file_path))
             .unwrap()
             .into();
         let batch_stream: Option<SendableRecordBatchStream>;
-        let batch_reader: Option<ParquetRecordBatchReader> = None;
         // TODO: (ARROW NATIVE) Use the common global runtime
         let runtime = tokio::runtime::Builder::new_multi_thread()
             .enable_all()
@@ -681,8 +679,13 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
         // TODO: Maybe these are configs?
         table_parquet_options.global.pushdown_filters = true;
         table_parquet_options.global.reorder_filters = true;
+        let session_timezone: String = env
+            .get_string(&JString::from_raw(session_timezone))
+            .unwrap()
+            .into();
 
-        let mut spark_parquet_options = SparkParquetOptions::new(EvalMode::Legacy, "UTC", false);
+        let mut spark_parquet_options =
+            SparkParquetOptions::new(EvalMode::Legacy, session_timezone.as_str(), false);
         spark_parquet_options.allow_cast_unsigned_ints = true;
 
         let builder2 = ParquetExecBuilder::new(file_scan_config)
@@ -704,7 +707,6 @@ pub unsafe extern "system" fn Java_org_apache_comet_parquet_Native_initRecordBat
         let ctx = BatchContext {
             runtime,
             batch_stream,
-            batch_reader,
             current_batch: None,
             reader_state: ParquetReaderState::Init,
         };
@@ -725,7 +727,6 @@ pub extern "system" fn Java_org_apache_comet_parquet_Native_readNextRecordBatch(
         let batch_stream = context.batch_stream.as_mut().unwrap();
         let runtime = &context.runtime;
 
-        // let mut stream = batch_stream.as_mut();
         loop {
             let next_item = batch_stream.next();
             let poll_batch: Poll<Option<datafusion_common::Result<RecordBatch>>> =

diff --git a/native/core/src/parquet/parquet_support.rs b/native/core/src/parquet/parquet_support.rs
@@ -818,7 +818,7 @@ fn cast_struct_to_struct(
             Ok(Arc::new(StructArray::new(
                 to_fields.clone(),
                 cast_fields,
-                array.nulls().map(|nulls| nulls.clone()),
+                array.nulls().cloned(),
             )))
         }
         _ => unreachable!(),

diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
@@ -90,6 +90,7 @@ message NativeScan {
   repeated spark.spark_expression.Expr data_filters = 6;
   repeated SparkFilePartition file_partitions = 7;
   repeated int64 projection_vector = 8;
+  string session_timezone = 9;
 }
 
 message Projection {

diff --git a/native/spark-expr/src/cast.rs b/native/spark-expr/src/cast.rs
@@ -838,7 +838,7 @@ fn cast_struct_to_struct(
             Ok(Arc::new(StructArray::new(
                 to_fields.clone(),
                 cast_fields,
-                array.nulls().map(|nulls| nulls.clone()),
+                array.nulls().cloned(),
             )))
         }
         _ => unreachable!(),

diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
@@ -206,7 +206,7 @@ class CometSparkSessionExtensions
                 // here and then it gets replaced with `CometNativeScanExec` in `CometExecRule`
                 // but that only happens if `COMET_EXEC_ENABLED` is enabled
                 && COMET_EXEC_ENABLED.get()
-                && COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_FULL =>
+                && COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_DATAFUSION =>
             logInfo("Comet extension enabled for v1 full native Scan")
             CometScanExec(scanExec, session)
 
@@ -377,7 +377,7 @@ class CometSparkSessionExtensions
       plan.transformUp {
         // Fully native scan for V1
         case scan: CometScanExec
-            if COMET_NATIVE_SCAN_IMPL.get.equals(CometConf.SCAN_NATIVE_FULL) =>
+            if COMET_NATIVE_SCAN_IMPL.get().equals(CometConf.SCAN_NATIVE_DATAFUSION) =>
           val nativeOp = QueryPlanSerde.operator2Proto(scan).get
           CometNativeScanExec(nativeOp, scan.wrapped, scan.session)
 

diff --git a/spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala b/spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala
@@ -101,7 +101,7 @@ class CometParquetFileFormat extends ParquetFileFormat with MetricsSupport with
     // Comet specific configurations
     val capacity = CometConf.COMET_BATCH_SIZE.get(sqlConf)
     val nativeRecordBatchReaderEnabled =
-      CometConf.COMET_NATIVE_SCAN_IMPL.get(sqlConf).equals(CometConf.SCAN_NATIVE_RECORDBATCH)
+      CometConf.COMET_NATIVE_SCAN_IMPL.get(sqlConf).equals(CometConf.SCAN_NATIVE_ICEBERG_COMPAT)
 
     (file: PartitionedFile) => {
       val sharedConf = broadcastedHadoopConf.value.value

diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -2517,7 +2517,7 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
 
       // Fully native scan for V1
       case scan: CometScanExec
-          if CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_FULL =>
+          if CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_DATAFUSION =>
         val nativeScanBuilder = OperatorOuterClass.NativeScan.newBuilder()
         nativeScanBuilder.setSource(op.simpleStringWithNodeId())
 
@@ -2578,6 +2578,7 @@ object QueryPlanSerde extends Logging with ShimQueryPlanSerde with CometExprShim
           nativeScanBuilder.addAllDataSchema(dataSchema.toIterable.asJava)
           nativeScanBuilder.addAllRequiredSchema(requiredSchema.toIterable.asJava)
           nativeScanBuilder.addAllPartitionSchema(partitionSchema.toIterable.asJava)
+          nativeScanBuilder.setSessionTimezone(conf.getConfString("spark.sql.session.timeZone"))
 
           Some(result.setNativeScan(nativeScanBuilder).build())
 

diff --git a/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala b/spark/src/main/scala/org/apache/spark/sql/comet/CometScanExec.scala
@@ -474,7 +474,7 @@ case class CometScanExec(
 object CometScanExec extends DataTypeSupport {
 
   override def isAdditionallySupported(dt: DataType): Boolean = {
-    if (CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_RECORDBATCH) {
+    if (CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_ICEBERG_COMPAT) {
       // TODO add array and map
       dt match {
         case s: StructType => s.fields.map(_.dataType).forall(isTypeSupported)