From 14e839fd6f2ded67241196bfd3cdfa74aea7128e Mon Sep 17 00:00:00 2001 From: coastalwhite Date: Wed, 15 Jan 2025 16:02:33 +0100 Subject: [PATCH] fix hive columns --- .../polars-arrow/src/array/binview/mutable.rs | 13 +++++++++++++ crates/polars-io/Cargo.toml | 1 + crates/polars-io/src/parquet/read/read_impl.rs | 11 ++++++++--- crates/polars-io/src/predicates.rs | 18 ++++++++++++------ crates/polars/Cargo.toml | 1 + 5 files changed, 35 insertions(+), 9 deletions(-) diff --git a/crates/polars-arrow/src/array/binview/mutable.rs b/crates/polars-arrow/src/array/binview/mutable.rs index 21ea23d1348f..799af65b8fc5 100644 --- a/crates/polars-arrow/src/array/binview/mutable.rs +++ b/crates/polars-arrow/src/array/binview/mutable.rs @@ -589,6 +589,19 @@ impl MutableBinaryViewArray { } pub fn extend_from_array(&mut self, other: &BinaryViewArrayGeneric) { + let slf_len = self.len(); + match (&mut self.validity, other.validity()) { + (None, None) => {}, + (Some(v), None) => v.extend_constant(other.len(), true), + (v @ None, Some(other)) => { + let mut bm = MutableBitmap::with_capacity(slf_len + other.len()); + bm.extend_constant(slf_len, true); + bm.extend_from_bitmap(other); + *v = Some(bm); + } + (Some(slf), Some(other)) => slf.extend_from_bitmap(other), + } + if other.total_buffer_len() == 0 { self.views.extend(other.views().iter().copied()); } else { diff --git a/crates/polars-io/Cargo.toml b/crates/polars-io/Cargo.toml index e06a7aad1b8a..149324225fe1 100644 --- a/crates/polars-io/Cargo.toml +++ b/crates/polars-io/Cargo.toml @@ -99,6 +99,7 @@ timezones = [ "polars-json?/timezones", ] dtype-time = ["polars-core/dtype-time", "polars-core/temporal", "polars-time/dtype-time"] +dtype-duration = ["polars-core/dtype-duration", "polars-time/dtype-duration"] dtype-struct = ["polars-core/dtype-struct"] dtype-decimal = ["polars-core/dtype-decimal", "polars-json?/dtype-decimal"] fmt = ["polars-core/fmt"] diff --git a/crates/polars-io/src/parquet/read/read_impl.rs b/crates/polars-io/src/parquet/read/read_impl.rs index 6f39e634ebc2..30e394efbfc5 100644 --- a/crates/polars-io/src/parquet/read/read_impl.rs +++ b/crates/polars-io/src/parquet/read/read_impl.rs @@ -298,12 +298,16 @@ fn rg_to_dfs_prefiltered( } let do_parquet_expr = std::env::var("POLARS_NO_PARQUET_EXPR").as_deref() != Ok("1") - && live_columns.len() == 1 + && live_columns.len() == 1 // Only do it with one column for now + && hive_partition_columns.is_none_or(|hc| { + !hc.iter() + .any(|c| c.name().as_str() == live_columns[0].as_str()) + }) // No hive columns && !schema .get(live_columns[0].as_str()) .unwrap() .dtype() - .is_nested(); + .is_nested(); // No nested columns let column_exprs = do_parquet_expr.then(|| { live_columns .iter() @@ -430,10 +434,11 @@ fn rg_to_dfs_prefiltered( PlSmallStr::EMPTY, [BooleanArray::new(ArrowDataType::Boolean, f.clone(), None)], ))?; - unsafe { df.column_extend_unchecked(live_columns) }; + unsafe { df.column_extend_unchecked(live_columns) } } else { df = DataFrame::new(live_columns).unwrap(); } + filter_mask = f.clone(); } else { df = unsafe { DataFrame::new_no_checks(md.num_rows(), live_columns.clone()) }; diff --git a/crates/polars-io/src/predicates.rs b/crates/polars-io/src/predicates.rs index 8fca362de24e..e01eca0256d4 100644 --- a/crates/polars-io/src/predicates.rs +++ b/crates/polars-io/src/predicates.rs @@ -143,17 +143,23 @@ fn cast_to_parquet_scalar(scalar: Scalar) -> Option { A::Int8(v) => P::Int8(v), A::Int16(v) => P::Int16(v), - A::Int32(v) | A::Date(v) => P::Int32(v), - A::Int64(v) - | A::Datetime(v, _, _) - | A::DatetimeOwned(v, _, _) - | A::Duration(v, _) - | A::Time(v) => P::Int64(v), + A::Int32(v) => P::Int32(v), + A::Int64(v) => P::Int64(v), + + #[cfg(feature = "dtype-time")] + A::Date(v) => P::Int32(v), + #[cfg(feature = "dtype-datetime")] + A::Datetime(v, _, _) | A::DatetimeOwned(v, _, _) => P::Int64(v), + #[cfg(feature = "dtype-duration")] + A::Duration(v, _) => P::Int64(v), + #[cfg(feature = "dtype-time")] + A::Time(v) => P::Int64(v), A::Float32(v) => P::Float32(v), A::Float64(v) => P::Float64(v), // @TODO: Cast to string + #[cfg(feature = "dtype-categorical")] A::Categorical(_, _, _) | A::CategoricalOwned(_, _, _) | A::Enum(_, _, _) diff --git a/crates/polars/Cargo.toml b/crates/polars/Cargo.toml index d256582503cf..7aff19a1853f 100644 --- a/crates/polars/Cargo.toml +++ b/crates/polars/Cargo.toml @@ -298,6 +298,7 @@ dtype-datetime = [ ] dtype-duration = [ "polars-core/dtype-duration", + "polars-io/dtype-duration", "polars-lazy?/dtype-duration", "polars-time?/dtype-duration", "polars-ops/dtype-duration",