Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Single loop concatenate #20773

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ impl CategoricalChunked {
Field::new(name.clone(), self.dtype().clone())
}

pub fn logical_mut(&mut self) -> &mut Logical<CategoricalType, UInt32Type> {
&mut self.physical
}

pub fn is_empty(&self) -> bool {
self.len() == 0
}
Expand Down Expand Up @@ -347,8 +351,8 @@ impl LogicalType for CategoricalChunked {
match self.physical.0.get_unchecked(i) {
Some(i) => match self.dtype() {
DataType::Enum(_, _) => AnyValue::Enum(i, self.get_rev_map(), SyncPtr::new_null()),
DataType::Categorical(_, _) => {
AnyValue::Categorical(i, self.get_rev_map(), SyncPtr::new_null())
DataType::Categorical(_, ord) => {
AnyValue::Categorical(i, self.get_rev_map(), SyncPtr::new_null(), *ord)
},
_ => unimplemented!(),
},
Expand Down Expand Up @@ -553,7 +557,7 @@ mod test {
);
assert!(matches!(
s.get(0)?,
AnyValue::Categorical(0, RevMapping::Local(_, _), _)
AnyValue::Categorical(0, RevMapping::Local(_, _), _, _)
));

let groups = s.group_tuples(false, true);
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/date.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ impl LogicalType for DateChunked {
}

fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
self.0.get_any_value(i).map(|av| av.as_date())
self.0.try_get_any_value(i).map(|av| av.as_date())
}

unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/datetime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ impl LogicalType for DatetimeChunked {

fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
self.0
.get_any_value(i)
.try_get_any_value(i)
.map(|av| av.as_datetime(self.time_unit(), self.time_zone().as_ref()))
}

Expand Down
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/duration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ impl LogicalType for DurationChunked {

fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
self.0
.get_any_value(i)
.try_get_any_value(i)
.map(|av| av.as_duration(self.time_unit()))
}
unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
Expand Down
28 changes: 28 additions & 0 deletions crates/polars-core/src/chunked_array/logical/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,36 @@ where
pub fn physical(&self) -> &ChunkedArray<T> {
&self.0
}

pub fn field(&self) -> Field {
let name = self.0.ref_field().name();
Field::new(name.clone(), LogicalType::dtype(self).clone())
}
}

impl<K: PolarsDataType, T: PolarsDataType<IsLogical = FalseT>> Logical<K, T>
where
Self: LogicalType,
ChunkedArray<T>: ChunkAnyValue,
{
pub unsafe fn append_gather_unchecked(
&mut self,
dfs: &[DataFrame],
i: usize,
check_names: bool,
check_dtypes: bool,
) -> PolarsResult<()> {
if check_dtypes {
for df in dfs.iter() {
if df.width() == 0 {
continue;
}

let column = &df.get_columns()[i];
polars_ensure!(self.dtype() == column.dtype(), append);
}
}

unsafe { self.0.append_gather_unchecked(dfs, i, check_names, false) }
}
}
2 changes: 1 addition & 1 deletion crates/polars-core/src/chunked_array/logical/time.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ impl LogicalType for TimeChunked {

#[cfg(feature = "dtype-time")]
fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
self.0.get_any_value(i).map(|av| av.as_time())
self.0.try_get_any_value(i).map(|av| av.as_time())
}
unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
self.0.get_any_value_unchecked(i).as_time()
Expand Down
114 changes: 114 additions & 0 deletions crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,120 @@ impl<T: PolarsDataType> ChunkedArray<T> {
}
}

impl<T> ChunkedArray<T>
where
T: PolarsDataType<IsLogical = FalseT>,
Self: ChunkAnyValue,
{
pub unsafe fn append_gather_unchecked(
&mut self,
dfs: &[DataFrame],
i: usize,
check_names: bool,
check_dtypes: bool,
) -> PolarsResult<()> {
let estimated_num_chunks = self.chunks.len().max(1) * dfs.len();
self.chunks.reserve(estimated_num_chunks);

let mut flags = self.flags.get_mut();

for other_df in dfs {
if other_df.width() == 0 {
continue;
}

let ca: &Self = other_df.get_columns()[i]
.as_materialized_series()
.as_ref()
.as_ref();

if check_names {
polars_ensure!(
self.name() == ca.name(),
ShapeMismatch: "unable to vstack, column names don't match: {:?} and {:?}",
self.name(), ca.name(),
);
}

if check_dtypes {
polars_ensure!(self.dtype() == ca.dtype(), append);
}

if ca.is_empty() {
continue;
}

let boundary_idx = self.length;
self.length += ca.len();
self.null_count += ca.null_count();
self.chunks.extend(ca.chunks.iter().cloned());

if self.len() == ca.len() {
flags = ca.get_flags()
& (StatisticsFlags::CAN_FAST_EXPLODE_LIST | StatisticsFlags::IS_SORTED_ANY);
} else if flags.is_sorted_any() || flags.can_fast_explode_list() {
let ca_flags = ca.get_flags();

let ca_can_fast_explode_list = ca_flags & StatisticsFlags::CAN_FAST_EXPLODE_LIST;
flags &= !(ca_can_fast_explode_list ^ StatisticsFlags::CAN_FAST_EXPLODE_LIST);

let is_sorted = flags.is_sorted();
let ca_is_sorted = ca_flags.is_sorted();

flags.set_sorted(IsSorted::Not);

// Preserve the sorted flag if possible.
if !matches!(is_sorted, IsSorted::Not) && is_sorted == ca_is_sorted {
let lst = self.get_any_value_static_unchecked(boundary_idx - 1);
let fst = self.get_any_value_static_unchecked(boundary_idx);

let is_still_sorted = match (&lst, &fst) {
(AnyValue::Null, AnyValue::Null) => {
ca.null_count() == ca.len()
|| self.null_count() - ca.null_count() == self.len() - ca.len()
},
(AnyValue::Null, _) => self.null_count() - ca.null_count() == 0,
(_, AnyValue::Null) => self.null_count() == ca.null_count(),
(_, _) => {
let mut res = true;

if self.null_count() > 0 {
// SAFETY: We know that self has at least one element
let are_nulls_first =
unsafe { self.get_any_value_unchecked(0) }.is_null();
let are_nulls_last =
unsafe { self.get_any_value_unchecked(self.len() - 1) }
.is_null();

res &= are_nulls_first != are_nulls_last;
}

// @NOTE: This only works when the logical type and the physical type
// have the same ordering. This is not the case for
// Categorical(ordering = 'lexical') so that needs a special case.
res &= match is_sorted {
IsSorted::Ascending => lst <= fst,
IsSorted::Descending => lst >= fst,
IsSorted::Not => unreachable!(),
};

res
},
};

if is_still_sorted {
flags.set_sorted(is_sorted);
}
}
}
}

self.flags = StatisticsFlagsIM::new(flags);

Ok(())
}
}

impl<T> ChunkedArray<T>
where
T: PolarsDataType,
Expand Down
63 changes: 6 additions & 57 deletions crates/polars-core/src/chunked_array/ops/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ pub(crate) unsafe fn arr_to_any_value<'a>(
}
},
#[cfg(feature = "dtype-categorical")]
DataType::Categorical(rev_map, _) => {
DataType::Categorical(rev_map, ord) => {
let arr = &*(arr as *const dyn Array as *const UInt32Array);
let v = arr.value_unchecked(idx);
AnyValue::Categorical(v, rev_map.as_ref().unwrap().as_ref(), SyncPtr::new_null())
AnyValue::Categorical(v, rev_map.as_ref().unwrap().as_ref(), SyncPtr::new_null(), *ord)
},
#[cfg(feature = "dtype-categorical")]
DataType::Enum(rev_map, _) => {
Expand Down Expand Up @@ -163,11 +163,12 @@ impl<'a> AnyValue<'a> {
if arr.is_valid_unchecked(idx) {
let v = arr.value_unchecked(idx);
match fld.dtype() {
DataType::Categorical(Some(rev_map), _) => {
DataType::Categorical(Some(rev_map), ord) => {
AnyValue::Categorical(
v,
rev_map,
SyncPtr::from_const(values),
*ord,
)
},
DataType::Enum(Some(rev_map), _) => {
Expand Down Expand Up @@ -210,17 +211,6 @@ macro_rules! get_any_value_unchecked {
}};
}

macro_rules! get_any_value {
($self:ident, $index:expr) => {{
if $index >= $self.len() {
polars_bail!(oob = $index, $self.len());
}
// SAFETY:
// bounds are checked
Ok(unsafe { $self.get_any_value_unchecked($index) })
}};
}

impl<T> ChunkAnyValue for ChunkedArray<T>
where
T: PolarsNumericType,
Expand All @@ -229,65 +219,41 @@ where
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for BooleanChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for StringChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for BinaryChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for BinaryOffsetChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for ListChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

#[cfg(feature = "dtype-array")]
Expand All @@ -296,10 +262,6 @@ impl ChunkAnyValue for ArrayChunked {
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
get_any_value_unchecked!(self, index)
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

#[cfg(feature = "object")]
Expand All @@ -311,31 +273,18 @@ impl<T: PolarsObject> ChunkAnyValue for ObjectChunked<T> {
Some(v) => AnyValue::Object(v),
}
}

fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue> {
get_any_value!(self, index)
}
}

impl ChunkAnyValue for NullChunked {
#[inline]
unsafe fn get_any_value_unchecked(&self, _index: usize) -> AnyValue {
unsafe fn get_any_value_unchecked(&self, index: usize) -> AnyValue {
debug_assert!(index < NullChunked::len(self));
AnyValue::Null
}

fn get_any_value(&self, _index: usize) -> PolarsResult<AnyValue> {
Ok(AnyValue::Null)
}
}

#[cfg(feature = "dtype-struct")]
impl ChunkAnyValue for StructChunked {
/// Gets AnyValue from LogicalType
fn get_any_value(&self, i: usize) -> PolarsResult<AnyValue<'_>> {
polars_ensure!(i < self.len(), oob = i, self.len());
unsafe { Ok(self.get_any_value_unchecked(i)) }
}

unsafe fn get_any_value_unchecked(&self, i: usize) -> AnyValue<'_> {
let (chunk_idx, idx) = index_to_chunked_index(self.chunks.iter().map(|c| c.len()), i);
if let DataType::Struct(flds) = self.dtype() {
Expand Down
Loading
Loading