Skip to content

Commit

Permalink
feat: implement StringColumn using StringViewArray
Browse files Browse the repository at this point in the history
  • Loading branch information
andylokandy committed Oct 15, 2024
1 parent 86cd608 commit 0c1473b
Show file tree
Hide file tree
Showing 38 changed files with 500 additions and 1,235 deletions.
4 changes: 2 additions & 2 deletions src/common/arrow/src/arrow/array/binview/mutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ pub struct MutableBinaryViewArray<T: ViewType + ?Sized> {
pub(super) validity: Option<MutableBitmap>,
pub(super) phantom: std::marker::PhantomData<T>,
/// Total bytes length if we would concatenate them all.
pub(super) total_bytes_len: usize,
pub total_bytes_len: usize,
/// Total bytes in the buffer (excluding remaining capacity)
pub(super) total_buffer_len: usize,
pub total_buffer_len: usize,
}

impl<T: ViewType + ?Sized> Clone for MutableBinaryViewArray<T> {
Expand Down
289 changes: 123 additions & 166 deletions src/query/expression/src/converts/arrow2/from.rs

Large diffs are not rendered by default.

56 changes: 11 additions & 45 deletions src/query/expression/src/converts/arrow2/to.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ fn table_type_to_arrow_type(ty: &TableDataType) -> ArrowDataType {
None,
),
TableDataType::Boolean => ArrowDataType::Boolean,
TableDataType::Binary => ArrowDataType::LargeBinary,
TableDataType::String => ArrowDataType::LargeUtf8,
TableDataType::Binary => ArrowDataType::BinaryView,
TableDataType::String => ArrowDataType::Utf8View,
TableDataType::Number(ty) => with_number_type!(|TYPE| match ty {
NumberDataType::TYPE => ArrowDataType::TYPE,
}),
Expand Down Expand Up @@ -135,7 +135,7 @@ fn table_type_to_arrow_type(ty: &TableDataType) -> ArrowDataType {
}
TableDataType::Bitmap => ArrowDataType::Extension(
ARROW_EXT_TYPE_BITMAP.to_string(),
Box::new(ArrowDataType::LargeBinary),
Box::new(ArrowDataType::BinaryView),
None,
),
TableDataType::Tuple {
Expand All @@ -157,17 +157,17 @@ fn table_type_to_arrow_type(ty: &TableDataType) -> ArrowDataType {
}
TableDataType::Variant => ArrowDataType::Extension(
ARROW_EXT_TYPE_VARIANT.to_string(),
Box::new(ArrowDataType::LargeBinary),
Box::new(ArrowDataType::BinaryView),
None,
),
TableDataType::Geometry => ArrowDataType::Extension(
ARROW_EXT_TYPE_GEOMETRY.to_string(),
Box::new(ArrowDataType::LargeBinary),
Box::new(ArrowDataType::BinaryView),
None,
),
TableDataType::Geography => ArrowDataType::Extension(
ARROW_EXT_TYPE_GEOGRAPHY.to_string(),
Box::new(ArrowDataType::LargeBinary),
Box::new(ArrowDataType::BinaryView),
None,
),
}
Expand Down Expand Up @@ -304,32 +304,10 @@ impl Column {
)
.unwrap(),
),
Column::Binary(col) => {
let offsets: Buffer<i64> =
col.offsets().iter().map(|offset| *offset as i64).collect();
Box::new(
databend_common_arrow::arrow::array::BinaryArray::<i64>::try_new(
arrow_type,
unsafe { OffsetsBuffer::new_unchecked(offsets) },
col.data().clone(),
None,
)
.unwrap(),
)
}
Column::String(col) => {
let offsets: Buffer<i64> =
col.offsets().iter().map(|offset| *offset as i64).collect();
Box::new(
databend_common_arrow::arrow::array::Utf8Array::<i64>::try_new(
arrow_type,
unsafe { OffsetsBuffer::new_unchecked(offsets) },
col.data().clone(),
None,
)
.unwrap(),
)
}
Column::Binary(col) => Box::new(col.clone().into_inner()),
Column::String(col) => unsafe {
Box::new(col.clone().into_inner().to_utf8view_unchecked())
},
Column::Timestamp(col) => Box::new(
databend_common_arrow::arrow::array::PrimitiveArray::<i64>::try_new(
arrow_type,
Expand Down Expand Up @@ -401,19 +379,7 @@ impl Column {
Column::Bitmap(col)
| Column::Variant(col)
| Column::Geometry(col)
| Column::Geography(GeographyColumn(col)) => {
let offsets: Buffer<i64> =
col.offsets().iter().map(|offset| *offset as i64).collect();
Box::new(
databend_common_arrow::arrow::array::BinaryArray::<i64>::try_new(
arrow_type,
unsafe { OffsetsBuffer::new_unchecked(offsets) },
col.data().clone(),
None,
)
.unwrap(),
)
}
| Column::Geography(GeographyColumn(col)) => Box::new(col.clone().into_inner()),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -239,48 +239,19 @@ impl<'a> Selector<'a> {
Some(validity) => {
// search the whole string buffer
if let LikePattern::SurroundByPercent(searcher) = like_pattern {
let needle = searcher.needle();
let needle_byte_len = needle.len();
let data = column.data().as_slice();
let offsets = column.offsets().as_slice();
let mut idx = 0;
let mut pos = (*offsets.first().unwrap()) as usize;
let end = (*offsets.last().unwrap()) as usize;

while pos < end && idx < count {
if let Some(p) = searcher.search(&data[pos..end]) {
while offsets[idx + 1] as usize <= pos + p {
let ret = NOT && validity.get_bit_unchecked(idx);
update_index(
ret,
idx as u32,
true_selection,
false_selection,
);
idx += 1;
}

// check if the substring is in bound
let ret =
pos + p + needle_byte_len <= offsets[idx + 1] as usize;

let ret = if NOT {
validity.get_bit_unchecked(idx) && !ret
} else {
validity.get_bit_unchecked(idx) && ret
};
update_index(ret, idx as u32, true_selection, false_selection);

pos = offsets[idx + 1] as usize;
idx += 1;
for idx in 0u32..count as u32 {
let ret = if NOT {
validity.get_bit_unchecked(idx as usize)
&& !searcher
.search(column.index_unchecked_bytes(idx as usize))
.is_some()
} else {
break;
}
}
while idx < count {
let ret = NOT && validity.get_bit_unchecked(idx);
update_index(ret, idx as u32, true_selection, false_selection);
idx += 1;
validity.get_bit_unchecked(idx as usize)
&& searcher
.search(column.index_unchecked_bytes(idx as usize))
.is_some()
};
update_index(ret, idx, true_selection, false_selection);
}
} else {
for idx in 0u32..count as u32 {
Expand All @@ -300,40 +271,17 @@ impl<'a> Selector<'a> {
None => {
// search the whole string buffer
if let LikePattern::SurroundByPercent(searcher) = like_pattern {
let needle = searcher.needle();
let needle_byte_len = needle.len();
let data = column.data().as_slice();
let offsets = column.offsets().as_slice();
let mut idx = 0;
let mut pos = (*offsets.first().unwrap()) as usize;
let end = (*offsets.last().unwrap()) as usize;

while pos < end && idx < count {
if let Some(p) = searcher.search(&data[pos..end]) {
while offsets[idx + 1] as usize <= pos + p {
update_index(
NOT,
idx as u32,
true_selection,
false_selection,
);
idx += 1;
}
// check if the substring is in bound
let ret =
pos + p + needle_byte_len <= offsets[idx + 1] as usize;
let ret = if NOT { !ret } else { ret };
update_index(ret, idx as u32, true_selection, false_selection);

pos = offsets[idx + 1] as usize;
idx += 1;
for idx in 0u32..count as u32 {
let ret = if NOT {
!searcher
.search(column.index_unchecked_bytes(idx as usize))
.is_some()
} else {
break;
}
}
while idx < count {
update_index(NOT, idx as u32, true_selection, false_selection);
idx += 1;
searcher
.search(column.index_unchecked_bytes(idx as usize))
.is_some()
};
update_index(ret, idx, true_selection, false_selection);
}
} else {
for idx in 0u32..count as u32 {
Expand Down
35 changes: 5 additions & 30 deletions src/query/expression/src/kernels/concat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ use crate::kernels::utils::set_vec_len_by_ptr;
use crate::store_advance_aligned;
use crate::types::array::ArrayColumnBuilder;
use crate::types::binary::BinaryColumn;
use crate::types::binary::BinaryColumnBuilder;
use crate::types::decimal::DecimalColumn;
use crate::types::geography::GeographyColumn;
use crate::types::geometry::GeometryType;
Expand Down Expand Up @@ -387,37 +388,11 @@ impl Column {
cols: impl Iterator<Item = BinaryColumn> + Clone,
num_rows: usize,
) -> BinaryColumn {
// [`BinaryColumn`] consists of [`data`] and [`offset`], we build [`data`] and [`offset`] respectively,
// and then call `BinaryColumn::new(data.into(), offsets.into())` to create [`BinaryColumn`].
let mut offsets: Vec<u64> = Vec::with_capacity(num_rows + 1);
let mut data_size = 0;

// Build [`offset`] and calculate `data_size` required by [`data`].
offsets.push(0);
for col in cols.clone() {
let mut start = col.offsets()[0];
for end in col.offsets()[1..].iter() {
data_size += end - start;
start = *end;
offsets.push(data_size);
}
}

// Build [`data`].
let mut data: Vec<u8> = Vec::with_capacity(data_size as usize);
let mut data_ptr = data.as_mut_ptr();

unsafe {
for col in cols {
let offsets = col.offsets();
let col_data = &(col.data().as_slice())
[offsets[0] as usize..offsets[offsets.len() - 1] as usize];
copy_advance_aligned(col_data.as_ptr(), &mut data_ptr, col_data.len());
}
set_vec_len_by_ptr(&mut data, data_ptr);
let mut builder = BinaryColumnBuilder::with_capacity(num_rows, 0);
for col in cols {
builder.append_column(&col);
}

BinaryColumn::new(data.into(), offsets.into())
builder.build()
}

pub fn concat_string_types(
Expand Down
Loading

0 comments on commit 0c1473b

Please sign in to comment.