Skip to content

Commit

Permalink
feat: implement StringColumn using StringViewArray (#16610)
Browse files Browse the repository at this point in the history
* feat: implement StringColumn using StringViewArray

* fix

* convert binaryview between arrow1 and arrow2

* fix

* fix

* fix

* fix

* fix

* fix some issue

* fix view slice bug

* fix view slice bug

* fix

* support native read write

* fix

* fix

* fix tests

* add with_data_type

* add with_data_type

* fix gen_random_uuid commit row

* move record batch to block

* remove unused dep

* fix lint

* fix commit row

* fix commit row

* fix size

* fix size

* add NewBinaryColumnBuilder and NewStringColumnBulder

* fix incorrect serialize_size

* fix incorrect serialize_size

* lint

* lint

* fix tests

* use binary state

* use binary state

* update tests

* update tests

* update tests

* fix native view encoding

* fix

* [ci skip] updata kernel concat for view types

* [ci skip] improve kernels for view types

* [ci skip] only string type use string view type

* [ci skip] only string type use string view type

* fix tests

* [ci skip] fix tests

* [ci skip] fix

* fix

* use NewStringColumnBuilder

* rename NewString -> String

* fmt

* [ci skip] update tests

* optimize take

* add bench

* fix tests

* update

* improve compare

* implement compare using string view prefix

* fix

* fix

* fix

* fix-length

* disable spill

* [ci skip] add put_and_commit

* [ci skip] update

* update test

* lint

* [ci skip] add maybe gc

* fix endiness

* fix endiness

* fix

* update string compare

* update

---------

Co-authored-by: sundy-li <[email protected]>
  • Loading branch information
andylokandy and sundy-li authored Nov 8, 2024
1 parent a645b0d commit f4e599f
Show file tree
Hide file tree
Showing 173 changed files with 4,535 additions and 4,355 deletions.
35 changes: 16 additions & 19 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -219,9 +219,9 @@ arrow-ipc = { version = "53" }
arrow-ord = { version = "53" }
arrow-schema = { version = "53", features = ["serde"] }
arrow-select = { version = "53" }
arrow-udf-js = "0.5.0"
arrow-udf-python = "0.4.0"
arrow-udf-wasm = "0.4.0"
arrow-udf-js = { git = "https://github.com/arrow-udf/arrow-udf", rev = "80b09d6" }
arrow-udf-python = { git = "https://github.com/arrow-udf/arrow-udf", rev = "80b09d6" }
arrow-udf-wasm = { git = "https://github.com/arrow-udf/arrow-udf", rev = "80b09d6" }
async-backtrace = "0.2"
async-channel = "1.7.1"
async-compression = { git = "https://github.com/datafuse-extras/async-compression", rev = "dc81082", features = [
Expand Down
1 change: 0 additions & 1 deletion src/common/arrow/src/arrow/array/binview/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ unsafe impl<T: ViewType + ?Sized> ToFfi for BinaryViewArrayGeneric<T> {
validity,
views: self.views.clone(),
buffers: self.buffers.clone(),
raw_buffers: self.raw_buffers.clone(),
phantom: Default::default(),
total_bytes_len: AtomicU64::new(self.total_bytes_len.load(Ordering::Relaxed)),
total_buffer_len: self.total_buffer_len,
Expand Down
76 changes: 76 additions & 0 deletions src/common/arrow/src/arrow/array/binview/from.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,89 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use arrow_data::ArrayData;
use arrow_data::ArrayDataBuilder;
use arrow_schema::DataType;

use crate::arrow::array::Arrow2Arrow;
use crate::arrow::array::BinaryViewArray;
use crate::arrow::array::BinaryViewArrayGeneric;
use crate::arrow::array::MutableBinaryViewArray;
use crate::arrow::array::Utf8ViewArray;
use crate::arrow::array::ViewType;
use crate::arrow::bitmap::Bitmap;

impl<T: ViewType + ?Sized, P: AsRef<T>> FromIterator<Option<P>> for BinaryViewArrayGeneric<T> {
#[inline]
fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
MutableBinaryViewArray::<T>::from_iter(iter).into()
}
}

impl Arrow2Arrow for BinaryViewArray {
fn to_data(&self) -> ArrayData {
let builder = ArrayDataBuilder::new(DataType::BinaryView)
.len(self.len())
.add_buffer(self.views.clone().into())
.add_buffers(
self.buffers
.iter()
.map(|x| x.clone().into())
.collect::<Vec<_>>(),
)
.nulls(self.validity.clone().map(Into::into));
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let views = crate::arrow::buffer::Buffer::from(data.buffers()[0].clone());
let buffers = data.buffers()[1..]
.iter()
.map(|x| crate::arrow::buffer::Buffer::from(x.clone()))
.collect();
let validity = data.nulls().map(|x| Bitmap::from_null_buffer(x.clone()));
unsafe {
Self::new_unchecked_unknown_md(
crate::arrow::datatypes::DataType::BinaryView,
views,
buffers,
validity,
None,
)
}
}
}

impl Arrow2Arrow for Utf8ViewArray {
fn to_data(&self) -> ArrayData {
let builder = ArrayDataBuilder::new(DataType::Utf8View)
.len(self.len())
.add_buffer(self.views.clone().into())
.add_buffers(
self.buffers
.iter()
.map(|x| x.clone().into())
.collect::<Vec<_>>(),
)
.nulls(self.validity.clone().map(Into::into));
unsafe { builder.build_unchecked() }
}

fn from_data(data: &ArrayData) -> Self {
let views = crate::arrow::buffer::Buffer::from(data.buffers()[0].clone());
let buffers = data.buffers()[1..]
.iter()
.map(|x| crate::arrow::buffer::Buffer::from(x.clone()))
.collect();
let validity = data.nulls().map(|x| Bitmap::from_null_buffer(x.clone()));
unsafe {
Self::new_unchecked_unknown_md(
crate::arrow::datatypes::DataType::Utf8View,
views,
buffers,
validity,
None,
)
}
}
}
Loading

0 comments on commit f4e599f

Please sign in to comment.