Skip to content

Commit

Permalink
Add Fields abstraction (apache#3955)
Browse files Browse the repository at this point in the history
  • Loading branch information
tustvold committed Mar 27, 2023
1 parent 71ecc39 commit e48e93f
Show file tree
Hide file tree
Showing 10 changed files with 292 additions and 95 deletions.
5 changes: 3 additions & 2 deletions arrow-array/src/array/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ mod tests {
use crate::cast::AsArray;
use crate::types::UInt32Type;
use crate::{Int32Array, UInt32Array};
use arrow_schema::Fields;
use std::sync::Arc;

use super::*;
Expand Down Expand Up @@ -496,10 +497,10 @@ mod tests {
fn test_from_array_data_validation() {
// A DictionaryArray has similar buffer layout to a MapArray
// but the meaning of the values differs
let struct_t = DataType::Struct(vec![
let struct_t = DataType::Struct(Fields::from(vec![
Field::new("keys", DataType::Int32, true),
Field::new("values", DataType::UInt32, true),
]);
]));
let dict_t = DataType::Dictionary(Box::new(DataType::Int32), Box::new(struct_t));
let _ = MapArray::from(ArrayData::new_empty(&dict_t));
}
Expand Down
8 changes: 4 additions & 4 deletions arrow-array/src/array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ mod tests {
use crate::cast::{as_union_array, downcast_array};
use crate::downcast_run_array;
use arrow_buffer::{Buffer, MutableBuffer};
use arrow_schema::{Field, UnionMode};
use arrow_schema::{Field, Fields, UnionMode};

#[test]
fn test_empty_primitive() {
Expand Down Expand Up @@ -785,7 +785,7 @@ mod tests {
// It is possible to create a null struct containing a non-nullable child
// see https://github.com/apache/arrow-rs/pull/3244 for details
let struct_type =
DataType::Struct(vec![Field::new("data", DataType::Int64, false)]);
DataType::Struct(vec![Field::new("data", DataType::Int64, false)].into());
let array = new_null_array(&struct_type, 9);

let a = array.as_any().downcast_ref::<StructArray>().unwrap();
Expand Down Expand Up @@ -828,10 +828,10 @@ mod tests {
let data_type = DataType::Map(
Box::new(Field::new(
"entry",
DataType::Struct(vec![
DataType::Struct(Fields::from(vec![
Field::new("key", DataType::Utf8, false),
Field::new("value", DataType::Int32, true),
]),
])),
false,
)),
false,
Expand Down
6 changes: 3 additions & 3 deletions arrow-array/src/array/struct_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,11 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
len = Some(child_datum_len)
}
child_data.push(child_datum.clone());
fields.push(Field::new(
fields.push(Arc::new(Field::new(
field_name,
array.data_type().clone(),
child_datum.nulls().is_some(),
));
)));

if let Some(child_nulls) = child_datum.nulls() {
null = Some(if let Some(null_buffer) = &null {
Expand All @@ -176,7 +176,7 @@ impl TryFrom<Vec<(&str, ArrayRef)>> for StructArray {
}
let len = len.unwrap();

let builder = ArrayData::builder(DataType::Struct(fields))
let builder = ArrayData::builder(DataType::Struct(fields.into()))
.len(len)
.null_bit_buffer(null)
.child_data(child_data);
Expand Down
8 changes: 4 additions & 4 deletions arrow-array/src/builder/struct_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::builder::*;
use crate::{Array, ArrayRef, StructArray};
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit};
use arrow_schema::{DataType, Field, Fields, IntervalUnit, TimeUnit};
use std::any::Any;
use std::sync::Arc;

Expand All @@ -29,7 +29,7 @@ use std::sync::Arc;
/// Note that callers should make sure that methods of all the child field builders are
/// properly called to maintain the consistency of the data structure.
pub struct StructBuilder {
fields: Vec<Field>,
fields: Fields,
field_builders: Vec<Box<dyn ArrayBuilder>>,
null_buffer_builder: NullBufferBuilder,
}
Expand Down Expand Up @@ -180,7 +180,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box<dyn ArrayBuilde

impl StructBuilder {
/// Creates a new `StructBuilder`
pub fn new(fields: Vec<Field>, field_builders: Vec<Box<dyn ArrayBuilder>>) -> Self {
pub fn new(fields: Fields, field_builders: Vec<Box<dyn ArrayBuilder>>) -> Self {
Self {
fields,
field_builders,
Expand All @@ -189,7 +189,7 @@ impl StructBuilder {
}

/// Creates a new `StructBuilder` from vector of [`Field`] with `capacity`
pub fn from_fields(fields: Vec<Field>, capacity: usize) -> Self {
pub fn from_fields(fields: Fields, capacity: usize) -> Self {
let mut builders = Vec::with_capacity(fields.len());
for field in &fields {
builders.push(make_builder(field.data_type(), capacity));
Expand Down
3 changes: 2 additions & 1 deletion arrow-data/src/data/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1874,7 +1874,8 @@ mod tests {
)
.unwrap();

let data_type = DataType::Struct(vec![Field::new("x", DataType::Int32, true)]);
let field = Arc::new(Field::new("x", DataType::Int32, true));
let data_type = DataType::Struct(vec![field].into());

let arr_data = ArrayData::builder(data_type)
.len(5)
Expand Down
42 changes: 23 additions & 19 deletions arrow-schema/src/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
use std::fmt;

use crate::field::Field;
use crate::Fields;

/// The set of datatypes that are supported by this implementation of Apache Arrow.
///
Expand Down Expand Up @@ -182,7 +183,7 @@ pub enum DataType {
/// A single LargeList array can store up to [`i64::MAX`] elements in total
LargeList(Box<Field>),
/// A nested datatype that contains a number of sub-fields.
Struct(Vec<Field>),
Struct(Fields),
/// A nested datatype that can represent slots of differing types. Components:
///
/// 1. [`Field`] for each possible child type the Union can hold
Expand Down Expand Up @@ -482,7 +483,8 @@ impl DataType {
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
| DataType::Map(field, _) => field.size(),
DataType::Struct(fields) | DataType::Union(fields, _, _) => {
DataType::Struct(fields) => fields.size(),
DataType::Union(fields, _, _) => {
fields
.iter()
.map(|field| field.size() - std::mem::size_of_val(field))
Expand Down Expand Up @@ -534,18 +536,18 @@ mod tests {
let last_name = Field::new("last_name", DataType::Utf8, false)
.with_metadata(HashMap::default());

let person = DataType::Struct(vec![
let person = DataType::Struct(Fields::from(vec![
first_name,
last_name,
Field::new(
"address",
DataType::Struct(vec![
DataType::Struct(Fields::from(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
])),
false,
),
]);
]));

let serialized = serde_json::to_string(&person).unwrap();

Expand Down Expand Up @@ -592,24 +594,26 @@ mod tests {
assert!(!list_e.equals_datatype(&list_g));
assert!(!list_f.equals_datatype(&list_g));

let list_h = DataType::Struct(vec![Field::new("f1", list_e, true)]);
let list_i = DataType::Struct(vec![Field::new("f1", list_f.clone(), true)]);
let list_j = DataType::Struct(vec![Field::new("f1", list_f.clone(), false)]);
let list_k = DataType::Struct(vec![
let list_h = DataType::Struct(Fields::from(vec![Field::new("f1", list_e, true)]));
let list_i =
DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), true)]));
let list_j =
DataType::Struct(Fields::from(vec![Field::new("f1", list_f.clone(), false)]));
let list_k = DataType::Struct(Fields::from(vec![
Field::new("f1", list_f.clone(), false),
Field::new("f2", list_g.clone(), false),
Field::new("f3", DataType::Utf8, true),
]);
let list_l = DataType::Struct(vec![
]));
let list_l = DataType::Struct(Fields::from(vec![
Field::new("ff1", list_f.clone(), false),
Field::new("ff2", list_g.clone(), false),
Field::new("ff3", DataType::LargeUtf8, true),
]);
let list_m = DataType::Struct(vec![
]));
let list_m = DataType::Struct(Fields::from(vec![
Field::new("ff1", list_f, false),
Field::new("ff2", list_g, false),
Field::new("ff3", DataType::Utf8, true),
]);
]));
assert!(list_h.equals_datatype(&list_i));
assert!(!list_h.equals_datatype(&list_j));
assert!(!list_k.equals_datatype(&list_l));
Expand All @@ -618,18 +622,18 @@ mod tests {

#[test]
fn create_struct_type() {
let _person = DataType::Struct(vec![
let _person = DataType::Struct(Fields::from(vec![
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new(
"address",
DataType::Struct(vec![
DataType::Struct(Fields::from(vec![
Field::new("street", DataType::Utf8, false),
Field::new("zip", DataType::UInt16, false),
]),
])),
false,
),
]);
]));
}

#[test]
Expand Down
49 changes: 28 additions & 21 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ use std::collections::HashMap;
use std::hash::{Hash, Hasher};

use crate::datatype::DataType;
use crate::schema::SchemaBuilder;

/// A reference counted [`Field`]
pub type FieldRef = std::sync::Arc<Field>;

/// Describes a single column in a [`Schema`](super::Schema).
///
Expand Down Expand Up @@ -230,7 +234,8 @@ impl Field {

fn _fields(dt: &DataType) -> Vec<&Field> {
match dt {
DataType::Struct(fields) | DataType::Union(fields, _, _) => {
DataType::Struct(fields) => fields.iter().flat_map(|f| f.fields()).collect(),
DataType::Union(fields, _, _) => {
fields.iter().flat_map(|f| f.fields()).collect()
}
DataType::List(field)
Expand Down Expand Up @@ -326,15 +331,9 @@ impl Field {
match &mut self.data_type {
DataType::Struct(nested_fields) => match &from.data_type {
DataType::Struct(from_nested_fields) => {
for from_field in from_nested_fields {
match nested_fields
.iter_mut()
.find(|self_field| self_field.name == from_field.name)
{
Some(self_field) => self_field.try_merge(from_field)?,
None => nested_fields.push(from_field.clone()),
}
}
let mut builder = SchemaBuilder::new();
nested_fields.iter().chain(from_nested_fields).try_for_each(|f| builder.try_merge(f))?;
*nested_fields = builder.finish().fields;
}
_ => {
return Err(ArrowError::SchemaError(
Expand Down Expand Up @@ -479,6 +478,7 @@ impl std::fmt::Display for Field {
#[cfg(test)]
mod test {
use super::*;
use crate::Fields;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};

Expand Down Expand Up @@ -525,29 +525,29 @@ mod test {

let field = Field::new(
"struct<dict1, list[struct<dict2, list[struct<dict1]>]>",
DataType::Struct(vec![
DataType::Struct(Fields::from(vec![
dict1.clone(),
Field::new(
"list[struct<dict1, list[struct<dict2>]>]",
DataType::List(Box::new(Field::new(
"struct<dict1, list[struct<dict2>]>",
DataType::Struct(vec![
DataType::Struct(Fields::from(vec![
dict1.clone(),
Field::new(
"list[struct<dict2>]",
DataType::List(Box::new(Field::new(
"struct<dict2>",
DataType::Struct(vec![dict2.clone()]),
DataType::Struct(vec![dict2.clone()].into()),
false,
))),
false,
),
]),
])),
false,
))),
false,
),
]),
])),
false,
);

Expand Down Expand Up @@ -632,14 +632,18 @@ mod test {
fn test_contains_transitivity() {
let child_field = Field::new("child1", DataType::Float16, false);

let mut field1 = Field::new("field1", DataType::Struct(vec![child_field]), false);
let mut field1 = Field::new(
"field1",
DataType::Struct(Fields::from(vec![child_field])),
false,
);
field1.set_metadata(HashMap::from([(String::from("k1"), String::from("v1"))]));

let mut field2 = Field::new("field1", DataType::Struct(vec![]), true);
let mut field2 = Field::new("field1", DataType::Struct(Fields::default()), true);
field2.set_metadata(HashMap::from([(String::from("k2"), String::from("v2"))]));
field2.try_merge(&field1).unwrap();

let mut field3 = Field::new("field1", DataType::Struct(vec![]), false);
let mut field3 = Field::new("field1", DataType::Struct(Fields::default()), false);
field3.set_metadata(HashMap::from([(String::from("k3"), String::from("v3"))]));
field3.try_merge(&field2).unwrap();

Expand All @@ -665,11 +669,14 @@ mod test {
let child_field1 = Field::new("child1", DataType::Float16, false);
let child_field2 = Field::new("child2", DataType::Float16, false);

let field1 =
Field::new("field1", DataType::Struct(vec![child_field1.clone()]), true);
let field1 = Field::new(
"field1",
DataType::Struct(vec![child_field1.clone()].into()),
true,
);
let field2 = Field::new(
"field1",
DataType::Struct(vec![child_field1, child_field2]),
DataType::Struct(vec![child_field1, child_field2].into()),
true,
);

Expand Down
Loading

0 comments on commit e48e93f

Please sign in to comment.