From 1b656f15f65f2eea91f68ddc91c8a5003a2161bd Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 Jan 2025 17:06:29 +0100 Subject: [PATCH 1/5] Expose metadata in SchemaEngineVisitor in ffi api --- ffi/examples/read-table/schema.h | 35 +++++++++++++++++++++--- ffi/src/scan.rs | 2 +- ffi/src/schema.rs | 47 ++++++++++++++++++++++++++++---- 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/ffi/examples/read-table/schema.h b/ffi/examples/read-table/schema.h index b4a64b4e6..b8b903970 100644 --- a/ffi/examples/read-table/schema.h +++ b/ffi/examples/read-table/schema.h @@ -85,6 +85,23 @@ void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int paren } } +void print_metadata(const char *name, const CStringMap* metadata) +{ +#ifdef VERBOSE + char* key_str = "delta.columnMapping.physicalName"; + KernelStringSlice key = { key_str, strlen(key_str) }; + char* value = get_from_map(metadata, key, allocate_string); + if (value) { + printf("Physical name of %s is %s\n", name, value); + free(value); + } else { + printf("No physical name\n"); + } +#else + (void)metadata; +#endif +} + // declare all our visitor methods uintptr_t make_field_list(void* data, uintptr_t reserve) { @@ -109,11 +126,13 @@ void visit_struct( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); PRINT_CHILD_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id); + print_metadata(name_ptr, metadata); SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct", is_nullable); struct_item->children = child_list_id; } @@ -123,12 +142,14 @@ void visit_array( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); + print_metadata(name_ptr, metadata); PRINT_CHILD_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array", is_nullable); array_item->children = child_list_id; @@ -139,12 +160,14 @@ void visit_map( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uintptr_t child_list_id) { SchemaBuilder* builder = data; char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); + print_metadata(name_ptr, metadata); PRINT_CHILD_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map", is_nullable); map_item->children = child_list_id; @@ -155,6 +178,7 @@ void visit_decimal( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, uint8_t precision, uint8_t scale) { @@ -162,6 +186,7 @@ void visit_decimal( char* name_ptr = allocate_string(name); char* type = malloc(19 * sizeof(char)); snprintf(type, 19, "decimal(%u)(%d)", precision, scale); + print_metadata(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } @@ -171,18 +196,20 @@ void visit_simple_type( uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, + const CStringMap * metadata, char* type) { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); + print_metadata(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } -#define DEFINE_VISIT_SIMPLE_TYPE(typename) \ - void visit_##typename(void* data, uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable)\ - { \ - visit_simple_type(data, sibling_list_id, name, is_nullable, #typename); \ +#define DEFINE_VISIT_SIMPLE_TYPE(typename) \ + void visit_##typename(void* data, uintptr_t sibling_list_id, struct KernelStringSlice name, bool is_nullable, const CStringMap * metadata)\ + { \ + visit_simple_type(data, sibling_list_id, name, is_nullable, metadata, #typename); \ } DEFINE_VISIT_SIMPLE_TYPE(string) diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 5d3b5047b..ecdda9c92 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -270,7 +270,7 @@ type CScanCallback = extern "C" fn( ); pub struct CStringMap { - values: HashMap, + pub values: HashMap, } #[no_mangle] diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index b74051133..d2e44e27d 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -1,7 +1,9 @@ use std::os::raw::c_void; +use crate::scan::CStringMap; use crate::{handle::Handle, kernel_string_slice, KernelStringSlice, SharedSnapshot}; use delta_kernel::schema::{ArrayType, DataType, MapType, PrimitiveType, StructType}; + /// The `EngineSchemaVisitor` defines a visitor system to allow engines to build their own /// representation of a schema from a particular schema within kernel. /// @@ -28,7 +30,6 @@ use delta_kernel::schema::{ArrayType, DataType, MapType, PrimitiveType, StructTy /// that element's (already-visited) children. /// 4. The [`visit_schema`] method returns the id of the list of top-level columns // WARNING: the visitor MUST NOT retain internal references to the string slices passed to visitor methods -// TODO: struct field metadata #[repr(C)] pub struct EngineSchemaVisitor { /// opaque state pointer @@ -44,6 +45,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -54,6 +56,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -65,6 +68,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, child_list_id: usize, ), @@ -74,6 +78,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, precision: u8, scale: u8, ), @@ -84,6 +89,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `long` belonging to the list identified by `sibling_list_id`. @@ -92,6 +98,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit an `integer` belonging to the list identified by `sibling_list_id`. @@ -100,6 +107,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `short` belonging to the list identified by `sibling_list_id`. @@ -108,6 +116,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `byte` belonging to the list identified by `sibling_list_id`. @@ -116,6 +125,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `float` belonging to the list identified by `sibling_list_id`. @@ -124,6 +134,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `double` belonging to the list identified by `sibling_list_id`. @@ -132,6 +143,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `boolean` belonging to the list identified by `sibling_list_id`. @@ -140,6 +152,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit `binary` belonging to the list identified by `sibling_list_id`. @@ -148,6 +161,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `date` belonging to the list identified by `sibling_list_id`. @@ -156,6 +170,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `timestamp` belonging to the list identified by `sibling_list_id`. @@ -164,6 +179,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), /// Visit a `timestamp` with no timezone belonging to the list identified by `sibling_list_id`. @@ -172,6 +188,7 @@ pub struct EngineSchemaVisitor { sibling_list_id: usize, name: KernelStringSlice, is_nullable: bool, + metadata: &CStringMap, ), } @@ -193,10 +210,14 @@ pub unsafe extern "C" fn visit_schema( fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); for field in s.fields() { + let metadata = CStringMap { + values: field.metadata_with_string_values(), + }; visit_schema_item( field.name(), field.data_type(), field.is_nullable(), + &metadata, visitor, child_list_id, ); @@ -208,12 +229,14 @@ pub unsafe extern "C" fn visit_schema( visitor: &EngineSchemaVisitor, at: &ArrayType, contains_null: bool, + metadata: &CStringMap, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 1); visit_schema_item( "array_element", &at.element_type, contains_null, + metadata, visitor, child_list_id, ); @@ -224,13 +247,22 @@ pub unsafe extern "C" fn visit_schema( visitor: &EngineSchemaVisitor, mt: &MapType, value_contains_null: bool, + metadata: &CStringMap, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 2); - visit_schema_item("map_key", &mt.key_type, false, visitor, child_list_id); + visit_schema_item( + "map_key", + &mt.key_type, + false, + metadata, + visitor, + child_list_id, + ); visit_schema_item( "map_value", &mt.value_type, value_contains_null, + metadata, visitor, child_list_id, ); @@ -242,6 +274,7 @@ pub unsafe extern "C" fn visit_schema( name: &str, data_type: &DataType, is_nullable: bool, + metadata: &CStringMap, visitor: &EngineSchemaVisitor, sibling_list_id: usize, ) { @@ -251,7 +284,8 @@ pub unsafe extern "C" fn visit_schema( visitor.data, sibling_list_id, kernel_string_slice!(name), - is_nullable + is_nullable, + metadata $(, $extra_args) * ) }; @@ -261,11 +295,14 @@ pub unsafe extern "C" fn visit_schema( DataType::Map(mt) => { call!( visit_map, - visit_map_types(visitor, mt, mt.value_contains_null) + visit_map_types(visitor, mt, mt.value_contains_null, metadata) ) } DataType::Array(at) => { - call!(visit_array, visit_array_item(visitor, at, at.contains_null)) + call!( + visit_array, + visit_array_item(visitor, at, at.contains_null, metadata) + ) } DataType::Primitive(PrimitiveType::Decimal(precision, scale)) => { call!(visit_decimal, *precision, *scale) From 7ff0f0168673cb93a46d08a42e4e562ec1a6fd5c Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 23 Jan 2025 19:15:29 +0100 Subject: [PATCH 2/5] Fix review comments --- ffi/examples/read-table/schema.h | 13 +++++++------ ffi/src/scan.rs | 8 +++++++- ffi/src/schema.rs | 4 +--- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/ffi/examples/read-table/schema.h b/ffi/examples/read-table/schema.h index b8b903970..8c29675a6 100644 --- a/ffi/examples/read-table/schema.h +++ b/ffi/examples/read-table/schema.h @@ -85,7 +85,7 @@ void print_list(SchemaBuilder* builder, uintptr_t list_id, int indent, int paren } } -void print_metadata(const char *name, const CStringMap* metadata) +void print_physical_name(const char *name, const CStringMap* metadata) { #ifdef VERBOSE char* key_str = "delta.columnMapping.physicalName"; @@ -98,6 +98,7 @@ void print_metadata(const char *name, const CStringMap* metadata) printf("No physical name\n"); } #else + (void)name; (void)metadata; #endif } @@ -132,7 +133,7 @@ void visit_struct( SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); PRINT_CHILD_VISIT("struct", name_ptr, sibling_list_id, "Children", child_list_id); - print_metadata(name_ptr, metadata); + print_physical_name(name_ptr, metadata); SchemaItem* struct_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "struct", is_nullable); struct_item->children = child_list_id; } @@ -149,7 +150,7 @@ void visit_array( char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); - print_metadata(name_ptr, metadata); + print_physical_name(name_ptr, metadata); PRINT_CHILD_VISIT("array", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* array_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "array", is_nullable); array_item->children = child_list_id; @@ -167,7 +168,7 @@ void visit_map( char* name_ptr = malloc(sizeof(char) * (name.len + 22)); snprintf(name_ptr, name.len + 1, "%s", name.ptr); snprintf(name_ptr + name.len, 22, " (is nullable: %s)", is_nullable ? "true" : "false"); - print_metadata(name_ptr, metadata); + print_physical_name(name_ptr, metadata); PRINT_CHILD_VISIT("map", name_ptr, sibling_list_id, "Types", child_list_id); SchemaItem* map_item = add_to_list(&builder->lists[sibling_list_id], name_ptr, "map", is_nullable); map_item->children = child_list_id; @@ -186,7 +187,7 @@ void visit_decimal( char* name_ptr = allocate_string(name); char* type = malloc(19 * sizeof(char)); snprintf(type, 19, "decimal(%u)(%d)", precision, scale); - print_metadata(name_ptr, metadata); + print_physical_name(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } @@ -201,7 +202,7 @@ void visit_simple_type( { SchemaBuilder* builder = data; char* name_ptr = allocate_string(name); - print_metadata(name_ptr, metadata); + print_physical_name(name_ptr, metadata); PRINT_NO_CHILD_VISIT(type, name_ptr, sibling_list_id); add_to_list(&builder->lists[sibling_list_id], name_ptr, type, is_nullable); } diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index ecdda9c92..dc44df493 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -270,7 +270,13 @@ type CScanCallback = extern "C" fn( ); pub struct CStringMap { - pub values: HashMap, + values: HashMap, +} + +impl CStringMap { + pub fn new(values: HashMap) -> CStringMap { + CStringMap { values } + } } #[no_mangle] diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index d2e44e27d..d33c93282 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -210,9 +210,7 @@ pub unsafe extern "C" fn visit_schema( fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); for field in s.fields() { - let metadata = CStringMap { - values: field.metadata_with_string_values(), - }; + let metadata = CStringMap::new(field.metadata_with_string_values()); visit_schema_item( field.name(), field.data_type(), From 95a89fabee4604d50369ff99811835b24957445a Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 24 Jan 2025 13:22:17 +0100 Subject: [PATCH 3/5] Use empty map for metadata in map/array values --- ffi/src/scan.rs | 10 ++++++++-- ffi/src/schema.rs | 17 +++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index dc44df493..4ff799d23 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -274,8 +274,14 @@ pub struct CStringMap { } impl CStringMap { - pub fn new(values: HashMap) -> CStringMap { - CStringMap { values } + pub fn new(values: HashMap) -> Self { + Self { values } + } + + pub fn default() -> Self { + Self { + values: Default::default(), + } } } diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index d33c93282..e0ef4048f 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -227,14 +227,14 @@ pub unsafe extern "C" fn visit_schema( visitor: &EngineSchemaVisitor, at: &ArrayType, contains_null: bool, - metadata: &CStringMap, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 1); + let metadata = CStringMap::default(); visit_schema_item( "array_element", &at.element_type, contains_null, - metadata, + &metadata, visitor, child_list_id, ); @@ -245,14 +245,14 @@ pub unsafe extern "C" fn visit_schema( visitor: &EngineSchemaVisitor, mt: &MapType, value_contains_null: bool, - metadata: &CStringMap, ) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, 2); + let metadata = CStringMap::default(); visit_schema_item( "map_key", &mt.key_type, false, - metadata, + &metadata, visitor, child_list_id, ); @@ -260,7 +260,7 @@ pub unsafe extern "C" fn visit_schema( "map_value", &mt.value_type, value_contains_null, - metadata, + &metadata, visitor, child_list_id, ); @@ -293,14 +293,11 @@ pub unsafe extern "C" fn visit_schema( DataType::Map(mt) => { call!( visit_map, - visit_map_types(visitor, mt, mt.value_contains_null, metadata) + visit_map_types(visitor, mt, mt.value_contains_null) ) } DataType::Array(at) => { - call!( - visit_array, - visit_array_item(visitor, at, at.contains_null, metadata) - ) + call!(visit_array, visit_array_item(visitor, at, at.contains_null)) } DataType::Primitive(PrimitiveType::Decimal(precision, scale)) => { call!(visit_decimal, *precision, *scale) From 8a28ac138f5455f1dcb450d78a7c3f99ec6e8ddc Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 Jan 2025 14:24:11 +0100 Subject: [PATCH 4/5] Review feedback --- ffi/src/scan.rs | 10 ++++++---- ffi/src/schema.rs | 3 +-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 4ff799d23..9cfaec806 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -274,10 +274,6 @@ pub struct CStringMap { } impl CStringMap { - pub fn new(values: HashMap) -> Self { - Self { values } - } - pub fn default() -> Self { Self { values: Default::default(), @@ -285,6 +281,12 @@ impl CStringMap { } } +impl From> for CStringMap { + fn from(val: HashMap) -> Self { + Self { values: val } + } +} + #[no_mangle] /// allow probing into a CStringMap. If the specified key is in the map, kernel will call /// allocate_fn with the value associated with the key and return the value returned from that diff --git a/ffi/src/schema.rs b/ffi/src/schema.rs index e0ef4048f..23da22bc3 100644 --- a/ffi/src/schema.rs +++ b/ffi/src/schema.rs @@ -210,12 +210,11 @@ pub unsafe extern "C" fn visit_schema( fn visit_struct_fields(visitor: &EngineSchemaVisitor, s: &StructType) -> usize { let child_list_id = (visitor.make_field_list)(visitor.data, s.fields.len()); for field in s.fields() { - let metadata = CStringMap::new(field.metadata_with_string_values()); visit_schema_item( field.name(), field.data_type(), field.is_nullable(), - &metadata, + &field.metadata_with_string_values().into(), visitor, child_list_id, ); From 69b5da26e691bcd4c61e101c8025ccb9569650ef Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 29 Jan 2025 11:59:33 +0100 Subject: [PATCH 5/5] Review feedback --- ffi/src/scan.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ffi/src/scan.rs b/ffi/src/scan.rs index 9cfaec806..f2fee7643 100644 --- a/ffi/src/scan.rs +++ b/ffi/src/scan.rs @@ -269,18 +269,11 @@ type CScanCallback = extern "C" fn( partition_map: &CStringMap, ); +#[derive(Default)] pub struct CStringMap { values: HashMap, } -impl CStringMap { - pub fn default() -> Self { - Self { - values: Default::default(), - } - } -} - impl From> for CStringMap { fn from(val: HashMap) -> Self { Self { values: val }