From 43be262125a5dec7fc241afe4dd8c607888134e9 Mon Sep 17 00:00:00 2001 From: tycho garen Date: Fri, 26 Jul 2024 10:46:17 -0400 Subject: [PATCH] feat: table support for excel --- crates/datasources/src/excel/mod.rs | 53 ++++++++++++++++--- crates/protogen/proto/metastore/options.proto | 1 + .../protogen/src/metastore/types/options.rs | 3 ++ .../sqlbuiltins/src/functions/table/excel.rs | 11 ++-- .../src/functions/table/object_store.rs | 1 + crates/sqlexec/src/dispatch/external.rs | 2 + crates/sqlexec/src/planner/session_planner.rs | 6 +++ 7 files changed, 67 insertions(+), 10 deletions(-) diff --git a/crates/datasources/src/excel/mod.rs b/crates/datasources/src/excel/mod.rs index 7e8e3390e..7d5026ebe 100644 --- a/crates/datasources/src/excel/mod.rs +++ b/crates/datasources/src/excel/mod.rs @@ -34,20 +34,36 @@ impl ExcelTable { store_access: Arc, source_url: DatasourceUrl, sheet_name: Option, + table_name: Option, has_header: bool, ) -> Result { match source_url { DatasourceUrl::File(path) => { let path = ioutil::resolve_path(&path)?; - let mut sheet = calamine::open_workbook_auto(path)?; + let mut sheets = calamine::open_workbook_auto(path)?; let first_sheet = sheet_name.unwrap_or_else(|| { - let sheets = sheet.sheet_names(); - sheets.first().expect("file has a sheet").to_owned() + let names = sheets.sheet_names(); + names.first().expect("file has a sheet").to_owned() }); Ok(ExcelTable { - cell_range: sheet.worksheet_range(&first_sheet)?, + cell_range: match &table_name { + Some(name) => { + match sheets { + calamine::Sheets::Xlsx(mut sheet) => { + sheet.load_tables()?; + + sheet.table_by_name(name)?.data().to_owned() + } + _ => return Err(ExcelError::Load( + "specifying excel table is only compatible with xslx workbooks" + .to_string(), + )), + } + } + None => sheets.worksheet_range(&first_sheet)?, + }, has_header, }) } @@ -72,7 +88,8 @@ impl ExcelTable { let store = accessor.into_object_store(); - excel_table_from_object(store.as_ref(), meta, sheet_name, has_header).await + excel_table_from_object(store.as_ref(), meta, sheet_name, table_name, has_header) + .await } } } @@ -82,20 +99,41 @@ pub async fn excel_table_from_object( store: &dyn ObjectStore, meta: ObjectMeta, sheet_name: Option, + table_name: Option, has_header: bool, ) -> Result { let bs = store.get(&meta.location).await?.bytes().await?; let buffer = Cursor::new(bs); - let mut sheets: Sheets<_> = calamine::open_workbook_auto_from_rs(buffer)?; + let mut sheets: Sheets<_> = match table_name { + Some(_) => Sheets::Xlsx(calamine::open_workbook_from_rs::, _>( + buffer, + )?), + None => calamine::open_workbook_auto_from_rs(buffer)?, + }; let first_sheet = sheet_name.unwrap_or_else(|| { let sheets = sheets.sheet_names(); sheets.first().expect("file has a sheet").to_owned() }); + Ok(ExcelTable { - cell_range: sheets.worksheet_range(&first_sheet)?, + cell_range: match &table_name { + Some(name) => match sheets { + calamine::Sheets::Xlsx(mut sheet) => { + sheet.load_tables()?; + + sheet.table_by_name(name)?.data().to_owned() + } + _ => { + return Err(ExcelError::Load( + "specifying excel table is only compatible with xslx workbooks".to_string(), + )) + } + }, + None => sheets.worksheet_range(&first_sheet)?, + }, has_header, }) } @@ -108,6 +146,7 @@ fn xlsx_sheet_value_to_record_batch( infer_schema_length: usize, ) -> Result { let schema = infer_schema(&r, has_header, infer_schema_length)?; + let arrays = schema .fields() .iter() diff --git a/crates/protogen/proto/metastore/options.proto b/crates/protogen/proto/metastore/options.proto index 37e984ad6..f43ed2992 100644 --- a/crates/protogen/proto/metastore/options.proto +++ b/crates/protogen/proto/metastore/options.proto @@ -223,6 +223,7 @@ message TableOptionsExcel { optional string compression = 4; optional string sheet_name = 5; optional bool has_header = 6; + optional string table_name = 7; } message TableOptionsSnowflake { diff --git a/crates/protogen/src/metastore/types/options.rs b/crates/protogen/src/metastore/types/options.rs index 9e397a2c4..38a1bb8a2 100644 --- a/crates/protogen/src/metastore/types/options.rs +++ b/crates/protogen/src/metastore/types/options.rs @@ -1385,6 +1385,7 @@ pub struct TableOptionsExcel { pub file_type: Option, pub compression: Option, pub sheet_name: Option, + pub table_name: Option, pub has_header: Option, } @@ -1408,6 +1409,7 @@ impl TryFrom for TableOptionsExcel { compression: value.compression, sheet_name: value.sheet_name, has_header: value.has_header, + table_name: value.table_name, }) } } @@ -1421,6 +1423,7 @@ impl From for options::TableOptionsExcel { compression: value.compression, sheet_name: value.sheet_name, has_header: value.has_header, + table_name: value.table_name, } } } diff --git a/crates/sqlbuiltins/src/functions/table/excel.rs b/crates/sqlbuiltins/src/functions/table/excel.rs index 09a2e4367..42ed202c5 100644 --- a/crates/sqlbuiltins/src/functions/table/excel.rs +++ b/crates/sqlbuiltins/src/functions/table/excel.rs @@ -23,9 +23,9 @@ pub struct ExcelScan; impl ConstBuiltinFunction for ExcelScan { const NAME: &'static str = "read_excel"; - const DESCRIPTION: &'static str = "Reads an Excel file from the local filesystem"; + const DESCRIPTION: &'static str = "Reads an Excel file."; const EXAMPLE: &'static str = - "SELECT * FROM read_excel('file:///path/to/file.xlsx', sheet_name => 'Sheet1')"; + "SELECT * FROM read_excel('file:///path/to/file.xlsx', sheet_name => 'Sheet1', table_name => 'table')"; const FUNCTION_TYPE: FunctionType = FunctionType::TableReturning; const ALIASES: &'static [&'static str] = &["read_xlsx"]; @@ -99,7 +99,12 @@ impl TableFunc for ExcelScan { .transpose()? .unwrap_or(100); - let table = ExcelTable::open(store_access, source_url, sheet_name, has_header) + let table_name: Option = opts + .remove("table_name") + .map(FuncParamValue::try_into) + .transpose()?; + + let table = ExcelTable::open(store_access, source_url, sheet_name, table_name, has_header) .await .map_err(|e| DataFusionError::External(Box::new(e)))?; diff --git a/crates/sqlbuiltins/src/functions/table/object_store.rs b/crates/sqlbuiltins/src/functions/table/object_store.rs index a61e5c53c..2a5bb26a8 100644 --- a/crates/sqlbuiltins/src/functions/table/object_store.rs +++ b/crates/sqlbuiltins/src/functions/table/object_store.rs @@ -728,6 +728,7 @@ impl TableFunc for CloudUpload { storage.store.inner.as_ref(), meta, /* sheet_name = */ None, + /* table_name = */ None, /* has_header = */ true, ) .await?; diff --git a/crates/sqlexec/src/dispatch/external.rs b/crates/sqlexec/src/dispatch/external.rs index e46faa92a..cbcd3d7c2 100644 --- a/crates/sqlexec/src/dispatch/external.rs +++ b/crates/sqlexec/src/dispatch/external.rs @@ -445,6 +445,7 @@ impl<'a> ExternalDispatcher<'a> { storage_options, has_header, sheet_name, + table_name, .. }) => { let source_url = DatasourceUrl::try_new(location)?; @@ -454,6 +455,7 @@ impl<'a> ExternalDispatcher<'a> { store_access, source_url, sheet_name.to_owned(), + table_name.to_owned(), has_header.unwrap_or(true), ) .await?; diff --git a/crates/sqlexec/src/planner/session_planner.rs b/crates/sqlexec/src/planner/session_planner.rs index 42bee5f10..270cbccd1 100644 --- a/crates/sqlexec/src/planner/session_planner.rs +++ b/crates/sqlexec/src/planner/session_planner.rs @@ -966,6 +966,11 @@ impl<'a> SessionPlanner<'a> { .get("sheet_name") .map(|val| val.to_owned()); + let table_name = storage_options + .inner + .get("sheet_name") + .map(|val| val.to_owned()); + let has_header = storage_options .inner .get("has_header") @@ -985,6 +990,7 @@ impl<'a> SessionPlanner<'a> { file_type: None, compression: None, sheet_name, + table_name, has_header, } .into()